1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_breadcrumbs.h" 141 #include "intel_context.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_gt_requests.h" 146 #include "intel_lrc_reg.h" 147 #include "intel_mocs.h" 148 #include "intel_reset.h" 149 #include "intel_ring.h" 150 #include "intel_workarounds.h" 151 #include "shmem_utils.h" 152 153 #define RING_EXECLIST_QFULL (1 << 0x2) 154 #define RING_EXECLIST1_VALID (1 << 0x3) 155 #define RING_EXECLIST0_VALID (1 << 0x4) 156 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 157 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 158 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 159 160 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 161 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 163 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 164 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 165 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 166 167 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 168 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 169 170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 171 172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 174 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 175 #define GEN12_IDLE_CTX_ID 0x7FF 176 #define GEN12_CSB_CTX_VALID(csb_dw) \ 177 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 178 179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 struct rcu_work rcu; 186 187 /* 188 * We allow only a single request through the virtual engine at a time 189 * (each request in the timeline waits for the completion fence of 190 * the previous before being submitted). By restricting ourselves to 191 * only submitting a single request, each request is placed on to a 192 * physical to maximise load spreading (by virtue of the late greedy 193 * scheduling -- each real engine takes the next available request 194 * upon idling). 195 */ 196 struct i915_request *request; 197 198 /* 199 * We keep a rbtree of available virtual engines inside each physical 200 * engine, sorted by priority. Here we preallocate the nodes we need 201 * for the virtual engine, indexed by physical_engine->id. 202 */ 203 struct ve_node { 204 struct rb_node rb; 205 int prio; 206 } nodes[I915_NUM_ENGINES]; 207 208 /* 209 * Keep track of bonded pairs -- restrictions upon on our selection 210 * of physical engines any particular request may be submitted to. 211 * If we receive a submit-fence from a master engine, we will only 212 * use one of sibling_mask physical engines. 213 */ 214 struct ve_bond { 215 const struct intel_engine_cs *master; 216 intel_engine_mask_t sibling_mask; 217 } *bonds; 218 unsigned int num_bonds; 219 220 /* And finally, which physical engines this virtual engine maps onto. */ 221 unsigned int num_siblings; 222 struct intel_engine_cs *siblings[]; 223 }; 224 225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 226 { 227 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 228 return container_of(engine, struct virtual_engine, base); 229 } 230 231 static int __execlists_context_alloc(struct intel_context *ce, 232 struct intel_engine_cs *engine); 233 234 static void execlists_init_reg_state(u32 *reg_state, 235 const struct intel_context *ce, 236 const struct intel_engine_cs *engine, 237 const struct intel_ring *ring, 238 bool close); 239 static void 240 __execlists_update_reg_state(const struct intel_context *ce, 241 const struct intel_engine_cs *engine, 242 u32 head); 243 244 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 245 { 246 if (INTEL_GEN(engine->i915) >= 12) 247 return 0x60; 248 else if (INTEL_GEN(engine->i915) >= 9) 249 return 0x54; 250 else if (engine->class == RENDER_CLASS) 251 return 0x58; 252 else 253 return -1; 254 } 255 256 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 257 { 258 if (INTEL_GEN(engine->i915) >= 12) 259 return 0x74; 260 else if (INTEL_GEN(engine->i915) >= 9) 261 return 0x68; 262 else if (engine->class == RENDER_CLASS) 263 return 0xd8; 264 else 265 return -1; 266 } 267 268 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 269 { 270 if (INTEL_GEN(engine->i915) >= 12) 271 return 0x12; 272 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 273 return 0x18; 274 else 275 return -1; 276 } 277 278 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 279 { 280 int x; 281 282 x = lrc_ring_wa_bb_per_ctx(engine); 283 if (x < 0) 284 return x; 285 286 return x + 2; 287 } 288 289 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 290 { 291 int x; 292 293 x = lrc_ring_indirect_ptr(engine); 294 if (x < 0) 295 return x; 296 297 return x + 2; 298 } 299 300 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 301 { 302 if (engine->class != RENDER_CLASS) 303 return -1; 304 305 if (INTEL_GEN(engine->i915) >= 12) 306 return 0xb6; 307 else if (INTEL_GEN(engine->i915) >= 11) 308 return 0xaa; 309 else 310 return -1; 311 } 312 313 static u32 314 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 315 { 316 switch (INTEL_GEN(engine->i915)) { 317 default: 318 MISSING_CASE(INTEL_GEN(engine->i915)); 319 fallthrough; 320 case 12: 321 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 322 case 11: 323 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 324 case 10: 325 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 326 case 9: 327 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 328 case 8: 329 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 330 } 331 } 332 333 static void 334 lrc_ring_setup_indirect_ctx(u32 *regs, 335 const struct intel_engine_cs *engine, 336 u32 ctx_bb_ggtt_addr, 337 u32 size) 338 { 339 GEM_BUG_ON(!size); 340 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 341 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 342 regs[lrc_ring_indirect_ptr(engine) + 1] = 343 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 344 345 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 346 regs[lrc_ring_indirect_offset(engine) + 1] = 347 lrc_ring_indirect_offset_default(engine) << 6; 348 } 349 350 static u32 intel_context_get_runtime(const struct intel_context *ce) 351 { 352 /* 353 * We can use either ppHWSP[16] which is recorded before the context 354 * switch (and so excludes the cost of context switches) or use the 355 * value from the context image itself, which is saved/restored earlier 356 * and so includes the cost of the save. 357 */ 358 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 359 } 360 361 static void mark_eio(struct i915_request *rq) 362 { 363 if (i915_request_completed(rq)) 364 return; 365 366 GEM_BUG_ON(i915_request_signaled(rq)); 367 368 i915_request_set_error_once(rq, -EIO); 369 i915_request_mark_complete(rq); 370 } 371 372 static struct i915_request * 373 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 374 { 375 struct i915_request *active = rq; 376 377 rcu_read_lock(); 378 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 379 if (i915_request_completed(rq)) 380 break; 381 382 active = rq; 383 } 384 rcu_read_unlock(); 385 386 return active; 387 } 388 389 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 390 { 391 return (i915_ggtt_offset(engine->status_page.vma) + 392 I915_GEM_HWS_PREEMPT_ADDR); 393 } 394 395 static inline void 396 ring_set_paused(const struct intel_engine_cs *engine, int state) 397 { 398 /* 399 * We inspect HWS_PREEMPT with a semaphore inside 400 * engine->emit_fini_breadcrumb. If the dword is true, 401 * the ring is paused as the semaphore will busywait 402 * until the dword is false. 403 */ 404 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 405 if (state) 406 wmb(); 407 } 408 409 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 410 { 411 return rb_entry(rb, struct i915_priolist, node); 412 } 413 414 static inline int rq_prio(const struct i915_request *rq) 415 { 416 return READ_ONCE(rq->sched.attr.priority); 417 } 418 419 static int effective_prio(const struct i915_request *rq) 420 { 421 int prio = rq_prio(rq); 422 423 /* 424 * If this request is special and must not be interrupted at any 425 * cost, so be it. Note we are only checking the most recent request 426 * in the context and so may be masking an earlier vip request. It 427 * is hoped that under the conditions where nopreempt is used, this 428 * will not matter (i.e. all requests to that context will be 429 * nopreempt for as long as desired). 430 */ 431 if (i915_request_has_nopreempt(rq)) 432 prio = I915_PRIORITY_UNPREEMPTABLE; 433 434 return prio; 435 } 436 437 static int queue_prio(const struct intel_engine_execlists *execlists) 438 { 439 struct i915_priolist *p; 440 struct rb_node *rb; 441 442 rb = rb_first_cached(&execlists->queue); 443 if (!rb) 444 return INT_MIN; 445 446 /* 447 * As the priolist[] are inverted, with the highest priority in [0], 448 * we have to flip the index value to become priority. 449 */ 450 p = to_priolist(rb); 451 if (!I915_USER_PRIORITY_SHIFT) 452 return p->priority; 453 454 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 455 } 456 457 static inline bool need_preempt(const struct intel_engine_cs *engine, 458 const struct i915_request *rq, 459 struct rb_node *rb) 460 { 461 int last_prio; 462 463 if (!intel_engine_has_semaphores(engine)) 464 return false; 465 466 /* 467 * Check if the current priority hint merits a preemption attempt. 468 * 469 * We record the highest value priority we saw during rescheduling 470 * prior to this dequeue, therefore we know that if it is strictly 471 * less than the current tail of ESLP[0], we do not need to force 472 * a preempt-to-idle cycle. 473 * 474 * However, the priority hint is a mere hint that we may need to 475 * preempt. If that hint is stale or we may be trying to preempt 476 * ourselves, ignore the request. 477 * 478 * More naturally we would write 479 * prio >= max(0, last); 480 * except that we wish to prevent triggering preemption at the same 481 * priority level: the task that is running should remain running 482 * to preserve FIFO ordering of dependencies. 483 */ 484 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 485 if (engine->execlists.queue_priority_hint <= last_prio) 486 return false; 487 488 /* 489 * Check against the first request in ELSP[1], it will, thanks to the 490 * power of PI, be the highest priority of that context. 491 */ 492 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 493 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 494 return true; 495 496 if (rb) { 497 struct virtual_engine *ve = 498 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 499 bool preempt = false; 500 501 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 502 struct i915_request *next; 503 504 rcu_read_lock(); 505 next = READ_ONCE(ve->request); 506 if (next) 507 preempt = rq_prio(next) > last_prio; 508 rcu_read_unlock(); 509 } 510 511 if (preempt) 512 return preempt; 513 } 514 515 /* 516 * If the inflight context did not trigger the preemption, then maybe 517 * it was the set of queued requests? Pick the highest priority in 518 * the queue (the first active priolist) and see if it deserves to be 519 * running instead of ELSP[0]. 520 * 521 * The highest priority request in the queue can not be either 522 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 523 * context, it's priority would not exceed ELSP[0] aka last_prio. 524 */ 525 return queue_prio(&engine->execlists) > last_prio; 526 } 527 528 __maybe_unused static inline bool 529 assert_priority_queue(const struct i915_request *prev, 530 const struct i915_request *next) 531 { 532 /* 533 * Without preemption, the prev may refer to the still active element 534 * which we refuse to let go. 535 * 536 * Even with preemption, there are times when we think it is better not 537 * to preempt and leave an ostensibly lower priority request in flight. 538 */ 539 if (i915_request_is_active(prev)) 540 return true; 541 542 return rq_prio(prev) >= rq_prio(next); 543 } 544 545 /* 546 * The context descriptor encodes various attributes of a context, 547 * including its GTT address and some flags. Because it's fairly 548 * expensive to calculate, we'll just do it once and cache the result, 549 * which remains valid until the context is unpinned. 550 * 551 * This is what a descriptor looks like, from LSB to MSB:: 552 * 553 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 554 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 555 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 556 * bits 53-54: mbz, reserved for use by hardware 557 * bits 55-63: group ID, currently unused and set to 0 558 * 559 * Starting from Gen11, the upper dword of the descriptor has a new format: 560 * 561 * bits 32-36: reserved 562 * bits 37-47: SW context ID 563 * bits 48:53: engine instance 564 * bit 54: mbz, reserved for use by hardware 565 * bits 55-60: SW counter 566 * bits 61-63: engine class 567 * 568 * engine info, SW context ID and SW counter need to form a unique number 569 * (Context ID) per lrc. 570 */ 571 static u32 572 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 573 { 574 u32 desc; 575 576 desc = INTEL_LEGACY_32B_CONTEXT; 577 if (i915_vm_is_4lvl(ce->vm)) 578 desc = INTEL_LEGACY_64B_CONTEXT; 579 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 580 581 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 582 if (IS_GEN(engine->i915, 8)) 583 desc |= GEN8_CTX_L3LLC_COHERENT; 584 585 return i915_ggtt_offset(ce->state) | desc; 586 } 587 588 static inline unsigned int dword_in_page(void *addr) 589 { 590 return offset_in_page(addr) / sizeof(u32); 591 } 592 593 static void set_offsets(u32 *regs, 594 const u8 *data, 595 const struct intel_engine_cs *engine, 596 bool clear) 597 #define NOP(x) (BIT(7) | (x)) 598 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 599 #define POSTED BIT(0) 600 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 601 #define REG16(x) \ 602 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 603 (((x) >> 2) & 0x7f) 604 #define END(total_state_size) 0, (total_state_size) 605 { 606 const u32 base = engine->mmio_base; 607 608 while (*data) { 609 u8 count, flags; 610 611 if (*data & BIT(7)) { /* skip */ 612 count = *data++ & ~BIT(7); 613 if (clear) 614 memset32(regs, MI_NOOP, count); 615 regs += count; 616 continue; 617 } 618 619 count = *data & 0x3f; 620 flags = *data >> 6; 621 data++; 622 623 *regs = MI_LOAD_REGISTER_IMM(count); 624 if (flags & POSTED) 625 *regs |= MI_LRI_FORCE_POSTED; 626 if (INTEL_GEN(engine->i915) >= 11) 627 *regs |= MI_LRI_LRM_CS_MMIO; 628 regs++; 629 630 GEM_BUG_ON(!count); 631 do { 632 u32 offset = 0; 633 u8 v; 634 635 do { 636 v = *data++; 637 offset <<= 7; 638 offset |= v & ~BIT(7); 639 } while (v & BIT(7)); 640 641 regs[0] = base + (offset << 2); 642 if (clear) 643 regs[1] = 0; 644 regs += 2; 645 } while (--count); 646 } 647 648 if (clear) { 649 u8 count = *++data; 650 651 /* Clear past the tail for HW access */ 652 GEM_BUG_ON(dword_in_page(regs) > count); 653 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 654 655 /* Close the batch; used mainly by live_lrc_layout() */ 656 *regs = MI_BATCH_BUFFER_END; 657 if (INTEL_GEN(engine->i915) >= 10) 658 *regs |= BIT(0); 659 } 660 } 661 662 static const u8 gen8_xcs_offsets[] = { 663 NOP(1), 664 LRI(11, 0), 665 REG16(0x244), 666 REG(0x034), 667 REG(0x030), 668 REG(0x038), 669 REG(0x03c), 670 REG(0x168), 671 REG(0x140), 672 REG(0x110), 673 REG(0x11c), 674 REG(0x114), 675 REG(0x118), 676 677 NOP(9), 678 LRI(9, 0), 679 REG16(0x3a8), 680 REG16(0x28c), 681 REG16(0x288), 682 REG16(0x284), 683 REG16(0x280), 684 REG16(0x27c), 685 REG16(0x278), 686 REG16(0x274), 687 REG16(0x270), 688 689 NOP(13), 690 LRI(2, 0), 691 REG16(0x200), 692 REG(0x028), 693 694 END(80) 695 }; 696 697 static const u8 gen9_xcs_offsets[] = { 698 NOP(1), 699 LRI(14, POSTED), 700 REG16(0x244), 701 REG(0x034), 702 REG(0x030), 703 REG(0x038), 704 REG(0x03c), 705 REG(0x168), 706 REG(0x140), 707 REG(0x110), 708 REG(0x11c), 709 REG(0x114), 710 REG(0x118), 711 REG(0x1c0), 712 REG(0x1c4), 713 REG(0x1c8), 714 715 NOP(3), 716 LRI(9, POSTED), 717 REG16(0x3a8), 718 REG16(0x28c), 719 REG16(0x288), 720 REG16(0x284), 721 REG16(0x280), 722 REG16(0x27c), 723 REG16(0x278), 724 REG16(0x274), 725 REG16(0x270), 726 727 NOP(13), 728 LRI(1, POSTED), 729 REG16(0x200), 730 731 NOP(13), 732 LRI(44, POSTED), 733 REG(0x028), 734 REG(0x09c), 735 REG(0x0c0), 736 REG(0x178), 737 REG(0x17c), 738 REG16(0x358), 739 REG(0x170), 740 REG(0x150), 741 REG(0x154), 742 REG(0x158), 743 REG16(0x41c), 744 REG16(0x600), 745 REG16(0x604), 746 REG16(0x608), 747 REG16(0x60c), 748 REG16(0x610), 749 REG16(0x614), 750 REG16(0x618), 751 REG16(0x61c), 752 REG16(0x620), 753 REG16(0x624), 754 REG16(0x628), 755 REG16(0x62c), 756 REG16(0x630), 757 REG16(0x634), 758 REG16(0x638), 759 REG16(0x63c), 760 REG16(0x640), 761 REG16(0x644), 762 REG16(0x648), 763 REG16(0x64c), 764 REG16(0x650), 765 REG16(0x654), 766 REG16(0x658), 767 REG16(0x65c), 768 REG16(0x660), 769 REG16(0x664), 770 REG16(0x668), 771 REG16(0x66c), 772 REG16(0x670), 773 REG16(0x674), 774 REG16(0x678), 775 REG16(0x67c), 776 REG(0x068), 777 778 END(176) 779 }; 780 781 static const u8 gen12_xcs_offsets[] = { 782 NOP(1), 783 LRI(13, POSTED), 784 REG16(0x244), 785 REG(0x034), 786 REG(0x030), 787 REG(0x038), 788 REG(0x03c), 789 REG(0x168), 790 REG(0x140), 791 REG(0x110), 792 REG(0x1c0), 793 REG(0x1c4), 794 REG(0x1c8), 795 REG(0x180), 796 REG16(0x2b4), 797 798 NOP(5), 799 LRI(9, POSTED), 800 REG16(0x3a8), 801 REG16(0x28c), 802 REG16(0x288), 803 REG16(0x284), 804 REG16(0x280), 805 REG16(0x27c), 806 REG16(0x278), 807 REG16(0x274), 808 REG16(0x270), 809 810 END(80) 811 }; 812 813 static const u8 gen8_rcs_offsets[] = { 814 NOP(1), 815 LRI(14, POSTED), 816 REG16(0x244), 817 REG(0x034), 818 REG(0x030), 819 REG(0x038), 820 REG(0x03c), 821 REG(0x168), 822 REG(0x140), 823 REG(0x110), 824 REG(0x11c), 825 REG(0x114), 826 REG(0x118), 827 REG(0x1c0), 828 REG(0x1c4), 829 REG(0x1c8), 830 831 NOP(3), 832 LRI(9, POSTED), 833 REG16(0x3a8), 834 REG16(0x28c), 835 REG16(0x288), 836 REG16(0x284), 837 REG16(0x280), 838 REG16(0x27c), 839 REG16(0x278), 840 REG16(0x274), 841 REG16(0x270), 842 843 NOP(13), 844 LRI(1, 0), 845 REG(0x0c8), 846 847 END(80) 848 }; 849 850 static const u8 gen9_rcs_offsets[] = { 851 NOP(1), 852 LRI(14, POSTED), 853 REG16(0x244), 854 REG(0x34), 855 REG(0x30), 856 REG(0x38), 857 REG(0x3c), 858 REG(0x168), 859 REG(0x140), 860 REG(0x110), 861 REG(0x11c), 862 REG(0x114), 863 REG(0x118), 864 REG(0x1c0), 865 REG(0x1c4), 866 REG(0x1c8), 867 868 NOP(3), 869 LRI(9, POSTED), 870 REG16(0x3a8), 871 REG16(0x28c), 872 REG16(0x288), 873 REG16(0x284), 874 REG16(0x280), 875 REG16(0x27c), 876 REG16(0x278), 877 REG16(0x274), 878 REG16(0x270), 879 880 NOP(13), 881 LRI(1, 0), 882 REG(0xc8), 883 884 NOP(13), 885 LRI(44, POSTED), 886 REG(0x28), 887 REG(0x9c), 888 REG(0xc0), 889 REG(0x178), 890 REG(0x17c), 891 REG16(0x358), 892 REG(0x170), 893 REG(0x150), 894 REG(0x154), 895 REG(0x158), 896 REG16(0x41c), 897 REG16(0x600), 898 REG16(0x604), 899 REG16(0x608), 900 REG16(0x60c), 901 REG16(0x610), 902 REG16(0x614), 903 REG16(0x618), 904 REG16(0x61c), 905 REG16(0x620), 906 REG16(0x624), 907 REG16(0x628), 908 REG16(0x62c), 909 REG16(0x630), 910 REG16(0x634), 911 REG16(0x638), 912 REG16(0x63c), 913 REG16(0x640), 914 REG16(0x644), 915 REG16(0x648), 916 REG16(0x64c), 917 REG16(0x650), 918 REG16(0x654), 919 REG16(0x658), 920 REG16(0x65c), 921 REG16(0x660), 922 REG16(0x664), 923 REG16(0x668), 924 REG16(0x66c), 925 REG16(0x670), 926 REG16(0x674), 927 REG16(0x678), 928 REG16(0x67c), 929 REG(0x68), 930 931 END(176) 932 }; 933 934 static const u8 gen11_rcs_offsets[] = { 935 NOP(1), 936 LRI(15, POSTED), 937 REG16(0x244), 938 REG(0x034), 939 REG(0x030), 940 REG(0x038), 941 REG(0x03c), 942 REG(0x168), 943 REG(0x140), 944 REG(0x110), 945 REG(0x11c), 946 REG(0x114), 947 REG(0x118), 948 REG(0x1c0), 949 REG(0x1c4), 950 REG(0x1c8), 951 REG(0x180), 952 953 NOP(1), 954 LRI(9, POSTED), 955 REG16(0x3a8), 956 REG16(0x28c), 957 REG16(0x288), 958 REG16(0x284), 959 REG16(0x280), 960 REG16(0x27c), 961 REG16(0x278), 962 REG16(0x274), 963 REG16(0x270), 964 965 LRI(1, POSTED), 966 REG(0x1b0), 967 968 NOP(10), 969 LRI(1, 0), 970 REG(0x0c8), 971 972 END(80) 973 }; 974 975 static const u8 gen12_rcs_offsets[] = { 976 NOP(1), 977 LRI(13, POSTED), 978 REG16(0x244), 979 REG(0x034), 980 REG(0x030), 981 REG(0x038), 982 REG(0x03c), 983 REG(0x168), 984 REG(0x140), 985 REG(0x110), 986 REG(0x1c0), 987 REG(0x1c4), 988 REG(0x1c8), 989 REG(0x180), 990 REG16(0x2b4), 991 992 NOP(5), 993 LRI(9, POSTED), 994 REG16(0x3a8), 995 REG16(0x28c), 996 REG16(0x288), 997 REG16(0x284), 998 REG16(0x280), 999 REG16(0x27c), 1000 REG16(0x278), 1001 REG16(0x274), 1002 REG16(0x270), 1003 1004 LRI(3, POSTED), 1005 REG(0x1b0), 1006 REG16(0x5a8), 1007 REG16(0x5ac), 1008 1009 NOP(6), 1010 LRI(1, 0), 1011 REG(0x0c8), 1012 NOP(3 + 9 + 1), 1013 1014 LRI(51, POSTED), 1015 REG16(0x588), 1016 REG16(0x588), 1017 REG16(0x588), 1018 REG16(0x588), 1019 REG16(0x588), 1020 REG16(0x588), 1021 REG(0x028), 1022 REG(0x09c), 1023 REG(0x0c0), 1024 REG(0x178), 1025 REG(0x17c), 1026 REG16(0x358), 1027 REG(0x170), 1028 REG(0x150), 1029 REG(0x154), 1030 REG(0x158), 1031 REG16(0x41c), 1032 REG16(0x600), 1033 REG16(0x604), 1034 REG16(0x608), 1035 REG16(0x60c), 1036 REG16(0x610), 1037 REG16(0x614), 1038 REG16(0x618), 1039 REG16(0x61c), 1040 REG16(0x620), 1041 REG16(0x624), 1042 REG16(0x628), 1043 REG16(0x62c), 1044 REG16(0x630), 1045 REG16(0x634), 1046 REG16(0x638), 1047 REG16(0x63c), 1048 REG16(0x640), 1049 REG16(0x644), 1050 REG16(0x648), 1051 REG16(0x64c), 1052 REG16(0x650), 1053 REG16(0x654), 1054 REG16(0x658), 1055 REG16(0x65c), 1056 REG16(0x660), 1057 REG16(0x664), 1058 REG16(0x668), 1059 REG16(0x66c), 1060 REG16(0x670), 1061 REG16(0x674), 1062 REG16(0x678), 1063 REG16(0x67c), 1064 REG(0x068), 1065 REG(0x084), 1066 NOP(1), 1067 1068 END(192) 1069 }; 1070 1071 #undef END 1072 #undef REG16 1073 #undef REG 1074 #undef LRI 1075 #undef NOP 1076 1077 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 1078 { 1079 /* 1080 * The gen12+ lists only have the registers we program in the basic 1081 * default state. We rely on the context image using relative 1082 * addressing to automatic fixup the register state between the 1083 * physical engines for virtual engine. 1084 */ 1085 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 1086 !intel_engine_has_relative_mmio(engine)); 1087 1088 if (engine->class == RENDER_CLASS) { 1089 if (INTEL_GEN(engine->i915) >= 12) 1090 return gen12_rcs_offsets; 1091 else if (INTEL_GEN(engine->i915) >= 11) 1092 return gen11_rcs_offsets; 1093 else if (INTEL_GEN(engine->i915) >= 9) 1094 return gen9_rcs_offsets; 1095 else 1096 return gen8_rcs_offsets; 1097 } else { 1098 if (INTEL_GEN(engine->i915) >= 12) 1099 return gen12_xcs_offsets; 1100 else if (INTEL_GEN(engine->i915) >= 9) 1101 return gen9_xcs_offsets; 1102 else 1103 return gen8_xcs_offsets; 1104 } 1105 } 1106 1107 static struct i915_request * 1108 __unwind_incomplete_requests(struct intel_engine_cs *engine) 1109 { 1110 struct i915_request *rq, *rn, *active = NULL; 1111 struct list_head *pl; 1112 int prio = I915_PRIORITY_INVALID; 1113 1114 lockdep_assert_held(&engine->active.lock); 1115 1116 list_for_each_entry_safe_reverse(rq, rn, 1117 &engine->active.requests, 1118 sched.link) { 1119 if (i915_request_completed(rq)) 1120 continue; /* XXX */ 1121 1122 __i915_request_unsubmit(rq); 1123 1124 /* 1125 * Push the request back into the queue for later resubmission. 1126 * If this request is not native to this physical engine (i.e. 1127 * it came from a virtual source), push it back onto the virtual 1128 * engine so that it can be moved across onto another physical 1129 * engine as load dictates. 1130 */ 1131 if (likely(rq->execution_mask == engine->mask)) { 1132 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 1133 if (rq_prio(rq) != prio) { 1134 prio = rq_prio(rq); 1135 pl = i915_sched_lookup_priolist(engine, prio); 1136 } 1137 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1138 1139 list_move(&rq->sched.link, pl); 1140 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 1141 1142 /* Check in case we rollback so far we wrap [size/2] */ 1143 if (intel_ring_direction(rq->ring, 1144 rq->tail, 1145 rq->ring->tail + 8) > 0) 1146 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1147 1148 active = rq; 1149 } else { 1150 struct intel_engine_cs *owner = rq->context->engine; 1151 1152 WRITE_ONCE(rq->engine, owner); 1153 owner->submit_request(rq); 1154 active = NULL; 1155 } 1156 } 1157 1158 return active; 1159 } 1160 1161 struct i915_request * 1162 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1163 { 1164 struct intel_engine_cs *engine = 1165 container_of(execlists, typeof(*engine), execlists); 1166 1167 return __unwind_incomplete_requests(engine); 1168 } 1169 1170 static inline void 1171 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1172 { 1173 /* 1174 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1175 * The compiler should eliminate this function as dead-code. 1176 */ 1177 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1178 return; 1179 1180 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1181 status, rq); 1182 } 1183 1184 static void intel_engine_context_in(struct intel_engine_cs *engine) 1185 { 1186 unsigned long flags; 1187 1188 if (atomic_add_unless(&engine->stats.active, 1, 0)) 1189 return; 1190 1191 write_seqlock_irqsave(&engine->stats.lock, flags); 1192 if (!atomic_add_unless(&engine->stats.active, 1, 0)) { 1193 engine->stats.start = ktime_get(); 1194 atomic_inc(&engine->stats.active); 1195 } 1196 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1197 } 1198 1199 static void intel_engine_context_out(struct intel_engine_cs *engine) 1200 { 1201 unsigned long flags; 1202 1203 GEM_BUG_ON(!atomic_read(&engine->stats.active)); 1204 1205 if (atomic_add_unless(&engine->stats.active, -1, 1)) 1206 return; 1207 1208 write_seqlock_irqsave(&engine->stats.lock, flags); 1209 if (atomic_dec_and_test(&engine->stats.active)) { 1210 engine->stats.total = 1211 ktime_add(engine->stats.total, 1212 ktime_sub(ktime_get(), engine->stats.start)); 1213 } 1214 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1215 } 1216 1217 static void 1218 execlists_check_context(const struct intel_context *ce, 1219 const struct intel_engine_cs *engine) 1220 { 1221 const struct intel_ring *ring = ce->ring; 1222 u32 *regs = ce->lrc_reg_state; 1223 bool valid = true; 1224 int x; 1225 1226 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1227 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1228 engine->name, 1229 regs[CTX_RING_START], 1230 i915_ggtt_offset(ring->vma)); 1231 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1232 valid = false; 1233 } 1234 1235 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1236 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1237 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1238 engine->name, 1239 regs[CTX_RING_CTL], 1240 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1241 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1242 valid = false; 1243 } 1244 1245 x = lrc_ring_mi_mode(engine); 1246 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1247 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1248 engine->name, regs[x + 1]); 1249 regs[x + 1] &= ~STOP_RING; 1250 regs[x + 1] |= STOP_RING << 16; 1251 valid = false; 1252 } 1253 1254 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1255 } 1256 1257 static void restore_default_state(struct intel_context *ce, 1258 struct intel_engine_cs *engine) 1259 { 1260 u32 *regs; 1261 1262 regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); 1263 execlists_init_reg_state(regs, ce, engine, ce->ring, true); 1264 1265 ce->runtime.last = intel_context_get_runtime(ce); 1266 } 1267 1268 static void reset_active(struct i915_request *rq, 1269 struct intel_engine_cs *engine) 1270 { 1271 struct intel_context * const ce = rq->context; 1272 u32 head; 1273 1274 /* 1275 * The executing context has been cancelled. We want to prevent 1276 * further execution along this context and propagate the error on 1277 * to anything depending on its results. 1278 * 1279 * In __i915_request_submit(), we apply the -EIO and remove the 1280 * requests' payloads for any banned requests. But first, we must 1281 * rewind the context back to the start of the incomplete request so 1282 * that we do not jump back into the middle of the batch. 1283 * 1284 * We preserve the breadcrumbs and semaphores of the incomplete 1285 * requests so that inter-timeline dependencies (i.e other timelines) 1286 * remain correctly ordered. And we defer to __i915_request_submit() 1287 * so that all asynchronous waits are correctly handled. 1288 */ 1289 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1290 rq->fence.context, rq->fence.seqno); 1291 1292 /* On resubmission of the active request, payload will be scrubbed */ 1293 if (i915_request_completed(rq)) 1294 head = rq->tail; 1295 else 1296 head = active_request(ce->timeline, rq)->head; 1297 head = intel_ring_wrap(ce->ring, head); 1298 1299 /* Scrub the context image to prevent replaying the previous batch */ 1300 restore_default_state(ce, engine); 1301 __execlists_update_reg_state(ce, engine, head); 1302 1303 /* We've switched away, so this should be a no-op, but intent matters */ 1304 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1305 } 1306 1307 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1308 { 1309 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1310 ce->runtime.num_underflow += dt < 0; 1311 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1312 #endif 1313 } 1314 1315 static void intel_context_update_runtime(struct intel_context *ce) 1316 { 1317 u32 old; 1318 s32 dt; 1319 1320 if (intel_context_is_barrier(ce)) 1321 return; 1322 1323 old = ce->runtime.last; 1324 ce->runtime.last = intel_context_get_runtime(ce); 1325 dt = ce->runtime.last - old; 1326 1327 if (unlikely(dt <= 0)) { 1328 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1329 old, ce->runtime.last, dt); 1330 st_update_runtime_underflow(ce, dt); 1331 return; 1332 } 1333 1334 ewma_runtime_add(&ce->runtime.avg, dt); 1335 ce->runtime.total += dt; 1336 } 1337 1338 static inline struct intel_engine_cs * 1339 __execlists_schedule_in(struct i915_request *rq) 1340 { 1341 struct intel_engine_cs * const engine = rq->engine; 1342 struct intel_context * const ce = rq->context; 1343 1344 intel_context_get(ce); 1345 1346 if (unlikely(intel_context_is_banned(ce))) 1347 reset_active(rq, engine); 1348 1349 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1350 execlists_check_context(ce, engine); 1351 1352 if (ce->tag) { 1353 /* Use a fixed tag for OA and friends */ 1354 GEM_BUG_ON(ce->tag <= BITS_PER_LONG); 1355 ce->lrc.ccid = ce->tag; 1356 } else { 1357 /* We don't need a strict matching tag, just different values */ 1358 unsigned int tag = ffs(READ_ONCE(engine->context_tag)); 1359 1360 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); 1361 clear_bit(tag - 1, &engine->context_tag); 1362 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32); 1363 1364 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID); 1365 } 1366 1367 ce->lrc.ccid |= engine->execlists.ccid; 1368 1369 __intel_gt_pm_get(engine->gt); 1370 if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active)) 1371 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain); 1372 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1373 intel_engine_context_in(engine); 1374 1375 return engine; 1376 } 1377 1378 static inline struct i915_request * 1379 execlists_schedule_in(struct i915_request *rq, int idx) 1380 { 1381 struct intel_context * const ce = rq->context; 1382 struct intel_engine_cs *old; 1383 1384 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1385 trace_i915_request_in(rq, idx); 1386 1387 old = READ_ONCE(ce->inflight); 1388 do { 1389 if (!old) { 1390 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1391 break; 1392 } 1393 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1394 1395 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1396 return i915_request_get(rq); 1397 } 1398 1399 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1400 { 1401 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1402 struct i915_request *next = READ_ONCE(ve->request); 1403 1404 if (next == rq || (next && next->execution_mask & ~rq->execution_mask)) 1405 tasklet_hi_schedule(&ve->base.execlists.tasklet); 1406 } 1407 1408 static inline void 1409 __execlists_schedule_out(struct i915_request *rq, 1410 struct intel_engine_cs * const engine, 1411 unsigned int ccid) 1412 { 1413 struct intel_context * const ce = rq->context; 1414 1415 /* 1416 * NB process_csb() is not under the engine->active.lock and hence 1417 * schedule_out can race with schedule_in meaning that we should 1418 * refrain from doing non-trivial work here. 1419 */ 1420 1421 /* 1422 * If we have just completed this context, the engine may now be 1423 * idle and we want to re-enter powersaving. 1424 */ 1425 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && 1426 i915_request_completed(rq)) 1427 intel_engine_add_retire(engine, ce->timeline); 1428 1429 ccid >>= GEN11_SW_CTX_ID_SHIFT - 32; 1430 ccid &= GEN12_MAX_CONTEXT_HW_ID; 1431 if (ccid < BITS_PER_LONG) { 1432 GEM_BUG_ON(ccid == 0); 1433 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag)); 1434 set_bit(ccid - 1, &engine->context_tag); 1435 } 1436 1437 intel_context_update_runtime(ce); 1438 intel_engine_context_out(engine); 1439 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1440 if (engine->fw_domain && !atomic_dec_return(&engine->fw_active)) 1441 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain); 1442 intel_gt_pm_put_async(engine->gt); 1443 1444 /* 1445 * If this is part of a virtual engine, its next request may 1446 * have been blocked waiting for access to the active context. 1447 * We have to kick all the siblings again in case we need to 1448 * switch (e.g. the next request is not runnable on this 1449 * engine). Hopefully, we will already have submitted the next 1450 * request before the tasklet runs and do not need to rebuild 1451 * each virtual tree and kick everyone again. 1452 */ 1453 if (ce->engine != engine) 1454 kick_siblings(rq, ce); 1455 1456 intel_context_put(ce); 1457 } 1458 1459 static inline void 1460 execlists_schedule_out(struct i915_request *rq) 1461 { 1462 struct intel_context * const ce = rq->context; 1463 struct intel_engine_cs *cur, *old; 1464 u32 ccid; 1465 1466 trace_i915_request_out(rq); 1467 1468 ccid = rq->context->lrc.ccid; 1469 old = READ_ONCE(ce->inflight); 1470 do 1471 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1472 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1473 if (!cur) 1474 __execlists_schedule_out(rq, old, ccid); 1475 1476 i915_request_put(rq); 1477 } 1478 1479 static u64 execlists_update_context(struct i915_request *rq) 1480 { 1481 struct intel_context *ce = rq->context; 1482 u64 desc = ce->lrc.desc; 1483 u32 tail, prev; 1484 1485 /* 1486 * WaIdleLiteRestore:bdw,skl 1487 * 1488 * We should never submit the context with the same RING_TAIL twice 1489 * just in case we submit an empty ring, which confuses the HW. 1490 * 1491 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1492 * the normal request to be able to always advance the RING_TAIL on 1493 * subsequent resubmissions (for lite restore). Should that fail us, 1494 * and we try and submit the same tail again, force the context 1495 * reload. 1496 * 1497 * If we need to return to a preempted context, we need to skip the 1498 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1499 * HW has a tendency to ignore us rewinding the TAIL to the end of 1500 * an earlier request. 1501 */ 1502 GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail); 1503 prev = rq->ring->tail; 1504 tail = intel_ring_set_tail(rq->ring, rq->tail); 1505 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1506 desc |= CTX_DESC_FORCE_RESTORE; 1507 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1508 rq->tail = rq->wa_tail; 1509 1510 /* 1511 * Make sure the context image is complete before we submit it to HW. 1512 * 1513 * Ostensibly, writes (including the WCB) should be flushed prior to 1514 * an uncached write such as our mmio register access, the empirical 1515 * evidence (esp. on Braswell) suggests that the WC write into memory 1516 * may not be visible to the HW prior to the completion of the UC 1517 * register write and that we may begin execution from the context 1518 * before its image is complete leading to invalid PD chasing. 1519 */ 1520 wmb(); 1521 1522 ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE; 1523 return desc; 1524 } 1525 1526 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1527 { 1528 if (execlists->ctrl_reg) { 1529 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1530 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1531 } else { 1532 writel(upper_32_bits(desc), execlists->submit_reg); 1533 writel(lower_32_bits(desc), execlists->submit_reg); 1534 } 1535 } 1536 1537 static __maybe_unused char * 1538 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) 1539 { 1540 if (!rq) 1541 return ""; 1542 1543 snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", 1544 prefix, 1545 rq->context->lrc.ccid, 1546 rq->fence.context, rq->fence.seqno, 1547 i915_request_completed(rq) ? "!" : 1548 i915_request_started(rq) ? "*" : 1549 "", 1550 rq_prio(rq)); 1551 1552 return buf; 1553 } 1554 1555 static __maybe_unused void 1556 trace_ports(const struct intel_engine_execlists *execlists, 1557 const char *msg, 1558 struct i915_request * const *ports) 1559 { 1560 const struct intel_engine_cs *engine = 1561 container_of(execlists, typeof(*engine), execlists); 1562 char __maybe_unused p0[40], p1[40]; 1563 1564 if (!ports[0]) 1565 return; 1566 1567 ENGINE_TRACE(engine, "%s { %s%s }\n", msg, 1568 dump_port(p0, sizeof(p0), "", ports[0]), 1569 dump_port(p1, sizeof(p1), ", ", ports[1])); 1570 } 1571 1572 static inline bool 1573 reset_in_progress(const struct intel_engine_execlists *execlists) 1574 { 1575 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1576 } 1577 1578 static __maybe_unused bool 1579 assert_pending_valid(const struct intel_engine_execlists *execlists, 1580 const char *msg) 1581 { 1582 struct intel_engine_cs *engine = 1583 container_of(execlists, typeof(*engine), execlists); 1584 struct i915_request * const *port, *rq; 1585 struct intel_context *ce = NULL; 1586 bool sentinel = false; 1587 u32 ccid = -1; 1588 1589 trace_ports(execlists, msg, execlists->pending); 1590 1591 /* We may be messing around with the lists during reset, lalala */ 1592 if (reset_in_progress(execlists)) 1593 return true; 1594 1595 if (!execlists->pending[0]) { 1596 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", 1597 engine->name); 1598 return false; 1599 } 1600 1601 if (execlists->pending[execlists_num_ports(execlists)]) { 1602 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", 1603 engine->name, execlists_num_ports(execlists)); 1604 return false; 1605 } 1606 1607 for (port = execlists->pending; (rq = *port); port++) { 1608 unsigned long flags; 1609 bool ok = true; 1610 1611 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1612 GEM_BUG_ON(!i915_request_is_active(rq)); 1613 1614 if (ce == rq->context) { 1615 GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", 1616 engine->name, 1617 ce->timeline->fence_context, 1618 port - execlists->pending); 1619 return false; 1620 } 1621 ce = rq->context; 1622 1623 if (ccid == ce->lrc.ccid) { 1624 GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", 1625 engine->name, 1626 ccid, ce->timeline->fence_context, 1627 port - execlists->pending); 1628 return false; 1629 } 1630 ccid = ce->lrc.ccid; 1631 1632 /* 1633 * Sentinels are supposed to be the last request so they flush 1634 * the current execution off the HW. Check that they are the only 1635 * request in the pending submission. 1636 */ 1637 if (sentinel) { 1638 GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", 1639 engine->name, 1640 ce->timeline->fence_context, 1641 port - execlists->pending); 1642 return false; 1643 } 1644 sentinel = i915_request_has_sentinel(rq); 1645 1646 /* Hold tightly onto the lock to prevent concurrent retires! */ 1647 if (!spin_trylock_irqsave(&rq->lock, flags)) 1648 continue; 1649 1650 if (i915_request_completed(rq)) 1651 goto unlock; 1652 1653 if (i915_active_is_idle(&ce->active) && 1654 !intel_context_is_barrier(ce)) { 1655 GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", 1656 engine->name, 1657 ce->timeline->fence_context, 1658 port - execlists->pending); 1659 ok = false; 1660 goto unlock; 1661 } 1662 1663 if (!i915_vma_is_pinned(ce->state)) { 1664 GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", 1665 engine->name, 1666 ce->timeline->fence_context, 1667 port - execlists->pending); 1668 ok = false; 1669 goto unlock; 1670 } 1671 1672 if (!i915_vma_is_pinned(ce->ring->vma)) { 1673 GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", 1674 engine->name, 1675 ce->timeline->fence_context, 1676 port - execlists->pending); 1677 ok = false; 1678 goto unlock; 1679 } 1680 1681 unlock: 1682 spin_unlock_irqrestore(&rq->lock, flags); 1683 if (!ok) 1684 return false; 1685 } 1686 1687 return ce; 1688 } 1689 1690 static void execlists_submit_ports(struct intel_engine_cs *engine) 1691 { 1692 struct intel_engine_execlists *execlists = &engine->execlists; 1693 unsigned int n; 1694 1695 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1696 1697 /* 1698 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1699 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1700 * not be relinquished until the device is idle (see 1701 * i915_gem_idle_work_handler()). As a precaution, we make sure 1702 * that all ELSP are drained i.e. we have processed the CSB, 1703 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1704 */ 1705 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1706 1707 /* 1708 * ELSQ note: the submit queue is not cleared after being submitted 1709 * to the HW so we need to make sure we always clean it up. This is 1710 * currently ensured by the fact that we always write the same number 1711 * of elsq entries, keep this in mind before changing the loop below. 1712 */ 1713 for (n = execlists_num_ports(execlists); n--; ) { 1714 struct i915_request *rq = execlists->pending[n]; 1715 1716 write_desc(execlists, 1717 rq ? execlists_update_context(rq) : 0, 1718 n); 1719 } 1720 1721 /* we need to manually load the submit queue */ 1722 if (execlists->ctrl_reg) 1723 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1724 } 1725 1726 static bool ctx_single_port_submission(const struct intel_context *ce) 1727 { 1728 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1729 intel_context_force_single_submission(ce)); 1730 } 1731 1732 static bool can_merge_ctx(const struct intel_context *prev, 1733 const struct intel_context *next) 1734 { 1735 if (prev != next) 1736 return false; 1737 1738 if (ctx_single_port_submission(prev)) 1739 return false; 1740 1741 return true; 1742 } 1743 1744 static unsigned long i915_request_flags(const struct i915_request *rq) 1745 { 1746 return READ_ONCE(rq->fence.flags); 1747 } 1748 1749 static bool can_merge_rq(const struct i915_request *prev, 1750 const struct i915_request *next) 1751 { 1752 GEM_BUG_ON(prev == next); 1753 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1754 1755 /* 1756 * We do not submit known completed requests. Therefore if the next 1757 * request is already completed, we can pretend to merge it in 1758 * with the previous context (and we will skip updating the ELSP 1759 * and tracking). Thus hopefully keeping the ELSP full with active 1760 * contexts, despite the best efforts of preempt-to-busy to confuse 1761 * us. 1762 */ 1763 if (i915_request_completed(next)) 1764 return true; 1765 1766 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & 1767 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1768 BIT(I915_FENCE_FLAG_SENTINEL)))) 1769 return false; 1770 1771 if (!can_merge_ctx(prev->context, next->context)) 1772 return false; 1773 1774 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); 1775 return true; 1776 } 1777 1778 static void virtual_update_register_offsets(u32 *regs, 1779 struct intel_engine_cs *engine) 1780 { 1781 set_offsets(regs, reg_offsets(engine), engine, false); 1782 } 1783 1784 static bool virtual_matches(const struct virtual_engine *ve, 1785 const struct i915_request *rq, 1786 const struct intel_engine_cs *engine) 1787 { 1788 const struct intel_engine_cs *inflight; 1789 1790 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1791 return false; 1792 1793 /* 1794 * We track when the HW has completed saving the context image 1795 * (i.e. when we have seen the final CS event switching out of 1796 * the context) and must not overwrite the context image before 1797 * then. This restricts us to only using the active engine 1798 * while the previous virtualized request is inflight (so 1799 * we reuse the register offsets). This is a very small 1800 * hystersis on the greedy seelction algorithm. 1801 */ 1802 inflight = intel_context_inflight(&ve->context); 1803 if (inflight && inflight != engine) 1804 return false; 1805 1806 return true; 1807 } 1808 1809 static void virtual_xfer_context(struct virtual_engine *ve, 1810 struct intel_engine_cs *engine) 1811 { 1812 unsigned int n; 1813 1814 if (likely(engine == ve->siblings[0])) 1815 return; 1816 1817 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1818 if (!intel_engine_has_relative_mmio(engine)) 1819 virtual_update_register_offsets(ve->context.lrc_reg_state, 1820 engine); 1821 1822 /* 1823 * Move the bound engine to the top of the list for 1824 * future execution. We then kick this tasklet first 1825 * before checking others, so that we preferentially 1826 * reuse this set of bound registers. 1827 */ 1828 for (n = 1; n < ve->num_siblings; n++) { 1829 if (ve->siblings[n] == engine) { 1830 swap(ve->siblings[n], ve->siblings[0]); 1831 break; 1832 } 1833 } 1834 } 1835 1836 #define for_each_waiter(p__, rq__) \ 1837 list_for_each_entry_lockless(p__, \ 1838 &(rq__)->sched.waiters_list, \ 1839 wait_link) 1840 1841 #define for_each_signaler(p__, rq__) \ 1842 list_for_each_entry_rcu(p__, \ 1843 &(rq__)->sched.signalers_list, \ 1844 signal_link) 1845 1846 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1847 { 1848 LIST_HEAD(list); 1849 1850 /* 1851 * We want to move the interrupted request to the back of 1852 * the round-robin list (i.e. its priority level), but 1853 * in doing so, we must then move all requests that were in 1854 * flight and were waiting for the interrupted request to 1855 * be run after it again. 1856 */ 1857 do { 1858 struct i915_dependency *p; 1859 1860 GEM_BUG_ON(i915_request_is_active(rq)); 1861 list_move_tail(&rq->sched.link, pl); 1862 1863 for_each_waiter(p, rq) { 1864 struct i915_request *w = 1865 container_of(p->waiter, typeof(*w), sched); 1866 1867 if (p->flags & I915_DEPENDENCY_WEAK) 1868 continue; 1869 1870 /* Leave semaphores spinning on the other engines */ 1871 if (w->engine != rq->engine) 1872 continue; 1873 1874 /* No waiter should start before its signaler */ 1875 GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && 1876 i915_request_started(w) && 1877 !i915_request_completed(rq)); 1878 1879 GEM_BUG_ON(i915_request_is_active(w)); 1880 if (!i915_request_is_ready(w)) 1881 continue; 1882 1883 if (rq_prio(w) < rq_prio(rq)) 1884 continue; 1885 1886 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1887 list_move_tail(&w->sched.link, &list); 1888 } 1889 1890 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1891 } while (rq); 1892 } 1893 1894 static void defer_active(struct intel_engine_cs *engine) 1895 { 1896 struct i915_request *rq; 1897 1898 rq = __unwind_incomplete_requests(engine); 1899 if (!rq) 1900 return; 1901 1902 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1903 } 1904 1905 static bool 1906 need_timeslice(const struct intel_engine_cs *engine, 1907 const struct i915_request *rq, 1908 const struct rb_node *rb) 1909 { 1910 int hint; 1911 1912 if (!intel_engine_has_timeslices(engine)) 1913 return false; 1914 1915 hint = engine->execlists.queue_priority_hint; 1916 1917 if (rb) { 1918 const struct virtual_engine *ve = 1919 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1920 const struct intel_engine_cs *inflight = 1921 intel_context_inflight(&ve->context); 1922 1923 if (!inflight || inflight == engine) { 1924 struct i915_request *next; 1925 1926 rcu_read_lock(); 1927 next = READ_ONCE(ve->request); 1928 if (next) 1929 hint = max(hint, rq_prio(next)); 1930 rcu_read_unlock(); 1931 } 1932 } 1933 1934 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1935 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1936 1937 GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE); 1938 return hint >= effective_prio(rq); 1939 } 1940 1941 static bool 1942 timeslice_yield(const struct intel_engine_execlists *el, 1943 const struct i915_request *rq) 1944 { 1945 /* 1946 * Once bitten, forever smitten! 1947 * 1948 * If the active context ever busy-waited on a semaphore, 1949 * it will be treated as a hog until the end of its timeslice (i.e. 1950 * until it is scheduled out and replaced by a new submission, 1951 * possibly even its own lite-restore). The HW only sends an interrupt 1952 * on the first miss, and we do know if that semaphore has been 1953 * signaled, or even if it is now stuck on another semaphore. Play 1954 * safe, yield if it might be stuck -- it will be given a fresh 1955 * timeslice in the near future. 1956 */ 1957 return rq->context->lrc.ccid == READ_ONCE(el->yield); 1958 } 1959 1960 static bool 1961 timeslice_expired(const struct intel_engine_execlists *el, 1962 const struct i915_request *rq) 1963 { 1964 return timer_expired(&el->timer) || timeslice_yield(el, rq); 1965 } 1966 1967 static int 1968 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1969 { 1970 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1971 return engine->execlists.queue_priority_hint; 1972 1973 return rq_prio(list_next_entry(rq, sched.link)); 1974 } 1975 1976 static inline unsigned long 1977 timeslice(const struct intel_engine_cs *engine) 1978 { 1979 return READ_ONCE(engine->props.timeslice_duration_ms); 1980 } 1981 1982 static unsigned long active_timeslice(const struct intel_engine_cs *engine) 1983 { 1984 const struct intel_engine_execlists *execlists = &engine->execlists; 1985 const struct i915_request *rq = *execlists->active; 1986 1987 if (!rq || i915_request_completed(rq)) 1988 return 0; 1989 1990 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) 1991 return 0; 1992 1993 return timeslice(engine); 1994 } 1995 1996 static void set_timeslice(struct intel_engine_cs *engine) 1997 { 1998 unsigned long duration; 1999 2000 if (!intel_engine_has_timeslices(engine)) 2001 return; 2002 2003 duration = active_timeslice(engine); 2004 ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); 2005 2006 set_timer_ms(&engine->execlists.timer, duration); 2007 } 2008 2009 static void start_timeslice(struct intel_engine_cs *engine, int prio) 2010 { 2011 struct intel_engine_execlists *execlists = &engine->execlists; 2012 unsigned long duration; 2013 2014 if (!intel_engine_has_timeslices(engine)) 2015 return; 2016 2017 WRITE_ONCE(execlists->switch_priority_hint, prio); 2018 if (prio == INT_MIN) 2019 return; 2020 2021 if (timer_pending(&execlists->timer)) 2022 return; 2023 2024 duration = timeslice(engine); 2025 ENGINE_TRACE(engine, 2026 "start timeslicing, prio:%d, interval:%lu", 2027 prio, duration); 2028 2029 set_timer_ms(&execlists->timer, duration); 2030 } 2031 2032 static void record_preemption(struct intel_engine_execlists *execlists) 2033 { 2034 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 2035 } 2036 2037 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, 2038 const struct i915_request *rq) 2039 { 2040 if (!rq) 2041 return 0; 2042 2043 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 2044 if (unlikely(intel_context_is_banned(rq->context))) 2045 return 1; 2046 2047 return READ_ONCE(engine->props.preempt_timeout_ms); 2048 } 2049 2050 static void set_preempt_timeout(struct intel_engine_cs *engine, 2051 const struct i915_request *rq) 2052 { 2053 if (!intel_engine_has_preempt_reset(engine)) 2054 return; 2055 2056 set_timer_ms(&engine->execlists.preempt, 2057 active_preempt_timeout(engine, rq)); 2058 } 2059 2060 static inline void clear_ports(struct i915_request **ports, int count) 2061 { 2062 memset_p((void **)ports, NULL, count); 2063 } 2064 2065 static inline void 2066 copy_ports(struct i915_request **dst, struct i915_request **src, int count) 2067 { 2068 /* A memcpy_p() would be very useful here! */ 2069 while (count--) 2070 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */ 2071 } 2072 2073 static void execlists_dequeue(struct intel_engine_cs *engine) 2074 { 2075 struct intel_engine_execlists * const execlists = &engine->execlists; 2076 struct i915_request **port = execlists->pending; 2077 struct i915_request ** const last_port = port + execlists->port_mask; 2078 struct i915_request * const *active; 2079 struct i915_request *last; 2080 struct rb_node *rb; 2081 bool submit = false; 2082 2083 /* 2084 * Hardware submission is through 2 ports. Conceptually each port 2085 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 2086 * static for a context, and unique to each, so we only execute 2087 * requests belonging to a single context from each ring. RING_HEAD 2088 * is maintained by the CS in the context image, it marks the place 2089 * where it got up to last time, and through RING_TAIL we tell the CS 2090 * where we want to execute up to this time. 2091 * 2092 * In this list the requests are in order of execution. Consecutive 2093 * requests from the same context are adjacent in the ringbuffer. We 2094 * can combine these requests into a single RING_TAIL update: 2095 * 2096 * RING_HEAD...req1...req2 2097 * ^- RING_TAIL 2098 * since to execute req2 the CS must first execute req1. 2099 * 2100 * Our goal then is to point each port to the end of a consecutive 2101 * sequence of requests as being the most optimal (fewest wake ups 2102 * and context switches) submission. 2103 */ 2104 2105 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 2106 struct virtual_engine *ve = 2107 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2108 struct i915_request *rq = READ_ONCE(ve->request); 2109 2110 if (!rq) { /* lazily cleanup after another engine handled rq */ 2111 rb_erase_cached(rb, &execlists->virtual); 2112 RB_CLEAR_NODE(rb); 2113 rb = rb_first_cached(&execlists->virtual); 2114 continue; 2115 } 2116 2117 if (!virtual_matches(ve, rq, engine)) { 2118 rb = rb_next(rb); 2119 continue; 2120 } 2121 2122 break; 2123 } 2124 2125 /* 2126 * If the queue is higher priority than the last 2127 * request in the currently active context, submit afresh. 2128 * We will resubmit again afterwards in case we need to split 2129 * the active context to interject the preemption request, 2130 * i.e. we will retrigger preemption following the ack in case 2131 * of trouble. 2132 */ 2133 active = READ_ONCE(execlists->active); 2134 2135 /* 2136 * In theory we can skip over completed contexts that have not 2137 * yet been processed by events (as those events are in flight): 2138 * 2139 * while ((last = *active) && i915_request_completed(last)) 2140 * active++; 2141 * 2142 * However, the GPU cannot handle this as it will ultimately 2143 * find itself trying to jump back into a context it has just 2144 * completed and barf. 2145 */ 2146 2147 if ((last = *active)) { 2148 if (need_preempt(engine, last, rb)) { 2149 if (i915_request_completed(last)) { 2150 tasklet_hi_schedule(&execlists->tasklet); 2151 return; 2152 } 2153 2154 ENGINE_TRACE(engine, 2155 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 2156 last->fence.context, 2157 last->fence.seqno, 2158 last->sched.attr.priority, 2159 execlists->queue_priority_hint); 2160 record_preemption(execlists); 2161 2162 /* 2163 * Don't let the RING_HEAD advance past the breadcrumb 2164 * as we unwind (and until we resubmit) so that we do 2165 * not accidentally tell it to go backwards. 2166 */ 2167 ring_set_paused(engine, 1); 2168 2169 /* 2170 * Note that we have not stopped the GPU at this point, 2171 * so we are unwinding the incomplete requests as they 2172 * remain inflight and so by the time we do complete 2173 * the preemption, some of the unwound requests may 2174 * complete! 2175 */ 2176 __unwind_incomplete_requests(engine); 2177 2178 last = NULL; 2179 } else if (need_timeslice(engine, last, rb) && 2180 timeslice_expired(execlists, last)) { 2181 if (i915_request_completed(last)) { 2182 tasklet_hi_schedule(&execlists->tasklet); 2183 return; 2184 } 2185 2186 ENGINE_TRACE(engine, 2187 "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", 2188 last->fence.context, 2189 last->fence.seqno, 2190 last->sched.attr.priority, 2191 execlists->queue_priority_hint, 2192 yesno(timeslice_yield(execlists, last))); 2193 2194 ring_set_paused(engine, 1); 2195 defer_active(engine); 2196 2197 /* 2198 * Unlike for preemption, if we rewind and continue 2199 * executing the same context as previously active, 2200 * the order of execution will remain the same and 2201 * the tail will only advance. We do not need to 2202 * force a full context restore, as a lite-restore 2203 * is sufficient to resample the monotonic TAIL. 2204 * 2205 * If we switch to any other context, similarly we 2206 * will not rewind TAIL of current context, and 2207 * normal save/restore will preserve state and allow 2208 * us to later continue executing the same request. 2209 */ 2210 last = NULL; 2211 } else { 2212 /* 2213 * Otherwise if we already have a request pending 2214 * for execution after the current one, we can 2215 * just wait until the next CS event before 2216 * queuing more. In either case we will force a 2217 * lite-restore preemption event, but if we wait 2218 * we hopefully coalesce several updates into a single 2219 * submission. 2220 */ 2221 if (!list_is_last(&last->sched.link, 2222 &engine->active.requests)) { 2223 /* 2224 * Even if ELSP[1] is occupied and not worthy 2225 * of timeslices, our queue might be. 2226 */ 2227 start_timeslice(engine, queue_prio(execlists)); 2228 return; 2229 } 2230 } 2231 } 2232 2233 while (rb) { /* XXX virtual is always taking precedence */ 2234 struct virtual_engine *ve = 2235 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2236 struct i915_request *rq; 2237 2238 spin_lock(&ve->base.active.lock); 2239 2240 rq = ve->request; 2241 if (unlikely(!rq)) { /* lost the race to a sibling */ 2242 spin_unlock(&ve->base.active.lock); 2243 rb_erase_cached(rb, &execlists->virtual); 2244 RB_CLEAR_NODE(rb); 2245 rb = rb_first_cached(&execlists->virtual); 2246 continue; 2247 } 2248 2249 GEM_BUG_ON(rq != ve->request); 2250 GEM_BUG_ON(rq->engine != &ve->base); 2251 GEM_BUG_ON(rq->context != &ve->context); 2252 2253 if (rq_prio(rq) >= queue_prio(execlists)) { 2254 if (!virtual_matches(ve, rq, engine)) { 2255 spin_unlock(&ve->base.active.lock); 2256 rb = rb_next(rb); 2257 continue; 2258 } 2259 2260 if (last && !can_merge_rq(last, rq)) { 2261 spin_unlock(&ve->base.active.lock); 2262 start_timeslice(engine, rq_prio(rq)); 2263 return; /* leave this for another sibling */ 2264 } 2265 2266 ENGINE_TRACE(engine, 2267 "virtual rq=%llx:%lld%s, new engine? %s\n", 2268 rq->fence.context, 2269 rq->fence.seqno, 2270 i915_request_completed(rq) ? "!" : 2271 i915_request_started(rq) ? "*" : 2272 "", 2273 yesno(engine != ve->siblings[0])); 2274 2275 WRITE_ONCE(ve->request, NULL); 2276 WRITE_ONCE(ve->base.execlists.queue_priority_hint, 2277 INT_MIN); 2278 rb_erase_cached(rb, &execlists->virtual); 2279 RB_CLEAR_NODE(rb); 2280 2281 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2282 WRITE_ONCE(rq->engine, engine); 2283 2284 if (__i915_request_submit(rq)) { 2285 /* 2286 * Only after we confirm that we will submit 2287 * this request (i.e. it has not already 2288 * completed), do we want to update the context. 2289 * 2290 * This serves two purposes. It avoids 2291 * unnecessary work if we are resubmitting an 2292 * already completed request after timeslicing. 2293 * But more importantly, it prevents us altering 2294 * ve->siblings[] on an idle context, where 2295 * we may be using ve->siblings[] in 2296 * virtual_context_enter / virtual_context_exit. 2297 */ 2298 virtual_xfer_context(ve, engine); 2299 GEM_BUG_ON(ve->siblings[0] != engine); 2300 2301 submit = true; 2302 last = rq; 2303 } 2304 i915_request_put(rq); 2305 2306 /* 2307 * Hmm, we have a bunch of virtual engine requests, 2308 * but the first one was already completed (thanks 2309 * preempt-to-busy!). Keep looking at the veng queue 2310 * until we have no more relevant requests (i.e. 2311 * the normal submit queue has higher priority). 2312 */ 2313 if (!submit) { 2314 spin_unlock(&ve->base.active.lock); 2315 rb = rb_first_cached(&execlists->virtual); 2316 continue; 2317 } 2318 } 2319 2320 spin_unlock(&ve->base.active.lock); 2321 break; 2322 } 2323 2324 while ((rb = rb_first_cached(&execlists->queue))) { 2325 struct i915_priolist *p = to_priolist(rb); 2326 struct i915_request *rq, *rn; 2327 int i; 2328 2329 priolist_for_each_request_consume(rq, rn, p, i) { 2330 bool merge = true; 2331 2332 /* 2333 * Can we combine this request with the current port? 2334 * It has to be the same context/ringbuffer and not 2335 * have any exceptions (e.g. GVT saying never to 2336 * combine contexts). 2337 * 2338 * If we can combine the requests, we can execute both 2339 * by updating the RING_TAIL to point to the end of the 2340 * second request, and so we never need to tell the 2341 * hardware about the first. 2342 */ 2343 if (last && !can_merge_rq(last, rq)) { 2344 /* 2345 * If we are on the second port and cannot 2346 * combine this request with the last, then we 2347 * are done. 2348 */ 2349 if (port == last_port) 2350 goto done; 2351 2352 /* 2353 * We must not populate both ELSP[] with the 2354 * same LRCA, i.e. we must submit 2 different 2355 * contexts if we submit 2 ELSP. 2356 */ 2357 if (last->context == rq->context) 2358 goto done; 2359 2360 if (i915_request_has_sentinel(last)) 2361 goto done; 2362 2363 /* 2364 * If GVT overrides us we only ever submit 2365 * port[0], leaving port[1] empty. Note that we 2366 * also have to be careful that we don't queue 2367 * the same context (even though a different 2368 * request) to the second port. 2369 */ 2370 if (ctx_single_port_submission(last->context) || 2371 ctx_single_port_submission(rq->context)) 2372 goto done; 2373 2374 merge = false; 2375 } 2376 2377 if (__i915_request_submit(rq)) { 2378 if (!merge) { 2379 *port = execlists_schedule_in(last, port - execlists->pending); 2380 port++; 2381 last = NULL; 2382 } 2383 2384 GEM_BUG_ON(last && 2385 !can_merge_ctx(last->context, 2386 rq->context)); 2387 GEM_BUG_ON(last && 2388 i915_seqno_passed(last->fence.seqno, 2389 rq->fence.seqno)); 2390 2391 submit = true; 2392 last = rq; 2393 } 2394 } 2395 2396 rb_erase_cached(&p->node, &execlists->queue); 2397 i915_priolist_free(p); 2398 } 2399 2400 done: 2401 /* 2402 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2403 * 2404 * We choose the priority hint such that if we add a request of greater 2405 * priority than this, we kick the submission tasklet to decide on 2406 * the right order of submitting the requests to hardware. We must 2407 * also be prepared to reorder requests as they are in-flight on the 2408 * HW. We derive the priority hint then as the first "hole" in 2409 * the HW submission ports and if there are no available slots, 2410 * the priority of the lowest executing request, i.e. last. 2411 * 2412 * When we do receive a higher priority request ready to run from the 2413 * user, see queue_request(), the priority hint is bumped to that 2414 * request triggering preemption on the next dequeue (or subsequent 2415 * interrupt for secondary ports). 2416 */ 2417 execlists->queue_priority_hint = queue_prio(execlists); 2418 2419 if (submit) { 2420 *port = execlists_schedule_in(last, port - execlists->pending); 2421 execlists->switch_priority_hint = 2422 switch_prio(engine, *execlists->pending); 2423 2424 /* 2425 * Skip if we ended up with exactly the same set of requests, 2426 * e.g. trying to timeslice a pair of ordered contexts 2427 */ 2428 if (!memcmp(active, execlists->pending, 2429 (port - execlists->pending + 1) * sizeof(*port))) { 2430 do 2431 execlists_schedule_out(fetch_and_zero(port)); 2432 while (port-- != execlists->pending); 2433 2434 goto skip_submit; 2435 } 2436 clear_ports(port + 1, last_port - port); 2437 2438 WRITE_ONCE(execlists->yield, -1); 2439 set_preempt_timeout(engine, *active); 2440 execlists_submit_ports(engine); 2441 } else { 2442 start_timeslice(engine, execlists->queue_priority_hint); 2443 skip_submit: 2444 ring_set_paused(engine, 0); 2445 } 2446 } 2447 2448 static void 2449 cancel_port_requests(struct intel_engine_execlists * const execlists) 2450 { 2451 struct i915_request * const *port; 2452 2453 for (port = execlists->pending; *port; port++) 2454 execlists_schedule_out(*port); 2455 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2456 2457 /* Mark the end of active before we overwrite *active */ 2458 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2459 execlists_schedule_out(*port); 2460 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2461 2462 smp_wmb(); /* complete the seqlock for execlists_active() */ 2463 WRITE_ONCE(execlists->active, execlists->inflight); 2464 } 2465 2466 static inline void 2467 invalidate_csb_entries(const u64 *first, const u64 *last) 2468 { 2469 clflush((void *)first); 2470 clflush((void *)last); 2471 } 2472 2473 /* 2474 * Starting with Gen12, the status has a new format: 2475 * 2476 * bit 0: switched to new queue 2477 * bit 1: reserved 2478 * bit 2: semaphore wait mode (poll or signal), only valid when 2479 * switch detail is set to "wait on semaphore" 2480 * bits 3-5: engine class 2481 * bits 6-11: engine instance 2482 * bits 12-14: reserved 2483 * bits 15-25: sw context id of the lrc the GT switched to 2484 * bits 26-31: sw counter of the lrc the GT switched to 2485 * bits 32-35: context switch detail 2486 * - 0: ctx complete 2487 * - 1: wait on sync flip 2488 * - 2: wait on vblank 2489 * - 3: wait on scanline 2490 * - 4: wait on semaphore 2491 * - 5: context preempted (not on SEMAPHORE_WAIT or 2492 * WAIT_FOR_EVENT) 2493 * bit 36: reserved 2494 * bits 37-43: wait detail (for switch detail 1 to 4) 2495 * bits 44-46: reserved 2496 * bits 47-57: sw context id of the lrc the GT switched away from 2497 * bits 58-63: sw counter of the lrc the GT switched away from 2498 */ 2499 static inline bool gen12_csb_parse(const u64 *csb) 2500 { 2501 bool ctx_away_valid; 2502 bool new_queue; 2503 u64 entry; 2504 2505 /* HSD#22011248461 */ 2506 entry = READ_ONCE(*csb); 2507 if (unlikely(entry == -1)) { 2508 preempt_disable(); 2509 if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50)) 2510 GEM_WARN_ON("50us CSB timeout"); 2511 preempt_enable(); 2512 } 2513 WRITE_ONCE(*(u64 *)csb, -1); 2514 2515 ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); 2516 new_queue = 2517 lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2518 2519 /* 2520 * The context switch detail is not guaranteed to be 5 when a preemption 2521 * occurs, so we can't just check for that. The check below works for 2522 * all the cases we care about, including preemptions of WAIT 2523 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2524 * would require some extra handling, but we don't support that. 2525 */ 2526 if (!ctx_away_valid || new_queue) { 2527 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry))); 2528 return true; 2529 } 2530 2531 /* 2532 * switch detail = 5 is covered by the case above and we do not expect a 2533 * context switch on an unsuccessful wait instruction since we always 2534 * use polling mode. 2535 */ 2536 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry))); 2537 return false; 2538 } 2539 2540 static inline bool gen8_csb_parse(const u64 *csb) 2541 { 2542 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2543 } 2544 2545 static void process_csb(struct intel_engine_cs *engine) 2546 { 2547 struct intel_engine_execlists * const execlists = &engine->execlists; 2548 const u64 * const buf = execlists->csb_status; 2549 const u8 num_entries = execlists->csb_size; 2550 u8 head, tail; 2551 2552 /* 2553 * As we modify our execlists state tracking we require exclusive 2554 * access. Either we are inside the tasklet, or the tasklet is disabled 2555 * and we assume that is only inside the reset paths and so serialised. 2556 */ 2557 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2558 !reset_in_progress(execlists)); 2559 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2560 2561 /* 2562 * Note that csb_write, csb_status may be either in HWSP or mmio. 2563 * When reading from the csb_write mmio register, we have to be 2564 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2565 * the low 4bits. As it happens we know the next 4bits are always 2566 * zero and so we can simply masked off the low u8 of the register 2567 * and treat it identically to reading from the HWSP (without having 2568 * to use explicit shifting and masking, and probably bifurcating 2569 * the code to handle the legacy mmio read). 2570 */ 2571 head = execlists->csb_head; 2572 tail = READ_ONCE(*execlists->csb_write); 2573 if (unlikely(head == tail)) 2574 return; 2575 2576 /* 2577 * We will consume all events from HW, or at least pretend to. 2578 * 2579 * The sequence of events from the HW is deterministic, and derived 2580 * from our writes to the ELSP, with a smidgen of variability for 2581 * the arrival of the asynchronous requests wrt to the inflight 2582 * execution. If the HW sends an event that does not correspond with 2583 * the one we are expecting, we have to abandon all hope as we lose 2584 * all tracking of what the engine is actually executing. We will 2585 * only detect we are out of sequence with the HW when we get an 2586 * 'impossible' event because we have already drained our own 2587 * preemption/promotion queue. If this occurs, we know that we likely 2588 * lost track of execution earlier and must unwind and restart, the 2589 * simplest way is by stop processing the event queue and force the 2590 * engine to reset. 2591 */ 2592 execlists->csb_head = tail; 2593 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2594 2595 /* 2596 * Hopefully paired with a wmb() in HW! 2597 * 2598 * We must complete the read of the write pointer before any reads 2599 * from the CSB, so that we do not see stale values. Without an rmb 2600 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2601 * we perform the READ_ONCE(*csb_write). 2602 */ 2603 rmb(); 2604 do { 2605 bool promote; 2606 2607 if (++head == num_entries) 2608 head = 0; 2609 2610 /* 2611 * We are flying near dragons again. 2612 * 2613 * We hold a reference to the request in execlist_port[] 2614 * but no more than that. We are operating in softirq 2615 * context and so cannot hold any mutex or sleep. That 2616 * prevents us stopping the requests we are processing 2617 * in port[] from being retired simultaneously (the 2618 * breadcrumb will be complete before we see the 2619 * context-switch). As we only hold the reference to the 2620 * request, any pointer chasing underneath the request 2621 * is subject to a potential use-after-free. Thus we 2622 * store all of the bookkeeping within port[] as 2623 * required, and avoid using unguarded pointers beneath 2624 * request itself. The same applies to the atomic 2625 * status notifier. 2626 */ 2627 2628 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2629 head, 2630 upper_32_bits(buf[head]), 2631 lower_32_bits(buf[head])); 2632 2633 if (INTEL_GEN(engine->i915) >= 12) 2634 promote = gen12_csb_parse(buf + head); 2635 else 2636 promote = gen8_csb_parse(buf + head); 2637 if (promote) { 2638 struct i915_request * const *old = execlists->active; 2639 2640 if (GEM_WARN_ON(!*execlists->pending)) { 2641 execlists->error_interrupt |= ERROR_CSB; 2642 break; 2643 } 2644 2645 ring_set_paused(engine, 0); 2646 2647 /* Point active to the new ELSP; prevent overwriting */ 2648 WRITE_ONCE(execlists->active, execlists->pending); 2649 smp_wmb(); /* notify execlists_active() */ 2650 2651 /* cancel old inflight, prepare for switch */ 2652 trace_ports(execlists, "preempted", old); 2653 while (*old) 2654 execlists_schedule_out(*old++); 2655 2656 /* switch pending to inflight */ 2657 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2658 copy_ports(execlists->inflight, 2659 execlists->pending, 2660 execlists_num_ports(execlists)); 2661 smp_wmb(); /* complete the seqlock */ 2662 WRITE_ONCE(execlists->active, execlists->inflight); 2663 2664 /* XXX Magic delay for tgl */ 2665 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 2666 2667 WRITE_ONCE(execlists->pending[0], NULL); 2668 } else { 2669 if (GEM_WARN_ON(!*execlists->active)) { 2670 execlists->error_interrupt |= ERROR_CSB; 2671 break; 2672 } 2673 2674 /* port0 completed, advanced to port1 */ 2675 trace_ports(execlists, "completed", execlists->active); 2676 2677 /* 2678 * We rely on the hardware being strongly 2679 * ordered, that the breadcrumb write is 2680 * coherent (visible from the CPU) before the 2681 * user interrupt is processed. One might assume 2682 * that the breadcrumb write being before the 2683 * user interrupt and the CS event for the context 2684 * switch would therefore be before the CS event 2685 * itself... 2686 */ 2687 if (GEM_SHOW_DEBUG() && 2688 !i915_request_completed(*execlists->active)) { 2689 struct i915_request *rq = *execlists->active; 2690 const u32 *regs __maybe_unused = 2691 rq->context->lrc_reg_state; 2692 2693 ENGINE_TRACE(engine, 2694 "context completed before request!\n"); 2695 ENGINE_TRACE(engine, 2696 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2697 ENGINE_READ(engine, RING_START), 2698 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2699 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2700 ENGINE_READ(engine, RING_CTL), 2701 ENGINE_READ(engine, RING_MI_MODE)); 2702 ENGINE_TRACE(engine, 2703 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2704 i915_ggtt_offset(rq->ring->vma), 2705 rq->head, rq->tail, 2706 rq->fence.context, 2707 lower_32_bits(rq->fence.seqno), 2708 hwsp_seqno(rq)); 2709 ENGINE_TRACE(engine, 2710 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2711 regs[CTX_RING_START], 2712 regs[CTX_RING_HEAD], 2713 regs[CTX_RING_TAIL]); 2714 } 2715 2716 execlists_schedule_out(*execlists->active++); 2717 2718 GEM_BUG_ON(execlists->active - execlists->inflight > 2719 execlists_num_ports(execlists)); 2720 } 2721 } while (head != tail); 2722 2723 set_timeslice(engine); 2724 2725 /* 2726 * Gen11 has proven to fail wrt global observation point between 2727 * entry and tail update, failing on the ordering and thus 2728 * we see an old entry in the context status buffer. 2729 * 2730 * Forcibly evict out entries for the next gpu csb update, 2731 * to increase the odds that we get a fresh entries with non 2732 * working hardware. The cost for doing so comes out mostly with 2733 * the wash as hardware, working or not, will need to do the 2734 * invalidation before. 2735 */ 2736 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2737 } 2738 2739 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2740 { 2741 lockdep_assert_held(&engine->active.lock); 2742 if (!READ_ONCE(engine->execlists.pending[0])) { 2743 rcu_read_lock(); /* protect peeking at execlists->active */ 2744 execlists_dequeue(engine); 2745 rcu_read_unlock(); 2746 } 2747 } 2748 2749 static void __execlists_hold(struct i915_request *rq) 2750 { 2751 LIST_HEAD(list); 2752 2753 do { 2754 struct i915_dependency *p; 2755 2756 if (i915_request_is_active(rq)) 2757 __i915_request_unsubmit(rq); 2758 2759 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2760 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2761 i915_request_set_hold(rq); 2762 RQ_TRACE(rq, "on hold\n"); 2763 2764 for_each_waiter(p, rq) { 2765 struct i915_request *w = 2766 container_of(p->waiter, typeof(*w), sched); 2767 2768 /* Leave semaphores spinning on the other engines */ 2769 if (w->engine != rq->engine) 2770 continue; 2771 2772 if (!i915_request_is_ready(w)) 2773 continue; 2774 2775 if (i915_request_completed(w)) 2776 continue; 2777 2778 if (i915_request_on_hold(w)) 2779 continue; 2780 2781 list_move_tail(&w->sched.link, &list); 2782 } 2783 2784 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2785 } while (rq); 2786 } 2787 2788 static bool execlists_hold(struct intel_engine_cs *engine, 2789 struct i915_request *rq) 2790 { 2791 spin_lock_irq(&engine->active.lock); 2792 2793 if (i915_request_completed(rq)) { /* too late! */ 2794 rq = NULL; 2795 goto unlock; 2796 } 2797 2798 if (rq->engine != engine) { /* preempted virtual engine */ 2799 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2800 2801 /* 2802 * intel_context_inflight() is only protected by virtue 2803 * of process_csb() being called only by the tasklet (or 2804 * directly from inside reset while the tasklet is suspended). 2805 * Assert that neither of those are allowed to run while we 2806 * poke at the request queues. 2807 */ 2808 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2809 2810 /* 2811 * An unsubmitted request along a virtual engine will 2812 * remain on the active (this) engine until we are able 2813 * to process the context switch away (and so mark the 2814 * context as no longer in flight). That cannot have happened 2815 * yet, otherwise we would not be hanging! 2816 */ 2817 spin_lock(&ve->base.active.lock); 2818 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2819 GEM_BUG_ON(ve->request != rq); 2820 ve->request = NULL; 2821 spin_unlock(&ve->base.active.lock); 2822 i915_request_put(rq); 2823 2824 rq->engine = engine; 2825 } 2826 2827 /* 2828 * Transfer this request onto the hold queue to prevent it 2829 * being resumbitted to HW (and potentially completed) before we have 2830 * released it. Since we may have already submitted following 2831 * requests, we need to remove those as well. 2832 */ 2833 GEM_BUG_ON(i915_request_on_hold(rq)); 2834 GEM_BUG_ON(rq->engine != engine); 2835 __execlists_hold(rq); 2836 GEM_BUG_ON(list_empty(&engine->active.hold)); 2837 2838 unlock: 2839 spin_unlock_irq(&engine->active.lock); 2840 return rq; 2841 } 2842 2843 static bool hold_request(const struct i915_request *rq) 2844 { 2845 struct i915_dependency *p; 2846 bool result = false; 2847 2848 /* 2849 * If one of our ancestors is on hold, we must also be on hold, 2850 * otherwise we will bypass it and execute before it. 2851 */ 2852 rcu_read_lock(); 2853 for_each_signaler(p, rq) { 2854 const struct i915_request *s = 2855 container_of(p->signaler, typeof(*s), sched); 2856 2857 if (s->engine != rq->engine) 2858 continue; 2859 2860 result = i915_request_on_hold(s); 2861 if (result) 2862 break; 2863 } 2864 rcu_read_unlock(); 2865 2866 return result; 2867 } 2868 2869 static void __execlists_unhold(struct i915_request *rq) 2870 { 2871 LIST_HEAD(list); 2872 2873 do { 2874 struct i915_dependency *p; 2875 2876 RQ_TRACE(rq, "hold release\n"); 2877 2878 GEM_BUG_ON(!i915_request_on_hold(rq)); 2879 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2880 2881 i915_request_clear_hold(rq); 2882 list_move_tail(&rq->sched.link, 2883 i915_sched_lookup_priolist(rq->engine, 2884 rq_prio(rq))); 2885 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2886 2887 /* Also release any children on this engine that are ready */ 2888 for_each_waiter(p, rq) { 2889 struct i915_request *w = 2890 container_of(p->waiter, typeof(*w), sched); 2891 2892 /* Propagate any change in error status */ 2893 if (rq->fence.error) 2894 i915_request_set_error_once(w, rq->fence.error); 2895 2896 if (w->engine != rq->engine) 2897 continue; 2898 2899 if (!i915_request_on_hold(w)) 2900 continue; 2901 2902 /* Check that no other parents are also on hold */ 2903 if (hold_request(w)) 2904 continue; 2905 2906 list_move_tail(&w->sched.link, &list); 2907 } 2908 2909 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2910 } while (rq); 2911 } 2912 2913 static void execlists_unhold(struct intel_engine_cs *engine, 2914 struct i915_request *rq) 2915 { 2916 spin_lock_irq(&engine->active.lock); 2917 2918 /* 2919 * Move this request back to the priority queue, and all of its 2920 * children and grandchildren that were suspended along with it. 2921 */ 2922 __execlists_unhold(rq); 2923 2924 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2925 engine->execlists.queue_priority_hint = rq_prio(rq); 2926 tasklet_hi_schedule(&engine->execlists.tasklet); 2927 } 2928 2929 spin_unlock_irq(&engine->active.lock); 2930 } 2931 2932 struct execlists_capture { 2933 struct work_struct work; 2934 struct i915_request *rq; 2935 struct i915_gpu_coredump *error; 2936 }; 2937 2938 static void execlists_capture_work(struct work_struct *work) 2939 { 2940 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2941 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2942 struct intel_engine_cs *engine = cap->rq->engine; 2943 struct intel_gt_coredump *gt = cap->error->gt; 2944 struct intel_engine_capture_vma *vma; 2945 2946 /* Compress all the objects attached to the request, slow! */ 2947 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2948 if (vma) { 2949 struct i915_vma_compress *compress = 2950 i915_vma_capture_prepare(gt); 2951 2952 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2953 i915_vma_capture_finish(gt, compress); 2954 } 2955 2956 gt->simulated = gt->engine->simulated; 2957 cap->error->simulated = gt->simulated; 2958 2959 /* Publish the error state, and announce it to the world */ 2960 i915_error_state_store(cap->error); 2961 i915_gpu_coredump_put(cap->error); 2962 2963 /* Return this request and all that depend upon it for signaling */ 2964 execlists_unhold(engine, cap->rq); 2965 i915_request_put(cap->rq); 2966 2967 kfree(cap); 2968 } 2969 2970 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 2971 { 2972 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 2973 struct execlists_capture *cap; 2974 2975 cap = kmalloc(sizeof(*cap), gfp); 2976 if (!cap) 2977 return NULL; 2978 2979 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 2980 if (!cap->error) 2981 goto err_cap; 2982 2983 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 2984 if (!cap->error->gt) 2985 goto err_gpu; 2986 2987 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 2988 if (!cap->error->gt->engine) 2989 goto err_gt; 2990 2991 return cap; 2992 2993 err_gt: 2994 kfree(cap->error->gt); 2995 err_gpu: 2996 kfree(cap->error); 2997 err_cap: 2998 kfree(cap); 2999 return NULL; 3000 } 3001 3002 static struct i915_request * 3003 active_context(struct intel_engine_cs *engine, u32 ccid) 3004 { 3005 const struct intel_engine_execlists * const el = &engine->execlists; 3006 struct i915_request * const *port, *rq; 3007 3008 /* 3009 * Use the most recent result from process_csb(), but just in case 3010 * we trigger an error (via interrupt) before the first CS event has 3011 * been written, peek at the next submission. 3012 */ 3013 3014 for (port = el->active; (rq = *port); port++) { 3015 if (rq->context->lrc.ccid == ccid) { 3016 ENGINE_TRACE(engine, 3017 "ccid found at active:%zd\n", 3018 port - el->active); 3019 return rq; 3020 } 3021 } 3022 3023 for (port = el->pending; (rq = *port); port++) { 3024 if (rq->context->lrc.ccid == ccid) { 3025 ENGINE_TRACE(engine, 3026 "ccid found at pending:%zd\n", 3027 port - el->pending); 3028 return rq; 3029 } 3030 } 3031 3032 ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); 3033 return NULL; 3034 } 3035 3036 static u32 active_ccid(struct intel_engine_cs *engine) 3037 { 3038 return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); 3039 } 3040 3041 static void execlists_capture(struct intel_engine_cs *engine) 3042 { 3043 struct execlists_capture *cap; 3044 3045 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 3046 return; 3047 3048 /* 3049 * We need to _quickly_ capture the engine state before we reset. 3050 * We are inside an atomic section (softirq) here and we are delaying 3051 * the forced preemption event. 3052 */ 3053 cap = capture_regs(engine); 3054 if (!cap) 3055 return; 3056 3057 spin_lock_irq(&engine->active.lock); 3058 cap->rq = active_context(engine, active_ccid(engine)); 3059 if (cap->rq) { 3060 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 3061 cap->rq = i915_request_get_rcu(cap->rq); 3062 } 3063 spin_unlock_irq(&engine->active.lock); 3064 if (!cap->rq) 3065 goto err_free; 3066 3067 /* 3068 * Remove the request from the execlists queue, and take ownership 3069 * of the request. We pass it to our worker who will _slowly_ compress 3070 * all the pages the _user_ requested for debugging their batch, after 3071 * which we return it to the queue for signaling. 3072 * 3073 * By removing them from the execlists queue, we also remove the 3074 * requests from being processed by __unwind_incomplete_requests() 3075 * during the intel_engine_reset(), and so they will *not* be replayed 3076 * afterwards. 3077 * 3078 * Note that because we have not yet reset the engine at this point, 3079 * it is possible for the request that we have identified as being 3080 * guilty, did in fact complete and we will then hit an arbitration 3081 * point allowing the outstanding preemption to succeed. The likelihood 3082 * of that is very low (as capturing of the engine registers should be 3083 * fast enough to run inside an irq-off atomic section!), so we will 3084 * simply hold that request accountable for being non-preemptible 3085 * long enough to force the reset. 3086 */ 3087 if (!execlists_hold(engine, cap->rq)) 3088 goto err_rq; 3089 3090 INIT_WORK(&cap->work, execlists_capture_work); 3091 schedule_work(&cap->work); 3092 return; 3093 3094 err_rq: 3095 i915_request_put(cap->rq); 3096 err_free: 3097 i915_gpu_coredump_put(cap->error); 3098 kfree(cap); 3099 } 3100 3101 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 3102 { 3103 const unsigned int bit = I915_RESET_ENGINE + engine->id; 3104 unsigned long *lock = &engine->gt->reset.flags; 3105 3106 if (!intel_has_reset_engine(engine->gt)) 3107 return; 3108 3109 if (test_and_set_bit(bit, lock)) 3110 return; 3111 3112 ENGINE_TRACE(engine, "reset for %s\n", msg); 3113 3114 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 3115 tasklet_disable_nosync(&engine->execlists.tasklet); 3116 3117 ring_set_paused(engine, 1); /* Freeze the current request in place */ 3118 execlists_capture(engine); 3119 intel_engine_reset(engine, msg); 3120 3121 tasklet_enable(&engine->execlists.tasklet); 3122 clear_and_wake_up_bit(bit, lock); 3123 } 3124 3125 static bool preempt_timeout(const struct intel_engine_cs *const engine) 3126 { 3127 const struct timer_list *t = &engine->execlists.preempt; 3128 3129 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 3130 return false; 3131 3132 if (!timer_expired(t)) 3133 return false; 3134 3135 return READ_ONCE(engine->execlists.pending[0]); 3136 } 3137 3138 /* 3139 * Check the unread Context Status Buffers and manage the submission of new 3140 * contexts to the ELSP accordingly. 3141 */ 3142 static void execlists_submission_tasklet(unsigned long data) 3143 { 3144 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 3145 bool timeout = preempt_timeout(engine); 3146 3147 process_csb(engine); 3148 3149 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 3150 const char *msg; 3151 3152 /* Generate the error message in priority wrt to the user! */ 3153 if (engine->execlists.error_interrupt & GENMASK(15, 0)) 3154 msg = "CS error"; /* thrown by a user payload */ 3155 else if (engine->execlists.error_interrupt & ERROR_CSB) 3156 msg = "invalid CSB event"; 3157 else 3158 msg = "internal error"; 3159 3160 engine->execlists.error_interrupt = 0; 3161 execlists_reset(engine, msg); 3162 } 3163 3164 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 3165 unsigned long flags; 3166 3167 spin_lock_irqsave(&engine->active.lock, flags); 3168 __execlists_submission_tasklet(engine); 3169 spin_unlock_irqrestore(&engine->active.lock, flags); 3170 3171 /* Recheck after serialising with direct-submission */ 3172 if (unlikely(timeout && preempt_timeout(engine))) 3173 execlists_reset(engine, "preemption time out"); 3174 } 3175 } 3176 3177 static void __execlists_kick(struct intel_engine_execlists *execlists) 3178 { 3179 /* Kick the tasklet for some interrupt coalescing and reset handling */ 3180 tasklet_hi_schedule(&execlists->tasklet); 3181 } 3182 3183 #define execlists_kick(t, member) \ 3184 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 3185 3186 static void execlists_timeslice(struct timer_list *timer) 3187 { 3188 execlists_kick(timer, timer); 3189 } 3190 3191 static void execlists_preempt(struct timer_list *timer) 3192 { 3193 execlists_kick(timer, preempt); 3194 } 3195 3196 static void queue_request(struct intel_engine_cs *engine, 3197 struct i915_request *rq) 3198 { 3199 GEM_BUG_ON(!list_empty(&rq->sched.link)); 3200 list_add_tail(&rq->sched.link, 3201 i915_sched_lookup_priolist(engine, rq_prio(rq))); 3202 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 3203 } 3204 3205 static void __submit_queue_imm(struct intel_engine_cs *engine) 3206 { 3207 struct intel_engine_execlists * const execlists = &engine->execlists; 3208 3209 if (reset_in_progress(execlists)) 3210 return; /* defer until we restart the engine following reset */ 3211 3212 __execlists_submission_tasklet(engine); 3213 } 3214 3215 static void submit_queue(struct intel_engine_cs *engine, 3216 const struct i915_request *rq) 3217 { 3218 struct intel_engine_execlists *execlists = &engine->execlists; 3219 3220 if (rq_prio(rq) <= execlists->queue_priority_hint) 3221 return; 3222 3223 execlists->queue_priority_hint = rq_prio(rq); 3224 __submit_queue_imm(engine); 3225 } 3226 3227 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 3228 const struct i915_request *rq) 3229 { 3230 GEM_BUG_ON(i915_request_on_hold(rq)); 3231 return !list_empty(&engine->active.hold) && hold_request(rq); 3232 } 3233 3234 static void flush_csb(struct intel_engine_cs *engine) 3235 { 3236 struct intel_engine_execlists *el = &engine->execlists; 3237 3238 if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) { 3239 if (!reset_in_progress(el)) 3240 process_csb(engine); 3241 tasklet_unlock(&el->tasklet); 3242 } 3243 } 3244 3245 static void execlists_submit_request(struct i915_request *request) 3246 { 3247 struct intel_engine_cs *engine = request->engine; 3248 unsigned long flags; 3249 3250 /* Hopefully we clear execlists->pending[] to let us through */ 3251 flush_csb(engine); 3252 3253 /* Will be called from irq-context when using foreign fences. */ 3254 spin_lock_irqsave(&engine->active.lock, flags); 3255 3256 if (unlikely(ancestor_on_hold(engine, request))) { 3257 RQ_TRACE(request, "ancestor on hold\n"); 3258 list_add_tail(&request->sched.link, &engine->active.hold); 3259 i915_request_set_hold(request); 3260 } else { 3261 queue_request(engine, request); 3262 3263 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 3264 GEM_BUG_ON(list_empty(&request->sched.link)); 3265 3266 submit_queue(engine, request); 3267 } 3268 3269 spin_unlock_irqrestore(&engine->active.lock, flags); 3270 } 3271 3272 static void __execlists_context_fini(struct intel_context *ce) 3273 { 3274 intel_ring_put(ce->ring); 3275 i915_vma_put(ce->state); 3276 } 3277 3278 static void execlists_context_destroy(struct kref *kref) 3279 { 3280 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 3281 3282 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 3283 GEM_BUG_ON(intel_context_is_pinned(ce)); 3284 3285 if (ce->state) 3286 __execlists_context_fini(ce); 3287 3288 intel_context_fini(ce); 3289 intel_context_free(ce); 3290 } 3291 3292 static void 3293 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 3294 { 3295 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3296 return; 3297 3298 vaddr += engine->context_size; 3299 3300 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 3301 } 3302 3303 static void 3304 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 3305 { 3306 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3307 return; 3308 3309 vaddr += engine->context_size; 3310 3311 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 3312 drm_err_once(&engine->i915->drm, 3313 "%s context redzone overwritten!\n", 3314 engine->name); 3315 } 3316 3317 static void execlists_context_unpin(struct intel_context *ce) 3318 { 3319 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 3320 ce->engine); 3321 } 3322 3323 static void execlists_context_post_unpin(struct intel_context *ce) 3324 { 3325 i915_gem_object_unpin_map(ce->state->obj); 3326 } 3327 3328 static u32 * 3329 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 3330 { 3331 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3332 MI_SRM_LRM_GLOBAL_GTT | 3333 MI_LRI_LRM_CS_MMIO; 3334 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3335 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3336 CTX_TIMESTAMP * sizeof(u32); 3337 *cs++ = 0; 3338 3339 *cs++ = MI_LOAD_REGISTER_REG | 3340 MI_LRR_SOURCE_CS_MMIO | 3341 MI_LRI_LRM_CS_MMIO; 3342 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3343 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3344 3345 *cs++ = MI_LOAD_REGISTER_REG | 3346 MI_LRR_SOURCE_CS_MMIO | 3347 MI_LRI_LRM_CS_MMIO; 3348 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3349 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3350 3351 return cs; 3352 } 3353 3354 static u32 * 3355 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 3356 { 3357 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 3358 3359 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3360 MI_SRM_LRM_GLOBAL_GTT | 3361 MI_LRI_LRM_CS_MMIO; 3362 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3363 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3364 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 3365 *cs++ = 0; 3366 3367 return cs; 3368 } 3369 3370 static u32 * 3371 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 3372 { 3373 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 3374 3375 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3376 MI_SRM_LRM_GLOBAL_GTT | 3377 MI_LRI_LRM_CS_MMIO; 3378 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3379 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3380 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 3381 *cs++ = 0; 3382 3383 *cs++ = MI_LOAD_REGISTER_REG | 3384 MI_LRR_SOURCE_CS_MMIO | 3385 MI_LRI_LRM_CS_MMIO; 3386 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3387 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 3388 3389 return cs; 3390 } 3391 3392 static u32 * 3393 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 3394 { 3395 cs = gen12_emit_timestamp_wa(ce, cs); 3396 cs = gen12_emit_cmd_buf_wa(ce, cs); 3397 cs = gen12_emit_restore_scratch(ce, cs); 3398 3399 return cs; 3400 } 3401 3402 static u32 * 3403 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 3404 { 3405 cs = gen12_emit_timestamp_wa(ce, cs); 3406 cs = gen12_emit_restore_scratch(ce, cs); 3407 3408 return cs; 3409 } 3410 3411 static inline u32 context_wa_bb_offset(const struct intel_context *ce) 3412 { 3413 return PAGE_SIZE * ce->wa_bb_page; 3414 } 3415 3416 static u32 *context_indirect_bb(const struct intel_context *ce) 3417 { 3418 void *ptr; 3419 3420 GEM_BUG_ON(!ce->wa_bb_page); 3421 3422 ptr = ce->lrc_reg_state; 3423 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 3424 ptr += context_wa_bb_offset(ce); 3425 3426 return ptr; 3427 } 3428 3429 static void 3430 setup_indirect_ctx_bb(const struct intel_context *ce, 3431 const struct intel_engine_cs *engine, 3432 u32 *(*emit)(const struct intel_context *, u32 *)) 3433 { 3434 u32 * const start = context_indirect_bb(ce); 3435 u32 *cs; 3436 3437 cs = emit(ce, start); 3438 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 3439 while ((unsigned long)cs % CACHELINE_BYTES) 3440 *cs++ = MI_NOOP; 3441 3442 lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, 3443 i915_ggtt_offset(ce->state) + 3444 context_wa_bb_offset(ce), 3445 (cs - start) * sizeof(*cs)); 3446 } 3447 3448 static void 3449 __execlists_update_reg_state(const struct intel_context *ce, 3450 const struct intel_engine_cs *engine, 3451 u32 head) 3452 { 3453 struct intel_ring *ring = ce->ring; 3454 u32 *regs = ce->lrc_reg_state; 3455 3456 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 3457 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 3458 3459 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 3460 regs[CTX_RING_HEAD] = head; 3461 regs[CTX_RING_TAIL] = ring->tail; 3462 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3463 3464 /* RPCS */ 3465 if (engine->class == RENDER_CLASS) { 3466 regs[CTX_R_PWR_CLK_STATE] = 3467 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 3468 3469 i915_oa_init_reg_state(ce, engine); 3470 } 3471 3472 if (ce->wa_bb_page) { 3473 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 3474 3475 fn = gen12_emit_indirect_ctx_xcs; 3476 if (ce->engine->class == RENDER_CLASS) 3477 fn = gen12_emit_indirect_ctx_rcs; 3478 3479 /* Mutually exclusive wrt to global indirect bb */ 3480 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 3481 setup_indirect_ctx_bb(ce, engine, fn); 3482 } 3483 } 3484 3485 static int 3486 execlists_context_pre_pin(struct intel_context *ce, 3487 struct i915_gem_ww_ctx *ww, void **vaddr) 3488 { 3489 GEM_BUG_ON(!ce->state); 3490 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3491 3492 *vaddr = i915_gem_object_pin_map(ce->state->obj, 3493 i915_coherent_map_type(ce->engine->i915) | 3494 I915_MAP_OVERRIDE); 3495 3496 return PTR_ERR_OR_ZERO(*vaddr); 3497 } 3498 3499 static int 3500 __execlists_context_pin(struct intel_context *ce, 3501 struct intel_engine_cs *engine, 3502 void *vaddr) 3503 { 3504 ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 3505 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 3506 __execlists_update_reg_state(ce, engine, ce->ring->tail); 3507 3508 return 0; 3509 } 3510 3511 static int execlists_context_pin(struct intel_context *ce, void *vaddr) 3512 { 3513 return __execlists_context_pin(ce, ce->engine, vaddr); 3514 } 3515 3516 static int execlists_context_alloc(struct intel_context *ce) 3517 { 3518 return __execlists_context_alloc(ce, ce->engine); 3519 } 3520 3521 static void execlists_context_reset(struct intel_context *ce) 3522 { 3523 CE_TRACE(ce, "reset\n"); 3524 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3525 3526 intel_ring_reset(ce->ring, ce->ring->emit); 3527 3528 /* Scrub away the garbage */ 3529 execlists_init_reg_state(ce->lrc_reg_state, 3530 ce, ce->engine, ce->ring, true); 3531 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3532 3533 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 3534 } 3535 3536 static const struct intel_context_ops execlists_context_ops = { 3537 .alloc = execlists_context_alloc, 3538 3539 .pre_pin = execlists_context_pre_pin, 3540 .pin = execlists_context_pin, 3541 .unpin = execlists_context_unpin, 3542 .post_unpin = execlists_context_post_unpin, 3543 3544 .enter = intel_context_enter_engine, 3545 .exit = intel_context_exit_engine, 3546 3547 .reset = execlists_context_reset, 3548 .destroy = execlists_context_destroy, 3549 }; 3550 3551 static u32 hwsp_offset(const struct i915_request *rq) 3552 { 3553 const struct intel_timeline_cacheline *cl; 3554 3555 /* Before the request is executed, the timeline/cachline is fixed */ 3556 3557 cl = rcu_dereference_protected(rq->hwsp_cacheline, 1); 3558 if (cl) 3559 return cl->ggtt_offset; 3560 3561 return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset; 3562 } 3563 3564 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3565 { 3566 u32 *cs; 3567 3568 GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); 3569 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3570 return 0; 3571 3572 cs = intel_ring_begin(rq, 6); 3573 if (IS_ERR(cs)) 3574 return PTR_ERR(cs); 3575 3576 /* 3577 * Check if we have been preempted before we even get started. 3578 * 3579 * After this point i915_request_started() reports true, even if 3580 * we get preempted and so are no longer running. 3581 */ 3582 *cs++ = MI_ARB_CHECK; 3583 *cs++ = MI_NOOP; 3584 3585 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3586 *cs++ = hwsp_offset(rq); 3587 *cs++ = 0; 3588 *cs++ = rq->fence.seqno - 1; 3589 3590 intel_ring_advance(rq, cs); 3591 3592 /* Record the updated position of the request's payload */ 3593 rq->infix = intel_ring_offset(rq, cs); 3594 3595 __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); 3596 3597 return 0; 3598 } 3599 3600 static int emit_pdps(struct i915_request *rq) 3601 { 3602 const struct intel_engine_cs * const engine = rq->engine; 3603 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); 3604 int err, i; 3605 u32 *cs; 3606 3607 GEM_BUG_ON(intel_vgpu_active(rq->engine->i915)); 3608 3609 /* 3610 * Beware ye of the dragons, this sequence is magic! 3611 * 3612 * Small changes to this sequence can cause anything from 3613 * GPU hangs to forcewake errors and machine lockups! 3614 */ 3615 3616 /* Flush any residual operations from the context load */ 3617 err = engine->emit_flush(rq, EMIT_FLUSH); 3618 if (err) 3619 return err; 3620 3621 /* Magic required to prevent forcewake errors! */ 3622 err = engine->emit_flush(rq, EMIT_INVALIDATE); 3623 if (err) 3624 return err; 3625 3626 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 3627 if (IS_ERR(cs)) 3628 return PTR_ERR(cs); 3629 3630 /* Ensure the LRI have landed before we invalidate & continue */ 3631 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 3632 for (i = GEN8_3LVL_PDPES; i--; ) { 3633 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 3634 u32 base = engine->mmio_base; 3635 3636 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 3637 *cs++ = upper_32_bits(pd_daddr); 3638 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 3639 *cs++ = lower_32_bits(pd_daddr); 3640 } 3641 *cs++ = MI_NOOP; 3642 3643 intel_ring_advance(rq, cs); 3644 3645 return 0; 3646 } 3647 3648 static int execlists_request_alloc(struct i915_request *request) 3649 { 3650 int ret; 3651 3652 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3653 3654 /* 3655 * Flush enough space to reduce the likelihood of waiting after 3656 * we start building the request - in which case we will just 3657 * have to repeat work. 3658 */ 3659 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3660 3661 /* 3662 * Note that after this point, we have committed to using 3663 * this request as it is being used to both track the 3664 * state of engine initialisation and liveness of the 3665 * golden renderstate above. Think twice before you try 3666 * to cancel/unwind this request now. 3667 */ 3668 3669 if (!i915_vm_is_4lvl(request->context->vm)) { 3670 ret = emit_pdps(request); 3671 if (ret) 3672 return ret; 3673 } 3674 3675 /* Unconditionally invalidate GPU caches and TLBs. */ 3676 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3677 if (ret) 3678 return ret; 3679 3680 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3681 return 0; 3682 } 3683 3684 /* 3685 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3686 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3687 * but there is a slight complication as this is applied in WA batch where the 3688 * values are only initialized once so we cannot take register value at the 3689 * beginning and reuse it further; hence we save its value to memory, upload a 3690 * constant value with bit21 set and then we restore it back with the saved value. 3691 * To simplify the WA, a constant value is formed by using the default value 3692 * of this register. This shouldn't be a problem because we are only modifying 3693 * it for a short period and this batch in non-premptible. We can ofcourse 3694 * use additional instructions that read the actual value of the register 3695 * at that time and set our bit of interest but it makes the WA complicated. 3696 * 3697 * This WA is also required for Gen9 so extracting as a function avoids 3698 * code duplication. 3699 */ 3700 static u32 * 3701 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3702 { 3703 /* NB no one else is allowed to scribble over scratch + 256! */ 3704 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3705 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3706 *batch++ = intel_gt_scratch_offset(engine->gt, 3707 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3708 *batch++ = 0; 3709 3710 *batch++ = MI_LOAD_REGISTER_IMM(1); 3711 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3712 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3713 3714 batch = gen8_emit_pipe_control(batch, 3715 PIPE_CONTROL_CS_STALL | 3716 PIPE_CONTROL_DC_FLUSH_ENABLE, 3717 0); 3718 3719 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3720 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3721 *batch++ = intel_gt_scratch_offset(engine->gt, 3722 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3723 *batch++ = 0; 3724 3725 return batch; 3726 } 3727 3728 /* 3729 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3730 * initialized at the beginning and shared across all contexts but this field 3731 * helps us to have multiple batches at different offsets and select them based 3732 * on a criteria. At the moment this batch always start at the beginning of the page 3733 * and at this point we don't have multiple wa_ctx batch buffers. 3734 * 3735 * The number of WA applied are not known at the beginning; we use this field 3736 * to return the no of DWORDS written. 3737 * 3738 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3739 * so it adds NOOPs as padding to make it cacheline aligned. 3740 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3741 * makes a complete batch buffer. 3742 */ 3743 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3744 { 3745 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3746 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3747 3748 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3749 if (IS_BROADWELL(engine->i915)) 3750 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3751 3752 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3753 /* Actual scratch location is at 128 bytes offset */ 3754 batch = gen8_emit_pipe_control(batch, 3755 PIPE_CONTROL_FLUSH_L3 | 3756 PIPE_CONTROL_STORE_DATA_INDEX | 3757 PIPE_CONTROL_CS_STALL | 3758 PIPE_CONTROL_QW_WRITE, 3759 LRC_PPHWSP_SCRATCH_ADDR); 3760 3761 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3762 3763 /* Pad to end of cacheline */ 3764 while ((unsigned long)batch % CACHELINE_BYTES) 3765 *batch++ = MI_NOOP; 3766 3767 /* 3768 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3769 * execution depends on the length specified in terms of cache lines 3770 * in the register CTX_RCS_INDIRECT_CTX 3771 */ 3772 3773 return batch; 3774 } 3775 3776 struct lri { 3777 i915_reg_t reg; 3778 u32 value; 3779 }; 3780 3781 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3782 { 3783 GEM_BUG_ON(!count || count > 63); 3784 3785 *batch++ = MI_LOAD_REGISTER_IMM(count); 3786 do { 3787 *batch++ = i915_mmio_reg_offset(lri->reg); 3788 *batch++ = lri->value; 3789 } while (lri++, --count); 3790 *batch++ = MI_NOOP; 3791 3792 return batch; 3793 } 3794 3795 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3796 { 3797 static const struct lri lri[] = { 3798 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3799 { 3800 COMMON_SLICE_CHICKEN2, 3801 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3802 0), 3803 }, 3804 3805 /* BSpec: 11391 */ 3806 { 3807 FF_SLICE_CHICKEN, 3808 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3809 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3810 }, 3811 3812 /* BSpec: 11299 */ 3813 { 3814 _3D_CHICKEN3, 3815 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3816 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3817 } 3818 }; 3819 3820 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3821 3822 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3823 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3824 3825 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3826 batch = gen8_emit_pipe_control(batch, 3827 PIPE_CONTROL_FLUSH_L3 | 3828 PIPE_CONTROL_STORE_DATA_INDEX | 3829 PIPE_CONTROL_CS_STALL | 3830 PIPE_CONTROL_QW_WRITE, 3831 LRC_PPHWSP_SCRATCH_ADDR); 3832 3833 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3834 3835 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3836 if (HAS_POOLED_EU(engine->i915)) { 3837 /* 3838 * EU pool configuration is setup along with golden context 3839 * during context initialization. This value depends on 3840 * device type (2x6 or 3x6) and needs to be updated based 3841 * on which subslice is disabled especially for 2x6 3842 * devices, however it is safe to load default 3843 * configuration of 3x6 device instead of masking off 3844 * corresponding bits because HW ignores bits of a disabled 3845 * subslice and drops down to appropriate config. Please 3846 * see render_state_setup() in i915_gem_render_state.c for 3847 * possible configurations, to avoid duplication they are 3848 * not shown here again. 3849 */ 3850 *batch++ = GEN9_MEDIA_POOL_STATE; 3851 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3852 *batch++ = 0x00777000; 3853 *batch++ = 0; 3854 *batch++ = 0; 3855 *batch++ = 0; 3856 } 3857 3858 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3859 3860 /* Pad to end of cacheline */ 3861 while ((unsigned long)batch % CACHELINE_BYTES) 3862 *batch++ = MI_NOOP; 3863 3864 return batch; 3865 } 3866 3867 static u32 * 3868 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3869 { 3870 int i; 3871 3872 /* 3873 * WaPipeControlBefore3DStateSamplePattern: cnl 3874 * 3875 * Ensure the engine is idle prior to programming a 3876 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3877 */ 3878 batch = gen8_emit_pipe_control(batch, 3879 PIPE_CONTROL_CS_STALL, 3880 0); 3881 /* 3882 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3883 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3884 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3885 * confusing. Since gen8_emit_pipe_control() already advances the 3886 * batch by 6 dwords, we advance the other 10 here, completing a 3887 * cacheline. It's not clear if the workaround requires this padding 3888 * before other commands, or if it's just the regular padding we would 3889 * already have for the workaround bb, so leave it here for now. 3890 */ 3891 for (i = 0; i < 10; i++) 3892 *batch++ = MI_NOOP; 3893 3894 /* Pad to end of cacheline */ 3895 while ((unsigned long)batch % CACHELINE_BYTES) 3896 *batch++ = MI_NOOP; 3897 3898 return batch; 3899 } 3900 3901 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3902 3903 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3904 { 3905 struct drm_i915_gem_object *obj; 3906 struct i915_vma *vma; 3907 int err; 3908 3909 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3910 if (IS_ERR(obj)) 3911 return PTR_ERR(obj); 3912 3913 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3914 if (IS_ERR(vma)) { 3915 err = PTR_ERR(vma); 3916 goto err; 3917 } 3918 3919 err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); 3920 if (err) 3921 goto err; 3922 3923 engine->wa_ctx.vma = vma; 3924 return 0; 3925 3926 err: 3927 i915_gem_object_put(obj); 3928 return err; 3929 } 3930 3931 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3932 { 3933 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3934 } 3935 3936 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3937 3938 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3939 { 3940 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3941 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3942 &wa_ctx->per_ctx }; 3943 wa_bb_func_t wa_bb_fn[2]; 3944 void *batch, *batch_ptr; 3945 unsigned int i; 3946 int ret; 3947 3948 if (engine->class != RENDER_CLASS) 3949 return 0; 3950 3951 switch (INTEL_GEN(engine->i915)) { 3952 case 12: 3953 case 11: 3954 return 0; 3955 case 10: 3956 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3957 wa_bb_fn[1] = NULL; 3958 break; 3959 case 9: 3960 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3961 wa_bb_fn[1] = NULL; 3962 break; 3963 case 8: 3964 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3965 wa_bb_fn[1] = NULL; 3966 break; 3967 default: 3968 MISSING_CASE(INTEL_GEN(engine->i915)); 3969 return 0; 3970 } 3971 3972 ret = lrc_setup_wa_ctx(engine); 3973 if (ret) { 3974 drm_dbg(&engine->i915->drm, 3975 "Failed to setup context WA page: %d\n", ret); 3976 return ret; 3977 } 3978 3979 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 3980 3981 /* 3982 * Emit the two workaround batch buffers, recording the offset from the 3983 * start of the workaround batch buffer object for each and their 3984 * respective sizes. 3985 */ 3986 batch_ptr = batch; 3987 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 3988 wa_bb[i]->offset = batch_ptr - batch; 3989 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 3990 CACHELINE_BYTES))) { 3991 ret = -EINVAL; 3992 break; 3993 } 3994 if (wa_bb_fn[i]) 3995 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 3996 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 3997 } 3998 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 3999 4000 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 4001 __i915_gem_object_release_map(wa_ctx->vma->obj); 4002 if (ret) 4003 lrc_destroy_wa_ctx(engine); 4004 4005 return ret; 4006 } 4007 4008 static void reset_csb_pointers(struct intel_engine_cs *engine) 4009 { 4010 struct intel_engine_execlists * const execlists = &engine->execlists; 4011 const unsigned int reset_value = execlists->csb_size - 1; 4012 4013 ring_set_paused(engine, 0); 4014 4015 /* 4016 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 4017 * Bludgeon them with a mmio update to be sure. 4018 */ 4019 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4020 0xffff << 16 | reset_value << 8 | reset_value); 4021 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4022 4023 /* 4024 * After a reset, the HW starts writing into CSB entry [0]. We 4025 * therefore have to set our HEAD pointer back one entry so that 4026 * the *first* entry we check is entry 0. To complicate this further, 4027 * as we don't wait for the first interrupt after reset, we have to 4028 * fake the HW write to point back to the last entry so that our 4029 * inline comparison of our cached head position against the last HW 4030 * write works even before the first interrupt. 4031 */ 4032 execlists->csb_head = reset_value; 4033 WRITE_ONCE(*execlists->csb_write, reset_value); 4034 wmb(); /* Make sure this is visible to HW (paranoia?) */ 4035 4036 /* Check that the GPU does indeed update the CSB entries! */ 4037 memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64)); 4038 invalidate_csb_entries(&execlists->csb_status[0], 4039 &execlists->csb_status[reset_value]); 4040 4041 /* Once more for luck and our trusty paranoia */ 4042 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4043 0xffff << 16 | reset_value << 8 | reset_value); 4044 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4045 4046 GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); 4047 } 4048 4049 static void execlists_sanitize(struct intel_engine_cs *engine) 4050 { 4051 /* 4052 * Poison residual state on resume, in case the suspend didn't! 4053 * 4054 * We have to assume that across suspend/resume (or other loss 4055 * of control) that the contents of our pinned buffers has been 4056 * lost, replaced by garbage. Since this doesn't always happen, 4057 * let's poison such state so that we more quickly spot when 4058 * we falsely assume it has been preserved. 4059 */ 4060 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4061 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); 4062 4063 reset_csb_pointers(engine); 4064 4065 /* 4066 * The kernel_context HWSP is stored in the status_page. As above, 4067 * that may be lost on resume/initialisation, and so we need to 4068 * reset the value in the HWSP. 4069 */ 4070 intel_timeline_reset_seqno(engine->kernel_context->timeline); 4071 4072 /* And scrub the dirty cachelines for the HWSP */ 4073 clflush_cache_range(engine->status_page.addr, PAGE_SIZE); 4074 } 4075 4076 static void enable_error_interrupt(struct intel_engine_cs *engine) 4077 { 4078 u32 status; 4079 4080 engine->execlists.error_interrupt = 0; 4081 ENGINE_WRITE(engine, RING_EMR, ~0u); 4082 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 4083 4084 status = ENGINE_READ(engine, RING_ESR); 4085 if (unlikely(status)) { 4086 drm_err(&engine->i915->drm, 4087 "engine '%s' resumed still in error: %08x\n", 4088 engine->name, status); 4089 __intel_gt_reset(engine->gt, engine->mask); 4090 } 4091 4092 /* 4093 * On current gen8+, we have 2 signals to play with 4094 * 4095 * - I915_ERROR_INSTUCTION (bit 0) 4096 * 4097 * Generate an error if the command parser encounters an invalid 4098 * instruction 4099 * 4100 * This is a fatal error. 4101 * 4102 * - CP_PRIV (bit 2) 4103 * 4104 * Generate an error on privilege violation (where the CP replaces 4105 * the instruction with a no-op). This also fires for writes into 4106 * read-only scratch pages. 4107 * 4108 * This is a non-fatal error, parsing continues. 4109 * 4110 * * there are a few others defined for odd HW that we do not use 4111 * 4112 * Since CP_PRIV fires for cases where we have chosen to ignore the 4113 * error (as the HW is validating and suppressing the mistakes), we 4114 * only unmask the instruction error bit. 4115 */ 4116 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 4117 } 4118 4119 static void enable_execlists(struct intel_engine_cs *engine) 4120 { 4121 u32 mode; 4122 4123 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 4124 4125 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 4126 4127 if (INTEL_GEN(engine->i915) >= 11) 4128 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 4129 else 4130 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 4131 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 4132 4133 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 4134 4135 ENGINE_WRITE_FW(engine, 4136 RING_HWS_PGA, 4137 i915_ggtt_offset(engine->status_page.vma)); 4138 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 4139 4140 enable_error_interrupt(engine); 4141 4142 engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0); 4143 } 4144 4145 static bool unexpected_starting_state(struct intel_engine_cs *engine) 4146 { 4147 bool unexpected = false; 4148 4149 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 4150 drm_dbg(&engine->i915->drm, 4151 "STOP_RING still set in RING_MI_MODE\n"); 4152 unexpected = true; 4153 } 4154 4155 return unexpected; 4156 } 4157 4158 static int execlists_resume(struct intel_engine_cs *engine) 4159 { 4160 intel_mocs_init_engine(engine); 4161 4162 intel_breadcrumbs_reset(engine->breadcrumbs); 4163 4164 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 4165 struct drm_printer p = drm_debug_printer(__func__); 4166 4167 intel_engine_dump(engine, &p, NULL); 4168 } 4169 4170 enable_execlists(engine); 4171 4172 return 0; 4173 } 4174 4175 static void execlists_reset_prepare(struct intel_engine_cs *engine) 4176 { 4177 struct intel_engine_execlists * const execlists = &engine->execlists; 4178 unsigned long flags; 4179 4180 ENGINE_TRACE(engine, "depth<-%d\n", 4181 atomic_read(&execlists->tasklet.count)); 4182 4183 /* 4184 * Prevent request submission to the hardware until we have 4185 * completed the reset in i915_gem_reset_finish(). If a request 4186 * is completed by one engine, it may then queue a request 4187 * to a second via its execlists->tasklet *just* as we are 4188 * calling engine->resume() and also writing the ELSP. 4189 * Turning off the execlists->tasklet until the reset is over 4190 * prevents the race. 4191 */ 4192 __tasklet_disable_sync_once(&execlists->tasklet); 4193 GEM_BUG_ON(!reset_in_progress(execlists)); 4194 4195 /* And flush any current direct submission. */ 4196 spin_lock_irqsave(&engine->active.lock, flags); 4197 spin_unlock_irqrestore(&engine->active.lock, flags); 4198 4199 /* 4200 * We stop engines, otherwise we might get failed reset and a 4201 * dead gpu (on elk). Also as modern gpu as kbl can suffer 4202 * from system hang if batchbuffer is progressing when 4203 * the reset is issued, regardless of READY_TO_RESET ack. 4204 * Thus assume it is best to stop engines on all gens 4205 * where we have a gpu reset. 4206 * 4207 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 4208 * 4209 * FIXME: Wa for more modern gens needs to be validated 4210 */ 4211 ring_set_paused(engine, 1); 4212 intel_engine_stop_cs(engine); 4213 4214 engine->execlists.reset_ccid = active_ccid(engine); 4215 } 4216 4217 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 4218 { 4219 int x; 4220 4221 x = lrc_ring_mi_mode(engine); 4222 if (x != -1) { 4223 regs[x + 1] &= ~STOP_RING; 4224 regs[x + 1] |= STOP_RING << 16; 4225 } 4226 } 4227 4228 static void __execlists_reset_reg_state(const struct intel_context *ce, 4229 const struct intel_engine_cs *engine) 4230 { 4231 u32 *regs = ce->lrc_reg_state; 4232 4233 __reset_stop_ring(regs, engine); 4234 } 4235 4236 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 4237 { 4238 struct intel_engine_execlists * const execlists = &engine->execlists; 4239 struct intel_context *ce; 4240 struct i915_request *rq; 4241 u32 head; 4242 4243 mb(); /* paranoia: read the CSB pointers from after the reset */ 4244 clflush(execlists->csb_write); 4245 mb(); 4246 4247 process_csb(engine); /* drain preemption events */ 4248 4249 /* Following the reset, we need to reload the CSB read/write pointers */ 4250 reset_csb_pointers(engine); 4251 4252 /* 4253 * Save the currently executing context, even if we completed 4254 * its request, it was still running at the time of the 4255 * reset and will have been clobbered. 4256 */ 4257 rq = active_context(engine, engine->execlists.reset_ccid); 4258 if (!rq) 4259 goto unwind; 4260 4261 ce = rq->context; 4262 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 4263 4264 if (i915_request_completed(rq)) { 4265 /* Idle context; tidy up the ring so we can restart afresh */ 4266 head = intel_ring_wrap(ce->ring, rq->tail); 4267 goto out_replay; 4268 } 4269 4270 /* We still have requests in-flight; the engine should be active */ 4271 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 4272 4273 /* Context has requests still in-flight; it should not be idle! */ 4274 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 4275 4276 rq = active_request(ce->timeline, rq); 4277 head = intel_ring_wrap(ce->ring, rq->head); 4278 GEM_BUG_ON(head == ce->ring->tail); 4279 4280 /* 4281 * If this request hasn't started yet, e.g. it is waiting on a 4282 * semaphore, we need to avoid skipping the request or else we 4283 * break the signaling chain. However, if the context is corrupt 4284 * the request will not restart and we will be stuck with a wedged 4285 * device. It is quite often the case that if we issue a reset 4286 * while the GPU is loading the context image, that the context 4287 * image becomes corrupt. 4288 * 4289 * Otherwise, if we have not started yet, the request should replay 4290 * perfectly and we do not need to flag the result as being erroneous. 4291 */ 4292 if (!i915_request_started(rq)) 4293 goto out_replay; 4294 4295 /* 4296 * If the request was innocent, we leave the request in the ELSP 4297 * and will try to replay it on restarting. The context image may 4298 * have been corrupted by the reset, in which case we may have 4299 * to service a new GPU hang, but more likely we can continue on 4300 * without impact. 4301 * 4302 * If the request was guilty, we presume the context is corrupt 4303 * and have to at least restore the RING register in the context 4304 * image back to the expected values to skip over the guilty request. 4305 */ 4306 __i915_request_reset(rq, stalled); 4307 4308 /* 4309 * We want a simple context + ring to execute the breadcrumb update. 4310 * We cannot rely on the context being intact across the GPU hang, 4311 * so clear it and rebuild just what we need for the breadcrumb. 4312 * All pending requests for this context will be zapped, and any 4313 * future request will be after userspace has had the opportunity 4314 * to recreate its own state. 4315 */ 4316 out_replay: 4317 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 4318 head, ce->ring->tail); 4319 __execlists_reset_reg_state(ce, engine); 4320 __execlists_update_reg_state(ce, engine, head); 4321 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 4322 4323 unwind: 4324 /* Push back any incomplete requests for replay after the reset. */ 4325 cancel_port_requests(execlists); 4326 __unwind_incomplete_requests(engine); 4327 } 4328 4329 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 4330 { 4331 unsigned long flags; 4332 4333 ENGINE_TRACE(engine, "\n"); 4334 4335 spin_lock_irqsave(&engine->active.lock, flags); 4336 4337 __execlists_reset(engine, stalled); 4338 4339 spin_unlock_irqrestore(&engine->active.lock, flags); 4340 } 4341 4342 static void nop_submission_tasklet(unsigned long data) 4343 { 4344 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 4345 4346 /* The driver is wedged; don't process any more events. */ 4347 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); 4348 } 4349 4350 static void execlists_reset_cancel(struct intel_engine_cs *engine) 4351 { 4352 struct intel_engine_execlists * const execlists = &engine->execlists; 4353 struct i915_request *rq, *rn; 4354 struct rb_node *rb; 4355 unsigned long flags; 4356 4357 ENGINE_TRACE(engine, "\n"); 4358 4359 /* 4360 * Before we call engine->cancel_requests(), we should have exclusive 4361 * access to the submission state. This is arranged for us by the 4362 * caller disabling the interrupt generation, the tasklet and other 4363 * threads that may then access the same state, giving us a free hand 4364 * to reset state. However, we still need to let lockdep be aware that 4365 * we know this state may be accessed in hardirq context, so we 4366 * disable the irq around this manipulation and we want to keep 4367 * the spinlock focused on its duties and not accidentally conflate 4368 * coverage to the submission's irq state. (Similarly, although we 4369 * shouldn't need to disable irq around the manipulation of the 4370 * submission's irq state, we also wish to remind ourselves that 4371 * it is irq state.) 4372 */ 4373 spin_lock_irqsave(&engine->active.lock, flags); 4374 4375 __execlists_reset(engine, true); 4376 4377 /* Mark all executing requests as skipped. */ 4378 list_for_each_entry(rq, &engine->active.requests, sched.link) 4379 mark_eio(rq); 4380 4381 /* Flush the queued requests to the timeline list (for retiring). */ 4382 while ((rb = rb_first_cached(&execlists->queue))) { 4383 struct i915_priolist *p = to_priolist(rb); 4384 int i; 4385 4386 priolist_for_each_request_consume(rq, rn, p, i) { 4387 mark_eio(rq); 4388 __i915_request_submit(rq); 4389 } 4390 4391 rb_erase_cached(&p->node, &execlists->queue); 4392 i915_priolist_free(p); 4393 } 4394 4395 /* On-hold requests will be flushed to timeline upon their release */ 4396 list_for_each_entry(rq, &engine->active.hold, sched.link) 4397 mark_eio(rq); 4398 4399 /* Cancel all attached virtual engines */ 4400 while ((rb = rb_first_cached(&execlists->virtual))) { 4401 struct virtual_engine *ve = 4402 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4403 4404 rb_erase_cached(rb, &execlists->virtual); 4405 RB_CLEAR_NODE(rb); 4406 4407 spin_lock(&ve->base.active.lock); 4408 rq = fetch_and_zero(&ve->request); 4409 if (rq) { 4410 mark_eio(rq); 4411 4412 rq->engine = engine; 4413 __i915_request_submit(rq); 4414 i915_request_put(rq); 4415 4416 ve->base.execlists.queue_priority_hint = INT_MIN; 4417 } 4418 spin_unlock(&ve->base.active.lock); 4419 } 4420 4421 /* Remaining _unready_ requests will be nop'ed when submitted */ 4422 4423 execlists->queue_priority_hint = INT_MIN; 4424 execlists->queue = RB_ROOT_CACHED; 4425 4426 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 4427 execlists->tasklet.func = nop_submission_tasklet; 4428 4429 spin_unlock_irqrestore(&engine->active.lock, flags); 4430 } 4431 4432 static void execlists_reset_finish(struct intel_engine_cs *engine) 4433 { 4434 struct intel_engine_execlists * const execlists = &engine->execlists; 4435 4436 /* 4437 * After a GPU reset, we may have requests to replay. Do so now while 4438 * we still have the forcewake to be sure that the GPU is not allowed 4439 * to sleep before we restart and reload a context. 4440 */ 4441 GEM_BUG_ON(!reset_in_progress(execlists)); 4442 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 4443 execlists->tasklet.func(execlists->tasklet.data); 4444 4445 if (__tasklet_enable(&execlists->tasklet)) 4446 /* And kick in case we missed a new request submission. */ 4447 tasklet_hi_schedule(&execlists->tasklet); 4448 ENGINE_TRACE(engine, "depth->%d\n", 4449 atomic_read(&execlists->tasklet.count)); 4450 } 4451 4452 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 4453 u64 offset, u32 len, 4454 const unsigned int flags) 4455 { 4456 u32 *cs; 4457 4458 cs = intel_ring_begin(rq, 4); 4459 if (IS_ERR(cs)) 4460 return PTR_ERR(cs); 4461 4462 /* 4463 * WaDisableCtxRestoreArbitration:bdw,chv 4464 * 4465 * We don't need to perform MI_ARB_ENABLE as often as we do (in 4466 * particular all the gen that do not need the w/a at all!), if we 4467 * took care to make sure that on every switch into this context 4468 * (both ordinary and for preemption) that arbitrartion was enabled 4469 * we would be fine. However, for gen8 there is another w/a that 4470 * requires us to not preempt inside GPGPU execution, so we keep 4471 * arbitration disabled for gen8 batches. Arbitration will be 4472 * re-enabled before we close the request 4473 * (engine->emit_fini_breadcrumb). 4474 */ 4475 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4476 4477 /* FIXME(BDW+): Address space and security selectors. */ 4478 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4479 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4480 *cs++ = lower_32_bits(offset); 4481 *cs++ = upper_32_bits(offset); 4482 4483 intel_ring_advance(rq, cs); 4484 4485 return 0; 4486 } 4487 4488 static int gen8_emit_bb_start(struct i915_request *rq, 4489 u64 offset, u32 len, 4490 const unsigned int flags) 4491 { 4492 u32 *cs; 4493 4494 cs = intel_ring_begin(rq, 6); 4495 if (IS_ERR(cs)) 4496 return PTR_ERR(cs); 4497 4498 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4499 4500 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4501 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4502 *cs++ = lower_32_bits(offset); 4503 *cs++ = upper_32_bits(offset); 4504 4505 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4506 *cs++ = MI_NOOP; 4507 4508 intel_ring_advance(rq, cs); 4509 4510 return 0; 4511 } 4512 4513 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 4514 { 4515 ENGINE_WRITE(engine, RING_IMR, 4516 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 4517 ENGINE_POSTING_READ(engine, RING_IMR); 4518 } 4519 4520 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 4521 { 4522 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 4523 } 4524 4525 static int gen8_emit_flush(struct i915_request *request, u32 mode) 4526 { 4527 u32 cmd, *cs; 4528 4529 cs = intel_ring_begin(request, 4); 4530 if (IS_ERR(cs)) 4531 return PTR_ERR(cs); 4532 4533 cmd = MI_FLUSH_DW + 1; 4534 4535 /* We always require a command barrier so that subsequent 4536 * commands, such as breadcrumb interrupts, are strictly ordered 4537 * wrt the contents of the write cache being flushed to memory 4538 * (and thus being coherent from the CPU). 4539 */ 4540 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4541 4542 if (mode & EMIT_INVALIDATE) { 4543 cmd |= MI_INVALIDATE_TLB; 4544 if (request->engine->class == VIDEO_DECODE_CLASS) 4545 cmd |= MI_INVALIDATE_BSD; 4546 } 4547 4548 *cs++ = cmd; 4549 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4550 *cs++ = 0; /* upper addr */ 4551 *cs++ = 0; /* value */ 4552 intel_ring_advance(request, cs); 4553 4554 return 0; 4555 } 4556 4557 static int gen8_emit_flush_render(struct i915_request *request, 4558 u32 mode) 4559 { 4560 bool vf_flush_wa = false, dc_flush_wa = false; 4561 u32 *cs, flags = 0; 4562 int len; 4563 4564 flags |= PIPE_CONTROL_CS_STALL; 4565 4566 if (mode & EMIT_FLUSH) { 4567 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4568 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4569 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4570 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4571 } 4572 4573 if (mode & EMIT_INVALIDATE) { 4574 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4575 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4576 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4577 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4578 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4579 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4580 flags |= PIPE_CONTROL_QW_WRITE; 4581 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4582 4583 /* 4584 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 4585 * pipe control. 4586 */ 4587 if (IS_GEN(request->engine->i915, 9)) 4588 vf_flush_wa = true; 4589 4590 /* WaForGAMHang:kbl */ 4591 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0)) 4592 dc_flush_wa = true; 4593 } 4594 4595 len = 6; 4596 4597 if (vf_flush_wa) 4598 len += 6; 4599 4600 if (dc_flush_wa) 4601 len += 12; 4602 4603 cs = intel_ring_begin(request, len); 4604 if (IS_ERR(cs)) 4605 return PTR_ERR(cs); 4606 4607 if (vf_flush_wa) 4608 cs = gen8_emit_pipe_control(cs, 0, 0); 4609 4610 if (dc_flush_wa) 4611 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 4612 0); 4613 4614 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4615 4616 if (dc_flush_wa) 4617 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 4618 4619 intel_ring_advance(request, cs); 4620 4621 return 0; 4622 } 4623 4624 static int gen11_emit_flush_render(struct i915_request *request, 4625 u32 mode) 4626 { 4627 if (mode & EMIT_FLUSH) { 4628 u32 *cs; 4629 u32 flags = 0; 4630 4631 flags |= PIPE_CONTROL_CS_STALL; 4632 4633 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4634 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4635 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4636 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4637 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4638 flags |= PIPE_CONTROL_QW_WRITE; 4639 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4640 4641 cs = intel_ring_begin(request, 6); 4642 if (IS_ERR(cs)) 4643 return PTR_ERR(cs); 4644 4645 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4646 intel_ring_advance(request, cs); 4647 } 4648 4649 if (mode & EMIT_INVALIDATE) { 4650 u32 *cs; 4651 u32 flags = 0; 4652 4653 flags |= PIPE_CONTROL_CS_STALL; 4654 4655 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4656 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4657 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4658 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4659 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4660 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4661 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4662 flags |= PIPE_CONTROL_QW_WRITE; 4663 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4664 4665 cs = intel_ring_begin(request, 6); 4666 if (IS_ERR(cs)) 4667 return PTR_ERR(cs); 4668 4669 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4670 intel_ring_advance(request, cs); 4671 } 4672 4673 return 0; 4674 } 4675 4676 static u32 preparser_disable(bool state) 4677 { 4678 return MI_ARB_CHECK | 1 << 8 | state; 4679 } 4680 4681 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) 4682 { 4683 static const i915_reg_t vd[] = { 4684 GEN12_VD0_AUX_NV, 4685 GEN12_VD1_AUX_NV, 4686 GEN12_VD2_AUX_NV, 4687 GEN12_VD3_AUX_NV, 4688 }; 4689 4690 static const i915_reg_t ve[] = { 4691 GEN12_VE0_AUX_NV, 4692 GEN12_VE1_AUX_NV, 4693 }; 4694 4695 if (engine->class == VIDEO_DECODE_CLASS) 4696 return vd[engine->instance]; 4697 4698 if (engine->class == VIDEO_ENHANCEMENT_CLASS) 4699 return ve[engine->instance]; 4700 4701 GEM_BUG_ON("unknown aux_inv_reg\n"); 4702 4703 return INVALID_MMIO_REG; 4704 } 4705 4706 static u32 * 4707 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) 4708 { 4709 *cs++ = MI_LOAD_REGISTER_IMM(1); 4710 *cs++ = i915_mmio_reg_offset(inv_reg); 4711 *cs++ = AUX_INV; 4712 *cs++ = MI_NOOP; 4713 4714 return cs; 4715 } 4716 4717 static int gen12_emit_flush_render(struct i915_request *request, 4718 u32 mode) 4719 { 4720 if (mode & EMIT_FLUSH) { 4721 u32 flags = 0; 4722 u32 *cs; 4723 4724 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4725 flags |= PIPE_CONTROL_FLUSH_L3; 4726 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4727 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4728 /* Wa_1409600907:tgl */ 4729 flags |= PIPE_CONTROL_DEPTH_STALL; 4730 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4731 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4732 4733 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4734 flags |= PIPE_CONTROL_QW_WRITE; 4735 4736 flags |= PIPE_CONTROL_CS_STALL; 4737 4738 cs = intel_ring_begin(request, 6); 4739 if (IS_ERR(cs)) 4740 return PTR_ERR(cs); 4741 4742 cs = gen12_emit_pipe_control(cs, 4743 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4744 flags, LRC_PPHWSP_SCRATCH_ADDR); 4745 intel_ring_advance(request, cs); 4746 } 4747 4748 if (mode & EMIT_INVALIDATE) { 4749 u32 flags = 0; 4750 u32 *cs; 4751 4752 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4753 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4754 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4755 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4756 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4757 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4758 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4759 4760 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4761 flags |= PIPE_CONTROL_QW_WRITE; 4762 4763 flags |= PIPE_CONTROL_CS_STALL; 4764 4765 cs = intel_ring_begin(request, 8 + 4); 4766 if (IS_ERR(cs)) 4767 return PTR_ERR(cs); 4768 4769 /* 4770 * Prevent the pre-parser from skipping past the TLB 4771 * invalidate and loading a stale page for the batch 4772 * buffer / request payload. 4773 */ 4774 *cs++ = preparser_disable(true); 4775 4776 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4777 4778 /* hsdes: 1809175790 */ 4779 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); 4780 4781 *cs++ = preparser_disable(false); 4782 intel_ring_advance(request, cs); 4783 } 4784 4785 return 0; 4786 } 4787 4788 static int gen12_emit_flush(struct i915_request *request, u32 mode) 4789 { 4790 intel_engine_mask_t aux_inv = 0; 4791 u32 cmd, *cs; 4792 4793 cmd = 4; 4794 if (mode & EMIT_INVALIDATE) 4795 cmd += 2; 4796 if (mode & EMIT_INVALIDATE) 4797 aux_inv = request->engine->mask & ~BIT(BCS0); 4798 if (aux_inv) 4799 cmd += 2 * hweight8(aux_inv) + 2; 4800 4801 cs = intel_ring_begin(request, cmd); 4802 if (IS_ERR(cs)) 4803 return PTR_ERR(cs); 4804 4805 if (mode & EMIT_INVALIDATE) 4806 *cs++ = preparser_disable(true); 4807 4808 cmd = MI_FLUSH_DW + 1; 4809 4810 /* We always require a command barrier so that subsequent 4811 * commands, such as breadcrumb interrupts, are strictly ordered 4812 * wrt the contents of the write cache being flushed to memory 4813 * (and thus being coherent from the CPU). 4814 */ 4815 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4816 4817 if (mode & EMIT_INVALIDATE) { 4818 cmd |= MI_INVALIDATE_TLB; 4819 if (request->engine->class == VIDEO_DECODE_CLASS) 4820 cmd |= MI_INVALIDATE_BSD; 4821 } 4822 4823 *cs++ = cmd; 4824 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4825 *cs++ = 0; /* upper addr */ 4826 *cs++ = 0; /* value */ 4827 4828 if (aux_inv) { /* hsdes: 1809175790 */ 4829 struct intel_engine_cs *engine; 4830 unsigned int tmp; 4831 4832 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); 4833 for_each_engine_masked(engine, request->engine->gt, 4834 aux_inv, tmp) { 4835 *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); 4836 *cs++ = AUX_INV; 4837 } 4838 *cs++ = MI_NOOP; 4839 } 4840 4841 if (mode & EMIT_INVALIDATE) 4842 *cs++ = preparser_disable(false); 4843 4844 intel_ring_advance(request, cs); 4845 4846 return 0; 4847 } 4848 4849 static void assert_request_valid(struct i915_request *rq) 4850 { 4851 struct intel_ring *ring __maybe_unused = rq->ring; 4852 4853 /* Can we unwind this request without appearing to go forwards? */ 4854 GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); 4855 } 4856 4857 /* 4858 * Reserve space for 2 NOOPs at the end of each request to be 4859 * used as a workaround for not being allowed to do lite 4860 * restore with HEAD==TAIL (WaIdleLiteRestore). 4861 */ 4862 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4863 { 4864 /* Ensure there's always at least one preemption point per-request. */ 4865 *cs++ = MI_ARB_CHECK; 4866 *cs++ = MI_NOOP; 4867 request->wa_tail = intel_ring_offset(request, cs); 4868 4869 /* Check that entire request is less than half the ring */ 4870 assert_request_valid(request); 4871 4872 return cs; 4873 } 4874 4875 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4876 { 4877 *cs++ = MI_SEMAPHORE_WAIT | 4878 MI_SEMAPHORE_GLOBAL_GTT | 4879 MI_SEMAPHORE_POLL | 4880 MI_SEMAPHORE_SAD_EQ_SDD; 4881 *cs++ = 0; 4882 *cs++ = intel_hws_preempt_address(request->engine); 4883 *cs++ = 0; 4884 4885 return cs; 4886 } 4887 4888 static __always_inline u32* 4889 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4890 { 4891 *cs++ = MI_USER_INTERRUPT; 4892 4893 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4894 if (intel_engine_has_semaphores(request->engine)) 4895 cs = emit_preempt_busywait(request, cs); 4896 4897 request->tail = intel_ring_offset(request, cs); 4898 assert_ring_tail_valid(request->ring, request->tail); 4899 4900 return gen8_emit_wa_tail(request, cs); 4901 } 4902 4903 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs) 4904 { 4905 return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0); 4906 } 4907 4908 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4909 { 4910 return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4911 } 4912 4913 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4914 { 4915 cs = gen8_emit_pipe_control(cs, 4916 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4917 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4918 PIPE_CONTROL_DC_FLUSH_ENABLE, 4919 0); 4920 4921 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4922 cs = gen8_emit_ggtt_write_rcs(cs, 4923 request->fence.seqno, 4924 hwsp_offset(request), 4925 PIPE_CONTROL_FLUSH_ENABLE | 4926 PIPE_CONTROL_CS_STALL); 4927 4928 return gen8_emit_fini_breadcrumb_tail(request, cs); 4929 } 4930 4931 static u32 * 4932 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4933 { 4934 cs = gen8_emit_ggtt_write_rcs(cs, 4935 request->fence.seqno, 4936 hwsp_offset(request), 4937 PIPE_CONTROL_CS_STALL | 4938 PIPE_CONTROL_TILE_CACHE_FLUSH | 4939 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4940 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4941 PIPE_CONTROL_DC_FLUSH_ENABLE | 4942 PIPE_CONTROL_FLUSH_ENABLE); 4943 4944 return gen8_emit_fini_breadcrumb_tail(request, cs); 4945 } 4946 4947 /* 4948 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4949 * flush and will continue pre-fetching the instructions after it before the 4950 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4951 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4952 * of the next request before the memory has been flushed, we're guaranteed that 4953 * we won't access the batch itself too early. 4954 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4955 * so, if the current request is modifying an instruction in the next request on 4956 * the same intel_context, we might pre-fetch and then execute the pre-update 4957 * instruction. To avoid this, the users of self-modifying code should either 4958 * disable the parser around the code emitting the memory writes, via a new flag 4959 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4960 * the in-kernel use-cases we've opted to use a separate context, see 4961 * reloc_gpu() as an example. 4962 * All the above applies only to the instructions themselves. Non-inline data 4963 * used by the instructions is not pre-fetched. 4964 */ 4965 4966 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 4967 { 4968 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 4969 MI_SEMAPHORE_GLOBAL_GTT | 4970 MI_SEMAPHORE_POLL | 4971 MI_SEMAPHORE_SAD_EQ_SDD; 4972 *cs++ = 0; 4973 *cs++ = intel_hws_preempt_address(request->engine); 4974 *cs++ = 0; 4975 *cs++ = 0; 4976 *cs++ = MI_NOOP; 4977 4978 return cs; 4979 } 4980 4981 static __always_inline u32* 4982 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4983 { 4984 *cs++ = MI_USER_INTERRUPT; 4985 4986 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4987 if (intel_engine_has_semaphores(request->engine)) 4988 cs = gen12_emit_preempt_busywait(request, cs); 4989 4990 request->tail = intel_ring_offset(request, cs); 4991 assert_ring_tail_valid(request->ring, request->tail); 4992 4993 return gen8_emit_wa_tail(request, cs); 4994 } 4995 4996 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4997 { 4998 /* XXX Stalling flush before seqno write; post-sync not */ 4999 cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0)); 5000 return gen12_emit_fini_breadcrumb_tail(rq, cs); 5001 } 5002 5003 static u32 * 5004 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 5005 { 5006 cs = gen12_emit_ggtt_write_rcs(cs, 5007 request->fence.seqno, 5008 hwsp_offset(request), 5009 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 5010 PIPE_CONTROL_CS_STALL | 5011 PIPE_CONTROL_TILE_CACHE_FLUSH | 5012 PIPE_CONTROL_FLUSH_L3 | 5013 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 5014 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 5015 /* Wa_1409600907:tgl */ 5016 PIPE_CONTROL_DEPTH_STALL | 5017 PIPE_CONTROL_DC_FLUSH_ENABLE | 5018 PIPE_CONTROL_FLUSH_ENABLE); 5019 5020 return gen12_emit_fini_breadcrumb_tail(request, cs); 5021 } 5022 5023 static void execlists_park(struct intel_engine_cs *engine) 5024 { 5025 cancel_timer(&engine->execlists.timer); 5026 cancel_timer(&engine->execlists.preempt); 5027 } 5028 5029 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 5030 { 5031 engine->submit_request = execlists_submit_request; 5032 engine->schedule = i915_schedule; 5033 engine->execlists.tasklet.func = execlists_submission_tasklet; 5034 5035 engine->reset.prepare = execlists_reset_prepare; 5036 engine->reset.rewind = execlists_reset_rewind; 5037 engine->reset.cancel = execlists_reset_cancel; 5038 engine->reset.finish = execlists_reset_finish; 5039 5040 engine->park = execlists_park; 5041 engine->unpark = NULL; 5042 5043 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 5044 if (!intel_vgpu_active(engine->i915)) { 5045 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 5046 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) { 5047 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 5048 if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) 5049 engine->flags |= I915_ENGINE_HAS_TIMESLICES; 5050 } 5051 } 5052 5053 if (INTEL_GEN(engine->i915) >= 12) 5054 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 5055 5056 if (intel_engine_has_preemption(engine)) 5057 engine->emit_bb_start = gen8_emit_bb_start; 5058 else 5059 engine->emit_bb_start = gen8_emit_bb_start_noarb; 5060 } 5061 5062 static void execlists_shutdown(struct intel_engine_cs *engine) 5063 { 5064 /* Synchronise with residual timers and any softirq they raise */ 5065 del_timer_sync(&engine->execlists.timer); 5066 del_timer_sync(&engine->execlists.preempt); 5067 tasklet_kill(&engine->execlists.tasklet); 5068 } 5069 5070 static void execlists_release(struct intel_engine_cs *engine) 5071 { 5072 engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ 5073 5074 execlists_shutdown(engine); 5075 5076 intel_engine_cleanup_common(engine); 5077 lrc_destroy_wa_ctx(engine); 5078 } 5079 5080 static void 5081 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 5082 { 5083 /* Default vfuncs which can be overriden by each engine. */ 5084 5085 engine->resume = execlists_resume; 5086 5087 engine->cops = &execlists_context_ops; 5088 engine->request_alloc = execlists_request_alloc; 5089 5090 engine->emit_flush = gen8_emit_flush; 5091 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 5092 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 5093 if (INTEL_GEN(engine->i915) >= 12) { 5094 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 5095 engine->emit_flush = gen12_emit_flush; 5096 } 5097 engine->set_default_submission = intel_execlists_set_default_submission; 5098 5099 if (INTEL_GEN(engine->i915) < 11) { 5100 engine->irq_enable = gen8_logical_ring_enable_irq; 5101 engine->irq_disable = gen8_logical_ring_disable_irq; 5102 } else { 5103 /* 5104 * TODO: On Gen11 interrupt masks need to be clear 5105 * to allow C6 entry. Keep interrupts enabled at 5106 * and take the hit of generating extra interrupts 5107 * until a more refined solution exists. 5108 */ 5109 } 5110 } 5111 5112 static inline void 5113 logical_ring_default_irqs(struct intel_engine_cs *engine) 5114 { 5115 unsigned int shift = 0; 5116 5117 if (INTEL_GEN(engine->i915) < 11) { 5118 const u8 irq_shifts[] = { 5119 [RCS0] = GEN8_RCS_IRQ_SHIFT, 5120 [BCS0] = GEN8_BCS_IRQ_SHIFT, 5121 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 5122 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 5123 [VECS0] = GEN8_VECS_IRQ_SHIFT, 5124 }; 5125 5126 shift = irq_shifts[engine->id]; 5127 } 5128 5129 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 5130 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 5131 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 5132 engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; 5133 } 5134 5135 static void rcs_submission_override(struct intel_engine_cs *engine) 5136 { 5137 switch (INTEL_GEN(engine->i915)) { 5138 case 12: 5139 engine->emit_flush = gen12_emit_flush_render; 5140 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 5141 break; 5142 case 11: 5143 engine->emit_flush = gen11_emit_flush_render; 5144 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 5145 break; 5146 default: 5147 engine->emit_flush = gen8_emit_flush_render; 5148 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 5149 break; 5150 } 5151 } 5152 5153 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 5154 { 5155 struct intel_engine_execlists * const execlists = &engine->execlists; 5156 struct drm_i915_private *i915 = engine->i915; 5157 struct intel_uncore *uncore = engine->uncore; 5158 u32 base = engine->mmio_base; 5159 5160 tasklet_init(&engine->execlists.tasklet, 5161 execlists_submission_tasklet, (unsigned long)engine); 5162 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 5163 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 5164 5165 logical_ring_default_vfuncs(engine); 5166 logical_ring_default_irqs(engine); 5167 5168 if (engine->class == RENDER_CLASS) 5169 rcs_submission_override(engine); 5170 5171 if (intel_init_workaround_bb(engine)) 5172 /* 5173 * We continue even if we fail to initialize WA batch 5174 * because we only expect rare glitches but nothing 5175 * critical to prevent us from using GPU 5176 */ 5177 drm_err(&i915->drm, "WA batch buffer initialization failed\n"); 5178 5179 if (HAS_LOGICAL_RING_ELSQ(i915)) { 5180 execlists->submit_reg = uncore->regs + 5181 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 5182 execlists->ctrl_reg = uncore->regs + 5183 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 5184 } else { 5185 execlists->submit_reg = uncore->regs + 5186 i915_mmio_reg_offset(RING_ELSP(base)); 5187 } 5188 5189 execlists->csb_status = 5190 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 5191 5192 execlists->csb_write = 5193 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 5194 5195 if (INTEL_GEN(i915) < 11) 5196 execlists->csb_size = GEN8_CSB_ENTRIES; 5197 else 5198 execlists->csb_size = GEN11_CSB_ENTRIES; 5199 5200 if (INTEL_GEN(engine->i915) >= 11) { 5201 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32); 5202 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); 5203 } 5204 5205 /* Finally, take ownership and responsibility for cleanup! */ 5206 engine->sanitize = execlists_sanitize; 5207 engine->release = execlists_release; 5208 5209 return 0; 5210 } 5211 5212 static void init_common_reg_state(u32 * const regs, 5213 const struct intel_engine_cs *engine, 5214 const struct intel_ring *ring, 5215 bool inhibit) 5216 { 5217 u32 ctl; 5218 5219 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 5220 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 5221 if (inhibit) 5222 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 5223 if (INTEL_GEN(engine->i915) < 11) 5224 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 5225 CTX_CTRL_RS_CTX_ENABLE); 5226 regs[CTX_CONTEXT_CONTROL] = ctl; 5227 5228 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 5229 regs[CTX_TIMESTAMP] = 0; 5230 } 5231 5232 static void init_wa_bb_reg_state(u32 * const regs, 5233 const struct intel_engine_cs *engine) 5234 { 5235 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 5236 5237 if (wa_ctx->per_ctx.size) { 5238 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 5239 5240 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 5241 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 5242 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 5243 } 5244 5245 if (wa_ctx->indirect_ctx.size) { 5246 lrc_ring_setup_indirect_ctx(regs, engine, 5247 i915_ggtt_offset(wa_ctx->vma) + 5248 wa_ctx->indirect_ctx.offset, 5249 wa_ctx->indirect_ctx.size); 5250 } 5251 } 5252 5253 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 5254 { 5255 if (i915_vm_is_4lvl(&ppgtt->vm)) { 5256 /* 64b PPGTT (48bit canonical) 5257 * PDP0_DESCRIPTOR contains the base address to PML4 and 5258 * other PDP Descriptors are ignored. 5259 */ 5260 ASSIGN_CTX_PML4(ppgtt, regs); 5261 } else { 5262 ASSIGN_CTX_PDP(ppgtt, regs, 3); 5263 ASSIGN_CTX_PDP(ppgtt, regs, 2); 5264 ASSIGN_CTX_PDP(ppgtt, regs, 1); 5265 ASSIGN_CTX_PDP(ppgtt, regs, 0); 5266 } 5267 } 5268 5269 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 5270 { 5271 if (i915_is_ggtt(vm)) 5272 return i915_vm_to_ggtt(vm)->alias; 5273 else 5274 return i915_vm_to_ppgtt(vm); 5275 } 5276 5277 static void execlists_init_reg_state(u32 *regs, 5278 const struct intel_context *ce, 5279 const struct intel_engine_cs *engine, 5280 const struct intel_ring *ring, 5281 bool inhibit) 5282 { 5283 /* 5284 * A context is actually a big batch buffer with several 5285 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 5286 * values we are setting here are only for the first context restore: 5287 * on a subsequent save, the GPU will recreate this batchbuffer with new 5288 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 5289 * we are not initializing here). 5290 * 5291 * Must keep consistent with virtual_update_register_offsets(). 5292 */ 5293 set_offsets(regs, reg_offsets(engine), engine, inhibit); 5294 5295 init_common_reg_state(regs, engine, ring, inhibit); 5296 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 5297 5298 init_wa_bb_reg_state(regs, engine); 5299 5300 __reset_stop_ring(regs, engine); 5301 } 5302 5303 static int 5304 populate_lr_context(struct intel_context *ce, 5305 struct drm_i915_gem_object *ctx_obj, 5306 struct intel_engine_cs *engine, 5307 struct intel_ring *ring) 5308 { 5309 bool inhibit = true; 5310 void *vaddr; 5311 5312 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 5313 if (IS_ERR(vaddr)) { 5314 drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); 5315 return PTR_ERR(vaddr); 5316 } 5317 5318 set_redzone(vaddr, engine); 5319 5320 if (engine->default_state) { 5321 shmem_read(engine->default_state, 0, 5322 vaddr, engine->context_size); 5323 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 5324 inhibit = false; 5325 } 5326 5327 /* Clear the ppHWSP (inc. per-context counters) */ 5328 memset(vaddr, 0, PAGE_SIZE); 5329 5330 /* 5331 * The second page of the context object contains some registers which 5332 * must be set up prior to the first execution. 5333 */ 5334 execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, 5335 ce, engine, ring, inhibit); 5336 5337 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 5338 i915_gem_object_unpin_map(ctx_obj); 5339 return 0; 5340 } 5341 5342 static struct intel_timeline *pinned_timeline(struct intel_context *ce) 5343 { 5344 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 5345 5346 return intel_timeline_create_from_engine(ce->engine, 5347 page_unmask_bits(tl)); 5348 } 5349 5350 static int __execlists_context_alloc(struct intel_context *ce, 5351 struct intel_engine_cs *engine) 5352 { 5353 struct drm_i915_gem_object *ctx_obj; 5354 struct intel_ring *ring; 5355 struct i915_vma *vma; 5356 u32 context_size; 5357 int ret; 5358 5359 GEM_BUG_ON(ce->state); 5360 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 5361 5362 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 5363 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 5364 5365 if (INTEL_GEN(engine->i915) == 12) { 5366 ce->wa_bb_page = context_size / PAGE_SIZE; 5367 context_size += PAGE_SIZE; 5368 } 5369 5370 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 5371 if (IS_ERR(ctx_obj)) 5372 return PTR_ERR(ctx_obj); 5373 5374 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 5375 if (IS_ERR(vma)) { 5376 ret = PTR_ERR(vma); 5377 goto error_deref_obj; 5378 } 5379 5380 if (!page_mask_bits(ce->timeline)) { 5381 struct intel_timeline *tl; 5382 5383 /* 5384 * Use the static global HWSP for the kernel context, and 5385 * a dynamically allocated cacheline for everyone else. 5386 */ 5387 if (unlikely(ce->timeline)) 5388 tl = pinned_timeline(ce); 5389 else 5390 tl = intel_timeline_create(engine->gt); 5391 if (IS_ERR(tl)) { 5392 ret = PTR_ERR(tl); 5393 goto error_deref_obj; 5394 } 5395 5396 ce->timeline = tl; 5397 } 5398 5399 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 5400 if (IS_ERR(ring)) { 5401 ret = PTR_ERR(ring); 5402 goto error_deref_obj; 5403 } 5404 5405 ret = populate_lr_context(ce, ctx_obj, engine, ring); 5406 if (ret) { 5407 drm_dbg(&engine->i915->drm, 5408 "Failed to populate LRC: %d\n", ret); 5409 goto error_ring_free; 5410 } 5411 5412 ce->ring = ring; 5413 ce->state = vma; 5414 5415 return 0; 5416 5417 error_ring_free: 5418 intel_ring_put(ring); 5419 error_deref_obj: 5420 i915_gem_object_put(ctx_obj); 5421 return ret; 5422 } 5423 5424 static struct list_head *virtual_queue(struct virtual_engine *ve) 5425 { 5426 return &ve->base.execlists.default_priolist.requests[0]; 5427 } 5428 5429 static void rcu_virtual_context_destroy(struct work_struct *wrk) 5430 { 5431 struct virtual_engine *ve = 5432 container_of(wrk, typeof(*ve), rcu.work); 5433 unsigned int n; 5434 5435 GEM_BUG_ON(ve->context.inflight); 5436 5437 /* Preempt-to-busy may leave a stale request behind. */ 5438 if (unlikely(ve->request)) { 5439 struct i915_request *old; 5440 5441 spin_lock_irq(&ve->base.active.lock); 5442 5443 old = fetch_and_zero(&ve->request); 5444 if (old) { 5445 GEM_BUG_ON(!i915_request_completed(old)); 5446 __i915_request_submit(old); 5447 i915_request_put(old); 5448 } 5449 5450 spin_unlock_irq(&ve->base.active.lock); 5451 } 5452 5453 /* 5454 * Flush the tasklet in case it is still running on another core. 5455 * 5456 * This needs to be done before we remove ourselves from the siblings' 5457 * rbtrees as in the case it is running in parallel, it may reinsert 5458 * the rb_node into a sibling. 5459 */ 5460 tasklet_kill(&ve->base.execlists.tasklet); 5461 5462 /* Decouple ourselves from the siblings, no more access allowed. */ 5463 for (n = 0; n < ve->num_siblings; n++) { 5464 struct intel_engine_cs *sibling = ve->siblings[n]; 5465 struct rb_node *node = &ve->nodes[sibling->id].rb; 5466 5467 if (RB_EMPTY_NODE(node)) 5468 continue; 5469 5470 spin_lock_irq(&sibling->active.lock); 5471 5472 /* Detachment is lazily performed in the execlists tasklet */ 5473 if (!RB_EMPTY_NODE(node)) 5474 rb_erase_cached(node, &sibling->execlists.virtual); 5475 5476 spin_unlock_irq(&sibling->active.lock); 5477 } 5478 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 5479 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5480 5481 if (ve->context.state) 5482 __execlists_context_fini(&ve->context); 5483 intel_context_fini(&ve->context); 5484 5485 intel_breadcrumbs_free(ve->base.breadcrumbs); 5486 intel_engine_free_request_pool(&ve->base); 5487 5488 kfree(ve->bonds); 5489 kfree(ve); 5490 } 5491 5492 static void virtual_context_destroy(struct kref *kref) 5493 { 5494 struct virtual_engine *ve = 5495 container_of(kref, typeof(*ve), context.ref); 5496 5497 GEM_BUG_ON(!list_empty(&ve->context.signals)); 5498 5499 /* 5500 * When destroying the virtual engine, we have to be aware that 5501 * it may still be in use from an hardirq/softirq context causing 5502 * the resubmission of a completed request (background completion 5503 * due to preempt-to-busy). Before we can free the engine, we need 5504 * to flush the submission code and tasklets that are still potentially 5505 * accessing the engine. Flushing the tasklets requires process context, 5506 * and since we can guard the resubmit onto the engine with an RCU read 5507 * lock, we can delegate the free of the engine to an RCU worker. 5508 */ 5509 INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy); 5510 queue_rcu_work(system_wq, &ve->rcu); 5511 } 5512 5513 static void virtual_engine_initial_hint(struct virtual_engine *ve) 5514 { 5515 int swp; 5516 5517 /* 5518 * Pick a random sibling on starting to help spread the load around. 5519 * 5520 * New contexts are typically created with exactly the same order 5521 * of siblings, and often started in batches. Due to the way we iterate 5522 * the array of sibling when submitting requests, sibling[0] is 5523 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 5524 * randomised across the system, we also help spread the load by the 5525 * first engine we inspect being different each time. 5526 * 5527 * NB This does not force us to execute on this engine, it will just 5528 * typically be the first we inspect for submission. 5529 */ 5530 swp = prandom_u32_max(ve->num_siblings); 5531 if (swp) 5532 swap(ve->siblings[swp], ve->siblings[0]); 5533 } 5534 5535 static int virtual_context_alloc(struct intel_context *ce) 5536 { 5537 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5538 5539 return __execlists_context_alloc(ce, ve->siblings[0]); 5540 } 5541 5542 static int virtual_context_pin(struct intel_context *ce, void *vaddr) 5543 { 5544 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5545 5546 /* Note: we must use a real engine class for setting up reg state */ 5547 return __execlists_context_pin(ce, ve->siblings[0], vaddr); 5548 } 5549 5550 static void virtual_context_enter(struct intel_context *ce) 5551 { 5552 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5553 unsigned int n; 5554 5555 for (n = 0; n < ve->num_siblings; n++) 5556 intel_engine_pm_get(ve->siblings[n]); 5557 5558 intel_timeline_enter(ce->timeline); 5559 } 5560 5561 static void virtual_context_exit(struct intel_context *ce) 5562 { 5563 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5564 unsigned int n; 5565 5566 intel_timeline_exit(ce->timeline); 5567 5568 for (n = 0; n < ve->num_siblings; n++) 5569 intel_engine_pm_put(ve->siblings[n]); 5570 } 5571 5572 static const struct intel_context_ops virtual_context_ops = { 5573 .alloc = virtual_context_alloc, 5574 5575 .pre_pin = execlists_context_pre_pin, 5576 .pin = virtual_context_pin, 5577 .unpin = execlists_context_unpin, 5578 .post_unpin = execlists_context_post_unpin, 5579 5580 .enter = virtual_context_enter, 5581 .exit = virtual_context_exit, 5582 5583 .destroy = virtual_context_destroy, 5584 }; 5585 5586 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 5587 { 5588 struct i915_request *rq; 5589 intel_engine_mask_t mask; 5590 5591 rq = READ_ONCE(ve->request); 5592 if (!rq) 5593 return 0; 5594 5595 /* The rq is ready for submission; rq->execution_mask is now stable. */ 5596 mask = rq->execution_mask; 5597 if (unlikely(!mask)) { 5598 /* Invalid selection, submit to a random engine in error */ 5599 i915_request_set_error_once(rq, -ENODEV); 5600 mask = ve->siblings[0]->mask; 5601 } 5602 5603 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 5604 rq->fence.context, rq->fence.seqno, 5605 mask, ve->base.execlists.queue_priority_hint); 5606 5607 return mask; 5608 } 5609 5610 static void virtual_submission_tasklet(unsigned long data) 5611 { 5612 struct virtual_engine * const ve = (struct virtual_engine *)data; 5613 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); 5614 intel_engine_mask_t mask; 5615 unsigned int n; 5616 5617 rcu_read_lock(); 5618 mask = virtual_submission_mask(ve); 5619 rcu_read_unlock(); 5620 if (unlikely(!mask)) 5621 return; 5622 5623 local_irq_disable(); 5624 for (n = 0; n < ve->num_siblings; n++) { 5625 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); 5626 struct ve_node * const node = &ve->nodes[sibling->id]; 5627 struct rb_node **parent, *rb; 5628 bool first; 5629 5630 if (!READ_ONCE(ve->request)) 5631 break; /* already handled by a sibling's tasklet */ 5632 5633 if (unlikely(!(mask & sibling->mask))) { 5634 if (!RB_EMPTY_NODE(&node->rb)) { 5635 spin_lock(&sibling->active.lock); 5636 rb_erase_cached(&node->rb, 5637 &sibling->execlists.virtual); 5638 RB_CLEAR_NODE(&node->rb); 5639 spin_unlock(&sibling->active.lock); 5640 } 5641 continue; 5642 } 5643 5644 spin_lock(&sibling->active.lock); 5645 5646 if (!RB_EMPTY_NODE(&node->rb)) { 5647 /* 5648 * Cheat and avoid rebalancing the tree if we can 5649 * reuse this node in situ. 5650 */ 5651 first = rb_first_cached(&sibling->execlists.virtual) == 5652 &node->rb; 5653 if (prio == node->prio || (prio > node->prio && first)) 5654 goto submit_engine; 5655 5656 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 5657 } 5658 5659 rb = NULL; 5660 first = true; 5661 parent = &sibling->execlists.virtual.rb_root.rb_node; 5662 while (*parent) { 5663 struct ve_node *other; 5664 5665 rb = *parent; 5666 other = rb_entry(rb, typeof(*other), rb); 5667 if (prio > other->prio) { 5668 parent = &rb->rb_left; 5669 } else { 5670 parent = &rb->rb_right; 5671 first = false; 5672 } 5673 } 5674 5675 rb_link_node(&node->rb, rb, parent); 5676 rb_insert_color_cached(&node->rb, 5677 &sibling->execlists.virtual, 5678 first); 5679 5680 submit_engine: 5681 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 5682 node->prio = prio; 5683 if (first && prio > sibling->execlists.queue_priority_hint) 5684 tasklet_hi_schedule(&sibling->execlists.tasklet); 5685 5686 spin_unlock(&sibling->active.lock); 5687 } 5688 local_irq_enable(); 5689 } 5690 5691 static void virtual_submit_request(struct i915_request *rq) 5692 { 5693 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5694 struct i915_request *old; 5695 unsigned long flags; 5696 5697 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 5698 rq->fence.context, 5699 rq->fence.seqno); 5700 5701 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 5702 5703 spin_lock_irqsave(&ve->base.active.lock, flags); 5704 5705 old = ve->request; 5706 if (old) { /* background completion event from preempt-to-busy */ 5707 GEM_BUG_ON(!i915_request_completed(old)); 5708 __i915_request_submit(old); 5709 i915_request_put(old); 5710 } 5711 5712 if (i915_request_completed(rq)) { 5713 __i915_request_submit(rq); 5714 5715 ve->base.execlists.queue_priority_hint = INT_MIN; 5716 ve->request = NULL; 5717 } else { 5718 ve->base.execlists.queue_priority_hint = rq_prio(rq); 5719 ve->request = i915_request_get(rq); 5720 5721 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5722 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5723 5724 tasklet_hi_schedule(&ve->base.execlists.tasklet); 5725 } 5726 5727 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5728 } 5729 5730 static struct ve_bond * 5731 virtual_find_bond(struct virtual_engine *ve, 5732 const struct intel_engine_cs *master) 5733 { 5734 int i; 5735 5736 for (i = 0; i < ve->num_bonds; i++) { 5737 if (ve->bonds[i].master == master) 5738 return &ve->bonds[i]; 5739 } 5740 5741 return NULL; 5742 } 5743 5744 static void 5745 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5746 { 5747 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5748 intel_engine_mask_t allowed, exec; 5749 struct ve_bond *bond; 5750 5751 allowed = ~to_request(signal)->engine->mask; 5752 5753 bond = virtual_find_bond(ve, to_request(signal)->engine); 5754 if (bond) 5755 allowed &= bond->sibling_mask; 5756 5757 /* Restrict the bonded request to run on only the available engines */ 5758 exec = READ_ONCE(rq->execution_mask); 5759 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5760 ; 5761 5762 /* Prevent the master from being re-run on the bonded engines */ 5763 to_request(signal)->execution_mask &= ~allowed; 5764 } 5765 5766 struct intel_context * 5767 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5768 unsigned int count) 5769 { 5770 struct virtual_engine *ve; 5771 unsigned int n; 5772 int err; 5773 5774 if (count == 0) 5775 return ERR_PTR(-EINVAL); 5776 5777 if (count == 1) 5778 return intel_context_create(siblings[0]); 5779 5780 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5781 if (!ve) 5782 return ERR_PTR(-ENOMEM); 5783 5784 ve->base.i915 = siblings[0]->i915; 5785 ve->base.gt = siblings[0]->gt; 5786 ve->base.uncore = siblings[0]->uncore; 5787 ve->base.id = -1; 5788 5789 ve->base.class = OTHER_CLASS; 5790 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5791 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5792 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5793 5794 /* 5795 * The decision on whether to submit a request using semaphores 5796 * depends on the saturated state of the engine. We only compute 5797 * this during HW submission of the request, and we need for this 5798 * state to be globally applied to all requests being submitted 5799 * to this engine. Virtual engines encompass more than one physical 5800 * engine and so we cannot accurately tell in advance if one of those 5801 * engines is already saturated and so cannot afford to use a semaphore 5802 * and be pessimized in priority for doing so -- if we are the only 5803 * context using semaphores after all other clients have stopped, we 5804 * will be starved on the saturated system. Such a global switch for 5805 * semaphores is less than ideal, but alas is the current compromise. 5806 */ 5807 ve->base.saturated = ALL_ENGINES; 5808 5809 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5810 5811 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5812 intel_engine_init_execlists(&ve->base); 5813 5814 ve->base.cops = &virtual_context_ops; 5815 ve->base.request_alloc = execlists_request_alloc; 5816 5817 ve->base.schedule = i915_schedule; 5818 ve->base.submit_request = virtual_submit_request; 5819 ve->base.bond_execute = virtual_bond_execute; 5820 5821 INIT_LIST_HEAD(virtual_queue(ve)); 5822 ve->base.execlists.queue_priority_hint = INT_MIN; 5823 tasklet_init(&ve->base.execlists.tasklet, 5824 virtual_submission_tasklet, 5825 (unsigned long)ve); 5826 5827 intel_context_init(&ve->context, &ve->base); 5828 5829 ve->base.breadcrumbs = intel_breadcrumbs_create(NULL); 5830 if (!ve->base.breadcrumbs) { 5831 err = -ENOMEM; 5832 goto err_put; 5833 } 5834 5835 for (n = 0; n < count; n++) { 5836 struct intel_engine_cs *sibling = siblings[n]; 5837 5838 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5839 if (sibling->mask & ve->base.mask) { 5840 DRM_DEBUG("duplicate %s entry in load balancer\n", 5841 sibling->name); 5842 err = -EINVAL; 5843 goto err_put; 5844 } 5845 5846 /* 5847 * The virtual engine implementation is tightly coupled to 5848 * the execlists backend -- we push out request directly 5849 * into a tree inside each physical engine. We could support 5850 * layering if we handle cloning of the requests and 5851 * submitting a copy into each backend. 5852 */ 5853 if (sibling->execlists.tasklet.func != 5854 execlists_submission_tasklet) { 5855 err = -ENODEV; 5856 goto err_put; 5857 } 5858 5859 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5860 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5861 5862 ve->siblings[ve->num_siblings++] = sibling; 5863 ve->base.mask |= sibling->mask; 5864 5865 /* 5866 * All physical engines must be compatible for their emission 5867 * functions (as we build the instructions during request 5868 * construction and do not alter them before submission 5869 * on the physical engine). We use the engine class as a guide 5870 * here, although that could be refined. 5871 */ 5872 if (ve->base.class != OTHER_CLASS) { 5873 if (ve->base.class != sibling->class) { 5874 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5875 sibling->class, ve->base.class); 5876 err = -EINVAL; 5877 goto err_put; 5878 } 5879 continue; 5880 } 5881 5882 ve->base.class = sibling->class; 5883 ve->base.uabi_class = sibling->uabi_class; 5884 snprintf(ve->base.name, sizeof(ve->base.name), 5885 "v%dx%d", ve->base.class, count); 5886 ve->base.context_size = sibling->context_size; 5887 5888 ve->base.emit_bb_start = sibling->emit_bb_start; 5889 ve->base.emit_flush = sibling->emit_flush; 5890 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5891 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5892 ve->base.emit_fini_breadcrumb_dw = 5893 sibling->emit_fini_breadcrumb_dw; 5894 5895 ve->base.flags = sibling->flags; 5896 } 5897 5898 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5899 5900 virtual_engine_initial_hint(ve); 5901 return &ve->context; 5902 5903 err_put: 5904 intel_context_put(&ve->context); 5905 return ERR_PTR(err); 5906 } 5907 5908 struct intel_context * 5909 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5910 { 5911 struct virtual_engine *se = to_virtual_engine(src); 5912 struct intel_context *dst; 5913 5914 dst = intel_execlists_create_virtual(se->siblings, 5915 se->num_siblings); 5916 if (IS_ERR(dst)) 5917 return dst; 5918 5919 if (se->num_bonds) { 5920 struct virtual_engine *de = to_virtual_engine(dst->engine); 5921 5922 de->bonds = kmemdup(se->bonds, 5923 sizeof(*se->bonds) * se->num_bonds, 5924 GFP_KERNEL); 5925 if (!de->bonds) { 5926 intel_context_put(dst); 5927 return ERR_PTR(-ENOMEM); 5928 } 5929 5930 de->num_bonds = se->num_bonds; 5931 } 5932 5933 return dst; 5934 } 5935 5936 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5937 const struct intel_engine_cs *master, 5938 const struct intel_engine_cs *sibling) 5939 { 5940 struct virtual_engine *ve = to_virtual_engine(engine); 5941 struct ve_bond *bond; 5942 int n; 5943 5944 /* Sanity check the sibling is part of the virtual engine */ 5945 for (n = 0; n < ve->num_siblings; n++) 5946 if (sibling == ve->siblings[n]) 5947 break; 5948 if (n == ve->num_siblings) 5949 return -EINVAL; 5950 5951 bond = virtual_find_bond(ve, master); 5952 if (bond) { 5953 bond->sibling_mask |= sibling->mask; 5954 return 0; 5955 } 5956 5957 bond = krealloc(ve->bonds, 5958 sizeof(*bond) * (ve->num_bonds + 1), 5959 GFP_KERNEL); 5960 if (!bond) 5961 return -ENOMEM; 5962 5963 bond[ve->num_bonds].master = master; 5964 bond[ve->num_bonds].sibling_mask = sibling->mask; 5965 5966 ve->bonds = bond; 5967 ve->num_bonds++; 5968 5969 return 0; 5970 } 5971 5972 struct intel_engine_cs * 5973 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 5974 unsigned int sibling) 5975 { 5976 struct virtual_engine *ve = to_virtual_engine(engine); 5977 5978 if (sibling >= ve->num_siblings) 5979 return NULL; 5980 5981 return ve->siblings[sibling]; 5982 } 5983 5984 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5985 struct drm_printer *m, 5986 void (*show_request)(struct drm_printer *m, 5987 struct i915_request *rq, 5988 const char *prefix), 5989 unsigned int max) 5990 { 5991 const struct intel_engine_execlists *execlists = &engine->execlists; 5992 struct i915_request *rq, *last; 5993 unsigned long flags; 5994 unsigned int count; 5995 struct rb_node *rb; 5996 5997 spin_lock_irqsave(&engine->active.lock, flags); 5998 5999 last = NULL; 6000 count = 0; 6001 list_for_each_entry(rq, &engine->active.requests, sched.link) { 6002 if (count++ < max - 1) 6003 show_request(m, rq, "\t\tE "); 6004 else 6005 last = rq; 6006 } 6007 if (last) { 6008 if (count > max) { 6009 drm_printf(m, 6010 "\t\t...skipping %d executing requests...\n", 6011 count - max); 6012 } 6013 show_request(m, last, "\t\tE "); 6014 } 6015 6016 if (execlists->switch_priority_hint != INT_MIN) 6017 drm_printf(m, "\t\tSwitch priority hint: %d\n", 6018 READ_ONCE(execlists->switch_priority_hint)); 6019 if (execlists->queue_priority_hint != INT_MIN) 6020 drm_printf(m, "\t\tQueue priority hint: %d\n", 6021 READ_ONCE(execlists->queue_priority_hint)); 6022 6023 last = NULL; 6024 count = 0; 6025 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 6026 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 6027 int i; 6028 6029 priolist_for_each_request(rq, p, i) { 6030 if (count++ < max - 1) 6031 show_request(m, rq, "\t\tQ "); 6032 else 6033 last = rq; 6034 } 6035 } 6036 if (last) { 6037 if (count > max) { 6038 drm_printf(m, 6039 "\t\t...skipping %d queued requests...\n", 6040 count - max); 6041 } 6042 show_request(m, last, "\t\tQ "); 6043 } 6044 6045 last = NULL; 6046 count = 0; 6047 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 6048 struct virtual_engine *ve = 6049 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 6050 struct i915_request *rq = READ_ONCE(ve->request); 6051 6052 if (rq) { 6053 if (count++ < max - 1) 6054 show_request(m, rq, "\t\tV "); 6055 else 6056 last = rq; 6057 } 6058 } 6059 if (last) { 6060 if (count > max) { 6061 drm_printf(m, 6062 "\t\t...skipping %d virtual requests...\n", 6063 count - max); 6064 } 6065 show_request(m, last, "\t\tV "); 6066 } 6067 6068 spin_unlock_irqrestore(&engine->active.lock, flags); 6069 } 6070 6071 void intel_lr_context_reset(struct intel_engine_cs *engine, 6072 struct intel_context *ce, 6073 u32 head, 6074 bool scrub) 6075 { 6076 GEM_BUG_ON(!intel_context_is_pinned(ce)); 6077 6078 /* 6079 * We want a simple context + ring to execute the breadcrumb update. 6080 * We cannot rely on the context being intact across the GPU hang, 6081 * so clear it and rebuild just what we need for the breadcrumb. 6082 * All pending requests for this context will be zapped, and any 6083 * future request will be after userspace has had the opportunity 6084 * to recreate its own state. 6085 */ 6086 if (scrub) 6087 restore_default_state(ce, engine); 6088 6089 /* Rerun the request; its payload has been neutered (if guilty). */ 6090 __execlists_update_reg_state(ce, engine, head); 6091 } 6092 6093 bool 6094 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 6095 { 6096 return engine->set_default_submission == 6097 intel_execlists_set_default_submission; 6098 } 6099 6100 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 6101 #include "selftest_lrc.c" 6102 #endif 6103