1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_breadcrumbs.h" 141 #include "intel_context.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_gt_requests.h" 146 #include "intel_lrc_reg.h" 147 #include "intel_mocs.h" 148 #include "intel_reset.h" 149 #include "intel_ring.h" 150 #include "intel_workarounds.h" 151 #include "shmem_utils.h" 152 153 #define RING_EXECLIST_QFULL (1 << 0x2) 154 #define RING_EXECLIST1_VALID (1 << 0x3) 155 #define RING_EXECLIST0_VALID (1 << 0x4) 156 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 157 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 158 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 159 160 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 161 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 163 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 164 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 165 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 166 167 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 168 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 169 170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 171 172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 174 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 175 #define GEN12_IDLE_CTX_ID 0x7FF 176 #define GEN12_CSB_CTX_VALID(csb_dw) \ 177 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 178 179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 186 /* 187 * We allow only a single request through the virtual engine at a time 188 * (each request in the timeline waits for the completion fence of 189 * the previous before being submitted). By restricting ourselves to 190 * only submitting a single request, each request is placed on to a 191 * physical to maximise load spreading (by virtue of the late greedy 192 * scheduling -- each real engine takes the next available request 193 * upon idling). 194 */ 195 struct i915_request *request; 196 197 /* 198 * We keep a rbtree of available virtual engines inside each physical 199 * engine, sorted by priority. Here we preallocate the nodes we need 200 * for the virtual engine, indexed by physical_engine->id. 201 */ 202 struct ve_node { 203 struct rb_node rb; 204 int prio; 205 } nodes[I915_NUM_ENGINES]; 206 207 /* 208 * Keep track of bonded pairs -- restrictions upon on our selection 209 * of physical engines any particular request may be submitted to. 210 * If we receive a submit-fence from a master engine, we will only 211 * use one of sibling_mask physical engines. 212 */ 213 struct ve_bond { 214 const struct intel_engine_cs *master; 215 intel_engine_mask_t sibling_mask; 216 } *bonds; 217 unsigned int num_bonds; 218 219 /* And finally, which physical engines this virtual engine maps onto. */ 220 unsigned int num_siblings; 221 struct intel_engine_cs *siblings[]; 222 }; 223 224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 225 { 226 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 227 return container_of(engine, struct virtual_engine, base); 228 } 229 230 static int __execlists_context_alloc(struct intel_context *ce, 231 struct intel_engine_cs *engine); 232 233 static void execlists_init_reg_state(u32 *reg_state, 234 const struct intel_context *ce, 235 const struct intel_engine_cs *engine, 236 const struct intel_ring *ring, 237 bool close); 238 static void 239 __execlists_update_reg_state(const struct intel_context *ce, 240 const struct intel_engine_cs *engine, 241 u32 head); 242 243 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 244 { 245 if (INTEL_GEN(engine->i915) >= 12) 246 return 0x60; 247 else if (INTEL_GEN(engine->i915) >= 9) 248 return 0x54; 249 else if (engine->class == RENDER_CLASS) 250 return 0x58; 251 else 252 return -1; 253 } 254 255 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 256 { 257 if (INTEL_GEN(engine->i915) >= 12) 258 return 0x74; 259 else if (INTEL_GEN(engine->i915) >= 9) 260 return 0x68; 261 else if (engine->class == RENDER_CLASS) 262 return 0xd8; 263 else 264 return -1; 265 } 266 267 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 268 { 269 if (INTEL_GEN(engine->i915) >= 12) 270 return 0x12; 271 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 272 return 0x18; 273 else 274 return -1; 275 } 276 277 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 278 { 279 int x; 280 281 x = lrc_ring_wa_bb_per_ctx(engine); 282 if (x < 0) 283 return x; 284 285 return x + 2; 286 } 287 288 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 289 { 290 int x; 291 292 x = lrc_ring_indirect_ptr(engine); 293 if (x < 0) 294 return x; 295 296 return x + 2; 297 } 298 299 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 300 { 301 if (engine->class != RENDER_CLASS) 302 return -1; 303 304 if (INTEL_GEN(engine->i915) >= 12) 305 return 0xb6; 306 else if (INTEL_GEN(engine->i915) >= 11) 307 return 0xaa; 308 else 309 return -1; 310 } 311 312 static u32 313 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 314 { 315 switch (INTEL_GEN(engine->i915)) { 316 default: 317 MISSING_CASE(INTEL_GEN(engine->i915)); 318 fallthrough; 319 case 12: 320 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 321 case 11: 322 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 323 case 10: 324 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 325 case 9: 326 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 327 case 8: 328 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 329 } 330 } 331 332 static void 333 lrc_ring_setup_indirect_ctx(u32 *regs, 334 const struct intel_engine_cs *engine, 335 u32 ctx_bb_ggtt_addr, 336 u32 size) 337 { 338 GEM_BUG_ON(!size); 339 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 340 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 341 regs[lrc_ring_indirect_ptr(engine) + 1] = 342 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 343 344 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 345 regs[lrc_ring_indirect_offset(engine) + 1] = 346 lrc_ring_indirect_offset_default(engine) << 6; 347 } 348 349 static u32 intel_context_get_runtime(const struct intel_context *ce) 350 { 351 /* 352 * We can use either ppHWSP[16] which is recorded before the context 353 * switch (and so excludes the cost of context switches) or use the 354 * value from the context image itself, which is saved/restored earlier 355 * and so includes the cost of the save. 356 */ 357 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 358 } 359 360 static void mark_eio(struct i915_request *rq) 361 { 362 if (i915_request_completed(rq)) 363 return; 364 365 GEM_BUG_ON(i915_request_signaled(rq)); 366 367 i915_request_set_error_once(rq, -EIO); 368 i915_request_mark_complete(rq); 369 } 370 371 static struct i915_request * 372 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 373 { 374 struct i915_request *active = rq; 375 376 rcu_read_lock(); 377 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 378 if (i915_request_completed(rq)) 379 break; 380 381 active = rq; 382 } 383 rcu_read_unlock(); 384 385 return active; 386 } 387 388 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 389 { 390 return (i915_ggtt_offset(engine->status_page.vma) + 391 I915_GEM_HWS_PREEMPT_ADDR); 392 } 393 394 static inline void 395 ring_set_paused(const struct intel_engine_cs *engine, int state) 396 { 397 /* 398 * We inspect HWS_PREEMPT with a semaphore inside 399 * engine->emit_fini_breadcrumb. If the dword is true, 400 * the ring is paused as the semaphore will busywait 401 * until the dword is false. 402 */ 403 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 404 if (state) 405 wmb(); 406 } 407 408 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 409 { 410 return rb_entry(rb, struct i915_priolist, node); 411 } 412 413 static inline int rq_prio(const struct i915_request *rq) 414 { 415 return READ_ONCE(rq->sched.attr.priority); 416 } 417 418 static int effective_prio(const struct i915_request *rq) 419 { 420 int prio = rq_prio(rq); 421 422 /* 423 * If this request is special and must not be interrupted at any 424 * cost, so be it. Note we are only checking the most recent request 425 * in the context and so may be masking an earlier vip request. It 426 * is hoped that under the conditions where nopreempt is used, this 427 * will not matter (i.e. all requests to that context will be 428 * nopreempt for as long as desired). 429 */ 430 if (i915_request_has_nopreempt(rq)) 431 prio = I915_PRIORITY_UNPREEMPTABLE; 432 433 return prio; 434 } 435 436 static int queue_prio(const struct intel_engine_execlists *execlists) 437 { 438 struct i915_priolist *p; 439 struct rb_node *rb; 440 441 rb = rb_first_cached(&execlists->queue); 442 if (!rb) 443 return INT_MIN; 444 445 /* 446 * As the priolist[] are inverted, with the highest priority in [0], 447 * we have to flip the index value to become priority. 448 */ 449 p = to_priolist(rb); 450 if (!I915_USER_PRIORITY_SHIFT) 451 return p->priority; 452 453 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 454 } 455 456 static inline bool need_preempt(const struct intel_engine_cs *engine, 457 const struct i915_request *rq, 458 struct rb_node *rb) 459 { 460 int last_prio; 461 462 if (!intel_engine_has_semaphores(engine)) 463 return false; 464 465 /* 466 * Check if the current priority hint merits a preemption attempt. 467 * 468 * We record the highest value priority we saw during rescheduling 469 * prior to this dequeue, therefore we know that if it is strictly 470 * less than the current tail of ESLP[0], we do not need to force 471 * a preempt-to-idle cycle. 472 * 473 * However, the priority hint is a mere hint that we may need to 474 * preempt. If that hint is stale or we may be trying to preempt 475 * ourselves, ignore the request. 476 * 477 * More naturally we would write 478 * prio >= max(0, last); 479 * except that we wish to prevent triggering preemption at the same 480 * priority level: the task that is running should remain running 481 * to preserve FIFO ordering of dependencies. 482 */ 483 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 484 if (engine->execlists.queue_priority_hint <= last_prio) 485 return false; 486 487 /* 488 * Check against the first request in ELSP[1], it will, thanks to the 489 * power of PI, be the highest priority of that context. 490 */ 491 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 492 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 493 return true; 494 495 if (rb) { 496 struct virtual_engine *ve = 497 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 498 bool preempt = false; 499 500 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 501 struct i915_request *next; 502 503 rcu_read_lock(); 504 next = READ_ONCE(ve->request); 505 if (next) 506 preempt = rq_prio(next) > last_prio; 507 rcu_read_unlock(); 508 } 509 510 if (preempt) 511 return preempt; 512 } 513 514 /* 515 * If the inflight context did not trigger the preemption, then maybe 516 * it was the set of queued requests? Pick the highest priority in 517 * the queue (the first active priolist) and see if it deserves to be 518 * running instead of ELSP[0]. 519 * 520 * The highest priority request in the queue can not be either 521 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 522 * context, it's priority would not exceed ELSP[0] aka last_prio. 523 */ 524 return queue_prio(&engine->execlists) > last_prio; 525 } 526 527 __maybe_unused static inline bool 528 assert_priority_queue(const struct i915_request *prev, 529 const struct i915_request *next) 530 { 531 /* 532 * Without preemption, the prev may refer to the still active element 533 * which we refuse to let go. 534 * 535 * Even with preemption, there are times when we think it is better not 536 * to preempt and leave an ostensibly lower priority request in flight. 537 */ 538 if (i915_request_is_active(prev)) 539 return true; 540 541 return rq_prio(prev) >= rq_prio(next); 542 } 543 544 /* 545 * The context descriptor encodes various attributes of a context, 546 * including its GTT address and some flags. Because it's fairly 547 * expensive to calculate, we'll just do it once and cache the result, 548 * which remains valid until the context is unpinned. 549 * 550 * This is what a descriptor looks like, from LSB to MSB:: 551 * 552 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 553 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 554 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 555 * bits 53-54: mbz, reserved for use by hardware 556 * bits 55-63: group ID, currently unused and set to 0 557 * 558 * Starting from Gen11, the upper dword of the descriptor has a new format: 559 * 560 * bits 32-36: reserved 561 * bits 37-47: SW context ID 562 * bits 48:53: engine instance 563 * bit 54: mbz, reserved for use by hardware 564 * bits 55-60: SW counter 565 * bits 61-63: engine class 566 * 567 * engine info, SW context ID and SW counter need to form a unique number 568 * (Context ID) per lrc. 569 */ 570 static u32 571 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 572 { 573 u32 desc; 574 575 desc = INTEL_LEGACY_32B_CONTEXT; 576 if (i915_vm_is_4lvl(ce->vm)) 577 desc = INTEL_LEGACY_64B_CONTEXT; 578 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 579 580 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 581 if (IS_GEN(engine->i915, 8)) 582 desc |= GEN8_CTX_L3LLC_COHERENT; 583 584 return i915_ggtt_offset(ce->state) | desc; 585 } 586 587 static inline unsigned int dword_in_page(void *addr) 588 { 589 return offset_in_page(addr) / sizeof(u32); 590 } 591 592 static void set_offsets(u32 *regs, 593 const u8 *data, 594 const struct intel_engine_cs *engine, 595 bool clear) 596 #define NOP(x) (BIT(7) | (x)) 597 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 598 #define POSTED BIT(0) 599 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 600 #define REG16(x) \ 601 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 602 (((x) >> 2) & 0x7f) 603 #define END(total_state_size) 0, (total_state_size) 604 { 605 const u32 base = engine->mmio_base; 606 607 while (*data) { 608 u8 count, flags; 609 610 if (*data & BIT(7)) { /* skip */ 611 count = *data++ & ~BIT(7); 612 if (clear) 613 memset32(regs, MI_NOOP, count); 614 regs += count; 615 continue; 616 } 617 618 count = *data & 0x3f; 619 flags = *data >> 6; 620 data++; 621 622 *regs = MI_LOAD_REGISTER_IMM(count); 623 if (flags & POSTED) 624 *regs |= MI_LRI_FORCE_POSTED; 625 if (INTEL_GEN(engine->i915) >= 11) 626 *regs |= MI_LRI_LRM_CS_MMIO; 627 regs++; 628 629 GEM_BUG_ON(!count); 630 do { 631 u32 offset = 0; 632 u8 v; 633 634 do { 635 v = *data++; 636 offset <<= 7; 637 offset |= v & ~BIT(7); 638 } while (v & BIT(7)); 639 640 regs[0] = base + (offset << 2); 641 if (clear) 642 regs[1] = 0; 643 regs += 2; 644 } while (--count); 645 } 646 647 if (clear) { 648 u8 count = *++data; 649 650 /* Clear past the tail for HW access */ 651 GEM_BUG_ON(dword_in_page(regs) > count); 652 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 653 654 /* Close the batch; used mainly by live_lrc_layout() */ 655 *regs = MI_BATCH_BUFFER_END; 656 if (INTEL_GEN(engine->i915) >= 10) 657 *regs |= BIT(0); 658 } 659 } 660 661 static const u8 gen8_xcs_offsets[] = { 662 NOP(1), 663 LRI(11, 0), 664 REG16(0x244), 665 REG(0x034), 666 REG(0x030), 667 REG(0x038), 668 REG(0x03c), 669 REG(0x168), 670 REG(0x140), 671 REG(0x110), 672 REG(0x11c), 673 REG(0x114), 674 REG(0x118), 675 676 NOP(9), 677 LRI(9, 0), 678 REG16(0x3a8), 679 REG16(0x28c), 680 REG16(0x288), 681 REG16(0x284), 682 REG16(0x280), 683 REG16(0x27c), 684 REG16(0x278), 685 REG16(0x274), 686 REG16(0x270), 687 688 NOP(13), 689 LRI(2, 0), 690 REG16(0x200), 691 REG(0x028), 692 693 END(80) 694 }; 695 696 static const u8 gen9_xcs_offsets[] = { 697 NOP(1), 698 LRI(14, POSTED), 699 REG16(0x244), 700 REG(0x034), 701 REG(0x030), 702 REG(0x038), 703 REG(0x03c), 704 REG(0x168), 705 REG(0x140), 706 REG(0x110), 707 REG(0x11c), 708 REG(0x114), 709 REG(0x118), 710 REG(0x1c0), 711 REG(0x1c4), 712 REG(0x1c8), 713 714 NOP(3), 715 LRI(9, POSTED), 716 REG16(0x3a8), 717 REG16(0x28c), 718 REG16(0x288), 719 REG16(0x284), 720 REG16(0x280), 721 REG16(0x27c), 722 REG16(0x278), 723 REG16(0x274), 724 REG16(0x270), 725 726 NOP(13), 727 LRI(1, POSTED), 728 REG16(0x200), 729 730 NOP(13), 731 LRI(44, POSTED), 732 REG(0x028), 733 REG(0x09c), 734 REG(0x0c0), 735 REG(0x178), 736 REG(0x17c), 737 REG16(0x358), 738 REG(0x170), 739 REG(0x150), 740 REG(0x154), 741 REG(0x158), 742 REG16(0x41c), 743 REG16(0x600), 744 REG16(0x604), 745 REG16(0x608), 746 REG16(0x60c), 747 REG16(0x610), 748 REG16(0x614), 749 REG16(0x618), 750 REG16(0x61c), 751 REG16(0x620), 752 REG16(0x624), 753 REG16(0x628), 754 REG16(0x62c), 755 REG16(0x630), 756 REG16(0x634), 757 REG16(0x638), 758 REG16(0x63c), 759 REG16(0x640), 760 REG16(0x644), 761 REG16(0x648), 762 REG16(0x64c), 763 REG16(0x650), 764 REG16(0x654), 765 REG16(0x658), 766 REG16(0x65c), 767 REG16(0x660), 768 REG16(0x664), 769 REG16(0x668), 770 REG16(0x66c), 771 REG16(0x670), 772 REG16(0x674), 773 REG16(0x678), 774 REG16(0x67c), 775 REG(0x068), 776 777 END(176) 778 }; 779 780 static const u8 gen12_xcs_offsets[] = { 781 NOP(1), 782 LRI(13, POSTED), 783 REG16(0x244), 784 REG(0x034), 785 REG(0x030), 786 REG(0x038), 787 REG(0x03c), 788 REG(0x168), 789 REG(0x140), 790 REG(0x110), 791 REG(0x1c0), 792 REG(0x1c4), 793 REG(0x1c8), 794 REG(0x180), 795 REG16(0x2b4), 796 797 NOP(5), 798 LRI(9, POSTED), 799 REG16(0x3a8), 800 REG16(0x28c), 801 REG16(0x288), 802 REG16(0x284), 803 REG16(0x280), 804 REG16(0x27c), 805 REG16(0x278), 806 REG16(0x274), 807 REG16(0x270), 808 809 END(80) 810 }; 811 812 static const u8 gen8_rcs_offsets[] = { 813 NOP(1), 814 LRI(14, POSTED), 815 REG16(0x244), 816 REG(0x034), 817 REG(0x030), 818 REG(0x038), 819 REG(0x03c), 820 REG(0x168), 821 REG(0x140), 822 REG(0x110), 823 REG(0x11c), 824 REG(0x114), 825 REG(0x118), 826 REG(0x1c0), 827 REG(0x1c4), 828 REG(0x1c8), 829 830 NOP(3), 831 LRI(9, POSTED), 832 REG16(0x3a8), 833 REG16(0x28c), 834 REG16(0x288), 835 REG16(0x284), 836 REG16(0x280), 837 REG16(0x27c), 838 REG16(0x278), 839 REG16(0x274), 840 REG16(0x270), 841 842 NOP(13), 843 LRI(1, 0), 844 REG(0x0c8), 845 846 END(80) 847 }; 848 849 static const u8 gen9_rcs_offsets[] = { 850 NOP(1), 851 LRI(14, POSTED), 852 REG16(0x244), 853 REG(0x34), 854 REG(0x30), 855 REG(0x38), 856 REG(0x3c), 857 REG(0x168), 858 REG(0x140), 859 REG(0x110), 860 REG(0x11c), 861 REG(0x114), 862 REG(0x118), 863 REG(0x1c0), 864 REG(0x1c4), 865 REG(0x1c8), 866 867 NOP(3), 868 LRI(9, POSTED), 869 REG16(0x3a8), 870 REG16(0x28c), 871 REG16(0x288), 872 REG16(0x284), 873 REG16(0x280), 874 REG16(0x27c), 875 REG16(0x278), 876 REG16(0x274), 877 REG16(0x270), 878 879 NOP(13), 880 LRI(1, 0), 881 REG(0xc8), 882 883 NOP(13), 884 LRI(44, POSTED), 885 REG(0x28), 886 REG(0x9c), 887 REG(0xc0), 888 REG(0x178), 889 REG(0x17c), 890 REG16(0x358), 891 REG(0x170), 892 REG(0x150), 893 REG(0x154), 894 REG(0x158), 895 REG16(0x41c), 896 REG16(0x600), 897 REG16(0x604), 898 REG16(0x608), 899 REG16(0x60c), 900 REG16(0x610), 901 REG16(0x614), 902 REG16(0x618), 903 REG16(0x61c), 904 REG16(0x620), 905 REG16(0x624), 906 REG16(0x628), 907 REG16(0x62c), 908 REG16(0x630), 909 REG16(0x634), 910 REG16(0x638), 911 REG16(0x63c), 912 REG16(0x640), 913 REG16(0x644), 914 REG16(0x648), 915 REG16(0x64c), 916 REG16(0x650), 917 REG16(0x654), 918 REG16(0x658), 919 REG16(0x65c), 920 REG16(0x660), 921 REG16(0x664), 922 REG16(0x668), 923 REG16(0x66c), 924 REG16(0x670), 925 REG16(0x674), 926 REG16(0x678), 927 REG16(0x67c), 928 REG(0x68), 929 930 END(176) 931 }; 932 933 static const u8 gen11_rcs_offsets[] = { 934 NOP(1), 935 LRI(15, POSTED), 936 REG16(0x244), 937 REG(0x034), 938 REG(0x030), 939 REG(0x038), 940 REG(0x03c), 941 REG(0x168), 942 REG(0x140), 943 REG(0x110), 944 REG(0x11c), 945 REG(0x114), 946 REG(0x118), 947 REG(0x1c0), 948 REG(0x1c4), 949 REG(0x1c8), 950 REG(0x180), 951 952 NOP(1), 953 LRI(9, POSTED), 954 REG16(0x3a8), 955 REG16(0x28c), 956 REG16(0x288), 957 REG16(0x284), 958 REG16(0x280), 959 REG16(0x27c), 960 REG16(0x278), 961 REG16(0x274), 962 REG16(0x270), 963 964 LRI(1, POSTED), 965 REG(0x1b0), 966 967 NOP(10), 968 LRI(1, 0), 969 REG(0x0c8), 970 971 END(80) 972 }; 973 974 static const u8 gen12_rcs_offsets[] = { 975 NOP(1), 976 LRI(13, POSTED), 977 REG16(0x244), 978 REG(0x034), 979 REG(0x030), 980 REG(0x038), 981 REG(0x03c), 982 REG(0x168), 983 REG(0x140), 984 REG(0x110), 985 REG(0x1c0), 986 REG(0x1c4), 987 REG(0x1c8), 988 REG(0x180), 989 REG16(0x2b4), 990 991 NOP(5), 992 LRI(9, POSTED), 993 REG16(0x3a8), 994 REG16(0x28c), 995 REG16(0x288), 996 REG16(0x284), 997 REG16(0x280), 998 REG16(0x27c), 999 REG16(0x278), 1000 REG16(0x274), 1001 REG16(0x270), 1002 1003 LRI(3, POSTED), 1004 REG(0x1b0), 1005 REG16(0x5a8), 1006 REG16(0x5ac), 1007 1008 NOP(6), 1009 LRI(1, 0), 1010 REG(0x0c8), 1011 NOP(3 + 9 + 1), 1012 1013 LRI(51, POSTED), 1014 REG16(0x588), 1015 REG16(0x588), 1016 REG16(0x588), 1017 REG16(0x588), 1018 REG16(0x588), 1019 REG16(0x588), 1020 REG(0x028), 1021 REG(0x09c), 1022 REG(0x0c0), 1023 REG(0x178), 1024 REG(0x17c), 1025 REG16(0x358), 1026 REG(0x170), 1027 REG(0x150), 1028 REG(0x154), 1029 REG(0x158), 1030 REG16(0x41c), 1031 REG16(0x600), 1032 REG16(0x604), 1033 REG16(0x608), 1034 REG16(0x60c), 1035 REG16(0x610), 1036 REG16(0x614), 1037 REG16(0x618), 1038 REG16(0x61c), 1039 REG16(0x620), 1040 REG16(0x624), 1041 REG16(0x628), 1042 REG16(0x62c), 1043 REG16(0x630), 1044 REG16(0x634), 1045 REG16(0x638), 1046 REG16(0x63c), 1047 REG16(0x640), 1048 REG16(0x644), 1049 REG16(0x648), 1050 REG16(0x64c), 1051 REG16(0x650), 1052 REG16(0x654), 1053 REG16(0x658), 1054 REG16(0x65c), 1055 REG16(0x660), 1056 REG16(0x664), 1057 REG16(0x668), 1058 REG16(0x66c), 1059 REG16(0x670), 1060 REG16(0x674), 1061 REG16(0x678), 1062 REG16(0x67c), 1063 REG(0x068), 1064 REG(0x084), 1065 NOP(1), 1066 1067 END(192) 1068 }; 1069 1070 #undef END 1071 #undef REG16 1072 #undef REG 1073 #undef LRI 1074 #undef NOP 1075 1076 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 1077 { 1078 /* 1079 * The gen12+ lists only have the registers we program in the basic 1080 * default state. We rely on the context image using relative 1081 * addressing to automatic fixup the register state between the 1082 * physical engines for virtual engine. 1083 */ 1084 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 1085 !intel_engine_has_relative_mmio(engine)); 1086 1087 if (engine->class == RENDER_CLASS) { 1088 if (INTEL_GEN(engine->i915) >= 12) 1089 return gen12_rcs_offsets; 1090 else if (INTEL_GEN(engine->i915) >= 11) 1091 return gen11_rcs_offsets; 1092 else if (INTEL_GEN(engine->i915) >= 9) 1093 return gen9_rcs_offsets; 1094 else 1095 return gen8_rcs_offsets; 1096 } else { 1097 if (INTEL_GEN(engine->i915) >= 12) 1098 return gen12_xcs_offsets; 1099 else if (INTEL_GEN(engine->i915) >= 9) 1100 return gen9_xcs_offsets; 1101 else 1102 return gen8_xcs_offsets; 1103 } 1104 } 1105 1106 static struct i915_request * 1107 __unwind_incomplete_requests(struct intel_engine_cs *engine) 1108 { 1109 struct i915_request *rq, *rn, *active = NULL; 1110 struct list_head *pl; 1111 int prio = I915_PRIORITY_INVALID; 1112 1113 lockdep_assert_held(&engine->active.lock); 1114 1115 list_for_each_entry_safe_reverse(rq, rn, 1116 &engine->active.requests, 1117 sched.link) { 1118 if (i915_request_completed(rq)) 1119 continue; /* XXX */ 1120 1121 __i915_request_unsubmit(rq); 1122 1123 /* 1124 * Push the request back into the queue for later resubmission. 1125 * If this request is not native to this physical engine (i.e. 1126 * it came from a virtual source), push it back onto the virtual 1127 * engine so that it can be moved across onto another physical 1128 * engine as load dictates. 1129 */ 1130 if (likely(rq->execution_mask == engine->mask)) { 1131 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 1132 if (rq_prio(rq) != prio) { 1133 prio = rq_prio(rq); 1134 pl = i915_sched_lookup_priolist(engine, prio); 1135 } 1136 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1137 1138 list_move(&rq->sched.link, pl); 1139 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 1140 1141 /* Check in case we rollback so far we wrap [size/2] */ 1142 if (intel_ring_direction(rq->ring, 1143 intel_ring_wrap(rq->ring, 1144 rq->tail), 1145 rq->ring->tail) > 0) 1146 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1147 1148 active = rq; 1149 } else { 1150 struct intel_engine_cs *owner = rq->context->engine; 1151 1152 WRITE_ONCE(rq->engine, owner); 1153 owner->submit_request(rq); 1154 active = NULL; 1155 } 1156 } 1157 1158 return active; 1159 } 1160 1161 struct i915_request * 1162 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1163 { 1164 struct intel_engine_cs *engine = 1165 container_of(execlists, typeof(*engine), execlists); 1166 1167 return __unwind_incomplete_requests(engine); 1168 } 1169 1170 static inline void 1171 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1172 { 1173 /* 1174 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1175 * The compiler should eliminate this function as dead-code. 1176 */ 1177 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1178 return; 1179 1180 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1181 status, rq); 1182 } 1183 1184 static void intel_engine_context_in(struct intel_engine_cs *engine) 1185 { 1186 unsigned long flags; 1187 1188 if (atomic_add_unless(&engine->stats.active, 1, 0)) 1189 return; 1190 1191 write_seqlock_irqsave(&engine->stats.lock, flags); 1192 if (!atomic_add_unless(&engine->stats.active, 1, 0)) { 1193 engine->stats.start = ktime_get(); 1194 atomic_inc(&engine->stats.active); 1195 } 1196 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1197 } 1198 1199 static void intel_engine_context_out(struct intel_engine_cs *engine) 1200 { 1201 unsigned long flags; 1202 1203 GEM_BUG_ON(!atomic_read(&engine->stats.active)); 1204 1205 if (atomic_add_unless(&engine->stats.active, -1, 1)) 1206 return; 1207 1208 write_seqlock_irqsave(&engine->stats.lock, flags); 1209 if (atomic_dec_and_test(&engine->stats.active)) { 1210 engine->stats.total = 1211 ktime_add(engine->stats.total, 1212 ktime_sub(ktime_get(), engine->stats.start)); 1213 } 1214 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1215 } 1216 1217 static void 1218 execlists_check_context(const struct intel_context *ce, 1219 const struct intel_engine_cs *engine) 1220 { 1221 const struct intel_ring *ring = ce->ring; 1222 u32 *regs = ce->lrc_reg_state; 1223 bool valid = true; 1224 int x; 1225 1226 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1227 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1228 engine->name, 1229 regs[CTX_RING_START], 1230 i915_ggtt_offset(ring->vma)); 1231 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1232 valid = false; 1233 } 1234 1235 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1236 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1237 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1238 engine->name, 1239 regs[CTX_RING_CTL], 1240 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1241 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1242 valid = false; 1243 } 1244 1245 x = lrc_ring_mi_mode(engine); 1246 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1247 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1248 engine->name, regs[x + 1]); 1249 regs[x + 1] &= ~STOP_RING; 1250 regs[x + 1] |= STOP_RING << 16; 1251 valid = false; 1252 } 1253 1254 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1255 } 1256 1257 static void restore_default_state(struct intel_context *ce, 1258 struct intel_engine_cs *engine) 1259 { 1260 u32 *regs; 1261 1262 regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); 1263 execlists_init_reg_state(regs, ce, engine, ce->ring, true); 1264 1265 ce->runtime.last = intel_context_get_runtime(ce); 1266 } 1267 1268 static void reset_active(struct i915_request *rq, 1269 struct intel_engine_cs *engine) 1270 { 1271 struct intel_context * const ce = rq->context; 1272 u32 head; 1273 1274 /* 1275 * The executing context has been cancelled. We want to prevent 1276 * further execution along this context and propagate the error on 1277 * to anything depending on its results. 1278 * 1279 * In __i915_request_submit(), we apply the -EIO and remove the 1280 * requests' payloads for any banned requests. But first, we must 1281 * rewind the context back to the start of the incomplete request so 1282 * that we do not jump back into the middle of the batch. 1283 * 1284 * We preserve the breadcrumbs and semaphores of the incomplete 1285 * requests so that inter-timeline dependencies (i.e other timelines) 1286 * remain correctly ordered. And we defer to __i915_request_submit() 1287 * so that all asynchronous waits are correctly handled. 1288 */ 1289 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1290 rq->fence.context, rq->fence.seqno); 1291 1292 /* On resubmission of the active request, payload will be scrubbed */ 1293 if (i915_request_completed(rq)) 1294 head = rq->tail; 1295 else 1296 head = active_request(ce->timeline, rq)->head; 1297 head = intel_ring_wrap(ce->ring, head); 1298 1299 /* Scrub the context image to prevent replaying the previous batch */ 1300 restore_default_state(ce, engine); 1301 __execlists_update_reg_state(ce, engine, head); 1302 1303 /* We've switched away, so this should be a no-op, but intent matters */ 1304 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1305 } 1306 1307 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1308 { 1309 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1310 ce->runtime.num_underflow += dt < 0; 1311 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1312 #endif 1313 } 1314 1315 static void intel_context_update_runtime(struct intel_context *ce) 1316 { 1317 u32 old; 1318 s32 dt; 1319 1320 if (intel_context_is_barrier(ce)) 1321 return; 1322 1323 old = ce->runtime.last; 1324 ce->runtime.last = intel_context_get_runtime(ce); 1325 dt = ce->runtime.last - old; 1326 1327 if (unlikely(dt <= 0)) { 1328 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1329 old, ce->runtime.last, dt); 1330 st_update_runtime_underflow(ce, dt); 1331 return; 1332 } 1333 1334 ewma_runtime_add(&ce->runtime.avg, dt); 1335 ce->runtime.total += dt; 1336 } 1337 1338 static inline struct intel_engine_cs * 1339 __execlists_schedule_in(struct i915_request *rq) 1340 { 1341 struct intel_engine_cs * const engine = rq->engine; 1342 struct intel_context * const ce = rq->context; 1343 1344 intel_context_get(ce); 1345 1346 if (unlikely(intel_context_is_banned(ce))) 1347 reset_active(rq, engine); 1348 1349 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1350 execlists_check_context(ce, engine); 1351 1352 if (ce->tag) { 1353 /* Use a fixed tag for OA and friends */ 1354 GEM_BUG_ON(ce->tag <= BITS_PER_LONG); 1355 ce->lrc.ccid = ce->tag; 1356 } else { 1357 /* We don't need a strict matching tag, just different values */ 1358 unsigned int tag = ffs(READ_ONCE(engine->context_tag)); 1359 1360 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); 1361 clear_bit(tag - 1, &engine->context_tag); 1362 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32); 1363 1364 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID); 1365 } 1366 1367 ce->lrc.ccid |= engine->execlists.ccid; 1368 1369 __intel_gt_pm_get(engine->gt); 1370 if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active)) 1371 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain); 1372 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1373 intel_engine_context_in(engine); 1374 1375 return engine; 1376 } 1377 1378 static inline struct i915_request * 1379 execlists_schedule_in(struct i915_request *rq, int idx) 1380 { 1381 struct intel_context * const ce = rq->context; 1382 struct intel_engine_cs *old; 1383 1384 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1385 trace_i915_request_in(rq, idx); 1386 1387 old = READ_ONCE(ce->inflight); 1388 do { 1389 if (!old) { 1390 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1391 break; 1392 } 1393 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1394 1395 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1396 return i915_request_get(rq); 1397 } 1398 1399 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1400 { 1401 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1402 struct i915_request *next = READ_ONCE(ve->request); 1403 1404 if (next == rq || (next && next->execution_mask & ~rq->execution_mask)) 1405 tasklet_hi_schedule(&ve->base.execlists.tasklet); 1406 } 1407 1408 static inline void 1409 __execlists_schedule_out(struct i915_request *rq, 1410 struct intel_engine_cs * const engine, 1411 unsigned int ccid) 1412 { 1413 struct intel_context * const ce = rq->context; 1414 1415 /* 1416 * NB process_csb() is not under the engine->active.lock and hence 1417 * schedule_out can race with schedule_in meaning that we should 1418 * refrain from doing non-trivial work here. 1419 */ 1420 1421 /* 1422 * If we have just completed this context, the engine may now be 1423 * idle and we want to re-enter powersaving. 1424 */ 1425 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && 1426 i915_request_completed(rq)) 1427 intel_engine_add_retire(engine, ce->timeline); 1428 1429 ccid >>= GEN11_SW_CTX_ID_SHIFT - 32; 1430 ccid &= GEN12_MAX_CONTEXT_HW_ID; 1431 if (ccid < BITS_PER_LONG) { 1432 GEM_BUG_ON(ccid == 0); 1433 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag)); 1434 set_bit(ccid - 1, &engine->context_tag); 1435 } 1436 1437 intel_context_update_runtime(ce); 1438 intel_engine_context_out(engine); 1439 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1440 if (engine->fw_domain && !atomic_dec_return(&engine->fw_active)) 1441 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain); 1442 intel_gt_pm_put_async(engine->gt); 1443 1444 /* 1445 * If this is part of a virtual engine, its next request may 1446 * have been blocked waiting for access to the active context. 1447 * We have to kick all the siblings again in case we need to 1448 * switch (e.g. the next request is not runnable on this 1449 * engine). Hopefully, we will already have submitted the next 1450 * request before the tasklet runs and do not need to rebuild 1451 * each virtual tree and kick everyone again. 1452 */ 1453 if (ce->engine != engine) 1454 kick_siblings(rq, ce); 1455 1456 intel_context_put(ce); 1457 } 1458 1459 static inline void 1460 execlists_schedule_out(struct i915_request *rq) 1461 { 1462 struct intel_context * const ce = rq->context; 1463 struct intel_engine_cs *cur, *old; 1464 u32 ccid; 1465 1466 trace_i915_request_out(rq); 1467 1468 ccid = rq->context->lrc.ccid; 1469 old = READ_ONCE(ce->inflight); 1470 do 1471 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1472 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1473 if (!cur) 1474 __execlists_schedule_out(rq, old, ccid); 1475 1476 i915_request_put(rq); 1477 } 1478 1479 static u64 execlists_update_context(struct i915_request *rq) 1480 { 1481 struct intel_context *ce = rq->context; 1482 u64 desc = ce->lrc.desc; 1483 u32 tail, prev; 1484 1485 /* 1486 * WaIdleLiteRestore:bdw,skl 1487 * 1488 * We should never submit the context with the same RING_TAIL twice 1489 * just in case we submit an empty ring, which confuses the HW. 1490 * 1491 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1492 * the normal request to be able to always advance the RING_TAIL on 1493 * subsequent resubmissions (for lite restore). Should that fail us, 1494 * and we try and submit the same tail again, force the context 1495 * reload. 1496 * 1497 * If we need to return to a preempted context, we need to skip the 1498 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1499 * HW has a tendency to ignore us rewinding the TAIL to the end of 1500 * an earlier request. 1501 */ 1502 GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail); 1503 prev = rq->ring->tail; 1504 tail = intel_ring_set_tail(rq->ring, rq->tail); 1505 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1506 desc |= CTX_DESC_FORCE_RESTORE; 1507 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1508 rq->tail = rq->wa_tail; 1509 1510 /* 1511 * Make sure the context image is complete before we submit it to HW. 1512 * 1513 * Ostensibly, writes (including the WCB) should be flushed prior to 1514 * an uncached write such as our mmio register access, the empirical 1515 * evidence (esp. on Braswell) suggests that the WC write into memory 1516 * may not be visible to the HW prior to the completion of the UC 1517 * register write and that we may begin execution from the context 1518 * before its image is complete leading to invalid PD chasing. 1519 */ 1520 wmb(); 1521 1522 ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE; 1523 return desc; 1524 } 1525 1526 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1527 { 1528 if (execlists->ctrl_reg) { 1529 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1530 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1531 } else { 1532 writel(upper_32_bits(desc), execlists->submit_reg); 1533 writel(lower_32_bits(desc), execlists->submit_reg); 1534 } 1535 } 1536 1537 static __maybe_unused char * 1538 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) 1539 { 1540 if (!rq) 1541 return ""; 1542 1543 snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", 1544 prefix, 1545 rq->context->lrc.ccid, 1546 rq->fence.context, rq->fence.seqno, 1547 i915_request_completed(rq) ? "!" : 1548 i915_request_started(rq) ? "*" : 1549 "", 1550 rq_prio(rq)); 1551 1552 return buf; 1553 } 1554 1555 static __maybe_unused void 1556 trace_ports(const struct intel_engine_execlists *execlists, 1557 const char *msg, 1558 struct i915_request * const *ports) 1559 { 1560 const struct intel_engine_cs *engine = 1561 container_of(execlists, typeof(*engine), execlists); 1562 char __maybe_unused p0[40], p1[40]; 1563 1564 if (!ports[0]) 1565 return; 1566 1567 ENGINE_TRACE(engine, "%s { %s%s }\n", msg, 1568 dump_port(p0, sizeof(p0), "", ports[0]), 1569 dump_port(p1, sizeof(p1), ", ", ports[1])); 1570 } 1571 1572 static inline bool 1573 reset_in_progress(const struct intel_engine_execlists *execlists) 1574 { 1575 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1576 } 1577 1578 static __maybe_unused bool 1579 assert_pending_valid(const struct intel_engine_execlists *execlists, 1580 const char *msg) 1581 { 1582 struct intel_engine_cs *engine = 1583 container_of(execlists, typeof(*engine), execlists); 1584 struct i915_request * const *port, *rq; 1585 struct intel_context *ce = NULL; 1586 bool sentinel = false; 1587 u32 ccid = -1; 1588 1589 trace_ports(execlists, msg, execlists->pending); 1590 1591 /* We may be messing around with the lists during reset, lalala */ 1592 if (reset_in_progress(execlists)) 1593 return true; 1594 1595 if (!execlists->pending[0]) { 1596 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", 1597 engine->name); 1598 return false; 1599 } 1600 1601 if (execlists->pending[execlists_num_ports(execlists)]) { 1602 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", 1603 engine->name, execlists_num_ports(execlists)); 1604 return false; 1605 } 1606 1607 for (port = execlists->pending; (rq = *port); port++) { 1608 unsigned long flags; 1609 bool ok = true; 1610 1611 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1612 GEM_BUG_ON(!i915_request_is_active(rq)); 1613 1614 if (ce == rq->context) { 1615 GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", 1616 engine->name, 1617 ce->timeline->fence_context, 1618 port - execlists->pending); 1619 return false; 1620 } 1621 ce = rq->context; 1622 1623 if (ccid == ce->lrc.ccid) { 1624 GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", 1625 engine->name, 1626 ccid, ce->timeline->fence_context, 1627 port - execlists->pending); 1628 return false; 1629 } 1630 ccid = ce->lrc.ccid; 1631 1632 /* 1633 * Sentinels are supposed to be the last request so they flush 1634 * the current execution off the HW. Check that they are the only 1635 * request in the pending submission. 1636 */ 1637 if (sentinel) { 1638 GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", 1639 engine->name, 1640 ce->timeline->fence_context, 1641 port - execlists->pending); 1642 return false; 1643 } 1644 sentinel = i915_request_has_sentinel(rq); 1645 1646 /* Hold tightly onto the lock to prevent concurrent retires! */ 1647 if (!spin_trylock_irqsave(&rq->lock, flags)) 1648 continue; 1649 1650 if (i915_request_completed(rq)) 1651 goto unlock; 1652 1653 if (i915_active_is_idle(&ce->active) && 1654 !intel_context_is_barrier(ce)) { 1655 GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", 1656 engine->name, 1657 ce->timeline->fence_context, 1658 port - execlists->pending); 1659 ok = false; 1660 goto unlock; 1661 } 1662 1663 if (!i915_vma_is_pinned(ce->state)) { 1664 GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", 1665 engine->name, 1666 ce->timeline->fence_context, 1667 port - execlists->pending); 1668 ok = false; 1669 goto unlock; 1670 } 1671 1672 if (!i915_vma_is_pinned(ce->ring->vma)) { 1673 GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", 1674 engine->name, 1675 ce->timeline->fence_context, 1676 port - execlists->pending); 1677 ok = false; 1678 goto unlock; 1679 } 1680 1681 unlock: 1682 spin_unlock_irqrestore(&rq->lock, flags); 1683 if (!ok) 1684 return false; 1685 } 1686 1687 return ce; 1688 } 1689 1690 static void execlists_submit_ports(struct intel_engine_cs *engine) 1691 { 1692 struct intel_engine_execlists *execlists = &engine->execlists; 1693 unsigned int n; 1694 1695 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1696 1697 /* 1698 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1699 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1700 * not be relinquished until the device is idle (see 1701 * i915_gem_idle_work_handler()). As a precaution, we make sure 1702 * that all ELSP are drained i.e. we have processed the CSB, 1703 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1704 */ 1705 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1706 1707 /* 1708 * ELSQ note: the submit queue is not cleared after being submitted 1709 * to the HW so we need to make sure we always clean it up. This is 1710 * currently ensured by the fact that we always write the same number 1711 * of elsq entries, keep this in mind before changing the loop below. 1712 */ 1713 for (n = execlists_num_ports(execlists); n--; ) { 1714 struct i915_request *rq = execlists->pending[n]; 1715 1716 write_desc(execlists, 1717 rq ? execlists_update_context(rq) : 0, 1718 n); 1719 } 1720 1721 /* we need to manually load the submit queue */ 1722 if (execlists->ctrl_reg) 1723 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1724 } 1725 1726 static bool ctx_single_port_submission(const struct intel_context *ce) 1727 { 1728 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1729 intel_context_force_single_submission(ce)); 1730 } 1731 1732 static bool can_merge_ctx(const struct intel_context *prev, 1733 const struct intel_context *next) 1734 { 1735 if (prev != next) 1736 return false; 1737 1738 if (ctx_single_port_submission(prev)) 1739 return false; 1740 1741 return true; 1742 } 1743 1744 static unsigned long i915_request_flags(const struct i915_request *rq) 1745 { 1746 return READ_ONCE(rq->fence.flags); 1747 } 1748 1749 static bool can_merge_rq(const struct i915_request *prev, 1750 const struct i915_request *next) 1751 { 1752 GEM_BUG_ON(prev == next); 1753 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1754 1755 /* 1756 * We do not submit known completed requests. Therefore if the next 1757 * request is already completed, we can pretend to merge it in 1758 * with the previous context (and we will skip updating the ELSP 1759 * and tracking). Thus hopefully keeping the ELSP full with active 1760 * contexts, despite the best efforts of preempt-to-busy to confuse 1761 * us. 1762 */ 1763 if (i915_request_completed(next)) 1764 return true; 1765 1766 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & 1767 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1768 BIT(I915_FENCE_FLAG_SENTINEL)))) 1769 return false; 1770 1771 if (!can_merge_ctx(prev->context, next->context)) 1772 return false; 1773 1774 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); 1775 return true; 1776 } 1777 1778 static void virtual_update_register_offsets(u32 *regs, 1779 struct intel_engine_cs *engine) 1780 { 1781 set_offsets(regs, reg_offsets(engine), engine, false); 1782 } 1783 1784 static bool virtual_matches(const struct virtual_engine *ve, 1785 const struct i915_request *rq, 1786 const struct intel_engine_cs *engine) 1787 { 1788 const struct intel_engine_cs *inflight; 1789 1790 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1791 return false; 1792 1793 /* 1794 * We track when the HW has completed saving the context image 1795 * (i.e. when we have seen the final CS event switching out of 1796 * the context) and must not overwrite the context image before 1797 * then. This restricts us to only using the active engine 1798 * while the previous virtualized request is inflight (so 1799 * we reuse the register offsets). This is a very small 1800 * hystersis on the greedy seelction algorithm. 1801 */ 1802 inflight = intel_context_inflight(&ve->context); 1803 if (inflight && inflight != engine) 1804 return false; 1805 1806 return true; 1807 } 1808 1809 static void virtual_xfer_context(struct virtual_engine *ve, 1810 struct intel_engine_cs *engine) 1811 { 1812 unsigned int n; 1813 1814 if (likely(engine == ve->siblings[0])) 1815 return; 1816 1817 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1818 if (!intel_engine_has_relative_mmio(engine)) 1819 virtual_update_register_offsets(ve->context.lrc_reg_state, 1820 engine); 1821 1822 /* 1823 * Move the bound engine to the top of the list for 1824 * future execution. We then kick this tasklet first 1825 * before checking others, so that we preferentially 1826 * reuse this set of bound registers. 1827 */ 1828 for (n = 1; n < ve->num_siblings; n++) { 1829 if (ve->siblings[n] == engine) { 1830 swap(ve->siblings[n], ve->siblings[0]); 1831 break; 1832 } 1833 } 1834 } 1835 1836 #define for_each_waiter(p__, rq__) \ 1837 list_for_each_entry_lockless(p__, \ 1838 &(rq__)->sched.waiters_list, \ 1839 wait_link) 1840 1841 #define for_each_signaler(p__, rq__) \ 1842 list_for_each_entry_rcu(p__, \ 1843 &(rq__)->sched.signalers_list, \ 1844 signal_link) 1845 1846 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1847 { 1848 LIST_HEAD(list); 1849 1850 /* 1851 * We want to move the interrupted request to the back of 1852 * the round-robin list (i.e. its priority level), but 1853 * in doing so, we must then move all requests that were in 1854 * flight and were waiting for the interrupted request to 1855 * be run after it again. 1856 */ 1857 do { 1858 struct i915_dependency *p; 1859 1860 GEM_BUG_ON(i915_request_is_active(rq)); 1861 list_move_tail(&rq->sched.link, pl); 1862 1863 for_each_waiter(p, rq) { 1864 struct i915_request *w = 1865 container_of(p->waiter, typeof(*w), sched); 1866 1867 if (p->flags & I915_DEPENDENCY_WEAK) 1868 continue; 1869 1870 /* Leave semaphores spinning on the other engines */ 1871 if (w->engine != rq->engine) 1872 continue; 1873 1874 /* No waiter should start before its signaler */ 1875 GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && 1876 i915_request_started(w) && 1877 !i915_request_completed(rq)); 1878 1879 GEM_BUG_ON(i915_request_is_active(w)); 1880 if (!i915_request_is_ready(w)) 1881 continue; 1882 1883 if (rq_prio(w) < rq_prio(rq)) 1884 continue; 1885 1886 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1887 list_move_tail(&w->sched.link, &list); 1888 } 1889 1890 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1891 } while (rq); 1892 } 1893 1894 static void defer_active(struct intel_engine_cs *engine) 1895 { 1896 struct i915_request *rq; 1897 1898 rq = __unwind_incomplete_requests(engine); 1899 if (!rq) 1900 return; 1901 1902 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1903 } 1904 1905 static bool 1906 need_timeslice(const struct intel_engine_cs *engine, 1907 const struct i915_request *rq, 1908 const struct rb_node *rb) 1909 { 1910 int hint; 1911 1912 if (!intel_engine_has_timeslices(engine)) 1913 return false; 1914 1915 hint = engine->execlists.queue_priority_hint; 1916 1917 if (rb) { 1918 const struct virtual_engine *ve = 1919 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1920 const struct intel_engine_cs *inflight = 1921 intel_context_inflight(&ve->context); 1922 1923 if (!inflight || inflight == engine) { 1924 struct i915_request *next; 1925 1926 rcu_read_lock(); 1927 next = READ_ONCE(ve->request); 1928 if (next) 1929 hint = max(hint, rq_prio(next)); 1930 rcu_read_unlock(); 1931 } 1932 } 1933 1934 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1935 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1936 1937 GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE); 1938 return hint >= effective_prio(rq); 1939 } 1940 1941 static bool 1942 timeslice_yield(const struct intel_engine_execlists *el, 1943 const struct i915_request *rq) 1944 { 1945 /* 1946 * Once bitten, forever smitten! 1947 * 1948 * If the active context ever busy-waited on a semaphore, 1949 * it will be treated as a hog until the end of its timeslice (i.e. 1950 * until it is scheduled out and replaced by a new submission, 1951 * possibly even its own lite-restore). The HW only sends an interrupt 1952 * on the first miss, and we do know if that semaphore has been 1953 * signaled, or even if it is now stuck on another semaphore. Play 1954 * safe, yield if it might be stuck -- it will be given a fresh 1955 * timeslice in the near future. 1956 */ 1957 return rq->context->lrc.ccid == READ_ONCE(el->yield); 1958 } 1959 1960 static bool 1961 timeslice_expired(const struct intel_engine_execlists *el, 1962 const struct i915_request *rq) 1963 { 1964 return timer_expired(&el->timer) || timeslice_yield(el, rq); 1965 } 1966 1967 static int 1968 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1969 { 1970 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1971 return engine->execlists.queue_priority_hint; 1972 1973 return rq_prio(list_next_entry(rq, sched.link)); 1974 } 1975 1976 static inline unsigned long 1977 timeslice(const struct intel_engine_cs *engine) 1978 { 1979 return READ_ONCE(engine->props.timeslice_duration_ms); 1980 } 1981 1982 static unsigned long active_timeslice(const struct intel_engine_cs *engine) 1983 { 1984 const struct intel_engine_execlists *execlists = &engine->execlists; 1985 const struct i915_request *rq = *execlists->active; 1986 1987 if (!rq || i915_request_completed(rq)) 1988 return 0; 1989 1990 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) 1991 return 0; 1992 1993 return timeslice(engine); 1994 } 1995 1996 static void set_timeslice(struct intel_engine_cs *engine) 1997 { 1998 unsigned long duration; 1999 2000 if (!intel_engine_has_timeslices(engine)) 2001 return; 2002 2003 duration = active_timeslice(engine); 2004 ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); 2005 2006 set_timer_ms(&engine->execlists.timer, duration); 2007 } 2008 2009 static void start_timeslice(struct intel_engine_cs *engine, int prio) 2010 { 2011 struct intel_engine_execlists *execlists = &engine->execlists; 2012 unsigned long duration; 2013 2014 if (!intel_engine_has_timeslices(engine)) 2015 return; 2016 2017 WRITE_ONCE(execlists->switch_priority_hint, prio); 2018 if (prio == INT_MIN) 2019 return; 2020 2021 if (timer_pending(&execlists->timer)) 2022 return; 2023 2024 duration = timeslice(engine); 2025 ENGINE_TRACE(engine, 2026 "start timeslicing, prio:%d, interval:%lu", 2027 prio, duration); 2028 2029 set_timer_ms(&execlists->timer, duration); 2030 } 2031 2032 static void record_preemption(struct intel_engine_execlists *execlists) 2033 { 2034 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 2035 } 2036 2037 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, 2038 const struct i915_request *rq) 2039 { 2040 if (!rq) 2041 return 0; 2042 2043 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 2044 if (unlikely(intel_context_is_banned(rq->context))) 2045 return 1; 2046 2047 return READ_ONCE(engine->props.preempt_timeout_ms); 2048 } 2049 2050 static void set_preempt_timeout(struct intel_engine_cs *engine, 2051 const struct i915_request *rq) 2052 { 2053 if (!intel_engine_has_preempt_reset(engine)) 2054 return; 2055 2056 set_timer_ms(&engine->execlists.preempt, 2057 active_preempt_timeout(engine, rq)); 2058 } 2059 2060 static inline void clear_ports(struct i915_request **ports, int count) 2061 { 2062 memset_p((void **)ports, NULL, count); 2063 } 2064 2065 static inline void 2066 copy_ports(struct i915_request **dst, struct i915_request **src, int count) 2067 { 2068 /* A memcpy_p() would be very useful here! */ 2069 while (count--) 2070 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */ 2071 } 2072 2073 static void execlists_dequeue(struct intel_engine_cs *engine) 2074 { 2075 struct intel_engine_execlists * const execlists = &engine->execlists; 2076 struct i915_request **port = execlists->pending; 2077 struct i915_request ** const last_port = port + execlists->port_mask; 2078 struct i915_request * const *active; 2079 struct i915_request *last; 2080 struct rb_node *rb; 2081 bool submit = false; 2082 2083 /* 2084 * Hardware submission is through 2 ports. Conceptually each port 2085 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 2086 * static for a context, and unique to each, so we only execute 2087 * requests belonging to a single context from each ring. RING_HEAD 2088 * is maintained by the CS in the context image, it marks the place 2089 * where it got up to last time, and through RING_TAIL we tell the CS 2090 * where we want to execute up to this time. 2091 * 2092 * In this list the requests are in order of execution. Consecutive 2093 * requests from the same context are adjacent in the ringbuffer. We 2094 * can combine these requests into a single RING_TAIL update: 2095 * 2096 * RING_HEAD...req1...req2 2097 * ^- RING_TAIL 2098 * since to execute req2 the CS must first execute req1. 2099 * 2100 * Our goal then is to point each port to the end of a consecutive 2101 * sequence of requests as being the most optimal (fewest wake ups 2102 * and context switches) submission. 2103 */ 2104 2105 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 2106 struct virtual_engine *ve = 2107 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2108 struct i915_request *rq = READ_ONCE(ve->request); 2109 2110 if (!rq) { /* lazily cleanup after another engine handled rq */ 2111 rb_erase_cached(rb, &execlists->virtual); 2112 RB_CLEAR_NODE(rb); 2113 rb = rb_first_cached(&execlists->virtual); 2114 continue; 2115 } 2116 2117 if (!virtual_matches(ve, rq, engine)) { 2118 rb = rb_next(rb); 2119 continue; 2120 } 2121 2122 break; 2123 } 2124 2125 /* 2126 * If the queue is higher priority than the last 2127 * request in the currently active context, submit afresh. 2128 * We will resubmit again afterwards in case we need to split 2129 * the active context to interject the preemption request, 2130 * i.e. we will retrigger preemption following the ack in case 2131 * of trouble. 2132 */ 2133 active = READ_ONCE(execlists->active); 2134 2135 /* 2136 * In theory we can skip over completed contexts that have not 2137 * yet been processed by events (as those events are in flight): 2138 * 2139 * while ((last = *active) && i915_request_completed(last)) 2140 * active++; 2141 * 2142 * However, the GPU cannot handle this as it will ultimately 2143 * find itself trying to jump back into a context it has just 2144 * completed and barf. 2145 */ 2146 2147 if ((last = *active)) { 2148 if (need_preempt(engine, last, rb)) { 2149 if (i915_request_completed(last)) { 2150 tasklet_hi_schedule(&execlists->tasklet); 2151 return; 2152 } 2153 2154 ENGINE_TRACE(engine, 2155 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 2156 last->fence.context, 2157 last->fence.seqno, 2158 last->sched.attr.priority, 2159 execlists->queue_priority_hint); 2160 record_preemption(execlists); 2161 2162 /* 2163 * Don't let the RING_HEAD advance past the breadcrumb 2164 * as we unwind (and until we resubmit) so that we do 2165 * not accidentally tell it to go backwards. 2166 */ 2167 ring_set_paused(engine, 1); 2168 2169 /* 2170 * Note that we have not stopped the GPU at this point, 2171 * so we are unwinding the incomplete requests as they 2172 * remain inflight and so by the time we do complete 2173 * the preemption, some of the unwound requests may 2174 * complete! 2175 */ 2176 __unwind_incomplete_requests(engine); 2177 2178 last = NULL; 2179 } else if (need_timeslice(engine, last, rb) && 2180 timeslice_expired(execlists, last)) { 2181 if (i915_request_completed(last)) { 2182 tasklet_hi_schedule(&execlists->tasklet); 2183 return; 2184 } 2185 2186 ENGINE_TRACE(engine, 2187 "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", 2188 last->fence.context, 2189 last->fence.seqno, 2190 last->sched.attr.priority, 2191 execlists->queue_priority_hint, 2192 yesno(timeslice_yield(execlists, last))); 2193 2194 ring_set_paused(engine, 1); 2195 defer_active(engine); 2196 2197 /* 2198 * Unlike for preemption, if we rewind and continue 2199 * executing the same context as previously active, 2200 * the order of execution will remain the same and 2201 * the tail will only advance. We do not need to 2202 * force a full context restore, as a lite-restore 2203 * is sufficient to resample the monotonic TAIL. 2204 * 2205 * If we switch to any other context, similarly we 2206 * will not rewind TAIL of current context, and 2207 * normal save/restore will preserve state and allow 2208 * us to later continue executing the same request. 2209 */ 2210 last = NULL; 2211 } else { 2212 /* 2213 * Otherwise if we already have a request pending 2214 * for execution after the current one, we can 2215 * just wait until the next CS event before 2216 * queuing more. In either case we will force a 2217 * lite-restore preemption event, but if we wait 2218 * we hopefully coalesce several updates into a single 2219 * submission. 2220 */ 2221 if (!list_is_last(&last->sched.link, 2222 &engine->active.requests)) { 2223 /* 2224 * Even if ELSP[1] is occupied and not worthy 2225 * of timeslices, our queue might be. 2226 */ 2227 start_timeslice(engine, queue_prio(execlists)); 2228 return; 2229 } 2230 } 2231 } 2232 2233 while (rb) { /* XXX virtual is always taking precedence */ 2234 struct virtual_engine *ve = 2235 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2236 struct i915_request *rq; 2237 2238 spin_lock(&ve->base.active.lock); 2239 2240 rq = ve->request; 2241 if (unlikely(!rq)) { /* lost the race to a sibling */ 2242 spin_unlock(&ve->base.active.lock); 2243 rb_erase_cached(rb, &execlists->virtual); 2244 RB_CLEAR_NODE(rb); 2245 rb = rb_first_cached(&execlists->virtual); 2246 continue; 2247 } 2248 2249 GEM_BUG_ON(rq != ve->request); 2250 GEM_BUG_ON(rq->engine != &ve->base); 2251 GEM_BUG_ON(rq->context != &ve->context); 2252 2253 if (rq_prio(rq) >= queue_prio(execlists)) { 2254 if (!virtual_matches(ve, rq, engine)) { 2255 spin_unlock(&ve->base.active.lock); 2256 rb = rb_next(rb); 2257 continue; 2258 } 2259 2260 if (last && !can_merge_rq(last, rq)) { 2261 spin_unlock(&ve->base.active.lock); 2262 start_timeslice(engine, rq_prio(rq)); 2263 return; /* leave this for another sibling */ 2264 } 2265 2266 ENGINE_TRACE(engine, 2267 "virtual rq=%llx:%lld%s, new engine? %s\n", 2268 rq->fence.context, 2269 rq->fence.seqno, 2270 i915_request_completed(rq) ? "!" : 2271 i915_request_started(rq) ? "*" : 2272 "", 2273 yesno(engine != ve->siblings[0])); 2274 2275 WRITE_ONCE(ve->request, NULL); 2276 WRITE_ONCE(ve->base.execlists.queue_priority_hint, 2277 INT_MIN); 2278 rb_erase_cached(rb, &execlists->virtual); 2279 RB_CLEAR_NODE(rb); 2280 2281 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2282 WRITE_ONCE(rq->engine, engine); 2283 2284 if (__i915_request_submit(rq)) { 2285 /* 2286 * Only after we confirm that we will submit 2287 * this request (i.e. it has not already 2288 * completed), do we want to update the context. 2289 * 2290 * This serves two purposes. It avoids 2291 * unnecessary work if we are resubmitting an 2292 * already completed request after timeslicing. 2293 * But more importantly, it prevents us altering 2294 * ve->siblings[] on an idle context, where 2295 * we may be using ve->siblings[] in 2296 * virtual_context_enter / virtual_context_exit. 2297 */ 2298 virtual_xfer_context(ve, engine); 2299 GEM_BUG_ON(ve->siblings[0] != engine); 2300 2301 submit = true; 2302 last = rq; 2303 } 2304 i915_request_put(rq); 2305 2306 /* 2307 * Hmm, we have a bunch of virtual engine requests, 2308 * but the first one was already completed (thanks 2309 * preempt-to-busy!). Keep looking at the veng queue 2310 * until we have no more relevant requests (i.e. 2311 * the normal submit queue has higher priority). 2312 */ 2313 if (!submit) { 2314 spin_unlock(&ve->base.active.lock); 2315 rb = rb_first_cached(&execlists->virtual); 2316 continue; 2317 } 2318 } 2319 2320 spin_unlock(&ve->base.active.lock); 2321 break; 2322 } 2323 2324 while ((rb = rb_first_cached(&execlists->queue))) { 2325 struct i915_priolist *p = to_priolist(rb); 2326 struct i915_request *rq, *rn; 2327 int i; 2328 2329 priolist_for_each_request_consume(rq, rn, p, i) { 2330 bool merge = true; 2331 2332 /* 2333 * Can we combine this request with the current port? 2334 * It has to be the same context/ringbuffer and not 2335 * have any exceptions (e.g. GVT saying never to 2336 * combine contexts). 2337 * 2338 * If we can combine the requests, we can execute both 2339 * by updating the RING_TAIL to point to the end of the 2340 * second request, and so we never need to tell the 2341 * hardware about the first. 2342 */ 2343 if (last && !can_merge_rq(last, rq)) { 2344 /* 2345 * If we are on the second port and cannot 2346 * combine this request with the last, then we 2347 * are done. 2348 */ 2349 if (port == last_port) 2350 goto done; 2351 2352 /* 2353 * We must not populate both ELSP[] with the 2354 * same LRCA, i.e. we must submit 2 different 2355 * contexts if we submit 2 ELSP. 2356 */ 2357 if (last->context == rq->context) 2358 goto done; 2359 2360 if (i915_request_has_sentinel(last)) 2361 goto done; 2362 2363 /* 2364 * If GVT overrides us we only ever submit 2365 * port[0], leaving port[1] empty. Note that we 2366 * also have to be careful that we don't queue 2367 * the same context (even though a different 2368 * request) to the second port. 2369 */ 2370 if (ctx_single_port_submission(last->context) || 2371 ctx_single_port_submission(rq->context)) 2372 goto done; 2373 2374 merge = false; 2375 } 2376 2377 if (__i915_request_submit(rq)) { 2378 if (!merge) { 2379 *port = execlists_schedule_in(last, port - execlists->pending); 2380 port++; 2381 last = NULL; 2382 } 2383 2384 GEM_BUG_ON(last && 2385 !can_merge_ctx(last->context, 2386 rq->context)); 2387 GEM_BUG_ON(last && 2388 i915_seqno_passed(last->fence.seqno, 2389 rq->fence.seqno)); 2390 2391 submit = true; 2392 last = rq; 2393 } 2394 } 2395 2396 rb_erase_cached(&p->node, &execlists->queue); 2397 i915_priolist_free(p); 2398 } 2399 2400 done: 2401 /* 2402 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2403 * 2404 * We choose the priority hint such that if we add a request of greater 2405 * priority than this, we kick the submission tasklet to decide on 2406 * the right order of submitting the requests to hardware. We must 2407 * also be prepared to reorder requests as they are in-flight on the 2408 * HW. We derive the priority hint then as the first "hole" in 2409 * the HW submission ports and if there are no available slots, 2410 * the priority of the lowest executing request, i.e. last. 2411 * 2412 * When we do receive a higher priority request ready to run from the 2413 * user, see queue_request(), the priority hint is bumped to that 2414 * request triggering preemption on the next dequeue (or subsequent 2415 * interrupt for secondary ports). 2416 */ 2417 execlists->queue_priority_hint = queue_prio(execlists); 2418 2419 if (submit) { 2420 *port = execlists_schedule_in(last, port - execlists->pending); 2421 execlists->switch_priority_hint = 2422 switch_prio(engine, *execlists->pending); 2423 2424 /* 2425 * Skip if we ended up with exactly the same set of requests, 2426 * e.g. trying to timeslice a pair of ordered contexts 2427 */ 2428 if (!memcmp(active, execlists->pending, 2429 (port - execlists->pending + 1) * sizeof(*port))) { 2430 do 2431 execlists_schedule_out(fetch_and_zero(port)); 2432 while (port-- != execlists->pending); 2433 2434 goto skip_submit; 2435 } 2436 clear_ports(port + 1, last_port - port); 2437 2438 WRITE_ONCE(execlists->yield, -1); 2439 set_preempt_timeout(engine, *active); 2440 execlists_submit_ports(engine); 2441 } else { 2442 start_timeslice(engine, execlists->queue_priority_hint); 2443 skip_submit: 2444 ring_set_paused(engine, 0); 2445 } 2446 } 2447 2448 static void 2449 cancel_port_requests(struct intel_engine_execlists * const execlists) 2450 { 2451 struct i915_request * const *port; 2452 2453 for (port = execlists->pending; *port; port++) 2454 execlists_schedule_out(*port); 2455 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2456 2457 /* Mark the end of active before we overwrite *active */ 2458 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2459 execlists_schedule_out(*port); 2460 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2461 2462 smp_wmb(); /* complete the seqlock for execlists_active() */ 2463 WRITE_ONCE(execlists->active, execlists->inflight); 2464 } 2465 2466 static inline void 2467 invalidate_csb_entries(const u64 *first, const u64 *last) 2468 { 2469 clflush((void *)first); 2470 clflush((void *)last); 2471 } 2472 2473 /* 2474 * Starting with Gen12, the status has a new format: 2475 * 2476 * bit 0: switched to new queue 2477 * bit 1: reserved 2478 * bit 2: semaphore wait mode (poll or signal), only valid when 2479 * switch detail is set to "wait on semaphore" 2480 * bits 3-5: engine class 2481 * bits 6-11: engine instance 2482 * bits 12-14: reserved 2483 * bits 15-25: sw context id of the lrc the GT switched to 2484 * bits 26-31: sw counter of the lrc the GT switched to 2485 * bits 32-35: context switch detail 2486 * - 0: ctx complete 2487 * - 1: wait on sync flip 2488 * - 2: wait on vblank 2489 * - 3: wait on scanline 2490 * - 4: wait on semaphore 2491 * - 5: context preempted (not on SEMAPHORE_WAIT or 2492 * WAIT_FOR_EVENT) 2493 * bit 36: reserved 2494 * bits 37-43: wait detail (for switch detail 1 to 4) 2495 * bits 44-46: reserved 2496 * bits 47-57: sw context id of the lrc the GT switched away from 2497 * bits 58-63: sw counter of the lrc the GT switched away from 2498 */ 2499 static inline bool gen12_csb_parse(const u64 csb) 2500 { 2501 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb)); 2502 bool new_queue = 2503 lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2504 2505 /* 2506 * The context switch detail is not guaranteed to be 5 when a preemption 2507 * occurs, so we can't just check for that. The check below works for 2508 * all the cases we care about, including preemptions of WAIT 2509 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2510 * would require some extra handling, but we don't support that. 2511 */ 2512 if (!ctx_away_valid || new_queue) { 2513 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb))); 2514 return true; 2515 } 2516 2517 /* 2518 * switch detail = 5 is covered by the case above and we do not expect a 2519 * context switch on an unsuccessful wait instruction since we always 2520 * use polling mode. 2521 */ 2522 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb))); 2523 return false; 2524 } 2525 2526 static inline bool gen8_csb_parse(const u64 csb) 2527 { 2528 return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2529 } 2530 2531 static noinline u64 2532 wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb) 2533 { 2534 u64 entry; 2535 2536 /* 2537 * Reading from the HWSP has one particular advantage: we can detect 2538 * a stale entry. Since the write into HWSP is broken, we have no reason 2539 * to trust the HW at all, the mmio entry may equally be unordered, so 2540 * we prefer the path that is self-checking and as a last resort, 2541 * return the mmio value. 2542 * 2543 * tgl,dg1:HSDES#22011327657 2544 */ 2545 preempt_disable(); 2546 if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 10)) { 2547 int idx = csb - engine->execlists.csb_status; 2548 int status; 2549 2550 status = GEN8_EXECLISTS_STATUS_BUF; 2551 if (idx >= 6) { 2552 status = GEN11_EXECLISTS_STATUS_BUF2; 2553 idx -= 6; 2554 } 2555 status += sizeof(u64) * idx; 2556 2557 entry = intel_uncore_read64(engine->uncore, 2558 _MMIO(engine->mmio_base + status)); 2559 } 2560 preempt_enable(); 2561 2562 return entry; 2563 } 2564 2565 static inline u64 2566 csb_read(const struct intel_engine_cs *engine, u64 * const csb) 2567 { 2568 u64 entry = READ_ONCE(*csb); 2569 2570 /* 2571 * Unfortunately, the GPU does not always serialise its write 2572 * of the CSB entries before its write of the CSB pointer, at least 2573 * from the perspective of the CPU, using what is known as a Global 2574 * Observation Point. We may read a new CSB tail pointer, but then 2575 * read the stale CSB entries, causing us to misinterpret the 2576 * context-switch events, and eventually declare the GPU hung. 2577 * 2578 * icl:HSDES#1806554093 2579 * tgl:HSDES#22011248461 2580 */ 2581 if (unlikely(entry == -1)) 2582 entry = wa_csb_read(engine, csb); 2583 2584 /* Consume this entry so that we can spot its future reuse. */ 2585 WRITE_ONCE(*csb, -1); 2586 2587 /* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */ 2588 return entry; 2589 } 2590 2591 static void process_csb(struct intel_engine_cs *engine) 2592 { 2593 struct intel_engine_execlists * const execlists = &engine->execlists; 2594 u64 * const buf = execlists->csb_status; 2595 const u8 num_entries = execlists->csb_size; 2596 u8 head, tail; 2597 2598 /* 2599 * As we modify our execlists state tracking we require exclusive 2600 * access. Either we are inside the tasklet, or the tasklet is disabled 2601 * and we assume that is only inside the reset paths and so serialised. 2602 */ 2603 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2604 !reset_in_progress(execlists)); 2605 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2606 2607 /* 2608 * Note that csb_write, csb_status may be either in HWSP or mmio. 2609 * When reading from the csb_write mmio register, we have to be 2610 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2611 * the low 4bits. As it happens we know the next 4bits are always 2612 * zero and so we can simply masked off the low u8 of the register 2613 * and treat it identically to reading from the HWSP (without having 2614 * to use explicit shifting and masking, and probably bifurcating 2615 * the code to handle the legacy mmio read). 2616 */ 2617 head = execlists->csb_head; 2618 tail = READ_ONCE(*execlists->csb_write); 2619 if (unlikely(head == tail)) 2620 return; 2621 2622 /* 2623 * We will consume all events from HW, or at least pretend to. 2624 * 2625 * The sequence of events from the HW is deterministic, and derived 2626 * from our writes to the ELSP, with a smidgen of variability for 2627 * the arrival of the asynchronous requests wrt to the inflight 2628 * execution. If the HW sends an event that does not correspond with 2629 * the one we are expecting, we have to abandon all hope as we lose 2630 * all tracking of what the engine is actually executing. We will 2631 * only detect we are out of sequence with the HW when we get an 2632 * 'impossible' event because we have already drained our own 2633 * preemption/promotion queue. If this occurs, we know that we likely 2634 * lost track of execution earlier and must unwind and restart, the 2635 * simplest way is by stop processing the event queue and force the 2636 * engine to reset. 2637 */ 2638 execlists->csb_head = tail; 2639 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2640 2641 /* 2642 * Hopefully paired with a wmb() in HW! 2643 * 2644 * We must complete the read of the write pointer before any reads 2645 * from the CSB, so that we do not see stale values. Without an rmb 2646 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2647 * we perform the READ_ONCE(*csb_write). 2648 */ 2649 rmb(); 2650 do { 2651 bool promote; 2652 u64 csb; 2653 2654 if (++head == num_entries) 2655 head = 0; 2656 2657 /* 2658 * We are flying near dragons again. 2659 * 2660 * We hold a reference to the request in execlist_port[] 2661 * but no more than that. We are operating in softirq 2662 * context and so cannot hold any mutex or sleep. That 2663 * prevents us stopping the requests we are processing 2664 * in port[] from being retired simultaneously (the 2665 * breadcrumb will be complete before we see the 2666 * context-switch). As we only hold the reference to the 2667 * request, any pointer chasing underneath the request 2668 * is subject to a potential use-after-free. Thus we 2669 * store all of the bookkeeping within port[] as 2670 * required, and avoid using unguarded pointers beneath 2671 * request itself. The same applies to the atomic 2672 * status notifier. 2673 */ 2674 2675 csb = csb_read(engine, buf + head); 2676 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2677 head, upper_32_bits(csb), lower_32_bits(csb)); 2678 2679 if (INTEL_GEN(engine->i915) >= 12) 2680 promote = gen12_csb_parse(csb); 2681 else 2682 promote = gen8_csb_parse(csb); 2683 if (promote) { 2684 struct i915_request * const *old = execlists->active; 2685 2686 if (GEM_WARN_ON(!*execlists->pending)) { 2687 execlists->error_interrupt |= ERROR_CSB; 2688 break; 2689 } 2690 2691 ring_set_paused(engine, 0); 2692 2693 /* Point active to the new ELSP; prevent overwriting */ 2694 WRITE_ONCE(execlists->active, execlists->pending); 2695 smp_wmb(); /* notify execlists_active() */ 2696 2697 /* cancel old inflight, prepare for switch */ 2698 trace_ports(execlists, "preempted", old); 2699 while (*old) 2700 execlists_schedule_out(*old++); 2701 2702 /* switch pending to inflight */ 2703 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2704 copy_ports(execlists->inflight, 2705 execlists->pending, 2706 execlists_num_ports(execlists)); 2707 smp_wmb(); /* complete the seqlock */ 2708 WRITE_ONCE(execlists->active, execlists->inflight); 2709 2710 /* XXX Magic delay for tgl */ 2711 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 2712 2713 WRITE_ONCE(execlists->pending[0], NULL); 2714 } else { 2715 if (GEM_WARN_ON(!*execlists->active)) { 2716 execlists->error_interrupt |= ERROR_CSB; 2717 break; 2718 } 2719 2720 /* port0 completed, advanced to port1 */ 2721 trace_ports(execlists, "completed", execlists->active); 2722 2723 /* 2724 * We rely on the hardware being strongly 2725 * ordered, that the breadcrumb write is 2726 * coherent (visible from the CPU) before the 2727 * user interrupt is processed. One might assume 2728 * that the breadcrumb write being before the 2729 * user interrupt and the CS event for the context 2730 * switch would therefore be before the CS event 2731 * itself... 2732 */ 2733 if (GEM_SHOW_DEBUG() && 2734 !i915_request_completed(*execlists->active)) { 2735 struct i915_request *rq = *execlists->active; 2736 const u32 *regs __maybe_unused = 2737 rq->context->lrc_reg_state; 2738 2739 ENGINE_TRACE(engine, 2740 "context completed before request!\n"); 2741 ENGINE_TRACE(engine, 2742 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2743 ENGINE_READ(engine, RING_START), 2744 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2745 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2746 ENGINE_READ(engine, RING_CTL), 2747 ENGINE_READ(engine, RING_MI_MODE)); 2748 ENGINE_TRACE(engine, 2749 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2750 i915_ggtt_offset(rq->ring->vma), 2751 rq->head, rq->tail, 2752 rq->fence.context, 2753 lower_32_bits(rq->fence.seqno), 2754 hwsp_seqno(rq)); 2755 ENGINE_TRACE(engine, 2756 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2757 regs[CTX_RING_START], 2758 regs[CTX_RING_HEAD], 2759 regs[CTX_RING_TAIL]); 2760 } 2761 2762 execlists_schedule_out(*execlists->active++); 2763 2764 GEM_BUG_ON(execlists->active - execlists->inflight > 2765 execlists_num_ports(execlists)); 2766 } 2767 } while (head != tail); 2768 2769 set_timeslice(engine); 2770 2771 /* 2772 * Gen11 has proven to fail wrt global observation point between 2773 * entry and tail update, failing on the ordering and thus 2774 * we see an old entry in the context status buffer. 2775 * 2776 * Forcibly evict out entries for the next gpu csb update, 2777 * to increase the odds that we get a fresh entries with non 2778 * working hardware. The cost for doing so comes out mostly with 2779 * the wash as hardware, working or not, will need to do the 2780 * invalidation before. 2781 */ 2782 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2783 } 2784 2785 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2786 { 2787 lockdep_assert_held(&engine->active.lock); 2788 if (!READ_ONCE(engine->execlists.pending[0])) { 2789 rcu_read_lock(); /* protect peeking at execlists->active */ 2790 execlists_dequeue(engine); 2791 rcu_read_unlock(); 2792 } 2793 } 2794 2795 static void __execlists_hold(struct i915_request *rq) 2796 { 2797 LIST_HEAD(list); 2798 2799 do { 2800 struct i915_dependency *p; 2801 2802 if (i915_request_is_active(rq)) 2803 __i915_request_unsubmit(rq); 2804 2805 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2806 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2807 i915_request_set_hold(rq); 2808 RQ_TRACE(rq, "on hold\n"); 2809 2810 for_each_waiter(p, rq) { 2811 struct i915_request *w = 2812 container_of(p->waiter, typeof(*w), sched); 2813 2814 /* Leave semaphores spinning on the other engines */ 2815 if (w->engine != rq->engine) 2816 continue; 2817 2818 if (!i915_request_is_ready(w)) 2819 continue; 2820 2821 if (i915_request_completed(w)) 2822 continue; 2823 2824 if (i915_request_on_hold(w)) 2825 continue; 2826 2827 list_move_tail(&w->sched.link, &list); 2828 } 2829 2830 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2831 } while (rq); 2832 } 2833 2834 static bool execlists_hold(struct intel_engine_cs *engine, 2835 struct i915_request *rq) 2836 { 2837 spin_lock_irq(&engine->active.lock); 2838 2839 if (i915_request_completed(rq)) { /* too late! */ 2840 rq = NULL; 2841 goto unlock; 2842 } 2843 2844 if (rq->engine != engine) { /* preempted virtual engine */ 2845 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2846 2847 /* 2848 * intel_context_inflight() is only protected by virtue 2849 * of process_csb() being called only by the tasklet (or 2850 * directly from inside reset while the tasklet is suspended). 2851 * Assert that neither of those are allowed to run while we 2852 * poke at the request queues. 2853 */ 2854 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2855 2856 /* 2857 * An unsubmitted request along a virtual engine will 2858 * remain on the active (this) engine until we are able 2859 * to process the context switch away (and so mark the 2860 * context as no longer in flight). That cannot have happened 2861 * yet, otherwise we would not be hanging! 2862 */ 2863 spin_lock(&ve->base.active.lock); 2864 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2865 GEM_BUG_ON(ve->request != rq); 2866 ve->request = NULL; 2867 spin_unlock(&ve->base.active.lock); 2868 i915_request_put(rq); 2869 2870 rq->engine = engine; 2871 } 2872 2873 /* 2874 * Transfer this request onto the hold queue to prevent it 2875 * being resumbitted to HW (and potentially completed) before we have 2876 * released it. Since we may have already submitted following 2877 * requests, we need to remove those as well. 2878 */ 2879 GEM_BUG_ON(i915_request_on_hold(rq)); 2880 GEM_BUG_ON(rq->engine != engine); 2881 __execlists_hold(rq); 2882 GEM_BUG_ON(list_empty(&engine->active.hold)); 2883 2884 unlock: 2885 spin_unlock_irq(&engine->active.lock); 2886 return rq; 2887 } 2888 2889 static bool hold_request(const struct i915_request *rq) 2890 { 2891 struct i915_dependency *p; 2892 bool result = false; 2893 2894 /* 2895 * If one of our ancestors is on hold, we must also be on hold, 2896 * otherwise we will bypass it and execute before it. 2897 */ 2898 rcu_read_lock(); 2899 for_each_signaler(p, rq) { 2900 const struct i915_request *s = 2901 container_of(p->signaler, typeof(*s), sched); 2902 2903 if (s->engine != rq->engine) 2904 continue; 2905 2906 result = i915_request_on_hold(s); 2907 if (result) 2908 break; 2909 } 2910 rcu_read_unlock(); 2911 2912 return result; 2913 } 2914 2915 static void __execlists_unhold(struct i915_request *rq) 2916 { 2917 LIST_HEAD(list); 2918 2919 do { 2920 struct i915_dependency *p; 2921 2922 RQ_TRACE(rq, "hold release\n"); 2923 2924 GEM_BUG_ON(!i915_request_on_hold(rq)); 2925 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2926 2927 i915_request_clear_hold(rq); 2928 list_move_tail(&rq->sched.link, 2929 i915_sched_lookup_priolist(rq->engine, 2930 rq_prio(rq))); 2931 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2932 2933 /* Also release any children on this engine that are ready */ 2934 for_each_waiter(p, rq) { 2935 struct i915_request *w = 2936 container_of(p->waiter, typeof(*w), sched); 2937 2938 /* Propagate any change in error status */ 2939 if (rq->fence.error) 2940 i915_request_set_error_once(w, rq->fence.error); 2941 2942 if (w->engine != rq->engine) 2943 continue; 2944 2945 if (!i915_request_on_hold(w)) 2946 continue; 2947 2948 /* Check that no other parents are also on hold */ 2949 if (hold_request(w)) 2950 continue; 2951 2952 list_move_tail(&w->sched.link, &list); 2953 } 2954 2955 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2956 } while (rq); 2957 } 2958 2959 static void execlists_unhold(struct intel_engine_cs *engine, 2960 struct i915_request *rq) 2961 { 2962 spin_lock_irq(&engine->active.lock); 2963 2964 /* 2965 * Move this request back to the priority queue, and all of its 2966 * children and grandchildren that were suspended along with it. 2967 */ 2968 __execlists_unhold(rq); 2969 2970 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2971 engine->execlists.queue_priority_hint = rq_prio(rq); 2972 tasklet_hi_schedule(&engine->execlists.tasklet); 2973 } 2974 2975 spin_unlock_irq(&engine->active.lock); 2976 } 2977 2978 struct execlists_capture { 2979 struct work_struct work; 2980 struct i915_request *rq; 2981 struct i915_gpu_coredump *error; 2982 }; 2983 2984 static void execlists_capture_work(struct work_struct *work) 2985 { 2986 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2987 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2988 struct intel_engine_cs *engine = cap->rq->engine; 2989 struct intel_gt_coredump *gt = cap->error->gt; 2990 struct intel_engine_capture_vma *vma; 2991 2992 /* Compress all the objects attached to the request, slow! */ 2993 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2994 if (vma) { 2995 struct i915_vma_compress *compress = 2996 i915_vma_capture_prepare(gt); 2997 2998 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2999 i915_vma_capture_finish(gt, compress); 3000 } 3001 3002 gt->simulated = gt->engine->simulated; 3003 cap->error->simulated = gt->simulated; 3004 3005 /* Publish the error state, and announce it to the world */ 3006 i915_error_state_store(cap->error); 3007 i915_gpu_coredump_put(cap->error); 3008 3009 /* Return this request and all that depend upon it for signaling */ 3010 execlists_unhold(engine, cap->rq); 3011 i915_request_put(cap->rq); 3012 3013 kfree(cap); 3014 } 3015 3016 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 3017 { 3018 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 3019 struct execlists_capture *cap; 3020 3021 cap = kmalloc(sizeof(*cap), gfp); 3022 if (!cap) 3023 return NULL; 3024 3025 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 3026 if (!cap->error) 3027 goto err_cap; 3028 3029 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 3030 if (!cap->error->gt) 3031 goto err_gpu; 3032 3033 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 3034 if (!cap->error->gt->engine) 3035 goto err_gt; 3036 3037 return cap; 3038 3039 err_gt: 3040 kfree(cap->error->gt); 3041 err_gpu: 3042 kfree(cap->error); 3043 err_cap: 3044 kfree(cap); 3045 return NULL; 3046 } 3047 3048 static struct i915_request * 3049 active_context(struct intel_engine_cs *engine, u32 ccid) 3050 { 3051 const struct intel_engine_execlists * const el = &engine->execlists; 3052 struct i915_request * const *port, *rq; 3053 3054 /* 3055 * Use the most recent result from process_csb(), but just in case 3056 * we trigger an error (via interrupt) before the first CS event has 3057 * been written, peek at the next submission. 3058 */ 3059 3060 for (port = el->active; (rq = *port); port++) { 3061 if (rq->context->lrc.ccid == ccid) { 3062 ENGINE_TRACE(engine, 3063 "ccid found at active:%zd\n", 3064 port - el->active); 3065 return rq; 3066 } 3067 } 3068 3069 for (port = el->pending; (rq = *port); port++) { 3070 if (rq->context->lrc.ccid == ccid) { 3071 ENGINE_TRACE(engine, 3072 "ccid found at pending:%zd\n", 3073 port - el->pending); 3074 return rq; 3075 } 3076 } 3077 3078 ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); 3079 return NULL; 3080 } 3081 3082 static u32 active_ccid(struct intel_engine_cs *engine) 3083 { 3084 return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); 3085 } 3086 3087 static void execlists_capture(struct intel_engine_cs *engine) 3088 { 3089 struct execlists_capture *cap; 3090 3091 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 3092 return; 3093 3094 /* 3095 * We need to _quickly_ capture the engine state before we reset. 3096 * We are inside an atomic section (softirq) here and we are delaying 3097 * the forced preemption event. 3098 */ 3099 cap = capture_regs(engine); 3100 if (!cap) 3101 return; 3102 3103 spin_lock_irq(&engine->active.lock); 3104 cap->rq = active_context(engine, active_ccid(engine)); 3105 if (cap->rq) { 3106 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 3107 cap->rq = i915_request_get_rcu(cap->rq); 3108 } 3109 spin_unlock_irq(&engine->active.lock); 3110 if (!cap->rq) 3111 goto err_free; 3112 3113 /* 3114 * Remove the request from the execlists queue, and take ownership 3115 * of the request. We pass it to our worker who will _slowly_ compress 3116 * all the pages the _user_ requested for debugging their batch, after 3117 * which we return it to the queue for signaling. 3118 * 3119 * By removing them from the execlists queue, we also remove the 3120 * requests from being processed by __unwind_incomplete_requests() 3121 * during the intel_engine_reset(), and so they will *not* be replayed 3122 * afterwards. 3123 * 3124 * Note that because we have not yet reset the engine at this point, 3125 * it is possible for the request that we have identified as being 3126 * guilty, did in fact complete and we will then hit an arbitration 3127 * point allowing the outstanding preemption to succeed. The likelihood 3128 * of that is very low (as capturing of the engine registers should be 3129 * fast enough to run inside an irq-off atomic section!), so we will 3130 * simply hold that request accountable for being non-preemptible 3131 * long enough to force the reset. 3132 */ 3133 if (!execlists_hold(engine, cap->rq)) 3134 goto err_rq; 3135 3136 INIT_WORK(&cap->work, execlists_capture_work); 3137 schedule_work(&cap->work); 3138 return; 3139 3140 err_rq: 3141 i915_request_put(cap->rq); 3142 err_free: 3143 i915_gpu_coredump_put(cap->error); 3144 kfree(cap); 3145 } 3146 3147 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 3148 { 3149 const unsigned int bit = I915_RESET_ENGINE + engine->id; 3150 unsigned long *lock = &engine->gt->reset.flags; 3151 3152 if (!intel_has_reset_engine(engine->gt)) 3153 return; 3154 3155 if (test_and_set_bit(bit, lock)) 3156 return; 3157 3158 ENGINE_TRACE(engine, "reset for %s\n", msg); 3159 3160 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 3161 tasklet_disable_nosync(&engine->execlists.tasklet); 3162 3163 ring_set_paused(engine, 1); /* Freeze the current request in place */ 3164 execlists_capture(engine); 3165 intel_engine_reset(engine, msg); 3166 3167 tasklet_enable(&engine->execlists.tasklet); 3168 clear_and_wake_up_bit(bit, lock); 3169 } 3170 3171 static bool preempt_timeout(const struct intel_engine_cs *const engine) 3172 { 3173 const struct timer_list *t = &engine->execlists.preempt; 3174 3175 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 3176 return false; 3177 3178 if (!timer_expired(t)) 3179 return false; 3180 3181 return READ_ONCE(engine->execlists.pending[0]); 3182 } 3183 3184 /* 3185 * Check the unread Context Status Buffers and manage the submission of new 3186 * contexts to the ELSP accordingly. 3187 */ 3188 static void execlists_submission_tasklet(unsigned long data) 3189 { 3190 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 3191 bool timeout = preempt_timeout(engine); 3192 3193 process_csb(engine); 3194 3195 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 3196 const char *msg; 3197 3198 /* Generate the error message in priority wrt to the user! */ 3199 if (engine->execlists.error_interrupt & GENMASK(15, 0)) 3200 msg = "CS error"; /* thrown by a user payload */ 3201 else if (engine->execlists.error_interrupt & ERROR_CSB) 3202 msg = "invalid CSB event"; 3203 else 3204 msg = "internal error"; 3205 3206 engine->execlists.error_interrupt = 0; 3207 execlists_reset(engine, msg); 3208 } 3209 3210 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 3211 unsigned long flags; 3212 3213 spin_lock_irqsave(&engine->active.lock, flags); 3214 __execlists_submission_tasklet(engine); 3215 spin_unlock_irqrestore(&engine->active.lock, flags); 3216 3217 /* Recheck after serialising with direct-submission */ 3218 if (unlikely(timeout && preempt_timeout(engine))) 3219 execlists_reset(engine, "preemption time out"); 3220 } 3221 } 3222 3223 static void __execlists_kick(struct intel_engine_execlists *execlists) 3224 { 3225 /* Kick the tasklet for some interrupt coalescing and reset handling */ 3226 tasklet_hi_schedule(&execlists->tasklet); 3227 } 3228 3229 #define execlists_kick(t, member) \ 3230 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 3231 3232 static void execlists_timeslice(struct timer_list *timer) 3233 { 3234 execlists_kick(timer, timer); 3235 } 3236 3237 static void execlists_preempt(struct timer_list *timer) 3238 { 3239 execlists_kick(timer, preempt); 3240 } 3241 3242 static void queue_request(struct intel_engine_cs *engine, 3243 struct i915_request *rq) 3244 { 3245 GEM_BUG_ON(!list_empty(&rq->sched.link)); 3246 list_add_tail(&rq->sched.link, 3247 i915_sched_lookup_priolist(engine, rq_prio(rq))); 3248 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 3249 } 3250 3251 static void __submit_queue_imm(struct intel_engine_cs *engine) 3252 { 3253 struct intel_engine_execlists * const execlists = &engine->execlists; 3254 3255 if (reset_in_progress(execlists)) 3256 return; /* defer until we restart the engine following reset */ 3257 3258 __execlists_submission_tasklet(engine); 3259 } 3260 3261 static void submit_queue(struct intel_engine_cs *engine, 3262 const struct i915_request *rq) 3263 { 3264 struct intel_engine_execlists *execlists = &engine->execlists; 3265 3266 if (rq_prio(rq) <= execlists->queue_priority_hint) 3267 return; 3268 3269 execlists->queue_priority_hint = rq_prio(rq); 3270 __submit_queue_imm(engine); 3271 } 3272 3273 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 3274 const struct i915_request *rq) 3275 { 3276 GEM_BUG_ON(i915_request_on_hold(rq)); 3277 return !list_empty(&engine->active.hold) && hold_request(rq); 3278 } 3279 3280 static void flush_csb(struct intel_engine_cs *engine) 3281 { 3282 struct intel_engine_execlists *el = &engine->execlists; 3283 3284 if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) { 3285 if (!reset_in_progress(el)) 3286 process_csb(engine); 3287 tasklet_unlock(&el->tasklet); 3288 } 3289 } 3290 3291 static void execlists_submit_request(struct i915_request *request) 3292 { 3293 struct intel_engine_cs *engine = request->engine; 3294 unsigned long flags; 3295 3296 /* Hopefully we clear execlists->pending[] to let us through */ 3297 flush_csb(engine); 3298 3299 /* Will be called from irq-context when using foreign fences. */ 3300 spin_lock_irqsave(&engine->active.lock, flags); 3301 3302 if (unlikely(ancestor_on_hold(engine, request))) { 3303 RQ_TRACE(request, "ancestor on hold\n"); 3304 list_add_tail(&request->sched.link, &engine->active.hold); 3305 i915_request_set_hold(request); 3306 } else { 3307 queue_request(engine, request); 3308 3309 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 3310 GEM_BUG_ON(list_empty(&request->sched.link)); 3311 3312 submit_queue(engine, request); 3313 } 3314 3315 spin_unlock_irqrestore(&engine->active.lock, flags); 3316 } 3317 3318 static void __execlists_context_fini(struct intel_context *ce) 3319 { 3320 intel_ring_put(ce->ring); 3321 i915_vma_put(ce->state); 3322 } 3323 3324 static void execlists_context_destroy(struct kref *kref) 3325 { 3326 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 3327 3328 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 3329 GEM_BUG_ON(intel_context_is_pinned(ce)); 3330 3331 if (ce->state) 3332 __execlists_context_fini(ce); 3333 3334 intel_context_fini(ce); 3335 intel_context_free(ce); 3336 } 3337 3338 static void 3339 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 3340 { 3341 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3342 return; 3343 3344 vaddr += engine->context_size; 3345 3346 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 3347 } 3348 3349 static void 3350 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 3351 { 3352 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3353 return; 3354 3355 vaddr += engine->context_size; 3356 3357 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 3358 drm_err_once(&engine->i915->drm, 3359 "%s context redzone overwritten!\n", 3360 engine->name); 3361 } 3362 3363 static void execlists_context_unpin(struct intel_context *ce) 3364 { 3365 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 3366 ce->engine); 3367 } 3368 3369 static void execlists_context_post_unpin(struct intel_context *ce) 3370 { 3371 i915_gem_object_unpin_map(ce->state->obj); 3372 } 3373 3374 static u32 * 3375 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 3376 { 3377 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3378 MI_SRM_LRM_GLOBAL_GTT | 3379 MI_LRI_LRM_CS_MMIO; 3380 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3381 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3382 CTX_TIMESTAMP * sizeof(u32); 3383 *cs++ = 0; 3384 3385 *cs++ = MI_LOAD_REGISTER_REG | 3386 MI_LRR_SOURCE_CS_MMIO | 3387 MI_LRI_LRM_CS_MMIO; 3388 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3389 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3390 3391 *cs++ = MI_LOAD_REGISTER_REG | 3392 MI_LRR_SOURCE_CS_MMIO | 3393 MI_LRI_LRM_CS_MMIO; 3394 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3395 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3396 3397 return cs; 3398 } 3399 3400 static u32 * 3401 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 3402 { 3403 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 3404 3405 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3406 MI_SRM_LRM_GLOBAL_GTT | 3407 MI_LRI_LRM_CS_MMIO; 3408 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3409 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3410 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 3411 *cs++ = 0; 3412 3413 return cs; 3414 } 3415 3416 static u32 * 3417 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 3418 { 3419 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 3420 3421 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3422 MI_SRM_LRM_GLOBAL_GTT | 3423 MI_LRI_LRM_CS_MMIO; 3424 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3425 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3426 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 3427 *cs++ = 0; 3428 3429 *cs++ = MI_LOAD_REGISTER_REG | 3430 MI_LRR_SOURCE_CS_MMIO | 3431 MI_LRI_LRM_CS_MMIO; 3432 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3433 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 3434 3435 return cs; 3436 } 3437 3438 static u32 * 3439 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 3440 { 3441 cs = gen12_emit_timestamp_wa(ce, cs); 3442 cs = gen12_emit_cmd_buf_wa(ce, cs); 3443 cs = gen12_emit_restore_scratch(ce, cs); 3444 3445 return cs; 3446 } 3447 3448 static u32 * 3449 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 3450 { 3451 cs = gen12_emit_timestamp_wa(ce, cs); 3452 cs = gen12_emit_restore_scratch(ce, cs); 3453 3454 return cs; 3455 } 3456 3457 static inline u32 context_wa_bb_offset(const struct intel_context *ce) 3458 { 3459 return PAGE_SIZE * ce->wa_bb_page; 3460 } 3461 3462 static u32 *context_indirect_bb(const struct intel_context *ce) 3463 { 3464 void *ptr; 3465 3466 GEM_BUG_ON(!ce->wa_bb_page); 3467 3468 ptr = ce->lrc_reg_state; 3469 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 3470 ptr += context_wa_bb_offset(ce); 3471 3472 return ptr; 3473 } 3474 3475 static void 3476 setup_indirect_ctx_bb(const struct intel_context *ce, 3477 const struct intel_engine_cs *engine, 3478 u32 *(*emit)(const struct intel_context *, u32 *)) 3479 { 3480 u32 * const start = context_indirect_bb(ce); 3481 u32 *cs; 3482 3483 cs = emit(ce, start); 3484 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 3485 while ((unsigned long)cs % CACHELINE_BYTES) 3486 *cs++ = MI_NOOP; 3487 3488 lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, 3489 i915_ggtt_offset(ce->state) + 3490 context_wa_bb_offset(ce), 3491 (cs - start) * sizeof(*cs)); 3492 } 3493 3494 static void 3495 __execlists_update_reg_state(const struct intel_context *ce, 3496 const struct intel_engine_cs *engine, 3497 u32 head) 3498 { 3499 struct intel_ring *ring = ce->ring; 3500 u32 *regs = ce->lrc_reg_state; 3501 3502 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 3503 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 3504 3505 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 3506 regs[CTX_RING_HEAD] = head; 3507 regs[CTX_RING_TAIL] = ring->tail; 3508 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3509 3510 /* RPCS */ 3511 if (engine->class == RENDER_CLASS) { 3512 regs[CTX_R_PWR_CLK_STATE] = 3513 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 3514 3515 i915_oa_init_reg_state(ce, engine); 3516 } 3517 3518 if (ce->wa_bb_page) { 3519 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 3520 3521 fn = gen12_emit_indirect_ctx_xcs; 3522 if (ce->engine->class == RENDER_CLASS) 3523 fn = gen12_emit_indirect_ctx_rcs; 3524 3525 /* Mutually exclusive wrt to global indirect bb */ 3526 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 3527 setup_indirect_ctx_bb(ce, engine, fn); 3528 } 3529 } 3530 3531 static int 3532 execlists_context_pre_pin(struct intel_context *ce, 3533 struct i915_gem_ww_ctx *ww, void **vaddr) 3534 { 3535 GEM_BUG_ON(!ce->state); 3536 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3537 3538 *vaddr = i915_gem_object_pin_map(ce->state->obj, 3539 i915_coherent_map_type(ce->engine->i915) | 3540 I915_MAP_OVERRIDE); 3541 3542 return PTR_ERR_OR_ZERO(*vaddr); 3543 } 3544 3545 static int 3546 __execlists_context_pin(struct intel_context *ce, 3547 struct intel_engine_cs *engine, 3548 void *vaddr) 3549 { 3550 ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 3551 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 3552 __execlists_update_reg_state(ce, engine, ce->ring->tail); 3553 3554 return 0; 3555 } 3556 3557 static int execlists_context_pin(struct intel_context *ce, void *vaddr) 3558 { 3559 return __execlists_context_pin(ce, ce->engine, vaddr); 3560 } 3561 3562 static int execlists_context_alloc(struct intel_context *ce) 3563 { 3564 return __execlists_context_alloc(ce, ce->engine); 3565 } 3566 3567 static void execlists_context_reset(struct intel_context *ce) 3568 { 3569 CE_TRACE(ce, "reset\n"); 3570 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3571 3572 intel_ring_reset(ce->ring, ce->ring->emit); 3573 3574 /* Scrub away the garbage */ 3575 execlists_init_reg_state(ce->lrc_reg_state, 3576 ce, ce->engine, ce->ring, true); 3577 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3578 3579 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 3580 } 3581 3582 static const struct intel_context_ops execlists_context_ops = { 3583 .alloc = execlists_context_alloc, 3584 3585 .pre_pin = execlists_context_pre_pin, 3586 .pin = execlists_context_pin, 3587 .unpin = execlists_context_unpin, 3588 .post_unpin = execlists_context_post_unpin, 3589 3590 .enter = intel_context_enter_engine, 3591 .exit = intel_context_exit_engine, 3592 3593 .reset = execlists_context_reset, 3594 .destroy = execlists_context_destroy, 3595 }; 3596 3597 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3598 { 3599 u32 *cs; 3600 3601 GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); 3602 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3603 return 0; 3604 3605 cs = intel_ring_begin(rq, 6); 3606 if (IS_ERR(cs)) 3607 return PTR_ERR(cs); 3608 3609 /* 3610 * Check if we have been preempted before we even get started. 3611 * 3612 * After this point i915_request_started() reports true, even if 3613 * we get preempted and so are no longer running. 3614 */ 3615 *cs++ = MI_ARB_CHECK; 3616 *cs++ = MI_NOOP; 3617 3618 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3619 *cs++ = i915_request_timeline(rq)->hwsp_offset; 3620 *cs++ = 0; 3621 *cs++ = rq->fence.seqno - 1; 3622 3623 intel_ring_advance(rq, cs); 3624 3625 /* Record the updated position of the request's payload */ 3626 rq->infix = intel_ring_offset(rq, cs); 3627 3628 __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); 3629 3630 return 0; 3631 } 3632 3633 static int emit_pdps(struct i915_request *rq) 3634 { 3635 const struct intel_engine_cs * const engine = rq->engine; 3636 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); 3637 int err, i; 3638 u32 *cs; 3639 3640 GEM_BUG_ON(intel_vgpu_active(rq->engine->i915)); 3641 3642 /* 3643 * Beware ye of the dragons, this sequence is magic! 3644 * 3645 * Small changes to this sequence can cause anything from 3646 * GPU hangs to forcewake errors and machine lockups! 3647 */ 3648 3649 /* Flush any residual operations from the context load */ 3650 err = engine->emit_flush(rq, EMIT_FLUSH); 3651 if (err) 3652 return err; 3653 3654 /* Magic required to prevent forcewake errors! */ 3655 err = engine->emit_flush(rq, EMIT_INVALIDATE); 3656 if (err) 3657 return err; 3658 3659 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 3660 if (IS_ERR(cs)) 3661 return PTR_ERR(cs); 3662 3663 /* Ensure the LRI have landed before we invalidate & continue */ 3664 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 3665 for (i = GEN8_3LVL_PDPES; i--; ) { 3666 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 3667 u32 base = engine->mmio_base; 3668 3669 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 3670 *cs++ = upper_32_bits(pd_daddr); 3671 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 3672 *cs++ = lower_32_bits(pd_daddr); 3673 } 3674 *cs++ = MI_NOOP; 3675 3676 intel_ring_advance(rq, cs); 3677 3678 return 0; 3679 } 3680 3681 static int execlists_request_alloc(struct i915_request *request) 3682 { 3683 int ret; 3684 3685 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3686 3687 /* 3688 * Flush enough space to reduce the likelihood of waiting after 3689 * we start building the request - in which case we will just 3690 * have to repeat work. 3691 */ 3692 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3693 3694 /* 3695 * Note that after this point, we have committed to using 3696 * this request as it is being used to both track the 3697 * state of engine initialisation and liveness of the 3698 * golden renderstate above. Think twice before you try 3699 * to cancel/unwind this request now. 3700 */ 3701 3702 if (!i915_vm_is_4lvl(request->context->vm)) { 3703 ret = emit_pdps(request); 3704 if (ret) 3705 return ret; 3706 } 3707 3708 /* Unconditionally invalidate GPU caches and TLBs. */ 3709 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3710 if (ret) 3711 return ret; 3712 3713 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3714 return 0; 3715 } 3716 3717 /* 3718 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3719 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3720 * but there is a slight complication as this is applied in WA batch where the 3721 * values are only initialized once so we cannot take register value at the 3722 * beginning and reuse it further; hence we save its value to memory, upload a 3723 * constant value with bit21 set and then we restore it back with the saved value. 3724 * To simplify the WA, a constant value is formed by using the default value 3725 * of this register. This shouldn't be a problem because we are only modifying 3726 * it for a short period and this batch in non-premptible. We can ofcourse 3727 * use additional instructions that read the actual value of the register 3728 * at that time and set our bit of interest but it makes the WA complicated. 3729 * 3730 * This WA is also required for Gen9 so extracting as a function avoids 3731 * code duplication. 3732 */ 3733 static u32 * 3734 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3735 { 3736 /* NB no one else is allowed to scribble over scratch + 256! */ 3737 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3738 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3739 *batch++ = intel_gt_scratch_offset(engine->gt, 3740 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3741 *batch++ = 0; 3742 3743 *batch++ = MI_LOAD_REGISTER_IMM(1); 3744 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3745 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3746 3747 batch = gen8_emit_pipe_control(batch, 3748 PIPE_CONTROL_CS_STALL | 3749 PIPE_CONTROL_DC_FLUSH_ENABLE, 3750 0); 3751 3752 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3753 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3754 *batch++ = intel_gt_scratch_offset(engine->gt, 3755 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3756 *batch++ = 0; 3757 3758 return batch; 3759 } 3760 3761 /* 3762 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3763 * initialized at the beginning and shared across all contexts but this field 3764 * helps us to have multiple batches at different offsets and select them based 3765 * on a criteria. At the moment this batch always start at the beginning of the page 3766 * and at this point we don't have multiple wa_ctx batch buffers. 3767 * 3768 * The number of WA applied are not known at the beginning; we use this field 3769 * to return the no of DWORDS written. 3770 * 3771 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3772 * so it adds NOOPs as padding to make it cacheline aligned. 3773 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3774 * makes a complete batch buffer. 3775 */ 3776 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3777 { 3778 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3779 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3780 3781 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3782 if (IS_BROADWELL(engine->i915)) 3783 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3784 3785 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3786 /* Actual scratch location is at 128 bytes offset */ 3787 batch = gen8_emit_pipe_control(batch, 3788 PIPE_CONTROL_FLUSH_L3 | 3789 PIPE_CONTROL_STORE_DATA_INDEX | 3790 PIPE_CONTROL_CS_STALL | 3791 PIPE_CONTROL_QW_WRITE, 3792 LRC_PPHWSP_SCRATCH_ADDR); 3793 3794 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3795 3796 /* Pad to end of cacheline */ 3797 while ((unsigned long)batch % CACHELINE_BYTES) 3798 *batch++ = MI_NOOP; 3799 3800 /* 3801 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3802 * execution depends on the length specified in terms of cache lines 3803 * in the register CTX_RCS_INDIRECT_CTX 3804 */ 3805 3806 return batch; 3807 } 3808 3809 struct lri { 3810 i915_reg_t reg; 3811 u32 value; 3812 }; 3813 3814 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3815 { 3816 GEM_BUG_ON(!count || count > 63); 3817 3818 *batch++ = MI_LOAD_REGISTER_IMM(count); 3819 do { 3820 *batch++ = i915_mmio_reg_offset(lri->reg); 3821 *batch++ = lri->value; 3822 } while (lri++, --count); 3823 *batch++ = MI_NOOP; 3824 3825 return batch; 3826 } 3827 3828 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3829 { 3830 static const struct lri lri[] = { 3831 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3832 { 3833 COMMON_SLICE_CHICKEN2, 3834 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3835 0), 3836 }, 3837 3838 /* BSpec: 11391 */ 3839 { 3840 FF_SLICE_CHICKEN, 3841 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3842 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3843 }, 3844 3845 /* BSpec: 11299 */ 3846 { 3847 _3D_CHICKEN3, 3848 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3849 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3850 } 3851 }; 3852 3853 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3854 3855 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3856 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3857 3858 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3859 batch = gen8_emit_pipe_control(batch, 3860 PIPE_CONTROL_FLUSH_L3 | 3861 PIPE_CONTROL_STORE_DATA_INDEX | 3862 PIPE_CONTROL_CS_STALL | 3863 PIPE_CONTROL_QW_WRITE, 3864 LRC_PPHWSP_SCRATCH_ADDR); 3865 3866 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3867 3868 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3869 if (HAS_POOLED_EU(engine->i915)) { 3870 /* 3871 * EU pool configuration is setup along with golden context 3872 * during context initialization. This value depends on 3873 * device type (2x6 or 3x6) and needs to be updated based 3874 * on which subslice is disabled especially for 2x6 3875 * devices, however it is safe to load default 3876 * configuration of 3x6 device instead of masking off 3877 * corresponding bits because HW ignores bits of a disabled 3878 * subslice and drops down to appropriate config. Please 3879 * see render_state_setup() in i915_gem_render_state.c for 3880 * possible configurations, to avoid duplication they are 3881 * not shown here again. 3882 */ 3883 *batch++ = GEN9_MEDIA_POOL_STATE; 3884 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3885 *batch++ = 0x00777000; 3886 *batch++ = 0; 3887 *batch++ = 0; 3888 *batch++ = 0; 3889 } 3890 3891 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3892 3893 /* Pad to end of cacheline */ 3894 while ((unsigned long)batch % CACHELINE_BYTES) 3895 *batch++ = MI_NOOP; 3896 3897 return batch; 3898 } 3899 3900 static u32 * 3901 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3902 { 3903 int i; 3904 3905 /* 3906 * WaPipeControlBefore3DStateSamplePattern: cnl 3907 * 3908 * Ensure the engine is idle prior to programming a 3909 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3910 */ 3911 batch = gen8_emit_pipe_control(batch, 3912 PIPE_CONTROL_CS_STALL, 3913 0); 3914 /* 3915 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3916 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3917 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3918 * confusing. Since gen8_emit_pipe_control() already advances the 3919 * batch by 6 dwords, we advance the other 10 here, completing a 3920 * cacheline. It's not clear if the workaround requires this padding 3921 * before other commands, or if it's just the regular padding we would 3922 * already have for the workaround bb, so leave it here for now. 3923 */ 3924 for (i = 0; i < 10; i++) 3925 *batch++ = MI_NOOP; 3926 3927 /* Pad to end of cacheline */ 3928 while ((unsigned long)batch % CACHELINE_BYTES) 3929 *batch++ = MI_NOOP; 3930 3931 return batch; 3932 } 3933 3934 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3935 3936 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3937 { 3938 struct drm_i915_gem_object *obj; 3939 struct i915_vma *vma; 3940 int err; 3941 3942 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3943 if (IS_ERR(obj)) 3944 return PTR_ERR(obj); 3945 3946 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3947 if (IS_ERR(vma)) { 3948 err = PTR_ERR(vma); 3949 goto err; 3950 } 3951 3952 err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); 3953 if (err) 3954 goto err; 3955 3956 engine->wa_ctx.vma = vma; 3957 return 0; 3958 3959 err: 3960 i915_gem_object_put(obj); 3961 return err; 3962 } 3963 3964 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3965 { 3966 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3967 } 3968 3969 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3970 3971 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3972 { 3973 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3974 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3975 &wa_ctx->per_ctx }; 3976 wa_bb_func_t wa_bb_fn[2]; 3977 void *batch, *batch_ptr; 3978 unsigned int i; 3979 int ret; 3980 3981 if (engine->class != RENDER_CLASS) 3982 return 0; 3983 3984 switch (INTEL_GEN(engine->i915)) { 3985 case 12: 3986 case 11: 3987 return 0; 3988 case 10: 3989 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3990 wa_bb_fn[1] = NULL; 3991 break; 3992 case 9: 3993 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3994 wa_bb_fn[1] = NULL; 3995 break; 3996 case 8: 3997 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3998 wa_bb_fn[1] = NULL; 3999 break; 4000 default: 4001 MISSING_CASE(INTEL_GEN(engine->i915)); 4002 return 0; 4003 } 4004 4005 ret = lrc_setup_wa_ctx(engine); 4006 if (ret) { 4007 drm_dbg(&engine->i915->drm, 4008 "Failed to setup context WA page: %d\n", ret); 4009 return ret; 4010 } 4011 4012 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 4013 4014 /* 4015 * Emit the two workaround batch buffers, recording the offset from the 4016 * start of the workaround batch buffer object for each and their 4017 * respective sizes. 4018 */ 4019 batch_ptr = batch; 4020 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 4021 wa_bb[i]->offset = batch_ptr - batch; 4022 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 4023 CACHELINE_BYTES))) { 4024 ret = -EINVAL; 4025 break; 4026 } 4027 if (wa_bb_fn[i]) 4028 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 4029 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 4030 } 4031 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 4032 4033 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 4034 __i915_gem_object_release_map(wa_ctx->vma->obj); 4035 if (ret) 4036 lrc_destroy_wa_ctx(engine); 4037 4038 return ret; 4039 } 4040 4041 static void reset_csb_pointers(struct intel_engine_cs *engine) 4042 { 4043 struct intel_engine_execlists * const execlists = &engine->execlists; 4044 const unsigned int reset_value = execlists->csb_size - 1; 4045 4046 ring_set_paused(engine, 0); 4047 4048 /* 4049 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 4050 * Bludgeon them with a mmio update to be sure. 4051 */ 4052 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4053 0xffff << 16 | reset_value << 8 | reset_value); 4054 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4055 4056 /* 4057 * After a reset, the HW starts writing into CSB entry [0]. We 4058 * therefore have to set our HEAD pointer back one entry so that 4059 * the *first* entry we check is entry 0. To complicate this further, 4060 * as we don't wait for the first interrupt after reset, we have to 4061 * fake the HW write to point back to the last entry so that our 4062 * inline comparison of our cached head position against the last HW 4063 * write works even before the first interrupt. 4064 */ 4065 execlists->csb_head = reset_value; 4066 WRITE_ONCE(*execlists->csb_write, reset_value); 4067 wmb(); /* Make sure this is visible to HW (paranoia?) */ 4068 4069 /* Check that the GPU does indeed update the CSB entries! */ 4070 memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64)); 4071 invalidate_csb_entries(&execlists->csb_status[0], 4072 &execlists->csb_status[reset_value]); 4073 4074 /* Once more for luck and our trusty paranoia */ 4075 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4076 0xffff << 16 | reset_value << 8 | reset_value); 4077 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4078 4079 GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); 4080 } 4081 4082 static void execlists_sanitize(struct intel_engine_cs *engine) 4083 { 4084 /* 4085 * Poison residual state on resume, in case the suspend didn't! 4086 * 4087 * We have to assume that across suspend/resume (or other loss 4088 * of control) that the contents of our pinned buffers has been 4089 * lost, replaced by garbage. Since this doesn't always happen, 4090 * let's poison such state so that we more quickly spot when 4091 * we falsely assume it has been preserved. 4092 */ 4093 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4094 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); 4095 4096 reset_csb_pointers(engine); 4097 4098 /* 4099 * The kernel_context HWSP is stored in the status_page. As above, 4100 * that may be lost on resume/initialisation, and so we need to 4101 * reset the value in the HWSP. 4102 */ 4103 intel_timeline_reset_seqno(engine->kernel_context->timeline); 4104 4105 /* And scrub the dirty cachelines for the HWSP */ 4106 clflush_cache_range(engine->status_page.addr, PAGE_SIZE); 4107 } 4108 4109 static void enable_error_interrupt(struct intel_engine_cs *engine) 4110 { 4111 u32 status; 4112 4113 engine->execlists.error_interrupt = 0; 4114 ENGINE_WRITE(engine, RING_EMR, ~0u); 4115 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 4116 4117 status = ENGINE_READ(engine, RING_ESR); 4118 if (unlikely(status)) { 4119 drm_err(&engine->i915->drm, 4120 "engine '%s' resumed still in error: %08x\n", 4121 engine->name, status); 4122 __intel_gt_reset(engine->gt, engine->mask); 4123 } 4124 4125 /* 4126 * On current gen8+, we have 2 signals to play with 4127 * 4128 * - I915_ERROR_INSTUCTION (bit 0) 4129 * 4130 * Generate an error if the command parser encounters an invalid 4131 * instruction 4132 * 4133 * This is a fatal error. 4134 * 4135 * - CP_PRIV (bit 2) 4136 * 4137 * Generate an error on privilege violation (where the CP replaces 4138 * the instruction with a no-op). This also fires for writes into 4139 * read-only scratch pages. 4140 * 4141 * This is a non-fatal error, parsing continues. 4142 * 4143 * * there are a few others defined for odd HW that we do not use 4144 * 4145 * Since CP_PRIV fires for cases where we have chosen to ignore the 4146 * error (as the HW is validating and suppressing the mistakes), we 4147 * only unmask the instruction error bit. 4148 */ 4149 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 4150 } 4151 4152 static void enable_execlists(struct intel_engine_cs *engine) 4153 { 4154 u32 mode; 4155 4156 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 4157 4158 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 4159 4160 if (INTEL_GEN(engine->i915) >= 11) 4161 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 4162 else 4163 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 4164 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 4165 4166 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 4167 4168 ENGINE_WRITE_FW(engine, 4169 RING_HWS_PGA, 4170 i915_ggtt_offset(engine->status_page.vma)); 4171 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 4172 4173 enable_error_interrupt(engine); 4174 4175 engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0); 4176 } 4177 4178 static bool unexpected_starting_state(struct intel_engine_cs *engine) 4179 { 4180 bool unexpected = false; 4181 4182 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 4183 drm_dbg(&engine->i915->drm, 4184 "STOP_RING still set in RING_MI_MODE\n"); 4185 unexpected = true; 4186 } 4187 4188 return unexpected; 4189 } 4190 4191 static int execlists_resume(struct intel_engine_cs *engine) 4192 { 4193 intel_mocs_init_engine(engine); 4194 4195 intel_breadcrumbs_reset(engine->breadcrumbs); 4196 4197 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 4198 struct drm_printer p = drm_debug_printer(__func__); 4199 4200 intel_engine_dump(engine, &p, NULL); 4201 } 4202 4203 enable_execlists(engine); 4204 4205 return 0; 4206 } 4207 4208 static void execlists_reset_prepare(struct intel_engine_cs *engine) 4209 { 4210 struct intel_engine_execlists * const execlists = &engine->execlists; 4211 unsigned long flags; 4212 4213 ENGINE_TRACE(engine, "depth<-%d\n", 4214 atomic_read(&execlists->tasklet.count)); 4215 4216 /* 4217 * Prevent request submission to the hardware until we have 4218 * completed the reset in i915_gem_reset_finish(). If a request 4219 * is completed by one engine, it may then queue a request 4220 * to a second via its execlists->tasklet *just* as we are 4221 * calling engine->resume() and also writing the ELSP. 4222 * Turning off the execlists->tasklet until the reset is over 4223 * prevents the race. 4224 */ 4225 __tasklet_disable_sync_once(&execlists->tasklet); 4226 GEM_BUG_ON(!reset_in_progress(execlists)); 4227 4228 /* And flush any current direct submission. */ 4229 spin_lock_irqsave(&engine->active.lock, flags); 4230 spin_unlock_irqrestore(&engine->active.lock, flags); 4231 4232 /* 4233 * We stop engines, otherwise we might get failed reset and a 4234 * dead gpu (on elk). Also as modern gpu as kbl can suffer 4235 * from system hang if batchbuffer is progressing when 4236 * the reset is issued, regardless of READY_TO_RESET ack. 4237 * Thus assume it is best to stop engines on all gens 4238 * where we have a gpu reset. 4239 * 4240 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 4241 * 4242 * FIXME: Wa for more modern gens needs to be validated 4243 */ 4244 ring_set_paused(engine, 1); 4245 intel_engine_stop_cs(engine); 4246 4247 engine->execlists.reset_ccid = active_ccid(engine); 4248 } 4249 4250 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 4251 { 4252 int x; 4253 4254 x = lrc_ring_mi_mode(engine); 4255 if (x != -1) { 4256 regs[x + 1] &= ~STOP_RING; 4257 regs[x + 1] |= STOP_RING << 16; 4258 } 4259 } 4260 4261 static void __execlists_reset_reg_state(const struct intel_context *ce, 4262 const struct intel_engine_cs *engine) 4263 { 4264 u32 *regs = ce->lrc_reg_state; 4265 4266 __reset_stop_ring(regs, engine); 4267 } 4268 4269 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 4270 { 4271 struct intel_engine_execlists * const execlists = &engine->execlists; 4272 struct intel_context *ce; 4273 struct i915_request *rq; 4274 u32 head; 4275 4276 mb(); /* paranoia: read the CSB pointers from after the reset */ 4277 clflush(execlists->csb_write); 4278 mb(); 4279 4280 process_csb(engine); /* drain preemption events */ 4281 4282 /* Following the reset, we need to reload the CSB read/write pointers */ 4283 reset_csb_pointers(engine); 4284 4285 /* 4286 * Save the currently executing context, even if we completed 4287 * its request, it was still running at the time of the 4288 * reset and will have been clobbered. 4289 */ 4290 rq = active_context(engine, engine->execlists.reset_ccid); 4291 if (!rq) 4292 goto unwind; 4293 4294 ce = rq->context; 4295 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 4296 4297 if (i915_request_completed(rq)) { 4298 /* Idle context; tidy up the ring so we can restart afresh */ 4299 head = intel_ring_wrap(ce->ring, rq->tail); 4300 goto out_replay; 4301 } 4302 4303 /* We still have requests in-flight; the engine should be active */ 4304 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 4305 4306 /* Context has requests still in-flight; it should not be idle! */ 4307 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 4308 4309 rq = active_request(ce->timeline, rq); 4310 head = intel_ring_wrap(ce->ring, rq->head); 4311 GEM_BUG_ON(head == ce->ring->tail); 4312 4313 /* 4314 * If this request hasn't started yet, e.g. it is waiting on a 4315 * semaphore, we need to avoid skipping the request or else we 4316 * break the signaling chain. However, if the context is corrupt 4317 * the request will not restart and we will be stuck with a wedged 4318 * device. It is quite often the case that if we issue a reset 4319 * while the GPU is loading the context image, that the context 4320 * image becomes corrupt. 4321 * 4322 * Otherwise, if we have not started yet, the request should replay 4323 * perfectly and we do not need to flag the result as being erroneous. 4324 */ 4325 if (!i915_request_started(rq)) 4326 goto out_replay; 4327 4328 /* 4329 * If the request was innocent, we leave the request in the ELSP 4330 * and will try to replay it on restarting. The context image may 4331 * have been corrupted by the reset, in which case we may have 4332 * to service a new GPU hang, but more likely we can continue on 4333 * without impact. 4334 * 4335 * If the request was guilty, we presume the context is corrupt 4336 * and have to at least restore the RING register in the context 4337 * image back to the expected values to skip over the guilty request. 4338 */ 4339 __i915_request_reset(rq, stalled); 4340 4341 /* 4342 * We want a simple context + ring to execute the breadcrumb update. 4343 * We cannot rely on the context being intact across the GPU hang, 4344 * so clear it and rebuild just what we need for the breadcrumb. 4345 * All pending requests for this context will be zapped, and any 4346 * future request will be after userspace has had the opportunity 4347 * to recreate its own state. 4348 */ 4349 out_replay: 4350 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 4351 head, ce->ring->tail); 4352 __execlists_reset_reg_state(ce, engine); 4353 __execlists_update_reg_state(ce, engine, head); 4354 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 4355 4356 unwind: 4357 /* Push back any incomplete requests for replay after the reset. */ 4358 cancel_port_requests(execlists); 4359 __unwind_incomplete_requests(engine); 4360 } 4361 4362 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 4363 { 4364 unsigned long flags; 4365 4366 ENGINE_TRACE(engine, "\n"); 4367 4368 spin_lock_irqsave(&engine->active.lock, flags); 4369 4370 __execlists_reset(engine, stalled); 4371 4372 spin_unlock_irqrestore(&engine->active.lock, flags); 4373 } 4374 4375 static void nop_submission_tasklet(unsigned long data) 4376 { 4377 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 4378 4379 /* The driver is wedged; don't process any more events. */ 4380 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); 4381 } 4382 4383 static void execlists_reset_cancel(struct intel_engine_cs *engine) 4384 { 4385 struct intel_engine_execlists * const execlists = &engine->execlists; 4386 struct i915_request *rq, *rn; 4387 struct rb_node *rb; 4388 unsigned long flags; 4389 4390 ENGINE_TRACE(engine, "\n"); 4391 4392 /* 4393 * Before we call engine->cancel_requests(), we should have exclusive 4394 * access to the submission state. This is arranged for us by the 4395 * caller disabling the interrupt generation, the tasklet and other 4396 * threads that may then access the same state, giving us a free hand 4397 * to reset state. However, we still need to let lockdep be aware that 4398 * we know this state may be accessed in hardirq context, so we 4399 * disable the irq around this manipulation and we want to keep 4400 * the spinlock focused on its duties and not accidentally conflate 4401 * coverage to the submission's irq state. (Similarly, although we 4402 * shouldn't need to disable irq around the manipulation of the 4403 * submission's irq state, we also wish to remind ourselves that 4404 * it is irq state.) 4405 */ 4406 spin_lock_irqsave(&engine->active.lock, flags); 4407 4408 __execlists_reset(engine, true); 4409 4410 /* Mark all executing requests as skipped. */ 4411 list_for_each_entry(rq, &engine->active.requests, sched.link) 4412 mark_eio(rq); 4413 intel_engine_signal_breadcrumbs(engine); 4414 4415 /* Flush the queued requests to the timeline list (for retiring). */ 4416 while ((rb = rb_first_cached(&execlists->queue))) { 4417 struct i915_priolist *p = to_priolist(rb); 4418 int i; 4419 4420 priolist_for_each_request_consume(rq, rn, p, i) { 4421 mark_eio(rq); 4422 __i915_request_submit(rq); 4423 } 4424 4425 rb_erase_cached(&p->node, &execlists->queue); 4426 i915_priolist_free(p); 4427 } 4428 4429 /* On-hold requests will be flushed to timeline upon their release */ 4430 list_for_each_entry(rq, &engine->active.hold, sched.link) 4431 mark_eio(rq); 4432 4433 /* Cancel all attached virtual engines */ 4434 while ((rb = rb_first_cached(&execlists->virtual))) { 4435 struct virtual_engine *ve = 4436 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4437 4438 rb_erase_cached(rb, &execlists->virtual); 4439 RB_CLEAR_NODE(rb); 4440 4441 spin_lock(&ve->base.active.lock); 4442 rq = fetch_and_zero(&ve->request); 4443 if (rq) { 4444 mark_eio(rq); 4445 4446 rq->engine = engine; 4447 __i915_request_submit(rq); 4448 i915_request_put(rq); 4449 4450 ve->base.execlists.queue_priority_hint = INT_MIN; 4451 } 4452 spin_unlock(&ve->base.active.lock); 4453 } 4454 4455 /* Remaining _unready_ requests will be nop'ed when submitted */ 4456 4457 execlists->queue_priority_hint = INT_MIN; 4458 execlists->queue = RB_ROOT_CACHED; 4459 4460 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 4461 execlists->tasklet.func = nop_submission_tasklet; 4462 4463 spin_unlock_irqrestore(&engine->active.lock, flags); 4464 } 4465 4466 static void execlists_reset_finish(struct intel_engine_cs *engine) 4467 { 4468 struct intel_engine_execlists * const execlists = &engine->execlists; 4469 4470 /* 4471 * After a GPU reset, we may have requests to replay. Do so now while 4472 * we still have the forcewake to be sure that the GPU is not allowed 4473 * to sleep before we restart and reload a context. 4474 */ 4475 GEM_BUG_ON(!reset_in_progress(execlists)); 4476 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 4477 execlists->tasklet.func(execlists->tasklet.data); 4478 4479 if (__tasklet_enable(&execlists->tasklet)) 4480 /* And kick in case we missed a new request submission. */ 4481 tasklet_hi_schedule(&execlists->tasklet); 4482 ENGINE_TRACE(engine, "depth->%d\n", 4483 atomic_read(&execlists->tasklet.count)); 4484 } 4485 4486 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 4487 u64 offset, u32 len, 4488 const unsigned int flags) 4489 { 4490 u32 *cs; 4491 4492 cs = intel_ring_begin(rq, 4); 4493 if (IS_ERR(cs)) 4494 return PTR_ERR(cs); 4495 4496 /* 4497 * WaDisableCtxRestoreArbitration:bdw,chv 4498 * 4499 * We don't need to perform MI_ARB_ENABLE as often as we do (in 4500 * particular all the gen that do not need the w/a at all!), if we 4501 * took care to make sure that on every switch into this context 4502 * (both ordinary and for preemption) that arbitrartion was enabled 4503 * we would be fine. However, for gen8 there is another w/a that 4504 * requires us to not preempt inside GPGPU execution, so we keep 4505 * arbitration disabled for gen8 batches. Arbitration will be 4506 * re-enabled before we close the request 4507 * (engine->emit_fini_breadcrumb). 4508 */ 4509 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4510 4511 /* FIXME(BDW+): Address space and security selectors. */ 4512 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4513 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4514 *cs++ = lower_32_bits(offset); 4515 *cs++ = upper_32_bits(offset); 4516 4517 intel_ring_advance(rq, cs); 4518 4519 return 0; 4520 } 4521 4522 static int gen8_emit_bb_start(struct i915_request *rq, 4523 u64 offset, u32 len, 4524 const unsigned int flags) 4525 { 4526 u32 *cs; 4527 4528 cs = intel_ring_begin(rq, 6); 4529 if (IS_ERR(cs)) 4530 return PTR_ERR(cs); 4531 4532 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4533 4534 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4535 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4536 *cs++ = lower_32_bits(offset); 4537 *cs++ = upper_32_bits(offset); 4538 4539 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4540 *cs++ = MI_NOOP; 4541 4542 intel_ring_advance(rq, cs); 4543 4544 return 0; 4545 } 4546 4547 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 4548 { 4549 ENGINE_WRITE(engine, RING_IMR, 4550 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 4551 ENGINE_POSTING_READ(engine, RING_IMR); 4552 } 4553 4554 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 4555 { 4556 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 4557 } 4558 4559 static int gen8_emit_flush(struct i915_request *request, u32 mode) 4560 { 4561 u32 cmd, *cs; 4562 4563 cs = intel_ring_begin(request, 4); 4564 if (IS_ERR(cs)) 4565 return PTR_ERR(cs); 4566 4567 cmd = MI_FLUSH_DW + 1; 4568 4569 /* We always require a command barrier so that subsequent 4570 * commands, such as breadcrumb interrupts, are strictly ordered 4571 * wrt the contents of the write cache being flushed to memory 4572 * (and thus being coherent from the CPU). 4573 */ 4574 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4575 4576 if (mode & EMIT_INVALIDATE) { 4577 cmd |= MI_INVALIDATE_TLB; 4578 if (request->engine->class == VIDEO_DECODE_CLASS) 4579 cmd |= MI_INVALIDATE_BSD; 4580 } 4581 4582 *cs++ = cmd; 4583 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4584 *cs++ = 0; /* upper addr */ 4585 *cs++ = 0; /* value */ 4586 intel_ring_advance(request, cs); 4587 4588 return 0; 4589 } 4590 4591 static int gen8_emit_flush_render(struct i915_request *request, 4592 u32 mode) 4593 { 4594 bool vf_flush_wa = false, dc_flush_wa = false; 4595 u32 *cs, flags = 0; 4596 int len; 4597 4598 flags |= PIPE_CONTROL_CS_STALL; 4599 4600 if (mode & EMIT_FLUSH) { 4601 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4602 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4603 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4604 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4605 } 4606 4607 if (mode & EMIT_INVALIDATE) { 4608 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4609 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4610 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4611 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4612 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4613 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4614 flags |= PIPE_CONTROL_QW_WRITE; 4615 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4616 4617 /* 4618 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 4619 * pipe control. 4620 */ 4621 if (IS_GEN(request->engine->i915, 9)) 4622 vf_flush_wa = true; 4623 4624 /* WaForGAMHang:kbl */ 4625 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0)) 4626 dc_flush_wa = true; 4627 } 4628 4629 len = 6; 4630 4631 if (vf_flush_wa) 4632 len += 6; 4633 4634 if (dc_flush_wa) 4635 len += 12; 4636 4637 cs = intel_ring_begin(request, len); 4638 if (IS_ERR(cs)) 4639 return PTR_ERR(cs); 4640 4641 if (vf_flush_wa) 4642 cs = gen8_emit_pipe_control(cs, 0, 0); 4643 4644 if (dc_flush_wa) 4645 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 4646 0); 4647 4648 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4649 4650 if (dc_flush_wa) 4651 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 4652 4653 intel_ring_advance(request, cs); 4654 4655 return 0; 4656 } 4657 4658 static int gen11_emit_flush_render(struct i915_request *request, 4659 u32 mode) 4660 { 4661 if (mode & EMIT_FLUSH) { 4662 u32 *cs; 4663 u32 flags = 0; 4664 4665 flags |= PIPE_CONTROL_CS_STALL; 4666 4667 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4668 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4669 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4670 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4671 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4672 flags |= PIPE_CONTROL_QW_WRITE; 4673 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4674 4675 cs = intel_ring_begin(request, 6); 4676 if (IS_ERR(cs)) 4677 return PTR_ERR(cs); 4678 4679 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4680 intel_ring_advance(request, cs); 4681 } 4682 4683 if (mode & EMIT_INVALIDATE) { 4684 u32 *cs; 4685 u32 flags = 0; 4686 4687 flags |= PIPE_CONTROL_CS_STALL; 4688 4689 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4690 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4691 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4692 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4693 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4694 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4695 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4696 flags |= PIPE_CONTROL_QW_WRITE; 4697 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4698 4699 cs = intel_ring_begin(request, 6); 4700 if (IS_ERR(cs)) 4701 return PTR_ERR(cs); 4702 4703 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4704 intel_ring_advance(request, cs); 4705 } 4706 4707 return 0; 4708 } 4709 4710 static u32 preparser_disable(bool state) 4711 { 4712 return MI_ARB_CHECK | 1 << 8 | state; 4713 } 4714 4715 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) 4716 { 4717 static const i915_reg_t vd[] = { 4718 GEN12_VD0_AUX_NV, 4719 GEN12_VD1_AUX_NV, 4720 GEN12_VD2_AUX_NV, 4721 GEN12_VD3_AUX_NV, 4722 }; 4723 4724 static const i915_reg_t ve[] = { 4725 GEN12_VE0_AUX_NV, 4726 GEN12_VE1_AUX_NV, 4727 }; 4728 4729 if (engine->class == VIDEO_DECODE_CLASS) 4730 return vd[engine->instance]; 4731 4732 if (engine->class == VIDEO_ENHANCEMENT_CLASS) 4733 return ve[engine->instance]; 4734 4735 GEM_BUG_ON("unknown aux_inv_reg\n"); 4736 4737 return INVALID_MMIO_REG; 4738 } 4739 4740 static u32 * 4741 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) 4742 { 4743 *cs++ = MI_LOAD_REGISTER_IMM(1); 4744 *cs++ = i915_mmio_reg_offset(inv_reg); 4745 *cs++ = AUX_INV; 4746 *cs++ = MI_NOOP; 4747 4748 return cs; 4749 } 4750 4751 static int gen12_emit_flush_render(struct i915_request *request, 4752 u32 mode) 4753 { 4754 if (mode & EMIT_FLUSH) { 4755 u32 flags = 0; 4756 u32 *cs; 4757 4758 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4759 flags |= PIPE_CONTROL_FLUSH_L3; 4760 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4761 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4762 /* Wa_1409600907:tgl */ 4763 flags |= PIPE_CONTROL_DEPTH_STALL; 4764 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4765 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4766 4767 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4768 flags |= PIPE_CONTROL_QW_WRITE; 4769 4770 flags |= PIPE_CONTROL_CS_STALL; 4771 4772 cs = intel_ring_begin(request, 6); 4773 if (IS_ERR(cs)) 4774 return PTR_ERR(cs); 4775 4776 cs = gen12_emit_pipe_control(cs, 4777 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4778 flags, LRC_PPHWSP_SCRATCH_ADDR); 4779 intel_ring_advance(request, cs); 4780 } 4781 4782 if (mode & EMIT_INVALIDATE) { 4783 u32 flags = 0; 4784 u32 *cs; 4785 4786 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4787 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4788 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4789 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4790 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4791 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4792 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4793 4794 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4795 flags |= PIPE_CONTROL_QW_WRITE; 4796 4797 flags |= PIPE_CONTROL_CS_STALL; 4798 4799 cs = intel_ring_begin(request, 8 + 4); 4800 if (IS_ERR(cs)) 4801 return PTR_ERR(cs); 4802 4803 /* 4804 * Prevent the pre-parser from skipping past the TLB 4805 * invalidate and loading a stale page for the batch 4806 * buffer / request payload. 4807 */ 4808 *cs++ = preparser_disable(true); 4809 4810 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4811 4812 /* hsdes: 1809175790 */ 4813 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); 4814 4815 *cs++ = preparser_disable(false); 4816 intel_ring_advance(request, cs); 4817 } 4818 4819 return 0; 4820 } 4821 4822 static int gen12_emit_flush(struct i915_request *request, u32 mode) 4823 { 4824 intel_engine_mask_t aux_inv = 0; 4825 u32 cmd, *cs; 4826 4827 cmd = 4; 4828 if (mode & EMIT_INVALIDATE) 4829 cmd += 2; 4830 if (mode & EMIT_INVALIDATE) 4831 aux_inv = request->engine->mask & ~BIT(BCS0); 4832 if (aux_inv) 4833 cmd += 2 * hweight8(aux_inv) + 2; 4834 4835 cs = intel_ring_begin(request, cmd); 4836 if (IS_ERR(cs)) 4837 return PTR_ERR(cs); 4838 4839 if (mode & EMIT_INVALIDATE) 4840 *cs++ = preparser_disable(true); 4841 4842 cmd = MI_FLUSH_DW + 1; 4843 4844 /* We always require a command barrier so that subsequent 4845 * commands, such as breadcrumb interrupts, are strictly ordered 4846 * wrt the contents of the write cache being flushed to memory 4847 * (and thus being coherent from the CPU). 4848 */ 4849 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4850 4851 if (mode & EMIT_INVALIDATE) { 4852 cmd |= MI_INVALIDATE_TLB; 4853 if (request->engine->class == VIDEO_DECODE_CLASS) 4854 cmd |= MI_INVALIDATE_BSD; 4855 } 4856 4857 *cs++ = cmd; 4858 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4859 *cs++ = 0; /* upper addr */ 4860 *cs++ = 0; /* value */ 4861 4862 if (aux_inv) { /* hsdes: 1809175790 */ 4863 struct intel_engine_cs *engine; 4864 unsigned int tmp; 4865 4866 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); 4867 for_each_engine_masked(engine, request->engine->gt, 4868 aux_inv, tmp) { 4869 *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); 4870 *cs++ = AUX_INV; 4871 } 4872 *cs++ = MI_NOOP; 4873 } 4874 4875 if (mode & EMIT_INVALIDATE) 4876 *cs++ = preparser_disable(false); 4877 4878 intel_ring_advance(request, cs); 4879 4880 return 0; 4881 } 4882 4883 static void assert_request_valid(struct i915_request *rq) 4884 { 4885 struct intel_ring *ring __maybe_unused = rq->ring; 4886 4887 /* Can we unwind this request without appearing to go forwards? */ 4888 GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); 4889 } 4890 4891 /* 4892 * Reserve space for 2 NOOPs at the end of each request to be 4893 * used as a workaround for not being allowed to do lite 4894 * restore with HEAD==TAIL (WaIdleLiteRestore). 4895 */ 4896 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4897 { 4898 /* Ensure there's always at least one preemption point per-request. */ 4899 *cs++ = MI_ARB_CHECK; 4900 *cs++ = MI_NOOP; 4901 request->wa_tail = intel_ring_offset(request, cs); 4902 4903 /* Check that entire request is less than half the ring */ 4904 assert_request_valid(request); 4905 4906 return cs; 4907 } 4908 4909 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4910 { 4911 *cs++ = MI_SEMAPHORE_WAIT | 4912 MI_SEMAPHORE_GLOBAL_GTT | 4913 MI_SEMAPHORE_POLL | 4914 MI_SEMAPHORE_SAD_EQ_SDD; 4915 *cs++ = 0; 4916 *cs++ = intel_hws_preempt_address(request->engine); 4917 *cs++ = 0; 4918 4919 return cs; 4920 } 4921 4922 static __always_inline u32* 4923 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4924 { 4925 *cs++ = MI_USER_INTERRUPT; 4926 4927 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4928 if (intel_engine_has_semaphores(request->engine)) 4929 cs = emit_preempt_busywait(request, cs); 4930 4931 request->tail = intel_ring_offset(request, cs); 4932 assert_ring_tail_valid(request->ring, request->tail); 4933 4934 return gen8_emit_wa_tail(request, cs); 4935 } 4936 4937 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs) 4938 { 4939 u32 addr = i915_request_active_timeline(request)->hwsp_offset; 4940 4941 return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0); 4942 } 4943 4944 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4945 { 4946 return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4947 } 4948 4949 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4950 { 4951 cs = gen8_emit_pipe_control(cs, 4952 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4953 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4954 PIPE_CONTROL_DC_FLUSH_ENABLE, 4955 0); 4956 4957 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4958 cs = gen8_emit_ggtt_write_rcs(cs, 4959 request->fence.seqno, 4960 i915_request_active_timeline(request)->hwsp_offset, 4961 PIPE_CONTROL_FLUSH_ENABLE | 4962 PIPE_CONTROL_CS_STALL); 4963 4964 return gen8_emit_fini_breadcrumb_tail(request, cs); 4965 } 4966 4967 static u32 * 4968 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4969 { 4970 cs = gen8_emit_ggtt_write_rcs(cs, 4971 request->fence.seqno, 4972 i915_request_active_timeline(request)->hwsp_offset, 4973 PIPE_CONTROL_CS_STALL | 4974 PIPE_CONTROL_TILE_CACHE_FLUSH | 4975 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4976 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4977 PIPE_CONTROL_DC_FLUSH_ENABLE | 4978 PIPE_CONTROL_FLUSH_ENABLE); 4979 4980 return gen8_emit_fini_breadcrumb_tail(request, cs); 4981 } 4982 4983 /* 4984 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4985 * flush and will continue pre-fetching the instructions after it before the 4986 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4987 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4988 * of the next request before the memory has been flushed, we're guaranteed that 4989 * we won't access the batch itself too early. 4990 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4991 * so, if the current request is modifying an instruction in the next request on 4992 * the same intel_context, we might pre-fetch and then execute the pre-update 4993 * instruction. To avoid this, the users of self-modifying code should either 4994 * disable the parser around the code emitting the memory writes, via a new flag 4995 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4996 * the in-kernel use-cases we've opted to use a separate context, see 4997 * reloc_gpu() as an example. 4998 * All the above applies only to the instructions themselves. Non-inline data 4999 * used by the instructions is not pre-fetched. 5000 */ 5001 5002 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 5003 { 5004 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 5005 MI_SEMAPHORE_GLOBAL_GTT | 5006 MI_SEMAPHORE_POLL | 5007 MI_SEMAPHORE_SAD_EQ_SDD; 5008 *cs++ = 0; 5009 *cs++ = intel_hws_preempt_address(request->engine); 5010 *cs++ = 0; 5011 *cs++ = 0; 5012 *cs++ = MI_NOOP; 5013 5014 return cs; 5015 } 5016 5017 static __always_inline u32* 5018 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 5019 { 5020 *cs++ = MI_USER_INTERRUPT; 5021 5022 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 5023 if (intel_engine_has_semaphores(request->engine)) 5024 cs = gen12_emit_preempt_busywait(request, cs); 5025 5026 request->tail = intel_ring_offset(request, cs); 5027 assert_ring_tail_valid(request->ring, request->tail); 5028 5029 return gen8_emit_wa_tail(request, cs); 5030 } 5031 5032 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 5033 { 5034 return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 5035 } 5036 5037 static u32 * 5038 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 5039 { 5040 cs = gen12_emit_ggtt_write_rcs(cs, 5041 request->fence.seqno, 5042 i915_request_active_timeline(request)->hwsp_offset, 5043 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 5044 PIPE_CONTROL_CS_STALL | 5045 PIPE_CONTROL_TILE_CACHE_FLUSH | 5046 PIPE_CONTROL_FLUSH_L3 | 5047 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 5048 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 5049 /* Wa_1409600907:tgl */ 5050 PIPE_CONTROL_DEPTH_STALL | 5051 PIPE_CONTROL_DC_FLUSH_ENABLE | 5052 PIPE_CONTROL_FLUSH_ENABLE); 5053 5054 return gen12_emit_fini_breadcrumb_tail(request, cs); 5055 } 5056 5057 static void execlists_park(struct intel_engine_cs *engine) 5058 { 5059 cancel_timer(&engine->execlists.timer); 5060 cancel_timer(&engine->execlists.preempt); 5061 } 5062 5063 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 5064 { 5065 engine->submit_request = execlists_submit_request; 5066 engine->schedule = i915_schedule; 5067 engine->execlists.tasklet.func = execlists_submission_tasklet; 5068 5069 engine->reset.prepare = execlists_reset_prepare; 5070 engine->reset.rewind = execlists_reset_rewind; 5071 engine->reset.cancel = execlists_reset_cancel; 5072 engine->reset.finish = execlists_reset_finish; 5073 5074 engine->park = execlists_park; 5075 engine->unpark = NULL; 5076 5077 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 5078 if (!intel_vgpu_active(engine->i915)) { 5079 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 5080 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) { 5081 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 5082 if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) 5083 engine->flags |= I915_ENGINE_HAS_TIMESLICES; 5084 } 5085 } 5086 5087 if (INTEL_GEN(engine->i915) >= 12) 5088 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 5089 5090 if (intel_engine_has_preemption(engine)) 5091 engine->emit_bb_start = gen8_emit_bb_start; 5092 else 5093 engine->emit_bb_start = gen8_emit_bb_start_noarb; 5094 } 5095 5096 static void execlists_shutdown(struct intel_engine_cs *engine) 5097 { 5098 /* Synchronise with residual timers and any softirq they raise */ 5099 del_timer_sync(&engine->execlists.timer); 5100 del_timer_sync(&engine->execlists.preempt); 5101 tasklet_kill(&engine->execlists.tasklet); 5102 } 5103 5104 static void execlists_release(struct intel_engine_cs *engine) 5105 { 5106 engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ 5107 5108 execlists_shutdown(engine); 5109 5110 intel_engine_cleanup_common(engine); 5111 lrc_destroy_wa_ctx(engine); 5112 } 5113 5114 static void 5115 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 5116 { 5117 /* Default vfuncs which can be overriden by each engine. */ 5118 5119 engine->resume = execlists_resume; 5120 5121 engine->cops = &execlists_context_ops; 5122 engine->request_alloc = execlists_request_alloc; 5123 5124 engine->emit_flush = gen8_emit_flush; 5125 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 5126 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 5127 if (INTEL_GEN(engine->i915) >= 12) { 5128 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 5129 engine->emit_flush = gen12_emit_flush; 5130 } 5131 engine->set_default_submission = intel_execlists_set_default_submission; 5132 5133 if (INTEL_GEN(engine->i915) < 11) { 5134 engine->irq_enable = gen8_logical_ring_enable_irq; 5135 engine->irq_disable = gen8_logical_ring_disable_irq; 5136 } else { 5137 /* 5138 * TODO: On Gen11 interrupt masks need to be clear 5139 * to allow C6 entry. Keep interrupts enabled at 5140 * and take the hit of generating extra interrupts 5141 * until a more refined solution exists. 5142 */ 5143 } 5144 } 5145 5146 static inline void 5147 logical_ring_default_irqs(struct intel_engine_cs *engine) 5148 { 5149 unsigned int shift = 0; 5150 5151 if (INTEL_GEN(engine->i915) < 11) { 5152 const u8 irq_shifts[] = { 5153 [RCS0] = GEN8_RCS_IRQ_SHIFT, 5154 [BCS0] = GEN8_BCS_IRQ_SHIFT, 5155 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 5156 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 5157 [VECS0] = GEN8_VECS_IRQ_SHIFT, 5158 }; 5159 5160 shift = irq_shifts[engine->id]; 5161 } 5162 5163 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 5164 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 5165 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 5166 engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; 5167 } 5168 5169 static void rcs_submission_override(struct intel_engine_cs *engine) 5170 { 5171 switch (INTEL_GEN(engine->i915)) { 5172 case 12: 5173 engine->emit_flush = gen12_emit_flush_render; 5174 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 5175 break; 5176 case 11: 5177 engine->emit_flush = gen11_emit_flush_render; 5178 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 5179 break; 5180 default: 5181 engine->emit_flush = gen8_emit_flush_render; 5182 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 5183 break; 5184 } 5185 } 5186 5187 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 5188 { 5189 struct intel_engine_execlists * const execlists = &engine->execlists; 5190 struct drm_i915_private *i915 = engine->i915; 5191 struct intel_uncore *uncore = engine->uncore; 5192 u32 base = engine->mmio_base; 5193 5194 tasklet_init(&engine->execlists.tasklet, 5195 execlists_submission_tasklet, (unsigned long)engine); 5196 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 5197 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 5198 5199 logical_ring_default_vfuncs(engine); 5200 logical_ring_default_irqs(engine); 5201 5202 if (engine->class == RENDER_CLASS) 5203 rcs_submission_override(engine); 5204 5205 if (intel_init_workaround_bb(engine)) 5206 /* 5207 * We continue even if we fail to initialize WA batch 5208 * because we only expect rare glitches but nothing 5209 * critical to prevent us from using GPU 5210 */ 5211 drm_err(&i915->drm, "WA batch buffer initialization failed\n"); 5212 5213 if (HAS_LOGICAL_RING_ELSQ(i915)) { 5214 execlists->submit_reg = uncore->regs + 5215 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 5216 execlists->ctrl_reg = uncore->regs + 5217 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 5218 } else { 5219 execlists->submit_reg = uncore->regs + 5220 i915_mmio_reg_offset(RING_ELSP(base)); 5221 } 5222 5223 execlists->csb_status = 5224 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 5225 5226 execlists->csb_write = 5227 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 5228 5229 if (INTEL_GEN(i915) < 11) 5230 execlists->csb_size = GEN8_CSB_ENTRIES; 5231 else 5232 execlists->csb_size = GEN11_CSB_ENTRIES; 5233 5234 if (INTEL_GEN(engine->i915) >= 11) { 5235 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32); 5236 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); 5237 } 5238 5239 /* Finally, take ownership and responsibility for cleanup! */ 5240 engine->sanitize = execlists_sanitize; 5241 engine->release = execlists_release; 5242 5243 return 0; 5244 } 5245 5246 static void init_common_reg_state(u32 * const regs, 5247 const struct intel_engine_cs *engine, 5248 const struct intel_ring *ring, 5249 bool inhibit) 5250 { 5251 u32 ctl; 5252 5253 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 5254 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 5255 if (inhibit) 5256 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 5257 if (INTEL_GEN(engine->i915) < 11) 5258 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 5259 CTX_CTRL_RS_CTX_ENABLE); 5260 regs[CTX_CONTEXT_CONTROL] = ctl; 5261 5262 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 5263 regs[CTX_TIMESTAMP] = 0; 5264 } 5265 5266 static void init_wa_bb_reg_state(u32 * const regs, 5267 const struct intel_engine_cs *engine) 5268 { 5269 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 5270 5271 if (wa_ctx->per_ctx.size) { 5272 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 5273 5274 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 5275 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 5276 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 5277 } 5278 5279 if (wa_ctx->indirect_ctx.size) { 5280 lrc_ring_setup_indirect_ctx(regs, engine, 5281 i915_ggtt_offset(wa_ctx->vma) + 5282 wa_ctx->indirect_ctx.offset, 5283 wa_ctx->indirect_ctx.size); 5284 } 5285 } 5286 5287 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 5288 { 5289 if (i915_vm_is_4lvl(&ppgtt->vm)) { 5290 /* 64b PPGTT (48bit canonical) 5291 * PDP0_DESCRIPTOR contains the base address to PML4 and 5292 * other PDP Descriptors are ignored. 5293 */ 5294 ASSIGN_CTX_PML4(ppgtt, regs); 5295 } else { 5296 ASSIGN_CTX_PDP(ppgtt, regs, 3); 5297 ASSIGN_CTX_PDP(ppgtt, regs, 2); 5298 ASSIGN_CTX_PDP(ppgtt, regs, 1); 5299 ASSIGN_CTX_PDP(ppgtt, regs, 0); 5300 } 5301 } 5302 5303 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 5304 { 5305 if (i915_is_ggtt(vm)) 5306 return i915_vm_to_ggtt(vm)->alias; 5307 else 5308 return i915_vm_to_ppgtt(vm); 5309 } 5310 5311 static void execlists_init_reg_state(u32 *regs, 5312 const struct intel_context *ce, 5313 const struct intel_engine_cs *engine, 5314 const struct intel_ring *ring, 5315 bool inhibit) 5316 { 5317 /* 5318 * A context is actually a big batch buffer with several 5319 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 5320 * values we are setting here are only for the first context restore: 5321 * on a subsequent save, the GPU will recreate this batchbuffer with new 5322 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 5323 * we are not initializing here). 5324 * 5325 * Must keep consistent with virtual_update_register_offsets(). 5326 */ 5327 set_offsets(regs, reg_offsets(engine), engine, inhibit); 5328 5329 init_common_reg_state(regs, engine, ring, inhibit); 5330 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 5331 5332 init_wa_bb_reg_state(regs, engine); 5333 5334 __reset_stop_ring(regs, engine); 5335 } 5336 5337 static int 5338 populate_lr_context(struct intel_context *ce, 5339 struct drm_i915_gem_object *ctx_obj, 5340 struct intel_engine_cs *engine, 5341 struct intel_ring *ring) 5342 { 5343 bool inhibit = true; 5344 void *vaddr; 5345 5346 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 5347 if (IS_ERR(vaddr)) { 5348 drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); 5349 return PTR_ERR(vaddr); 5350 } 5351 5352 set_redzone(vaddr, engine); 5353 5354 if (engine->default_state) { 5355 shmem_read(engine->default_state, 0, 5356 vaddr, engine->context_size); 5357 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 5358 inhibit = false; 5359 } 5360 5361 /* Clear the ppHWSP (inc. per-context counters) */ 5362 memset(vaddr, 0, PAGE_SIZE); 5363 5364 /* 5365 * The second page of the context object contains some registers which 5366 * must be set up prior to the first execution. 5367 */ 5368 execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, 5369 ce, engine, ring, inhibit); 5370 5371 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 5372 i915_gem_object_unpin_map(ctx_obj); 5373 return 0; 5374 } 5375 5376 static struct intel_timeline *pinned_timeline(struct intel_context *ce) 5377 { 5378 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 5379 5380 return intel_timeline_create_from_engine(ce->engine, 5381 page_unmask_bits(tl)); 5382 } 5383 5384 static int __execlists_context_alloc(struct intel_context *ce, 5385 struct intel_engine_cs *engine) 5386 { 5387 struct drm_i915_gem_object *ctx_obj; 5388 struct intel_ring *ring; 5389 struct i915_vma *vma; 5390 u32 context_size; 5391 int ret; 5392 5393 GEM_BUG_ON(ce->state); 5394 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 5395 5396 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 5397 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 5398 5399 if (INTEL_GEN(engine->i915) == 12) { 5400 ce->wa_bb_page = context_size / PAGE_SIZE; 5401 context_size += PAGE_SIZE; 5402 } 5403 5404 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 5405 if (IS_ERR(ctx_obj)) 5406 return PTR_ERR(ctx_obj); 5407 5408 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 5409 if (IS_ERR(vma)) { 5410 ret = PTR_ERR(vma); 5411 goto error_deref_obj; 5412 } 5413 5414 if (!page_mask_bits(ce->timeline)) { 5415 struct intel_timeline *tl; 5416 5417 /* 5418 * Use the static global HWSP for the kernel context, and 5419 * a dynamically allocated cacheline for everyone else. 5420 */ 5421 if (unlikely(ce->timeline)) 5422 tl = pinned_timeline(ce); 5423 else 5424 tl = intel_timeline_create(engine->gt); 5425 if (IS_ERR(tl)) { 5426 ret = PTR_ERR(tl); 5427 goto error_deref_obj; 5428 } 5429 5430 ce->timeline = tl; 5431 } 5432 5433 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 5434 if (IS_ERR(ring)) { 5435 ret = PTR_ERR(ring); 5436 goto error_deref_obj; 5437 } 5438 5439 ret = populate_lr_context(ce, ctx_obj, engine, ring); 5440 if (ret) { 5441 drm_dbg(&engine->i915->drm, 5442 "Failed to populate LRC: %d\n", ret); 5443 goto error_ring_free; 5444 } 5445 5446 ce->ring = ring; 5447 ce->state = vma; 5448 5449 return 0; 5450 5451 error_ring_free: 5452 intel_ring_put(ring); 5453 error_deref_obj: 5454 i915_gem_object_put(ctx_obj); 5455 return ret; 5456 } 5457 5458 static struct list_head *virtual_queue(struct virtual_engine *ve) 5459 { 5460 return &ve->base.execlists.default_priolist.requests[0]; 5461 } 5462 5463 static void virtual_context_destroy(struct kref *kref) 5464 { 5465 struct virtual_engine *ve = 5466 container_of(kref, typeof(*ve), context.ref); 5467 unsigned int n; 5468 5469 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5470 GEM_BUG_ON(ve->request); 5471 GEM_BUG_ON(ve->context.inflight); 5472 5473 for (n = 0; n < ve->num_siblings; n++) { 5474 struct intel_engine_cs *sibling = ve->siblings[n]; 5475 struct rb_node *node = &ve->nodes[sibling->id].rb; 5476 unsigned long flags; 5477 5478 if (RB_EMPTY_NODE(node)) 5479 continue; 5480 5481 spin_lock_irqsave(&sibling->active.lock, flags); 5482 5483 /* Detachment is lazily performed in the execlists tasklet */ 5484 if (!RB_EMPTY_NODE(node)) 5485 rb_erase_cached(node, &sibling->execlists.virtual); 5486 5487 spin_unlock_irqrestore(&sibling->active.lock, flags); 5488 } 5489 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 5490 5491 if (ve->context.state) 5492 __execlists_context_fini(&ve->context); 5493 intel_context_fini(&ve->context); 5494 5495 intel_engine_free_request_pool(&ve->base); 5496 5497 kfree(ve->bonds); 5498 kfree(ve); 5499 } 5500 5501 static void virtual_engine_initial_hint(struct virtual_engine *ve) 5502 { 5503 int swp; 5504 5505 /* 5506 * Pick a random sibling on starting to help spread the load around. 5507 * 5508 * New contexts are typically created with exactly the same order 5509 * of siblings, and often started in batches. Due to the way we iterate 5510 * the array of sibling when submitting requests, sibling[0] is 5511 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 5512 * randomised across the system, we also help spread the load by the 5513 * first engine we inspect being different each time. 5514 * 5515 * NB This does not force us to execute on this engine, it will just 5516 * typically be the first we inspect for submission. 5517 */ 5518 swp = prandom_u32_max(ve->num_siblings); 5519 if (swp) 5520 swap(ve->siblings[swp], ve->siblings[0]); 5521 } 5522 5523 static int virtual_context_alloc(struct intel_context *ce) 5524 { 5525 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5526 5527 return __execlists_context_alloc(ce, ve->siblings[0]); 5528 } 5529 5530 static int virtual_context_pin(struct intel_context *ce, void *vaddr) 5531 { 5532 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5533 5534 /* Note: we must use a real engine class for setting up reg state */ 5535 return __execlists_context_pin(ce, ve->siblings[0], vaddr); 5536 } 5537 5538 static void virtual_context_enter(struct intel_context *ce) 5539 { 5540 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5541 unsigned int n; 5542 5543 for (n = 0; n < ve->num_siblings; n++) 5544 intel_engine_pm_get(ve->siblings[n]); 5545 5546 intel_timeline_enter(ce->timeline); 5547 } 5548 5549 static void virtual_context_exit(struct intel_context *ce) 5550 { 5551 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5552 unsigned int n; 5553 5554 intel_timeline_exit(ce->timeline); 5555 5556 for (n = 0; n < ve->num_siblings; n++) 5557 intel_engine_pm_put(ve->siblings[n]); 5558 } 5559 5560 static const struct intel_context_ops virtual_context_ops = { 5561 .alloc = virtual_context_alloc, 5562 5563 .pre_pin = execlists_context_pre_pin, 5564 .pin = virtual_context_pin, 5565 .unpin = execlists_context_unpin, 5566 .post_unpin = execlists_context_post_unpin, 5567 5568 .enter = virtual_context_enter, 5569 .exit = virtual_context_exit, 5570 5571 .destroy = virtual_context_destroy, 5572 }; 5573 5574 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 5575 { 5576 struct i915_request *rq; 5577 intel_engine_mask_t mask; 5578 5579 rq = READ_ONCE(ve->request); 5580 if (!rq) 5581 return 0; 5582 5583 /* The rq is ready for submission; rq->execution_mask is now stable. */ 5584 mask = rq->execution_mask; 5585 if (unlikely(!mask)) { 5586 /* Invalid selection, submit to a random engine in error */ 5587 i915_request_set_error_once(rq, -ENODEV); 5588 mask = ve->siblings[0]->mask; 5589 } 5590 5591 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 5592 rq->fence.context, rq->fence.seqno, 5593 mask, ve->base.execlists.queue_priority_hint); 5594 5595 return mask; 5596 } 5597 5598 static void virtual_submission_tasklet(unsigned long data) 5599 { 5600 struct virtual_engine * const ve = (struct virtual_engine *)data; 5601 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); 5602 intel_engine_mask_t mask; 5603 unsigned int n; 5604 5605 rcu_read_lock(); 5606 mask = virtual_submission_mask(ve); 5607 rcu_read_unlock(); 5608 if (unlikely(!mask)) 5609 return; 5610 5611 local_irq_disable(); 5612 for (n = 0; n < ve->num_siblings; n++) { 5613 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); 5614 struct ve_node * const node = &ve->nodes[sibling->id]; 5615 struct rb_node **parent, *rb; 5616 bool first; 5617 5618 if (!READ_ONCE(ve->request)) 5619 break; /* already handled by a sibling's tasklet */ 5620 5621 if (unlikely(!(mask & sibling->mask))) { 5622 if (!RB_EMPTY_NODE(&node->rb)) { 5623 spin_lock(&sibling->active.lock); 5624 rb_erase_cached(&node->rb, 5625 &sibling->execlists.virtual); 5626 RB_CLEAR_NODE(&node->rb); 5627 spin_unlock(&sibling->active.lock); 5628 } 5629 continue; 5630 } 5631 5632 spin_lock(&sibling->active.lock); 5633 5634 if (!RB_EMPTY_NODE(&node->rb)) { 5635 /* 5636 * Cheat and avoid rebalancing the tree if we can 5637 * reuse this node in situ. 5638 */ 5639 first = rb_first_cached(&sibling->execlists.virtual) == 5640 &node->rb; 5641 if (prio == node->prio || (prio > node->prio && first)) 5642 goto submit_engine; 5643 5644 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 5645 } 5646 5647 rb = NULL; 5648 first = true; 5649 parent = &sibling->execlists.virtual.rb_root.rb_node; 5650 while (*parent) { 5651 struct ve_node *other; 5652 5653 rb = *parent; 5654 other = rb_entry(rb, typeof(*other), rb); 5655 if (prio > other->prio) { 5656 parent = &rb->rb_left; 5657 } else { 5658 parent = &rb->rb_right; 5659 first = false; 5660 } 5661 } 5662 5663 rb_link_node(&node->rb, rb, parent); 5664 rb_insert_color_cached(&node->rb, 5665 &sibling->execlists.virtual, 5666 first); 5667 5668 submit_engine: 5669 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 5670 node->prio = prio; 5671 if (first && prio > sibling->execlists.queue_priority_hint) 5672 tasklet_hi_schedule(&sibling->execlists.tasklet); 5673 5674 spin_unlock(&sibling->active.lock); 5675 } 5676 local_irq_enable(); 5677 } 5678 5679 static void virtual_submit_request(struct i915_request *rq) 5680 { 5681 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5682 struct i915_request *old; 5683 unsigned long flags; 5684 5685 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 5686 rq->fence.context, 5687 rq->fence.seqno); 5688 5689 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 5690 5691 spin_lock_irqsave(&ve->base.active.lock, flags); 5692 5693 old = ve->request; 5694 if (old) { /* background completion event from preempt-to-busy */ 5695 GEM_BUG_ON(!i915_request_completed(old)); 5696 __i915_request_submit(old); 5697 i915_request_put(old); 5698 } 5699 5700 if (i915_request_completed(rq)) { 5701 __i915_request_submit(rq); 5702 5703 ve->base.execlists.queue_priority_hint = INT_MIN; 5704 ve->request = NULL; 5705 } else { 5706 ve->base.execlists.queue_priority_hint = rq_prio(rq); 5707 ve->request = i915_request_get(rq); 5708 5709 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5710 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5711 5712 tasklet_hi_schedule(&ve->base.execlists.tasklet); 5713 } 5714 5715 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5716 } 5717 5718 static struct ve_bond * 5719 virtual_find_bond(struct virtual_engine *ve, 5720 const struct intel_engine_cs *master) 5721 { 5722 int i; 5723 5724 for (i = 0; i < ve->num_bonds; i++) { 5725 if (ve->bonds[i].master == master) 5726 return &ve->bonds[i]; 5727 } 5728 5729 return NULL; 5730 } 5731 5732 static void 5733 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5734 { 5735 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5736 intel_engine_mask_t allowed, exec; 5737 struct ve_bond *bond; 5738 5739 allowed = ~to_request(signal)->engine->mask; 5740 5741 bond = virtual_find_bond(ve, to_request(signal)->engine); 5742 if (bond) 5743 allowed &= bond->sibling_mask; 5744 5745 /* Restrict the bonded request to run on only the available engines */ 5746 exec = READ_ONCE(rq->execution_mask); 5747 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5748 ; 5749 5750 /* Prevent the master from being re-run on the bonded engines */ 5751 to_request(signal)->execution_mask &= ~allowed; 5752 } 5753 5754 struct intel_context * 5755 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5756 unsigned int count) 5757 { 5758 struct virtual_engine *ve; 5759 unsigned int n; 5760 int err; 5761 5762 if (count == 0) 5763 return ERR_PTR(-EINVAL); 5764 5765 if (count == 1) 5766 return intel_context_create(siblings[0]); 5767 5768 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5769 if (!ve) 5770 return ERR_PTR(-ENOMEM); 5771 5772 ve->base.i915 = siblings[0]->i915; 5773 ve->base.gt = siblings[0]->gt; 5774 ve->base.uncore = siblings[0]->uncore; 5775 ve->base.id = -1; 5776 5777 ve->base.class = OTHER_CLASS; 5778 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5779 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5780 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5781 5782 /* 5783 * The decision on whether to submit a request using semaphores 5784 * depends on the saturated state of the engine. We only compute 5785 * this during HW submission of the request, and we need for this 5786 * state to be globally applied to all requests being submitted 5787 * to this engine. Virtual engines encompass more than one physical 5788 * engine and so we cannot accurately tell in advance if one of those 5789 * engines is already saturated and so cannot afford to use a semaphore 5790 * and be pessimized in priority for doing so -- if we are the only 5791 * context using semaphores after all other clients have stopped, we 5792 * will be starved on the saturated system. Such a global switch for 5793 * semaphores is less than ideal, but alas is the current compromise. 5794 */ 5795 ve->base.saturated = ALL_ENGINES; 5796 5797 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5798 5799 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5800 intel_engine_init_execlists(&ve->base); 5801 5802 ve->base.cops = &virtual_context_ops; 5803 ve->base.request_alloc = execlists_request_alloc; 5804 5805 ve->base.schedule = i915_schedule; 5806 ve->base.submit_request = virtual_submit_request; 5807 ve->base.bond_execute = virtual_bond_execute; 5808 5809 INIT_LIST_HEAD(virtual_queue(ve)); 5810 ve->base.execlists.queue_priority_hint = INT_MIN; 5811 tasklet_init(&ve->base.execlists.tasklet, 5812 virtual_submission_tasklet, 5813 (unsigned long)ve); 5814 5815 intel_context_init(&ve->context, &ve->base); 5816 5817 ve->base.breadcrumbs = intel_breadcrumbs_create(NULL); 5818 if (!ve->base.breadcrumbs) { 5819 err = -ENOMEM; 5820 goto err_put; 5821 } 5822 5823 for (n = 0; n < count; n++) { 5824 struct intel_engine_cs *sibling = siblings[n]; 5825 5826 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5827 if (sibling->mask & ve->base.mask) { 5828 DRM_DEBUG("duplicate %s entry in load balancer\n", 5829 sibling->name); 5830 err = -EINVAL; 5831 goto err_put; 5832 } 5833 5834 /* 5835 * The virtual engine implementation is tightly coupled to 5836 * the execlists backend -- we push out request directly 5837 * into a tree inside each physical engine. We could support 5838 * layering if we handle cloning of the requests and 5839 * submitting a copy into each backend. 5840 */ 5841 if (sibling->execlists.tasklet.func != 5842 execlists_submission_tasklet) { 5843 err = -ENODEV; 5844 goto err_put; 5845 } 5846 5847 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5848 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5849 5850 ve->siblings[ve->num_siblings++] = sibling; 5851 ve->base.mask |= sibling->mask; 5852 5853 /* 5854 * All physical engines must be compatible for their emission 5855 * functions (as we build the instructions during request 5856 * construction and do not alter them before submission 5857 * on the physical engine). We use the engine class as a guide 5858 * here, although that could be refined. 5859 */ 5860 if (ve->base.class != OTHER_CLASS) { 5861 if (ve->base.class != sibling->class) { 5862 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5863 sibling->class, ve->base.class); 5864 err = -EINVAL; 5865 goto err_put; 5866 } 5867 continue; 5868 } 5869 5870 ve->base.class = sibling->class; 5871 ve->base.uabi_class = sibling->uabi_class; 5872 snprintf(ve->base.name, sizeof(ve->base.name), 5873 "v%dx%d", ve->base.class, count); 5874 ve->base.context_size = sibling->context_size; 5875 5876 ve->base.emit_bb_start = sibling->emit_bb_start; 5877 ve->base.emit_flush = sibling->emit_flush; 5878 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5879 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5880 ve->base.emit_fini_breadcrumb_dw = 5881 sibling->emit_fini_breadcrumb_dw; 5882 5883 ve->base.flags = sibling->flags; 5884 } 5885 5886 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5887 5888 virtual_engine_initial_hint(ve); 5889 return &ve->context; 5890 5891 err_put: 5892 intel_context_put(&ve->context); 5893 return ERR_PTR(err); 5894 } 5895 5896 struct intel_context * 5897 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5898 { 5899 struct virtual_engine *se = to_virtual_engine(src); 5900 struct intel_context *dst; 5901 5902 dst = intel_execlists_create_virtual(se->siblings, 5903 se->num_siblings); 5904 if (IS_ERR(dst)) 5905 return dst; 5906 5907 if (se->num_bonds) { 5908 struct virtual_engine *de = to_virtual_engine(dst->engine); 5909 5910 de->bonds = kmemdup(se->bonds, 5911 sizeof(*se->bonds) * se->num_bonds, 5912 GFP_KERNEL); 5913 if (!de->bonds) { 5914 intel_context_put(dst); 5915 return ERR_PTR(-ENOMEM); 5916 } 5917 5918 de->num_bonds = se->num_bonds; 5919 } 5920 5921 return dst; 5922 } 5923 5924 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5925 const struct intel_engine_cs *master, 5926 const struct intel_engine_cs *sibling) 5927 { 5928 struct virtual_engine *ve = to_virtual_engine(engine); 5929 struct ve_bond *bond; 5930 int n; 5931 5932 /* Sanity check the sibling is part of the virtual engine */ 5933 for (n = 0; n < ve->num_siblings; n++) 5934 if (sibling == ve->siblings[n]) 5935 break; 5936 if (n == ve->num_siblings) 5937 return -EINVAL; 5938 5939 bond = virtual_find_bond(ve, master); 5940 if (bond) { 5941 bond->sibling_mask |= sibling->mask; 5942 return 0; 5943 } 5944 5945 bond = krealloc(ve->bonds, 5946 sizeof(*bond) * (ve->num_bonds + 1), 5947 GFP_KERNEL); 5948 if (!bond) 5949 return -ENOMEM; 5950 5951 bond[ve->num_bonds].master = master; 5952 bond[ve->num_bonds].sibling_mask = sibling->mask; 5953 5954 ve->bonds = bond; 5955 ve->num_bonds++; 5956 5957 return 0; 5958 } 5959 5960 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5961 struct drm_printer *m, 5962 void (*show_request)(struct drm_printer *m, 5963 struct i915_request *rq, 5964 const char *prefix), 5965 unsigned int max) 5966 { 5967 const struct intel_engine_execlists *execlists = &engine->execlists; 5968 struct i915_request *rq, *last; 5969 unsigned long flags; 5970 unsigned int count; 5971 struct rb_node *rb; 5972 5973 spin_lock_irqsave(&engine->active.lock, flags); 5974 5975 last = NULL; 5976 count = 0; 5977 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5978 if (count++ < max - 1) 5979 show_request(m, rq, "\t\tE "); 5980 else 5981 last = rq; 5982 } 5983 if (last) { 5984 if (count > max) { 5985 drm_printf(m, 5986 "\t\t...skipping %d executing requests...\n", 5987 count - max); 5988 } 5989 show_request(m, last, "\t\tE "); 5990 } 5991 5992 if (execlists->switch_priority_hint != INT_MIN) 5993 drm_printf(m, "\t\tSwitch priority hint: %d\n", 5994 READ_ONCE(execlists->switch_priority_hint)); 5995 if (execlists->queue_priority_hint != INT_MIN) 5996 drm_printf(m, "\t\tQueue priority hint: %d\n", 5997 READ_ONCE(execlists->queue_priority_hint)); 5998 5999 last = NULL; 6000 count = 0; 6001 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 6002 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 6003 int i; 6004 6005 priolist_for_each_request(rq, p, i) { 6006 if (count++ < max - 1) 6007 show_request(m, rq, "\t\tQ "); 6008 else 6009 last = rq; 6010 } 6011 } 6012 if (last) { 6013 if (count > max) { 6014 drm_printf(m, 6015 "\t\t...skipping %d queued requests...\n", 6016 count - max); 6017 } 6018 show_request(m, last, "\t\tQ "); 6019 } 6020 6021 last = NULL; 6022 count = 0; 6023 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 6024 struct virtual_engine *ve = 6025 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 6026 struct i915_request *rq = READ_ONCE(ve->request); 6027 6028 if (rq) { 6029 if (count++ < max - 1) 6030 show_request(m, rq, "\t\tV "); 6031 else 6032 last = rq; 6033 } 6034 } 6035 if (last) { 6036 if (count > max) { 6037 drm_printf(m, 6038 "\t\t...skipping %d virtual requests...\n", 6039 count - max); 6040 } 6041 show_request(m, last, "\t\tV "); 6042 } 6043 6044 spin_unlock_irqrestore(&engine->active.lock, flags); 6045 } 6046 6047 void intel_lr_context_reset(struct intel_engine_cs *engine, 6048 struct intel_context *ce, 6049 u32 head, 6050 bool scrub) 6051 { 6052 GEM_BUG_ON(!intel_context_is_pinned(ce)); 6053 6054 /* 6055 * We want a simple context + ring to execute the breadcrumb update. 6056 * We cannot rely on the context being intact across the GPU hang, 6057 * so clear it and rebuild just what we need for the breadcrumb. 6058 * All pending requests for this context will be zapped, and any 6059 * future request will be after userspace has had the opportunity 6060 * to recreate its own state. 6061 */ 6062 if (scrub) 6063 restore_default_state(ce, engine); 6064 6065 /* Rerun the request; its payload has been neutered (if guilty). */ 6066 __execlists_update_reg_state(ce, engine, head); 6067 } 6068 6069 bool 6070 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 6071 { 6072 return engine->set_default_submission == 6073 intel_execlists_set_default_submission; 6074 } 6075 6076 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 6077 #include "selftest_lrc.c" 6078 #endif 6079