1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "gem/i915_gem_context.h" 137 138 #include "i915_drv.h" 139 #include "i915_perf.h" 140 #include "i915_trace.h" 141 #include "i915_vgpu.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_gt_requests.h" 146 #include "intel_lrc_reg.h" 147 #include "intel_mocs.h" 148 #include "intel_reset.h" 149 #include "intel_ring.h" 150 #include "intel_workarounds.h" 151 152 #define RING_EXECLIST_QFULL (1 << 0x2) 153 #define RING_EXECLIST1_VALID (1 << 0x3) 154 #define RING_EXECLIST0_VALID (1 << 0x4) 155 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 156 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 157 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 158 159 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 160 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 162 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 163 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 164 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 165 166 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 167 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 168 169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 170 171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 173 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 174 #define GEN12_IDLE_CTX_ID 0x7FF 175 #define GEN12_CSB_CTX_VALID(csb_dw) \ 176 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 177 178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 180 #define WA_TAIL_DWORDS 2 181 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) 182 183 struct virtual_engine { 184 struct intel_engine_cs base; 185 struct intel_context context; 186 187 /* 188 * We allow only a single request through the virtual engine at a time 189 * (each request in the timeline waits for the completion fence of 190 * the previous before being submitted). By restricting ourselves to 191 * only submitting a single request, each request is placed on to a 192 * physical to maximise load spreading (by virtue of the late greedy 193 * scheduling -- each real engine takes the next available request 194 * upon idling). 195 */ 196 struct i915_request *request; 197 198 /* 199 * We keep a rbtree of available virtual engines inside each physical 200 * engine, sorted by priority. Here we preallocate the nodes we need 201 * for the virtual engine, indexed by physical_engine->id. 202 */ 203 struct ve_node { 204 struct rb_node rb; 205 int prio; 206 } nodes[I915_NUM_ENGINES]; 207 208 /* 209 * Keep track of bonded pairs -- restrictions upon on our selection 210 * of physical engines any particular request may be submitted to. 211 * If we receive a submit-fence from a master engine, we will only 212 * use one of sibling_mask physical engines. 213 */ 214 struct ve_bond { 215 const struct intel_engine_cs *master; 216 intel_engine_mask_t sibling_mask; 217 } *bonds; 218 unsigned int num_bonds; 219 220 /* And finally, which physical engines this virtual engine maps onto. */ 221 unsigned int num_siblings; 222 struct intel_engine_cs *siblings[0]; 223 }; 224 225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 226 { 227 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 228 return container_of(engine, struct virtual_engine, base); 229 } 230 231 static int __execlists_context_alloc(struct intel_context *ce, 232 struct intel_engine_cs *engine); 233 234 static void execlists_init_reg_state(u32 *reg_state, 235 const struct intel_context *ce, 236 const struct intel_engine_cs *engine, 237 const struct intel_ring *ring, 238 bool close); 239 static void 240 __execlists_update_reg_state(const struct intel_context *ce, 241 const struct intel_engine_cs *engine); 242 243 static void mark_eio(struct i915_request *rq) 244 { 245 if (i915_request_completed(rq)) 246 return; 247 248 GEM_BUG_ON(i915_request_signaled(rq)); 249 250 dma_fence_set_error(&rq->fence, -EIO); 251 i915_request_mark_complete(rq); 252 } 253 254 static struct i915_request * 255 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 256 { 257 struct i915_request *active = rq; 258 259 rcu_read_lock(); 260 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 261 if (i915_request_completed(rq)) 262 break; 263 264 active = rq; 265 } 266 rcu_read_unlock(); 267 268 return active; 269 } 270 271 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 272 { 273 return (i915_ggtt_offset(engine->status_page.vma) + 274 I915_GEM_HWS_PREEMPT_ADDR); 275 } 276 277 static inline void 278 ring_set_paused(const struct intel_engine_cs *engine, int state) 279 { 280 /* 281 * We inspect HWS_PREEMPT with a semaphore inside 282 * engine->emit_fini_breadcrumb. If the dword is true, 283 * the ring is paused as the semaphore will busywait 284 * until the dword is false. 285 */ 286 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 287 if (state) 288 wmb(); 289 } 290 291 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 292 { 293 return rb_entry(rb, struct i915_priolist, node); 294 } 295 296 static inline int rq_prio(const struct i915_request *rq) 297 { 298 return rq->sched.attr.priority; 299 } 300 301 static int effective_prio(const struct i915_request *rq) 302 { 303 int prio = rq_prio(rq); 304 305 /* 306 * If this request is special and must not be interrupted at any 307 * cost, so be it. Note we are only checking the most recent request 308 * in the context and so may be masking an earlier vip request. It 309 * is hoped that under the conditions where nopreempt is used, this 310 * will not matter (i.e. all requests to that context will be 311 * nopreempt for as long as desired). 312 */ 313 if (i915_request_has_nopreempt(rq)) 314 prio = I915_PRIORITY_UNPREEMPTABLE; 315 316 /* 317 * On unwinding the active request, we give it a priority bump 318 * if it has completed waiting on any semaphore. If we know that 319 * the request has already started, we can prevent an unwanted 320 * preempt-to-idle cycle by taking that into account now. 321 */ 322 if (__i915_request_has_started(rq)) 323 prio |= I915_PRIORITY_NOSEMAPHORE; 324 325 /* Restrict mere WAIT boosts from triggering preemption */ 326 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ 327 return prio | __NO_PREEMPTION; 328 } 329 330 static int queue_prio(const struct intel_engine_execlists *execlists) 331 { 332 struct i915_priolist *p; 333 struct rb_node *rb; 334 335 rb = rb_first_cached(&execlists->queue); 336 if (!rb) 337 return INT_MIN; 338 339 /* 340 * As the priolist[] are inverted, with the highest priority in [0], 341 * we have to flip the index value to become priority. 342 */ 343 p = to_priolist(rb); 344 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 345 } 346 347 static inline bool need_preempt(const struct intel_engine_cs *engine, 348 const struct i915_request *rq, 349 struct rb_node *rb) 350 { 351 int last_prio; 352 353 if (!intel_engine_has_semaphores(engine)) 354 return false; 355 356 /* 357 * Check if the current priority hint merits a preemption attempt. 358 * 359 * We record the highest value priority we saw during rescheduling 360 * prior to this dequeue, therefore we know that if it is strictly 361 * less than the current tail of ESLP[0], we do not need to force 362 * a preempt-to-idle cycle. 363 * 364 * However, the priority hint is a mere hint that we may need to 365 * preempt. If that hint is stale or we may be trying to preempt 366 * ourselves, ignore the request. 367 * 368 * More naturally we would write 369 * prio >= max(0, last); 370 * except that we wish to prevent triggering preemption at the same 371 * priority level: the task that is running should remain running 372 * to preserve FIFO ordering of dependencies. 373 */ 374 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 375 if (engine->execlists.queue_priority_hint <= last_prio) 376 return false; 377 378 /* 379 * Check against the first request in ELSP[1], it will, thanks to the 380 * power of PI, be the highest priority of that context. 381 */ 382 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 383 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 384 return true; 385 386 if (rb) { 387 struct virtual_engine *ve = 388 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 389 bool preempt = false; 390 391 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 392 struct i915_request *next; 393 394 rcu_read_lock(); 395 next = READ_ONCE(ve->request); 396 if (next) 397 preempt = rq_prio(next) > last_prio; 398 rcu_read_unlock(); 399 } 400 401 if (preempt) 402 return preempt; 403 } 404 405 /* 406 * If the inflight context did not trigger the preemption, then maybe 407 * it was the set of queued requests? Pick the highest priority in 408 * the queue (the first active priolist) and see if it deserves to be 409 * running instead of ELSP[0]. 410 * 411 * The highest priority request in the queue can not be either 412 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 413 * context, it's priority would not exceed ELSP[0] aka last_prio. 414 */ 415 return queue_prio(&engine->execlists) > last_prio; 416 } 417 418 __maybe_unused static inline bool 419 assert_priority_queue(const struct i915_request *prev, 420 const struct i915_request *next) 421 { 422 /* 423 * Without preemption, the prev may refer to the still active element 424 * which we refuse to let go. 425 * 426 * Even with preemption, there are times when we think it is better not 427 * to preempt and leave an ostensibly lower priority request in flight. 428 */ 429 if (i915_request_is_active(prev)) 430 return true; 431 432 return rq_prio(prev) >= rq_prio(next); 433 } 434 435 /* 436 * The context descriptor encodes various attributes of a context, 437 * including its GTT address and some flags. Because it's fairly 438 * expensive to calculate, we'll just do it once and cache the result, 439 * which remains valid until the context is unpinned. 440 * 441 * This is what a descriptor looks like, from LSB to MSB:: 442 * 443 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 444 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 445 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 446 * bits 53-54: mbz, reserved for use by hardware 447 * bits 55-63: group ID, currently unused and set to 0 448 * 449 * Starting from Gen11, the upper dword of the descriptor has a new format: 450 * 451 * bits 32-36: reserved 452 * bits 37-47: SW context ID 453 * bits 48:53: engine instance 454 * bit 54: mbz, reserved for use by hardware 455 * bits 55-60: SW counter 456 * bits 61-63: engine class 457 * 458 * engine info, SW context ID and SW counter need to form a unique number 459 * (Context ID) per lrc. 460 */ 461 static u64 462 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 463 { 464 u64 desc; 465 466 desc = INTEL_LEGACY_32B_CONTEXT; 467 if (i915_vm_is_4lvl(ce->vm)) 468 desc = INTEL_LEGACY_64B_CONTEXT; 469 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 470 471 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 472 if (IS_GEN(engine->i915, 8)) 473 desc |= GEN8_CTX_L3LLC_COHERENT; 474 475 desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */ 476 /* 477 * The following 32bits are copied into the OA reports (dword 2). 478 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing 479 * anything below. 480 */ 481 if (INTEL_GEN(engine->i915) >= 11) { 482 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; 483 /* bits 48-53 */ 484 485 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; 486 /* bits 61-63 */ 487 } 488 489 return desc; 490 } 491 492 static u32 *set_offsets(u32 *regs, 493 const u8 *data, 494 const struct intel_engine_cs *engine) 495 #define NOP(x) (BIT(7) | (x)) 496 #define LRI(count, flags) ((flags) << 6 | (count)) 497 #define POSTED BIT(0) 498 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 499 #define REG16(x) \ 500 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 501 (((x) >> 2) & 0x7f) 502 #define END() 0 503 { 504 const u32 base = engine->mmio_base; 505 506 while (*data) { 507 u8 count, flags; 508 509 if (*data & BIT(7)) { /* skip */ 510 regs += *data++ & ~BIT(7); 511 continue; 512 } 513 514 count = *data & 0x3f; 515 flags = *data >> 6; 516 data++; 517 518 *regs = MI_LOAD_REGISTER_IMM(count); 519 if (flags & POSTED) 520 *regs |= MI_LRI_FORCE_POSTED; 521 if (INTEL_GEN(engine->i915) >= 11) 522 *regs |= MI_LRI_CS_MMIO; 523 regs++; 524 525 GEM_BUG_ON(!count); 526 do { 527 u32 offset = 0; 528 u8 v; 529 530 do { 531 v = *data++; 532 offset <<= 7; 533 offset |= v & ~BIT(7); 534 } while (v & BIT(7)); 535 536 *regs = base + (offset << 2); 537 regs += 2; 538 } while (--count); 539 } 540 541 return regs; 542 } 543 544 static const u8 gen8_xcs_offsets[] = { 545 NOP(1), 546 LRI(11, 0), 547 REG16(0x244), 548 REG(0x034), 549 REG(0x030), 550 REG(0x038), 551 REG(0x03c), 552 REG(0x168), 553 REG(0x140), 554 REG(0x110), 555 REG(0x11c), 556 REG(0x114), 557 REG(0x118), 558 559 NOP(9), 560 LRI(9, 0), 561 REG16(0x3a8), 562 REG16(0x28c), 563 REG16(0x288), 564 REG16(0x284), 565 REG16(0x280), 566 REG16(0x27c), 567 REG16(0x278), 568 REG16(0x274), 569 REG16(0x270), 570 571 NOP(13), 572 LRI(2, 0), 573 REG16(0x200), 574 REG(0x028), 575 576 END(), 577 }; 578 579 static const u8 gen9_xcs_offsets[] = { 580 NOP(1), 581 LRI(14, POSTED), 582 REG16(0x244), 583 REG(0x034), 584 REG(0x030), 585 REG(0x038), 586 REG(0x03c), 587 REG(0x168), 588 REG(0x140), 589 REG(0x110), 590 REG(0x11c), 591 REG(0x114), 592 REG(0x118), 593 REG(0x1c0), 594 REG(0x1c4), 595 REG(0x1c8), 596 597 NOP(3), 598 LRI(9, POSTED), 599 REG16(0x3a8), 600 REG16(0x28c), 601 REG16(0x288), 602 REG16(0x284), 603 REG16(0x280), 604 REG16(0x27c), 605 REG16(0x278), 606 REG16(0x274), 607 REG16(0x270), 608 609 NOP(13), 610 LRI(1, POSTED), 611 REG16(0x200), 612 613 NOP(13), 614 LRI(44, POSTED), 615 REG(0x028), 616 REG(0x09c), 617 REG(0x0c0), 618 REG(0x178), 619 REG(0x17c), 620 REG16(0x358), 621 REG(0x170), 622 REG(0x150), 623 REG(0x154), 624 REG(0x158), 625 REG16(0x41c), 626 REG16(0x600), 627 REG16(0x604), 628 REG16(0x608), 629 REG16(0x60c), 630 REG16(0x610), 631 REG16(0x614), 632 REG16(0x618), 633 REG16(0x61c), 634 REG16(0x620), 635 REG16(0x624), 636 REG16(0x628), 637 REG16(0x62c), 638 REG16(0x630), 639 REG16(0x634), 640 REG16(0x638), 641 REG16(0x63c), 642 REG16(0x640), 643 REG16(0x644), 644 REG16(0x648), 645 REG16(0x64c), 646 REG16(0x650), 647 REG16(0x654), 648 REG16(0x658), 649 REG16(0x65c), 650 REG16(0x660), 651 REG16(0x664), 652 REG16(0x668), 653 REG16(0x66c), 654 REG16(0x670), 655 REG16(0x674), 656 REG16(0x678), 657 REG16(0x67c), 658 REG(0x068), 659 660 END(), 661 }; 662 663 static const u8 gen12_xcs_offsets[] = { 664 NOP(1), 665 LRI(13, POSTED), 666 REG16(0x244), 667 REG(0x034), 668 REG(0x030), 669 REG(0x038), 670 REG(0x03c), 671 REG(0x168), 672 REG(0x140), 673 REG(0x110), 674 REG(0x1c0), 675 REG(0x1c4), 676 REG(0x1c8), 677 REG(0x180), 678 REG16(0x2b4), 679 680 NOP(5), 681 LRI(9, POSTED), 682 REG16(0x3a8), 683 REG16(0x28c), 684 REG16(0x288), 685 REG16(0x284), 686 REG16(0x280), 687 REG16(0x27c), 688 REG16(0x278), 689 REG16(0x274), 690 REG16(0x270), 691 692 END(), 693 }; 694 695 static const u8 gen8_rcs_offsets[] = { 696 NOP(1), 697 LRI(14, POSTED), 698 REG16(0x244), 699 REG(0x034), 700 REG(0x030), 701 REG(0x038), 702 REG(0x03c), 703 REG(0x168), 704 REG(0x140), 705 REG(0x110), 706 REG(0x11c), 707 REG(0x114), 708 REG(0x118), 709 REG(0x1c0), 710 REG(0x1c4), 711 REG(0x1c8), 712 713 NOP(3), 714 LRI(9, POSTED), 715 REG16(0x3a8), 716 REG16(0x28c), 717 REG16(0x288), 718 REG16(0x284), 719 REG16(0x280), 720 REG16(0x27c), 721 REG16(0x278), 722 REG16(0x274), 723 REG16(0x270), 724 725 NOP(13), 726 LRI(1, 0), 727 REG(0x0c8), 728 729 END(), 730 }; 731 732 static const u8 gen11_rcs_offsets[] = { 733 NOP(1), 734 LRI(15, POSTED), 735 REG16(0x244), 736 REG(0x034), 737 REG(0x030), 738 REG(0x038), 739 REG(0x03c), 740 REG(0x168), 741 REG(0x140), 742 REG(0x110), 743 REG(0x11c), 744 REG(0x114), 745 REG(0x118), 746 REG(0x1c0), 747 REG(0x1c4), 748 REG(0x1c8), 749 REG(0x180), 750 751 NOP(1), 752 LRI(9, POSTED), 753 REG16(0x3a8), 754 REG16(0x28c), 755 REG16(0x288), 756 REG16(0x284), 757 REG16(0x280), 758 REG16(0x27c), 759 REG16(0x278), 760 REG16(0x274), 761 REG16(0x270), 762 763 LRI(1, POSTED), 764 REG(0x1b0), 765 766 NOP(10), 767 LRI(1, 0), 768 REG(0x0c8), 769 770 END(), 771 }; 772 773 static const u8 gen12_rcs_offsets[] = { 774 NOP(1), 775 LRI(13, POSTED), 776 REG16(0x244), 777 REG(0x034), 778 REG(0x030), 779 REG(0x038), 780 REG(0x03c), 781 REG(0x168), 782 REG(0x140), 783 REG(0x110), 784 REG(0x1c0), 785 REG(0x1c4), 786 REG(0x1c8), 787 REG(0x180), 788 REG16(0x2b4), 789 790 NOP(5), 791 LRI(9, POSTED), 792 REG16(0x3a8), 793 REG16(0x28c), 794 REG16(0x288), 795 REG16(0x284), 796 REG16(0x280), 797 REG16(0x27c), 798 REG16(0x278), 799 REG16(0x274), 800 REG16(0x270), 801 802 LRI(3, POSTED), 803 REG(0x1b0), 804 REG16(0x5a8), 805 REG16(0x5ac), 806 807 NOP(6), 808 LRI(1, 0), 809 REG(0x0c8), 810 811 END(), 812 }; 813 814 #undef END 815 #undef REG16 816 #undef REG 817 #undef LRI 818 #undef NOP 819 820 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 821 { 822 /* 823 * The gen12+ lists only have the registers we program in the basic 824 * default state. We rely on the context image using relative 825 * addressing to automatic fixup the register state between the 826 * physical engines for virtual engine. 827 */ 828 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 829 !intel_engine_has_relative_mmio(engine)); 830 831 if (engine->class == RENDER_CLASS) { 832 if (INTEL_GEN(engine->i915) >= 12) 833 return gen12_rcs_offsets; 834 else if (INTEL_GEN(engine->i915) >= 11) 835 return gen11_rcs_offsets; 836 else 837 return gen8_rcs_offsets; 838 } else { 839 if (INTEL_GEN(engine->i915) >= 12) 840 return gen12_xcs_offsets; 841 else if (INTEL_GEN(engine->i915) >= 9) 842 return gen9_xcs_offsets; 843 else 844 return gen8_xcs_offsets; 845 } 846 } 847 848 static struct i915_request * 849 __unwind_incomplete_requests(struct intel_engine_cs *engine) 850 { 851 struct i915_request *rq, *rn, *active = NULL; 852 struct list_head *uninitialized_var(pl); 853 int prio = I915_PRIORITY_INVALID; 854 855 lockdep_assert_held(&engine->active.lock); 856 857 list_for_each_entry_safe_reverse(rq, rn, 858 &engine->active.requests, 859 sched.link) { 860 if (i915_request_completed(rq)) 861 continue; /* XXX */ 862 863 __i915_request_unsubmit(rq); 864 865 /* 866 * Push the request back into the queue for later resubmission. 867 * If this request is not native to this physical engine (i.e. 868 * it came from a virtual source), push it back onto the virtual 869 * engine so that it can be moved across onto another physical 870 * engine as load dictates. 871 */ 872 if (likely(rq->execution_mask == engine->mask)) { 873 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 874 if (rq_prio(rq) != prio) { 875 prio = rq_prio(rq); 876 pl = i915_sched_lookup_priolist(engine, prio); 877 } 878 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 879 880 list_move(&rq->sched.link, pl); 881 active = rq; 882 } else { 883 struct intel_engine_cs *owner = rq->hw_context->engine; 884 885 /* 886 * Decouple the virtual breadcrumb before moving it 887 * back to the virtual engine -- we don't want the 888 * request to complete in the background and try 889 * and cancel the breadcrumb on the virtual engine 890 * (instead of the old engine where it is linked)! 891 */ 892 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 893 &rq->fence.flags)) { 894 spin_lock_nested(&rq->lock, 895 SINGLE_DEPTH_NESTING); 896 i915_request_cancel_breadcrumb(rq); 897 spin_unlock(&rq->lock); 898 } 899 rq->engine = owner; 900 owner->submit_request(rq); 901 active = NULL; 902 } 903 } 904 905 return active; 906 } 907 908 struct i915_request * 909 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 910 { 911 struct intel_engine_cs *engine = 912 container_of(execlists, typeof(*engine), execlists); 913 914 return __unwind_incomplete_requests(engine); 915 } 916 917 static inline void 918 execlists_context_status_change(struct i915_request *rq, unsigned long status) 919 { 920 /* 921 * Only used when GVT-g is enabled now. When GVT-g is disabled, 922 * The compiler should eliminate this function as dead-code. 923 */ 924 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 925 return; 926 927 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 928 status, rq); 929 } 930 931 static void intel_engine_context_in(struct intel_engine_cs *engine) 932 { 933 unsigned long flags; 934 935 if (READ_ONCE(engine->stats.enabled) == 0) 936 return; 937 938 write_seqlock_irqsave(&engine->stats.lock, flags); 939 940 if (engine->stats.enabled > 0) { 941 if (engine->stats.active++ == 0) 942 engine->stats.start = ktime_get(); 943 GEM_BUG_ON(engine->stats.active == 0); 944 } 945 946 write_sequnlock_irqrestore(&engine->stats.lock, flags); 947 } 948 949 static void intel_engine_context_out(struct intel_engine_cs *engine) 950 { 951 unsigned long flags; 952 953 if (READ_ONCE(engine->stats.enabled) == 0) 954 return; 955 956 write_seqlock_irqsave(&engine->stats.lock, flags); 957 958 if (engine->stats.enabled > 0) { 959 ktime_t last; 960 961 if (engine->stats.active && --engine->stats.active == 0) { 962 /* 963 * Decrement the active context count and in case GPU 964 * is now idle add up to the running total. 965 */ 966 last = ktime_sub(ktime_get(), engine->stats.start); 967 968 engine->stats.total = ktime_add(engine->stats.total, 969 last); 970 } else if (engine->stats.active == 0) { 971 /* 972 * After turning on engine stats, context out might be 973 * the first event in which case we account from the 974 * time stats gathering was turned on. 975 */ 976 last = ktime_sub(ktime_get(), engine->stats.enabled_at); 977 978 engine->stats.total = ktime_add(engine->stats.total, 979 last); 980 } 981 } 982 983 write_sequnlock_irqrestore(&engine->stats.lock, flags); 984 } 985 986 static void restore_default_state(struct intel_context *ce, 987 struct intel_engine_cs *engine) 988 { 989 u32 *regs = ce->lrc_reg_state; 990 991 if (engine->pinned_default_state) 992 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 993 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 994 engine->context_size - PAGE_SIZE); 995 996 execlists_init_reg_state(regs, ce, engine, ce->ring, false); 997 } 998 999 static void reset_active(struct i915_request *rq, 1000 struct intel_engine_cs *engine) 1001 { 1002 struct intel_context * const ce = rq->hw_context; 1003 u32 head; 1004 1005 /* 1006 * The executing context has been cancelled. We want to prevent 1007 * further execution along this context and propagate the error on 1008 * to anything depending on its results. 1009 * 1010 * In __i915_request_submit(), we apply the -EIO and remove the 1011 * requests' payloads for any banned requests. But first, we must 1012 * rewind the context back to the start of the incomplete request so 1013 * that we do not jump back into the middle of the batch. 1014 * 1015 * We preserve the breadcrumbs and semaphores of the incomplete 1016 * requests so that inter-timeline dependencies (i.e other timelines) 1017 * remain correctly ordered. And we defer to __i915_request_submit() 1018 * so that all asynchronous waits are correctly handled. 1019 */ 1020 GEM_TRACE("%s(%s): { rq=%llx:%lld }\n", 1021 __func__, engine->name, rq->fence.context, rq->fence.seqno); 1022 1023 /* On resubmission of the active request, payload will be scrubbed */ 1024 if (i915_request_completed(rq)) 1025 head = rq->tail; 1026 else 1027 head = active_request(ce->timeline, rq)->head; 1028 ce->ring->head = intel_ring_wrap(ce->ring, head); 1029 intel_ring_update_space(ce->ring); 1030 1031 /* Scrub the context image to prevent replaying the previous batch */ 1032 restore_default_state(ce, engine); 1033 __execlists_update_reg_state(ce, engine); 1034 1035 /* We've switched away, so this should be a no-op, but intent matters */ 1036 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1037 } 1038 1039 static inline struct intel_engine_cs * 1040 __execlists_schedule_in(struct i915_request *rq) 1041 { 1042 struct intel_engine_cs * const engine = rq->engine; 1043 struct intel_context * const ce = rq->hw_context; 1044 1045 intel_context_get(ce); 1046 1047 if (unlikely(i915_gem_context_is_banned(ce->gem_context))) 1048 reset_active(rq, engine); 1049 1050 if (ce->tag) { 1051 /* Use a fixed tag for OA and friends */ 1052 ce->lrc_desc |= (u64)ce->tag << 32; 1053 } else { 1054 /* We don't need a strict matching tag, just different values */ 1055 ce->lrc_desc &= ~GENMASK_ULL(47, 37); 1056 ce->lrc_desc |= 1057 (u64)(engine->context_tag++ % NUM_CONTEXT_TAG) << 1058 GEN11_SW_CTX_ID_SHIFT; 1059 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID); 1060 } 1061 1062 intel_gt_pm_get(engine->gt); 1063 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1064 intel_engine_context_in(engine); 1065 1066 return engine; 1067 } 1068 1069 static inline struct i915_request * 1070 execlists_schedule_in(struct i915_request *rq, int idx) 1071 { 1072 struct intel_context * const ce = rq->hw_context; 1073 struct intel_engine_cs *old; 1074 1075 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1076 trace_i915_request_in(rq, idx); 1077 1078 old = READ_ONCE(ce->inflight); 1079 do { 1080 if (!old) { 1081 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1082 break; 1083 } 1084 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1085 1086 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1087 return i915_request_get(rq); 1088 } 1089 1090 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1091 { 1092 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1093 struct i915_request *next = READ_ONCE(ve->request); 1094 1095 if (next && next->execution_mask & ~rq->execution_mask) 1096 tasklet_schedule(&ve->base.execlists.tasklet); 1097 } 1098 1099 static inline void 1100 __execlists_schedule_out(struct i915_request *rq, 1101 struct intel_engine_cs * const engine) 1102 { 1103 struct intel_context * const ce = rq->hw_context; 1104 1105 /* 1106 * NB process_csb() is not under the engine->active.lock and hence 1107 * schedule_out can race with schedule_in meaning that we should 1108 * refrain from doing non-trivial work here. 1109 */ 1110 1111 /* 1112 * If we have just completed this context, the engine may now be 1113 * idle and we want to re-enter powersaving. 1114 */ 1115 if (list_is_last(&rq->link, &ce->timeline->requests) && 1116 i915_request_completed(rq)) 1117 intel_engine_add_retire(engine, ce->timeline); 1118 1119 intel_engine_context_out(engine); 1120 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1121 intel_gt_pm_put_async(engine->gt); 1122 1123 /* 1124 * If this is part of a virtual engine, its next request may 1125 * have been blocked waiting for access to the active context. 1126 * We have to kick all the siblings again in case we need to 1127 * switch (e.g. the next request is not runnable on this 1128 * engine). Hopefully, we will already have submitted the next 1129 * request before the tasklet runs and do not need to rebuild 1130 * each virtual tree and kick everyone again. 1131 */ 1132 if (ce->engine != engine) 1133 kick_siblings(rq, ce); 1134 1135 intel_context_put(ce); 1136 } 1137 1138 static inline void 1139 execlists_schedule_out(struct i915_request *rq) 1140 { 1141 struct intel_context * const ce = rq->hw_context; 1142 struct intel_engine_cs *cur, *old; 1143 1144 trace_i915_request_out(rq); 1145 1146 old = READ_ONCE(ce->inflight); 1147 do 1148 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1149 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1150 if (!cur) 1151 __execlists_schedule_out(rq, old); 1152 1153 i915_request_put(rq); 1154 } 1155 1156 static u64 execlists_update_context(struct i915_request *rq) 1157 { 1158 struct intel_context *ce = rq->hw_context; 1159 u64 desc = ce->lrc_desc; 1160 u32 tail; 1161 1162 /* 1163 * WaIdleLiteRestore:bdw,skl 1164 * 1165 * We should never submit the context with the same RING_TAIL twice 1166 * just in case we submit an empty ring, which confuses the HW. 1167 * 1168 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1169 * the normal request to be able to always advance the RING_TAIL on 1170 * subsequent resubmissions (for lite restore). Should that fail us, 1171 * and we try and submit the same tail again, force the context 1172 * reload. 1173 */ 1174 tail = intel_ring_set_tail(rq->ring, rq->tail); 1175 if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail)) 1176 desc |= CTX_DESC_FORCE_RESTORE; 1177 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1178 rq->tail = rq->wa_tail; 1179 1180 /* 1181 * Make sure the context image is complete before we submit it to HW. 1182 * 1183 * Ostensibly, writes (including the WCB) should be flushed prior to 1184 * an uncached write such as our mmio register access, the empirical 1185 * evidence (esp. on Braswell) suggests that the WC write into memory 1186 * may not be visible to the HW prior to the completion of the UC 1187 * register write and that we may begin execution from the context 1188 * before its image is complete leading to invalid PD chasing. 1189 * 1190 * Furthermore, Braswell, at least, wants a full mb to be sure that 1191 * the writes are coherent in memory (visible to the GPU) prior to 1192 * execution, and not just visible to other CPUs (as is the result of 1193 * wmb). 1194 */ 1195 mb(); 1196 1197 /* Wa_1607138340:tgl */ 1198 if (IS_TGL_REVID(rq->i915, TGL_REVID_A0, TGL_REVID_A0)) 1199 desc |= CTX_DESC_FORCE_RESTORE; 1200 1201 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; 1202 return desc; 1203 } 1204 1205 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1206 { 1207 if (execlists->ctrl_reg) { 1208 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1209 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1210 } else { 1211 writel(upper_32_bits(desc), execlists->submit_reg); 1212 writel(lower_32_bits(desc), execlists->submit_reg); 1213 } 1214 } 1215 1216 static __maybe_unused void 1217 trace_ports(const struct intel_engine_execlists *execlists, 1218 const char *msg, 1219 struct i915_request * const *ports) 1220 { 1221 const struct intel_engine_cs *engine = 1222 container_of(execlists, typeof(*engine), execlists); 1223 1224 if (!ports[0]) 1225 return; 1226 1227 GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n", 1228 engine->name, msg, 1229 ports[0]->fence.context, 1230 ports[0]->fence.seqno, 1231 i915_request_completed(ports[0]) ? "!" : 1232 i915_request_started(ports[0]) ? "*" : 1233 "", 1234 ports[1] ? ports[1]->fence.context : 0, 1235 ports[1] ? ports[1]->fence.seqno : 0); 1236 } 1237 1238 static __maybe_unused bool 1239 assert_pending_valid(const struct intel_engine_execlists *execlists, 1240 const char *msg) 1241 { 1242 struct i915_request * const *port, *rq; 1243 struct intel_context *ce = NULL; 1244 1245 trace_ports(execlists, msg, execlists->pending); 1246 1247 if (!execlists->pending[0]) { 1248 GEM_TRACE_ERR("Nothing pending for promotion!\n"); 1249 return false; 1250 } 1251 1252 if (execlists->pending[execlists_num_ports(execlists)]) { 1253 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n", 1254 execlists_num_ports(execlists)); 1255 return false; 1256 } 1257 1258 for (port = execlists->pending; (rq = *port); port++) { 1259 if (ce == rq->hw_context) { 1260 GEM_TRACE_ERR("Duplicate context in pending[%zd]\n", 1261 port - execlists->pending); 1262 return false; 1263 } 1264 1265 ce = rq->hw_context; 1266 if (i915_request_completed(rq)) 1267 continue; 1268 1269 if (i915_active_is_idle(&ce->active)) { 1270 GEM_TRACE_ERR("Inactive context in pending[%zd]\n", 1271 port - execlists->pending); 1272 return false; 1273 } 1274 1275 if (!i915_vma_is_pinned(ce->state)) { 1276 GEM_TRACE_ERR("Unpinned context in pending[%zd]\n", 1277 port - execlists->pending); 1278 return false; 1279 } 1280 1281 if (!i915_vma_is_pinned(ce->ring->vma)) { 1282 GEM_TRACE_ERR("Unpinned ringbuffer in pending[%zd]\n", 1283 port - execlists->pending); 1284 return false; 1285 } 1286 } 1287 1288 return ce; 1289 } 1290 1291 static void execlists_submit_ports(struct intel_engine_cs *engine) 1292 { 1293 struct intel_engine_execlists *execlists = &engine->execlists; 1294 unsigned int n; 1295 1296 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1297 1298 /* 1299 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1300 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1301 * not be relinquished until the device is idle (see 1302 * i915_gem_idle_work_handler()). As a precaution, we make sure 1303 * that all ELSP are drained i.e. we have processed the CSB, 1304 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1305 */ 1306 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1307 1308 /* 1309 * ELSQ note: the submit queue is not cleared after being submitted 1310 * to the HW so we need to make sure we always clean it up. This is 1311 * currently ensured by the fact that we always write the same number 1312 * of elsq entries, keep this in mind before changing the loop below. 1313 */ 1314 for (n = execlists_num_ports(execlists); n--; ) { 1315 struct i915_request *rq = execlists->pending[n]; 1316 1317 write_desc(execlists, 1318 rq ? execlists_update_context(rq) : 0, 1319 n); 1320 } 1321 1322 /* we need to manually load the submit queue */ 1323 if (execlists->ctrl_reg) 1324 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1325 } 1326 1327 static bool ctx_single_port_submission(const struct intel_context *ce) 1328 { 1329 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1330 i915_gem_context_force_single_submission(ce->gem_context)); 1331 } 1332 1333 static bool can_merge_ctx(const struct intel_context *prev, 1334 const struct intel_context *next) 1335 { 1336 if (prev != next) 1337 return false; 1338 1339 if (ctx_single_port_submission(prev)) 1340 return false; 1341 1342 return true; 1343 } 1344 1345 static bool can_merge_rq(const struct i915_request *prev, 1346 const struct i915_request *next) 1347 { 1348 GEM_BUG_ON(prev == next); 1349 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1350 1351 /* 1352 * We do not submit known completed requests. Therefore if the next 1353 * request is already completed, we can pretend to merge it in 1354 * with the previous context (and we will skip updating the ELSP 1355 * and tracking). Thus hopefully keeping the ELSP full with active 1356 * contexts, despite the best efforts of preempt-to-busy to confuse 1357 * us. 1358 */ 1359 if (i915_request_completed(next)) 1360 return true; 1361 1362 if (unlikely((prev->flags ^ next->flags) & 1363 (I915_REQUEST_NOPREEMPT | I915_REQUEST_SENTINEL))) 1364 return false; 1365 1366 if (!can_merge_ctx(prev->hw_context, next->hw_context)) 1367 return false; 1368 1369 return true; 1370 } 1371 1372 static void virtual_update_register_offsets(u32 *regs, 1373 struct intel_engine_cs *engine) 1374 { 1375 set_offsets(regs, reg_offsets(engine), engine); 1376 } 1377 1378 static bool virtual_matches(const struct virtual_engine *ve, 1379 const struct i915_request *rq, 1380 const struct intel_engine_cs *engine) 1381 { 1382 const struct intel_engine_cs *inflight; 1383 1384 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1385 return false; 1386 1387 /* 1388 * We track when the HW has completed saving the context image 1389 * (i.e. when we have seen the final CS event switching out of 1390 * the context) and must not overwrite the context image before 1391 * then. This restricts us to only using the active engine 1392 * while the previous virtualized request is inflight (so 1393 * we reuse the register offsets). This is a very small 1394 * hystersis on the greedy seelction algorithm. 1395 */ 1396 inflight = intel_context_inflight(&ve->context); 1397 if (inflight && inflight != engine) 1398 return false; 1399 1400 return true; 1401 } 1402 1403 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, 1404 struct intel_engine_cs *engine) 1405 { 1406 struct intel_engine_cs *old = ve->siblings[0]; 1407 1408 /* All unattached (rq->engine == old) must already be completed */ 1409 1410 spin_lock(&old->breadcrumbs.irq_lock); 1411 if (!list_empty(&ve->context.signal_link)) { 1412 list_move_tail(&ve->context.signal_link, 1413 &engine->breadcrumbs.signalers); 1414 intel_engine_queue_breadcrumbs(engine); 1415 } 1416 spin_unlock(&old->breadcrumbs.irq_lock); 1417 } 1418 1419 static struct i915_request * 1420 last_active(const struct intel_engine_execlists *execlists) 1421 { 1422 struct i915_request * const *last = READ_ONCE(execlists->active); 1423 1424 while (*last && i915_request_completed(*last)) 1425 last++; 1426 1427 return *last; 1428 } 1429 1430 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1431 { 1432 LIST_HEAD(list); 1433 1434 /* 1435 * We want to move the interrupted request to the back of 1436 * the round-robin list (i.e. its priority level), but 1437 * in doing so, we must then move all requests that were in 1438 * flight and were waiting for the interrupted request to 1439 * be run after it again. 1440 */ 1441 do { 1442 struct i915_dependency *p; 1443 1444 GEM_BUG_ON(i915_request_is_active(rq)); 1445 list_move_tail(&rq->sched.link, pl); 1446 1447 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 1448 struct i915_request *w = 1449 container_of(p->waiter, typeof(*w), sched); 1450 1451 /* Leave semaphores spinning on the other engines */ 1452 if (w->engine != rq->engine) 1453 continue; 1454 1455 /* No waiter should start before its signaler */ 1456 GEM_BUG_ON(i915_request_started(w) && 1457 !i915_request_completed(rq)); 1458 1459 GEM_BUG_ON(i915_request_is_active(w)); 1460 if (list_empty(&w->sched.link)) 1461 continue; /* Not yet submitted; unready */ 1462 1463 if (rq_prio(w) < rq_prio(rq)) 1464 continue; 1465 1466 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1467 list_move_tail(&w->sched.link, &list); 1468 } 1469 1470 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1471 } while (rq); 1472 } 1473 1474 static void defer_active(struct intel_engine_cs *engine) 1475 { 1476 struct i915_request *rq; 1477 1478 rq = __unwind_incomplete_requests(engine); 1479 if (!rq) 1480 return; 1481 1482 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1483 } 1484 1485 static bool 1486 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) 1487 { 1488 int hint; 1489 1490 if (!intel_engine_has_timeslices(engine)) 1491 return false; 1492 1493 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1494 return false; 1495 1496 hint = max(rq_prio(list_next_entry(rq, sched.link)), 1497 engine->execlists.queue_priority_hint); 1498 1499 return hint >= effective_prio(rq); 1500 } 1501 1502 static int 1503 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1504 { 1505 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1506 return INT_MIN; 1507 1508 return rq_prio(list_next_entry(rq, sched.link)); 1509 } 1510 1511 static inline unsigned long 1512 timeslice(const struct intel_engine_cs *engine) 1513 { 1514 return READ_ONCE(engine->props.timeslice_duration_ms); 1515 } 1516 1517 static unsigned long 1518 active_timeslice(const struct intel_engine_cs *engine) 1519 { 1520 const struct i915_request *rq = *engine->execlists.active; 1521 1522 if (i915_request_completed(rq)) 1523 return 0; 1524 1525 if (engine->execlists.switch_priority_hint < effective_prio(rq)) 1526 return 0; 1527 1528 return timeslice(engine); 1529 } 1530 1531 static void set_timeslice(struct intel_engine_cs *engine) 1532 { 1533 if (!intel_engine_has_timeslices(engine)) 1534 return; 1535 1536 set_timer_ms(&engine->execlists.timer, active_timeslice(engine)); 1537 } 1538 1539 static void record_preemption(struct intel_engine_execlists *execlists) 1540 { 1541 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 1542 } 1543 1544 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine) 1545 { 1546 struct i915_request *rq; 1547 1548 rq = last_active(&engine->execlists); 1549 if (!rq) 1550 return 0; 1551 1552 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 1553 if (unlikely(i915_gem_context_is_banned(rq->gem_context))) 1554 return 1; 1555 1556 return READ_ONCE(engine->props.preempt_timeout_ms); 1557 } 1558 1559 static void set_preempt_timeout(struct intel_engine_cs *engine) 1560 { 1561 if (!intel_engine_has_preempt_reset(engine)) 1562 return; 1563 1564 set_timer_ms(&engine->execlists.preempt, 1565 active_preempt_timeout(engine)); 1566 } 1567 1568 static void execlists_dequeue(struct intel_engine_cs *engine) 1569 { 1570 struct intel_engine_execlists * const execlists = &engine->execlists; 1571 struct i915_request **port = execlists->pending; 1572 struct i915_request ** const last_port = port + execlists->port_mask; 1573 struct i915_request *last; 1574 struct rb_node *rb; 1575 bool submit = false; 1576 1577 /* 1578 * Hardware submission is through 2 ports. Conceptually each port 1579 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 1580 * static for a context, and unique to each, so we only execute 1581 * requests belonging to a single context from each ring. RING_HEAD 1582 * is maintained by the CS in the context image, it marks the place 1583 * where it got up to last time, and through RING_TAIL we tell the CS 1584 * where we want to execute up to this time. 1585 * 1586 * In this list the requests are in order of execution. Consecutive 1587 * requests from the same context are adjacent in the ringbuffer. We 1588 * can combine these requests into a single RING_TAIL update: 1589 * 1590 * RING_HEAD...req1...req2 1591 * ^- RING_TAIL 1592 * since to execute req2 the CS must first execute req1. 1593 * 1594 * Our goal then is to point each port to the end of a consecutive 1595 * sequence of requests as being the most optimal (fewest wake ups 1596 * and context switches) submission. 1597 */ 1598 1599 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 1600 struct virtual_engine *ve = 1601 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1602 struct i915_request *rq = READ_ONCE(ve->request); 1603 1604 if (!rq) { /* lazily cleanup after another engine handled rq */ 1605 rb_erase_cached(rb, &execlists->virtual); 1606 RB_CLEAR_NODE(rb); 1607 rb = rb_first_cached(&execlists->virtual); 1608 continue; 1609 } 1610 1611 if (!virtual_matches(ve, rq, engine)) { 1612 rb = rb_next(rb); 1613 continue; 1614 } 1615 1616 break; 1617 } 1618 1619 /* 1620 * If the queue is higher priority than the last 1621 * request in the currently active context, submit afresh. 1622 * We will resubmit again afterwards in case we need to split 1623 * the active context to interject the preemption request, 1624 * i.e. we will retrigger preemption following the ack in case 1625 * of trouble. 1626 */ 1627 last = last_active(execlists); 1628 if (last) { 1629 if (need_preempt(engine, last, rb)) { 1630 GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n", 1631 engine->name, 1632 last->fence.context, 1633 last->fence.seqno, 1634 last->sched.attr.priority, 1635 execlists->queue_priority_hint); 1636 record_preemption(execlists); 1637 1638 /* 1639 * Don't let the RING_HEAD advance past the breadcrumb 1640 * as we unwind (and until we resubmit) so that we do 1641 * not accidentally tell it to go backwards. 1642 */ 1643 ring_set_paused(engine, 1); 1644 1645 /* 1646 * Note that we have not stopped the GPU at this point, 1647 * so we are unwinding the incomplete requests as they 1648 * remain inflight and so by the time we do complete 1649 * the preemption, some of the unwound requests may 1650 * complete! 1651 */ 1652 __unwind_incomplete_requests(engine); 1653 1654 /* 1655 * If we need to return to the preempted context, we 1656 * need to skip the lite-restore and force it to 1657 * reload the RING_TAIL. Otherwise, the HW has a 1658 * tendency to ignore us rewinding the TAIL to the 1659 * end of an earlier request. 1660 */ 1661 last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1662 last = NULL; 1663 } else if (need_timeslice(engine, last) && 1664 timer_expired(&engine->execlists.timer)) { 1665 GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n", 1666 engine->name, 1667 last->fence.context, 1668 last->fence.seqno, 1669 last->sched.attr.priority, 1670 execlists->queue_priority_hint); 1671 1672 ring_set_paused(engine, 1); 1673 defer_active(engine); 1674 1675 /* 1676 * Unlike for preemption, if we rewind and continue 1677 * executing the same context as previously active, 1678 * the order of execution will remain the same and 1679 * the tail will only advance. We do not need to 1680 * force a full context restore, as a lite-restore 1681 * is sufficient to resample the monotonic TAIL. 1682 * 1683 * If we switch to any other context, similarly we 1684 * will not rewind TAIL of current context, and 1685 * normal save/restore will preserve state and allow 1686 * us to later continue executing the same request. 1687 */ 1688 last = NULL; 1689 } else { 1690 /* 1691 * Otherwise if we already have a request pending 1692 * for execution after the current one, we can 1693 * just wait until the next CS event before 1694 * queuing more. In either case we will force a 1695 * lite-restore preemption event, but if we wait 1696 * we hopefully coalesce several updates into a single 1697 * submission. 1698 */ 1699 if (!list_is_last(&last->sched.link, 1700 &engine->active.requests)) { 1701 /* 1702 * Even if ELSP[1] is occupied and not worthy 1703 * of timeslices, our queue might be. 1704 */ 1705 if (!execlists->timer.expires && 1706 need_timeslice(engine, last)) 1707 set_timer_ms(&execlists->timer, 1708 timeslice(engine)); 1709 1710 return; 1711 } 1712 } 1713 } 1714 1715 while (rb) { /* XXX virtual is always taking precedence */ 1716 struct virtual_engine *ve = 1717 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1718 struct i915_request *rq; 1719 1720 spin_lock(&ve->base.active.lock); 1721 1722 rq = ve->request; 1723 if (unlikely(!rq)) { /* lost the race to a sibling */ 1724 spin_unlock(&ve->base.active.lock); 1725 rb_erase_cached(rb, &execlists->virtual); 1726 RB_CLEAR_NODE(rb); 1727 rb = rb_first_cached(&execlists->virtual); 1728 continue; 1729 } 1730 1731 GEM_BUG_ON(rq != ve->request); 1732 GEM_BUG_ON(rq->engine != &ve->base); 1733 GEM_BUG_ON(rq->hw_context != &ve->context); 1734 1735 if (rq_prio(rq) >= queue_prio(execlists)) { 1736 if (!virtual_matches(ve, rq, engine)) { 1737 spin_unlock(&ve->base.active.lock); 1738 rb = rb_next(rb); 1739 continue; 1740 } 1741 1742 if (last && !can_merge_rq(last, rq)) { 1743 spin_unlock(&ve->base.active.lock); 1744 return; /* leave this for another */ 1745 } 1746 1747 GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n", 1748 engine->name, 1749 rq->fence.context, 1750 rq->fence.seqno, 1751 i915_request_completed(rq) ? "!" : 1752 i915_request_started(rq) ? "*" : 1753 "", 1754 yesno(engine != ve->siblings[0])); 1755 1756 ve->request = NULL; 1757 ve->base.execlists.queue_priority_hint = INT_MIN; 1758 rb_erase_cached(rb, &execlists->virtual); 1759 RB_CLEAR_NODE(rb); 1760 1761 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 1762 rq->engine = engine; 1763 1764 if (engine != ve->siblings[0]) { 1765 u32 *regs = ve->context.lrc_reg_state; 1766 unsigned int n; 1767 1768 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1769 1770 if (!intel_engine_has_relative_mmio(engine)) 1771 virtual_update_register_offsets(regs, 1772 engine); 1773 1774 if (!list_empty(&ve->context.signals)) 1775 virtual_xfer_breadcrumbs(ve, engine); 1776 1777 /* 1778 * Move the bound engine to the top of the list 1779 * for future execution. We then kick this 1780 * tasklet first before checking others, so that 1781 * we preferentially reuse this set of bound 1782 * registers. 1783 */ 1784 for (n = 1; n < ve->num_siblings; n++) { 1785 if (ve->siblings[n] == engine) { 1786 swap(ve->siblings[n], 1787 ve->siblings[0]); 1788 break; 1789 } 1790 } 1791 1792 GEM_BUG_ON(ve->siblings[0] != engine); 1793 } 1794 1795 if (__i915_request_submit(rq)) { 1796 submit = true; 1797 last = rq; 1798 } 1799 i915_request_put(rq); 1800 1801 /* 1802 * Hmm, we have a bunch of virtual engine requests, 1803 * but the first one was already completed (thanks 1804 * preempt-to-busy!). Keep looking at the veng queue 1805 * until we have no more relevant requests (i.e. 1806 * the normal submit queue has higher priority). 1807 */ 1808 if (!submit) { 1809 spin_unlock(&ve->base.active.lock); 1810 rb = rb_first_cached(&execlists->virtual); 1811 continue; 1812 } 1813 } 1814 1815 spin_unlock(&ve->base.active.lock); 1816 break; 1817 } 1818 1819 while ((rb = rb_first_cached(&execlists->queue))) { 1820 struct i915_priolist *p = to_priolist(rb); 1821 struct i915_request *rq, *rn; 1822 int i; 1823 1824 priolist_for_each_request_consume(rq, rn, p, i) { 1825 bool merge = true; 1826 1827 /* 1828 * Can we combine this request with the current port? 1829 * It has to be the same context/ringbuffer and not 1830 * have any exceptions (e.g. GVT saying never to 1831 * combine contexts). 1832 * 1833 * If we can combine the requests, we can execute both 1834 * by updating the RING_TAIL to point to the end of the 1835 * second request, and so we never need to tell the 1836 * hardware about the first. 1837 */ 1838 if (last && !can_merge_rq(last, rq)) { 1839 /* 1840 * If we are on the second port and cannot 1841 * combine this request with the last, then we 1842 * are done. 1843 */ 1844 if (port == last_port) 1845 goto done; 1846 1847 /* 1848 * We must not populate both ELSP[] with the 1849 * same LRCA, i.e. we must submit 2 different 1850 * contexts if we submit 2 ELSP. 1851 */ 1852 if (last->hw_context == rq->hw_context) 1853 goto done; 1854 1855 if (i915_request_has_sentinel(last)) 1856 goto done; 1857 1858 /* 1859 * If GVT overrides us we only ever submit 1860 * port[0], leaving port[1] empty. Note that we 1861 * also have to be careful that we don't queue 1862 * the same context (even though a different 1863 * request) to the second port. 1864 */ 1865 if (ctx_single_port_submission(last->hw_context) || 1866 ctx_single_port_submission(rq->hw_context)) 1867 goto done; 1868 1869 merge = false; 1870 } 1871 1872 if (__i915_request_submit(rq)) { 1873 if (!merge) { 1874 *port = execlists_schedule_in(last, port - execlists->pending); 1875 port++; 1876 last = NULL; 1877 } 1878 1879 GEM_BUG_ON(last && 1880 !can_merge_ctx(last->hw_context, 1881 rq->hw_context)); 1882 1883 submit = true; 1884 last = rq; 1885 } 1886 } 1887 1888 rb_erase_cached(&p->node, &execlists->queue); 1889 i915_priolist_free(p); 1890 } 1891 1892 done: 1893 /* 1894 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 1895 * 1896 * We choose the priority hint such that if we add a request of greater 1897 * priority than this, we kick the submission tasklet to decide on 1898 * the right order of submitting the requests to hardware. We must 1899 * also be prepared to reorder requests as they are in-flight on the 1900 * HW. We derive the priority hint then as the first "hole" in 1901 * the HW submission ports and if there are no available slots, 1902 * the priority of the lowest executing request, i.e. last. 1903 * 1904 * When we do receive a higher priority request ready to run from the 1905 * user, see queue_request(), the priority hint is bumped to that 1906 * request triggering preemption on the next dequeue (or subsequent 1907 * interrupt for secondary ports). 1908 */ 1909 execlists->queue_priority_hint = queue_prio(execlists); 1910 GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n", 1911 engine->name, execlists->queue_priority_hint, 1912 yesno(submit)); 1913 1914 if (submit) { 1915 *port = execlists_schedule_in(last, port - execlists->pending); 1916 execlists->switch_priority_hint = 1917 switch_prio(engine, *execlists->pending); 1918 1919 /* 1920 * Skip if we ended up with exactly the same set of requests, 1921 * e.g. trying to timeslice a pair of ordered contexts 1922 */ 1923 if (!memcmp(execlists->active, execlists->pending, 1924 (port - execlists->pending + 1) * sizeof(*port))) { 1925 do 1926 execlists_schedule_out(fetch_and_zero(port)); 1927 while (port-- != execlists->pending); 1928 1929 goto skip_submit; 1930 } 1931 1932 memset(port + 1, 0, (last_port - port) * sizeof(*port)); 1933 execlists_submit_ports(engine); 1934 1935 set_preempt_timeout(engine); 1936 } else { 1937 skip_submit: 1938 ring_set_paused(engine, 0); 1939 } 1940 } 1941 1942 static void 1943 cancel_port_requests(struct intel_engine_execlists * const execlists) 1944 { 1945 struct i915_request * const *port; 1946 1947 for (port = execlists->pending; *port; port++) 1948 execlists_schedule_out(*port); 1949 memset(execlists->pending, 0, sizeof(execlists->pending)); 1950 1951 /* Mark the end of active before we overwrite *active */ 1952 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 1953 execlists_schedule_out(*port); 1954 WRITE_ONCE(execlists->active, 1955 memset(execlists->inflight, 0, sizeof(execlists->inflight))); 1956 } 1957 1958 static inline void 1959 invalidate_csb_entries(const u32 *first, const u32 *last) 1960 { 1961 clflush((void *)first); 1962 clflush((void *)last); 1963 } 1964 1965 static inline bool 1966 reset_in_progress(const struct intel_engine_execlists *execlists) 1967 { 1968 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1969 } 1970 1971 /* 1972 * Starting with Gen12, the status has a new format: 1973 * 1974 * bit 0: switched to new queue 1975 * bit 1: reserved 1976 * bit 2: semaphore wait mode (poll or signal), only valid when 1977 * switch detail is set to "wait on semaphore" 1978 * bits 3-5: engine class 1979 * bits 6-11: engine instance 1980 * bits 12-14: reserved 1981 * bits 15-25: sw context id of the lrc the GT switched to 1982 * bits 26-31: sw counter of the lrc the GT switched to 1983 * bits 32-35: context switch detail 1984 * - 0: ctx complete 1985 * - 1: wait on sync flip 1986 * - 2: wait on vblank 1987 * - 3: wait on scanline 1988 * - 4: wait on semaphore 1989 * - 5: context preempted (not on SEMAPHORE_WAIT or 1990 * WAIT_FOR_EVENT) 1991 * bit 36: reserved 1992 * bits 37-43: wait detail (for switch detail 1 to 4) 1993 * bits 44-46: reserved 1994 * bits 47-57: sw context id of the lrc the GT switched away from 1995 * bits 58-63: sw counter of the lrc the GT switched away from 1996 */ 1997 static inline bool 1998 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 1999 { 2000 u32 lower_dw = csb[0]; 2001 u32 upper_dw = csb[1]; 2002 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 2003 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 2004 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2005 2006 /* 2007 * The context switch detail is not guaranteed to be 5 when a preemption 2008 * occurs, so we can't just check for that. The check below works for 2009 * all the cases we care about, including preemptions of WAIT 2010 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2011 * would require some extra handling, but we don't support that. 2012 */ 2013 if (!ctx_away_valid || new_queue) { 2014 GEM_BUG_ON(!ctx_to_valid); 2015 return true; 2016 } 2017 2018 /* 2019 * switch detail = 5 is covered by the case above and we do not expect a 2020 * context switch on an unsuccessful wait instruction since we always 2021 * use polling mode. 2022 */ 2023 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 2024 return false; 2025 } 2026 2027 static inline bool 2028 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2029 { 2030 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2031 } 2032 2033 static void process_csb(struct intel_engine_cs *engine) 2034 { 2035 struct intel_engine_execlists * const execlists = &engine->execlists; 2036 const u32 * const buf = execlists->csb_status; 2037 const u8 num_entries = execlists->csb_size; 2038 u8 head, tail; 2039 2040 /* 2041 * As we modify our execlists state tracking we require exclusive 2042 * access. Either we are inside the tasklet, or the tasklet is disabled 2043 * and we assume that is only inside the reset paths and so serialised. 2044 */ 2045 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2046 !reset_in_progress(execlists)); 2047 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2048 2049 /* 2050 * Note that csb_write, csb_status may be either in HWSP or mmio. 2051 * When reading from the csb_write mmio register, we have to be 2052 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2053 * the low 4bits. As it happens we know the next 4bits are always 2054 * zero and so we can simply masked off the low u8 of the register 2055 * and treat it identically to reading from the HWSP (without having 2056 * to use explicit shifting and masking, and probably bifurcating 2057 * the code to handle the legacy mmio read). 2058 */ 2059 head = execlists->csb_head; 2060 tail = READ_ONCE(*execlists->csb_write); 2061 GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail); 2062 if (unlikely(head == tail)) 2063 return; 2064 2065 /* 2066 * Hopefully paired with a wmb() in HW! 2067 * 2068 * We must complete the read of the write pointer before any reads 2069 * from the CSB, so that we do not see stale values. Without an rmb 2070 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2071 * we perform the READ_ONCE(*csb_write). 2072 */ 2073 rmb(); 2074 2075 do { 2076 bool promote; 2077 2078 if (++head == num_entries) 2079 head = 0; 2080 2081 /* 2082 * We are flying near dragons again. 2083 * 2084 * We hold a reference to the request in execlist_port[] 2085 * but no more than that. We are operating in softirq 2086 * context and so cannot hold any mutex or sleep. That 2087 * prevents us stopping the requests we are processing 2088 * in port[] from being retired simultaneously (the 2089 * breadcrumb will be complete before we see the 2090 * context-switch). As we only hold the reference to the 2091 * request, any pointer chasing underneath the request 2092 * is subject to a potential use-after-free. Thus we 2093 * store all of the bookkeeping within port[] as 2094 * required, and avoid using unguarded pointers beneath 2095 * request itself. The same applies to the atomic 2096 * status notifier. 2097 */ 2098 2099 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n", 2100 engine->name, head, 2101 buf[2 * head + 0], buf[2 * head + 1]); 2102 2103 if (INTEL_GEN(engine->i915) >= 12) 2104 promote = gen12_csb_parse(execlists, buf + 2 * head); 2105 else 2106 promote = gen8_csb_parse(execlists, buf + 2 * head); 2107 if (promote) { 2108 struct i915_request * const *old = execlists->active; 2109 2110 /* Point active to the new ELSP; prevent overwriting */ 2111 WRITE_ONCE(execlists->active, execlists->pending); 2112 set_timeslice(engine); 2113 2114 if (!inject_preempt_hang(execlists)) 2115 ring_set_paused(engine, 0); 2116 2117 /* cancel old inflight, prepare for switch */ 2118 trace_ports(execlists, "preempted", old); 2119 while (*old) 2120 execlists_schedule_out(*old++); 2121 2122 /* switch pending to inflight */ 2123 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2124 WRITE_ONCE(execlists->active, 2125 memcpy(execlists->inflight, 2126 execlists->pending, 2127 execlists_num_ports(execlists) * 2128 sizeof(*execlists->pending))); 2129 2130 WRITE_ONCE(execlists->pending[0], NULL); 2131 } else { 2132 GEM_BUG_ON(!*execlists->active); 2133 2134 /* port0 completed, advanced to port1 */ 2135 trace_ports(execlists, "completed", execlists->active); 2136 2137 /* 2138 * We rely on the hardware being strongly 2139 * ordered, that the breadcrumb write is 2140 * coherent (visible from the CPU) before the 2141 * user interrupt and CSB is processed. 2142 */ 2143 GEM_BUG_ON(!i915_request_completed(*execlists->active) && 2144 !reset_in_progress(execlists)); 2145 execlists_schedule_out(*execlists->active++); 2146 2147 GEM_BUG_ON(execlists->active - execlists->inflight > 2148 execlists_num_ports(execlists)); 2149 } 2150 } while (head != tail); 2151 2152 execlists->csb_head = head; 2153 2154 /* 2155 * Gen11 has proven to fail wrt global observation point between 2156 * entry and tail update, failing on the ordering and thus 2157 * we see an old entry in the context status buffer. 2158 * 2159 * Forcibly evict out entries for the next gpu csb update, 2160 * to increase the odds that we get a fresh entries with non 2161 * working hardware. The cost for doing so comes out mostly with 2162 * the wash as hardware, working or not, will need to do the 2163 * invalidation before. 2164 */ 2165 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2166 } 2167 2168 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2169 { 2170 lockdep_assert_held(&engine->active.lock); 2171 if (!engine->execlists.pending[0]) { 2172 rcu_read_lock(); /* protect peeking at execlists->active */ 2173 execlists_dequeue(engine); 2174 rcu_read_unlock(); 2175 } 2176 } 2177 2178 static noinline void preempt_reset(struct intel_engine_cs *engine) 2179 { 2180 const unsigned int bit = I915_RESET_ENGINE + engine->id; 2181 unsigned long *lock = &engine->gt->reset.flags; 2182 2183 if (i915_modparams.reset < 3) 2184 return; 2185 2186 if (test_and_set_bit(bit, lock)) 2187 return; 2188 2189 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 2190 tasklet_disable_nosync(&engine->execlists.tasklet); 2191 2192 GEM_TRACE("%s: preempt timeout %lu+%ums\n", 2193 engine->name, 2194 READ_ONCE(engine->props.preempt_timeout_ms), 2195 jiffies_to_msecs(jiffies - engine->execlists.preempt.expires)); 2196 intel_engine_reset(engine, "preemption time out"); 2197 2198 tasklet_enable(&engine->execlists.tasklet); 2199 clear_and_wake_up_bit(bit, lock); 2200 } 2201 2202 static bool preempt_timeout(const struct intel_engine_cs *const engine) 2203 { 2204 const struct timer_list *t = &engine->execlists.preempt; 2205 2206 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 2207 return false; 2208 2209 if (!timer_expired(t)) 2210 return false; 2211 2212 return READ_ONCE(engine->execlists.pending[0]); 2213 } 2214 2215 /* 2216 * Check the unread Context Status Buffers and manage the submission of new 2217 * contexts to the ELSP accordingly. 2218 */ 2219 static void execlists_submission_tasklet(unsigned long data) 2220 { 2221 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 2222 bool timeout = preempt_timeout(engine); 2223 2224 process_csb(engine); 2225 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 2226 unsigned long flags; 2227 2228 spin_lock_irqsave(&engine->active.lock, flags); 2229 __execlists_submission_tasklet(engine); 2230 spin_unlock_irqrestore(&engine->active.lock, flags); 2231 2232 /* Recheck after serialising with direct-submission */ 2233 if (timeout && preempt_timeout(engine)) 2234 preempt_reset(engine); 2235 } 2236 } 2237 2238 static void __execlists_kick(struct intel_engine_execlists *execlists) 2239 { 2240 /* Kick the tasklet for some interrupt coalescing and reset handling */ 2241 tasklet_hi_schedule(&execlists->tasklet); 2242 } 2243 2244 #define execlists_kick(t, member) \ 2245 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 2246 2247 static void execlists_timeslice(struct timer_list *timer) 2248 { 2249 execlists_kick(timer, timer); 2250 } 2251 2252 static void execlists_preempt(struct timer_list *timer) 2253 { 2254 execlists_kick(timer, preempt); 2255 } 2256 2257 static void queue_request(struct intel_engine_cs *engine, 2258 struct i915_sched_node *node, 2259 int prio) 2260 { 2261 GEM_BUG_ON(!list_empty(&node->link)); 2262 list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio)); 2263 } 2264 2265 static void __submit_queue_imm(struct intel_engine_cs *engine) 2266 { 2267 struct intel_engine_execlists * const execlists = &engine->execlists; 2268 2269 if (reset_in_progress(execlists)) 2270 return; /* defer until we restart the engine following reset */ 2271 2272 if (execlists->tasklet.func == execlists_submission_tasklet) 2273 __execlists_submission_tasklet(engine); 2274 else 2275 tasklet_hi_schedule(&execlists->tasklet); 2276 } 2277 2278 static void submit_queue(struct intel_engine_cs *engine, 2279 const struct i915_request *rq) 2280 { 2281 struct intel_engine_execlists *execlists = &engine->execlists; 2282 2283 if (rq_prio(rq) <= execlists->queue_priority_hint) 2284 return; 2285 2286 execlists->queue_priority_hint = rq_prio(rq); 2287 __submit_queue_imm(engine); 2288 } 2289 2290 static void execlists_submit_request(struct i915_request *request) 2291 { 2292 struct intel_engine_cs *engine = request->engine; 2293 unsigned long flags; 2294 2295 /* Will be called from irq-context when using foreign fences. */ 2296 spin_lock_irqsave(&engine->active.lock, flags); 2297 2298 queue_request(engine, &request->sched, rq_prio(request)); 2299 2300 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 2301 GEM_BUG_ON(list_empty(&request->sched.link)); 2302 2303 submit_queue(engine, request); 2304 2305 spin_unlock_irqrestore(&engine->active.lock, flags); 2306 } 2307 2308 static void __execlists_context_fini(struct intel_context *ce) 2309 { 2310 intel_ring_put(ce->ring); 2311 i915_vma_put(ce->state); 2312 } 2313 2314 static void execlists_context_destroy(struct kref *kref) 2315 { 2316 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 2317 2318 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 2319 GEM_BUG_ON(intel_context_is_pinned(ce)); 2320 2321 if (ce->state) 2322 __execlists_context_fini(ce); 2323 2324 intel_context_fini(ce); 2325 intel_context_free(ce); 2326 } 2327 2328 static void 2329 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 2330 { 2331 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2332 return; 2333 2334 vaddr += engine->context_size; 2335 2336 memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE); 2337 } 2338 2339 static void 2340 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 2341 { 2342 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2343 return; 2344 2345 vaddr += engine->context_size; 2346 2347 if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) 2348 dev_err_once(engine->i915->drm.dev, 2349 "%s context redzone overwritten!\n", 2350 engine->name); 2351 } 2352 2353 static void execlists_context_unpin(struct intel_context *ce) 2354 { 2355 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, 2356 ce->engine); 2357 2358 i915_gem_object_unpin_map(ce->state->obj); 2359 intel_ring_reset(ce->ring, ce->ring->tail); 2360 } 2361 2362 static void 2363 __execlists_update_reg_state(const struct intel_context *ce, 2364 const struct intel_engine_cs *engine) 2365 { 2366 struct intel_ring *ring = ce->ring; 2367 u32 *regs = ce->lrc_reg_state; 2368 2369 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head)); 2370 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 2371 2372 regs[CTX_RING_BUFFER_START] = i915_ggtt_offset(ring->vma); 2373 regs[CTX_RING_HEAD] = ring->head; 2374 regs[CTX_RING_TAIL] = ring->tail; 2375 2376 /* RPCS */ 2377 if (engine->class == RENDER_CLASS) { 2378 regs[CTX_R_PWR_CLK_STATE] = 2379 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 2380 2381 i915_oa_init_reg_state(ce, engine); 2382 } 2383 } 2384 2385 static int 2386 __execlists_context_pin(struct intel_context *ce, 2387 struct intel_engine_cs *engine) 2388 { 2389 void *vaddr; 2390 int ret; 2391 2392 GEM_BUG_ON(!ce->state); 2393 2394 ret = intel_context_active_acquire(ce); 2395 if (ret) 2396 goto err; 2397 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 2398 2399 vaddr = i915_gem_object_pin_map(ce->state->obj, 2400 i915_coherent_map_type(engine->i915) | 2401 I915_MAP_OVERRIDE); 2402 if (IS_ERR(vaddr)) { 2403 ret = PTR_ERR(vaddr); 2404 goto unpin_active; 2405 } 2406 2407 ce->lrc_desc = lrc_descriptor(ce, engine); 2408 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 2409 __execlists_update_reg_state(ce, engine); 2410 2411 return 0; 2412 2413 unpin_active: 2414 intel_context_active_release(ce); 2415 err: 2416 return ret; 2417 } 2418 2419 static int execlists_context_pin(struct intel_context *ce) 2420 { 2421 return __execlists_context_pin(ce, ce->engine); 2422 } 2423 2424 static int execlists_context_alloc(struct intel_context *ce) 2425 { 2426 return __execlists_context_alloc(ce, ce->engine); 2427 } 2428 2429 static void execlists_context_reset(struct intel_context *ce) 2430 { 2431 /* 2432 * Because we emit WA_TAIL_DWORDS there may be a disparity 2433 * between our bookkeeping in ce->ring->head and ce->ring->tail and 2434 * that stored in context. As we only write new commands from 2435 * ce->ring->tail onwards, everything before that is junk. If the GPU 2436 * starts reading from its RING_HEAD from the context, it may try to 2437 * execute that junk and die. 2438 * 2439 * The contexts that are stilled pinned on resume belong to the 2440 * kernel, and are local to each engine. All other contexts will 2441 * have their head/tail sanitized upon pinning before use, so they 2442 * will never see garbage, 2443 * 2444 * So to avoid that we reset the context images upon resume. For 2445 * simplicity, we just zero everything out. 2446 */ 2447 intel_ring_reset(ce->ring, 0); 2448 __execlists_update_reg_state(ce, ce->engine); 2449 } 2450 2451 static const struct intel_context_ops execlists_context_ops = { 2452 .alloc = execlists_context_alloc, 2453 2454 .pin = execlists_context_pin, 2455 .unpin = execlists_context_unpin, 2456 2457 .enter = intel_context_enter_engine, 2458 .exit = intel_context_exit_engine, 2459 2460 .reset = execlists_context_reset, 2461 .destroy = execlists_context_destroy, 2462 }; 2463 2464 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 2465 { 2466 u32 *cs; 2467 2468 GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb); 2469 2470 cs = intel_ring_begin(rq, 6); 2471 if (IS_ERR(cs)) 2472 return PTR_ERR(cs); 2473 2474 /* 2475 * Check if we have been preempted before we even get started. 2476 * 2477 * After this point i915_request_started() reports true, even if 2478 * we get preempted and so are no longer running. 2479 */ 2480 *cs++ = MI_ARB_CHECK; 2481 *cs++ = MI_NOOP; 2482 2483 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 2484 *cs++ = i915_request_timeline(rq)->hwsp_offset; 2485 *cs++ = 0; 2486 *cs++ = rq->fence.seqno - 1; 2487 2488 intel_ring_advance(rq, cs); 2489 2490 /* Record the updated position of the request's payload */ 2491 rq->infix = intel_ring_offset(rq, cs); 2492 2493 return 0; 2494 } 2495 2496 static int execlists_request_alloc(struct i915_request *request) 2497 { 2498 int ret; 2499 2500 GEM_BUG_ON(!intel_context_is_pinned(request->hw_context)); 2501 2502 /* 2503 * Flush enough space to reduce the likelihood of waiting after 2504 * we start building the request - in which case we will just 2505 * have to repeat work. 2506 */ 2507 request->reserved_space += EXECLISTS_REQUEST_SIZE; 2508 2509 /* 2510 * Note that after this point, we have committed to using 2511 * this request as it is being used to both track the 2512 * state of engine initialisation and liveness of the 2513 * golden renderstate above. Think twice before you try 2514 * to cancel/unwind this request now. 2515 */ 2516 2517 /* Unconditionally invalidate GPU caches and TLBs. */ 2518 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 2519 if (ret) 2520 return ret; 2521 2522 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 2523 return 0; 2524 } 2525 2526 /* 2527 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 2528 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 2529 * but there is a slight complication as this is applied in WA batch where the 2530 * values are only initialized once so we cannot take register value at the 2531 * beginning and reuse it further; hence we save its value to memory, upload a 2532 * constant value with bit21 set and then we restore it back with the saved value. 2533 * To simplify the WA, a constant value is formed by using the default value 2534 * of this register. This shouldn't be a problem because we are only modifying 2535 * it for a short period and this batch in non-premptible. We can ofcourse 2536 * use additional instructions that read the actual value of the register 2537 * at that time and set our bit of interest but it makes the WA complicated. 2538 * 2539 * This WA is also required for Gen9 so extracting as a function avoids 2540 * code duplication. 2541 */ 2542 static u32 * 2543 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 2544 { 2545 /* NB no one else is allowed to scribble over scratch + 256! */ 2546 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 2547 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2548 *batch++ = intel_gt_scratch_offset(engine->gt, 2549 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 2550 *batch++ = 0; 2551 2552 *batch++ = MI_LOAD_REGISTER_IMM(1); 2553 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2554 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 2555 2556 batch = gen8_emit_pipe_control(batch, 2557 PIPE_CONTROL_CS_STALL | 2558 PIPE_CONTROL_DC_FLUSH_ENABLE, 2559 0); 2560 2561 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 2562 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2563 *batch++ = intel_gt_scratch_offset(engine->gt, 2564 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 2565 *batch++ = 0; 2566 2567 return batch; 2568 } 2569 2570 /* 2571 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 2572 * initialized at the beginning and shared across all contexts but this field 2573 * helps us to have multiple batches at different offsets and select them based 2574 * on a criteria. At the moment this batch always start at the beginning of the page 2575 * and at this point we don't have multiple wa_ctx batch buffers. 2576 * 2577 * The number of WA applied are not known at the beginning; we use this field 2578 * to return the no of DWORDS written. 2579 * 2580 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 2581 * so it adds NOOPs as padding to make it cacheline aligned. 2582 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 2583 * makes a complete batch buffer. 2584 */ 2585 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2586 { 2587 /* WaDisableCtxRestoreArbitration:bdw,chv */ 2588 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2589 2590 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 2591 if (IS_BROADWELL(engine->i915)) 2592 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 2593 2594 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 2595 /* Actual scratch location is at 128 bytes offset */ 2596 batch = gen8_emit_pipe_control(batch, 2597 PIPE_CONTROL_FLUSH_L3 | 2598 PIPE_CONTROL_STORE_DATA_INDEX | 2599 PIPE_CONTROL_CS_STALL | 2600 PIPE_CONTROL_QW_WRITE, 2601 LRC_PPHWSP_SCRATCH_ADDR); 2602 2603 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2604 2605 /* Pad to end of cacheline */ 2606 while ((unsigned long)batch % CACHELINE_BYTES) 2607 *batch++ = MI_NOOP; 2608 2609 /* 2610 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 2611 * execution depends on the length specified in terms of cache lines 2612 * in the register CTX_RCS_INDIRECT_CTX 2613 */ 2614 2615 return batch; 2616 } 2617 2618 struct lri { 2619 i915_reg_t reg; 2620 u32 value; 2621 }; 2622 2623 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 2624 { 2625 GEM_BUG_ON(!count || count > 63); 2626 2627 *batch++ = MI_LOAD_REGISTER_IMM(count); 2628 do { 2629 *batch++ = i915_mmio_reg_offset(lri->reg); 2630 *batch++ = lri->value; 2631 } while (lri++, --count); 2632 *batch++ = MI_NOOP; 2633 2634 return batch; 2635 } 2636 2637 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2638 { 2639 static const struct lri lri[] = { 2640 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 2641 { 2642 COMMON_SLICE_CHICKEN2, 2643 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 2644 0), 2645 }, 2646 2647 /* BSpec: 11391 */ 2648 { 2649 FF_SLICE_CHICKEN, 2650 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 2651 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 2652 }, 2653 2654 /* BSpec: 11299 */ 2655 { 2656 _3D_CHICKEN3, 2657 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 2658 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 2659 } 2660 }; 2661 2662 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2663 2664 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 2665 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 2666 2667 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 2668 2669 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 2670 if (HAS_POOLED_EU(engine->i915)) { 2671 /* 2672 * EU pool configuration is setup along with golden context 2673 * during context initialization. This value depends on 2674 * device type (2x6 or 3x6) and needs to be updated based 2675 * on which subslice is disabled especially for 2x6 2676 * devices, however it is safe to load default 2677 * configuration of 3x6 device instead of masking off 2678 * corresponding bits because HW ignores bits of a disabled 2679 * subslice and drops down to appropriate config. Please 2680 * see render_state_setup() in i915_gem_render_state.c for 2681 * possible configurations, to avoid duplication they are 2682 * not shown here again. 2683 */ 2684 *batch++ = GEN9_MEDIA_POOL_STATE; 2685 *batch++ = GEN9_MEDIA_POOL_ENABLE; 2686 *batch++ = 0x00777000; 2687 *batch++ = 0; 2688 *batch++ = 0; 2689 *batch++ = 0; 2690 } 2691 2692 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2693 2694 /* Pad to end of cacheline */ 2695 while ((unsigned long)batch % CACHELINE_BYTES) 2696 *batch++ = MI_NOOP; 2697 2698 return batch; 2699 } 2700 2701 static u32 * 2702 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2703 { 2704 int i; 2705 2706 /* 2707 * WaPipeControlBefore3DStateSamplePattern: cnl 2708 * 2709 * Ensure the engine is idle prior to programming a 2710 * 3DSTATE_SAMPLE_PATTERN during a context restore. 2711 */ 2712 batch = gen8_emit_pipe_control(batch, 2713 PIPE_CONTROL_CS_STALL, 2714 0); 2715 /* 2716 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 2717 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 2718 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 2719 * confusing. Since gen8_emit_pipe_control() already advances the 2720 * batch by 6 dwords, we advance the other 10 here, completing a 2721 * cacheline. It's not clear if the workaround requires this padding 2722 * before other commands, or if it's just the regular padding we would 2723 * already have for the workaround bb, so leave it here for now. 2724 */ 2725 for (i = 0; i < 10; i++) 2726 *batch++ = MI_NOOP; 2727 2728 /* Pad to end of cacheline */ 2729 while ((unsigned long)batch % CACHELINE_BYTES) 2730 *batch++ = MI_NOOP; 2731 2732 return batch; 2733 } 2734 2735 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 2736 2737 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 2738 { 2739 struct drm_i915_gem_object *obj; 2740 struct i915_vma *vma; 2741 int err; 2742 2743 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 2744 if (IS_ERR(obj)) 2745 return PTR_ERR(obj); 2746 2747 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 2748 if (IS_ERR(vma)) { 2749 err = PTR_ERR(vma); 2750 goto err; 2751 } 2752 2753 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); 2754 if (err) 2755 goto err; 2756 2757 engine->wa_ctx.vma = vma; 2758 return 0; 2759 2760 err: 2761 i915_gem_object_put(obj); 2762 return err; 2763 } 2764 2765 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 2766 { 2767 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 2768 } 2769 2770 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 2771 2772 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 2773 { 2774 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 2775 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 2776 &wa_ctx->per_ctx }; 2777 wa_bb_func_t wa_bb_fn[2]; 2778 struct page *page; 2779 void *batch, *batch_ptr; 2780 unsigned int i; 2781 int ret; 2782 2783 if (engine->class != RENDER_CLASS) 2784 return 0; 2785 2786 switch (INTEL_GEN(engine->i915)) { 2787 case 12: 2788 case 11: 2789 return 0; 2790 case 10: 2791 wa_bb_fn[0] = gen10_init_indirectctx_bb; 2792 wa_bb_fn[1] = NULL; 2793 break; 2794 case 9: 2795 wa_bb_fn[0] = gen9_init_indirectctx_bb; 2796 wa_bb_fn[1] = NULL; 2797 break; 2798 case 8: 2799 wa_bb_fn[0] = gen8_init_indirectctx_bb; 2800 wa_bb_fn[1] = NULL; 2801 break; 2802 default: 2803 MISSING_CASE(INTEL_GEN(engine->i915)); 2804 return 0; 2805 } 2806 2807 ret = lrc_setup_wa_ctx(engine); 2808 if (ret) { 2809 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 2810 return ret; 2811 } 2812 2813 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 2814 batch = batch_ptr = kmap_atomic(page); 2815 2816 /* 2817 * Emit the two workaround batch buffers, recording the offset from the 2818 * start of the workaround batch buffer object for each and their 2819 * respective sizes. 2820 */ 2821 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 2822 wa_bb[i]->offset = batch_ptr - batch; 2823 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 2824 CACHELINE_BYTES))) { 2825 ret = -EINVAL; 2826 break; 2827 } 2828 if (wa_bb_fn[i]) 2829 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 2830 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 2831 } 2832 2833 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 2834 2835 kunmap_atomic(batch); 2836 if (ret) 2837 lrc_destroy_wa_ctx(engine); 2838 2839 return ret; 2840 } 2841 2842 static void enable_execlists(struct intel_engine_cs *engine) 2843 { 2844 u32 mode; 2845 2846 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 2847 2848 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 2849 2850 if (INTEL_GEN(engine->i915) >= 11) 2851 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 2852 else 2853 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 2854 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 2855 2856 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 2857 2858 ENGINE_WRITE_FW(engine, 2859 RING_HWS_PGA, 2860 i915_ggtt_offset(engine->status_page.vma)); 2861 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 2862 } 2863 2864 static bool unexpected_starting_state(struct intel_engine_cs *engine) 2865 { 2866 bool unexpected = false; 2867 2868 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 2869 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); 2870 unexpected = true; 2871 } 2872 2873 return unexpected; 2874 } 2875 2876 static int execlists_resume(struct intel_engine_cs *engine) 2877 { 2878 intel_engine_apply_workarounds(engine); 2879 intel_engine_apply_whitelist(engine); 2880 2881 intel_mocs_init_engine(engine); 2882 2883 intel_engine_reset_breadcrumbs(engine); 2884 2885 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 2886 struct drm_printer p = drm_debug_printer(__func__); 2887 2888 intel_engine_dump(engine, &p, NULL); 2889 } 2890 2891 enable_execlists(engine); 2892 2893 return 0; 2894 } 2895 2896 static void execlists_reset_prepare(struct intel_engine_cs *engine) 2897 { 2898 struct intel_engine_execlists * const execlists = &engine->execlists; 2899 unsigned long flags; 2900 2901 GEM_TRACE("%s: depth<-%d\n", engine->name, 2902 atomic_read(&execlists->tasklet.count)); 2903 2904 /* 2905 * Prevent request submission to the hardware until we have 2906 * completed the reset in i915_gem_reset_finish(). If a request 2907 * is completed by one engine, it may then queue a request 2908 * to a second via its execlists->tasklet *just* as we are 2909 * calling engine->resume() and also writing the ELSP. 2910 * Turning off the execlists->tasklet until the reset is over 2911 * prevents the race. 2912 */ 2913 __tasklet_disable_sync_once(&execlists->tasklet); 2914 GEM_BUG_ON(!reset_in_progress(execlists)); 2915 2916 /* And flush any current direct submission. */ 2917 spin_lock_irqsave(&engine->active.lock, flags); 2918 spin_unlock_irqrestore(&engine->active.lock, flags); 2919 2920 /* 2921 * We stop engines, otherwise we might get failed reset and a 2922 * dead gpu (on elk). Also as modern gpu as kbl can suffer 2923 * from system hang if batchbuffer is progressing when 2924 * the reset is issued, regardless of READY_TO_RESET ack. 2925 * Thus assume it is best to stop engines on all gens 2926 * where we have a gpu reset. 2927 * 2928 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 2929 * 2930 * FIXME: Wa for more modern gens needs to be validated 2931 */ 2932 intel_engine_stop_cs(engine); 2933 } 2934 2935 static void reset_csb_pointers(struct intel_engine_cs *engine) 2936 { 2937 struct intel_engine_execlists * const execlists = &engine->execlists; 2938 const unsigned int reset_value = execlists->csb_size - 1; 2939 2940 ring_set_paused(engine, 0); 2941 2942 /* 2943 * After a reset, the HW starts writing into CSB entry [0]. We 2944 * therefore have to set our HEAD pointer back one entry so that 2945 * the *first* entry we check is entry 0. To complicate this further, 2946 * as we don't wait for the first interrupt after reset, we have to 2947 * fake the HW write to point back to the last entry so that our 2948 * inline comparison of our cached head position against the last HW 2949 * write works even before the first interrupt. 2950 */ 2951 execlists->csb_head = reset_value; 2952 WRITE_ONCE(*execlists->csb_write, reset_value); 2953 wmb(); /* Make sure this is visible to HW (paranoia?) */ 2954 2955 invalidate_csb_entries(&execlists->csb_status[0], 2956 &execlists->csb_status[reset_value]); 2957 } 2958 2959 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 2960 { 2961 if (INTEL_GEN(engine->i915) >= 12) 2962 return 0x60; 2963 else if (INTEL_GEN(engine->i915) >= 9) 2964 return 0x54; 2965 else if (engine->class == RENDER_CLASS) 2966 return 0x58; 2967 else 2968 return -1; 2969 } 2970 2971 static void __execlists_reset_reg_state(const struct intel_context *ce, 2972 const struct intel_engine_cs *engine) 2973 { 2974 u32 *regs = ce->lrc_reg_state; 2975 int x; 2976 2977 x = lrc_ring_mi_mode(engine); 2978 if (x != -1) { 2979 regs[x + 1] &= ~STOP_RING; 2980 regs[x + 1] |= STOP_RING << 16; 2981 } 2982 } 2983 2984 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 2985 { 2986 struct intel_engine_execlists * const execlists = &engine->execlists; 2987 struct intel_context *ce; 2988 struct i915_request *rq; 2989 2990 mb(); /* paranoia: read the CSB pointers from after the reset */ 2991 clflush(execlists->csb_write); 2992 mb(); 2993 2994 process_csb(engine); /* drain preemption events */ 2995 2996 /* Following the reset, we need to reload the CSB read/write pointers */ 2997 reset_csb_pointers(engine); 2998 2999 /* 3000 * Save the currently executing context, even if we completed 3001 * its request, it was still running at the time of the 3002 * reset and will have been clobbered. 3003 */ 3004 rq = execlists_active(execlists); 3005 if (!rq) 3006 goto unwind; 3007 3008 /* We still have requests in-flight; the engine should be active */ 3009 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 3010 3011 ce = rq->hw_context; 3012 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3013 3014 if (i915_request_completed(rq)) { 3015 /* Idle context; tidy up the ring so we can restart afresh */ 3016 ce->ring->head = intel_ring_wrap(ce->ring, rq->tail); 3017 goto out_replay; 3018 } 3019 3020 /* Context has requests still in-flight; it should not be idle! */ 3021 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 3022 rq = active_request(ce->timeline, rq); 3023 ce->ring->head = intel_ring_wrap(ce->ring, rq->head); 3024 GEM_BUG_ON(ce->ring->head == ce->ring->tail); 3025 3026 /* 3027 * If this request hasn't started yet, e.g. it is waiting on a 3028 * semaphore, we need to avoid skipping the request or else we 3029 * break the signaling chain. However, if the context is corrupt 3030 * the request will not restart and we will be stuck with a wedged 3031 * device. It is quite often the case that if we issue a reset 3032 * while the GPU is loading the context image, that the context 3033 * image becomes corrupt. 3034 * 3035 * Otherwise, if we have not started yet, the request should replay 3036 * perfectly and we do not need to flag the result as being erroneous. 3037 */ 3038 if (!i915_request_started(rq)) 3039 goto out_replay; 3040 3041 /* 3042 * If the request was innocent, we leave the request in the ELSP 3043 * and will try to replay it on restarting. The context image may 3044 * have been corrupted by the reset, in which case we may have 3045 * to service a new GPU hang, but more likely we can continue on 3046 * without impact. 3047 * 3048 * If the request was guilty, we presume the context is corrupt 3049 * and have to at least restore the RING register in the context 3050 * image back to the expected values to skip over the guilty request. 3051 */ 3052 __i915_request_reset(rq, stalled); 3053 if (!stalled) 3054 goto out_replay; 3055 3056 /* 3057 * We want a simple context + ring to execute the breadcrumb update. 3058 * We cannot rely on the context being intact across the GPU hang, 3059 * so clear it and rebuild just what we need for the breadcrumb. 3060 * All pending requests for this context will be zapped, and any 3061 * future request will be after userspace has had the opportunity 3062 * to recreate its own state. 3063 */ 3064 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3065 restore_default_state(ce, engine); 3066 3067 out_replay: 3068 GEM_TRACE("%s replay {head:%04x, tail:%04x}\n", 3069 engine->name, ce->ring->head, ce->ring->tail); 3070 intel_ring_update_space(ce->ring); 3071 __execlists_reset_reg_state(ce, engine); 3072 __execlists_update_reg_state(ce, engine); 3073 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 3074 3075 unwind: 3076 /* Push back any incomplete requests for replay after the reset. */ 3077 cancel_port_requests(execlists); 3078 __unwind_incomplete_requests(engine); 3079 } 3080 3081 static void execlists_reset(struct intel_engine_cs *engine, bool stalled) 3082 { 3083 unsigned long flags; 3084 3085 GEM_TRACE("%s\n", engine->name); 3086 3087 spin_lock_irqsave(&engine->active.lock, flags); 3088 3089 __execlists_reset(engine, stalled); 3090 3091 spin_unlock_irqrestore(&engine->active.lock, flags); 3092 } 3093 3094 static void nop_submission_tasklet(unsigned long data) 3095 { 3096 /* The driver is wedged; don't process any more events. */ 3097 } 3098 3099 static void execlists_cancel_requests(struct intel_engine_cs *engine) 3100 { 3101 struct intel_engine_execlists * const execlists = &engine->execlists; 3102 struct i915_request *rq, *rn; 3103 struct rb_node *rb; 3104 unsigned long flags; 3105 3106 GEM_TRACE("%s\n", engine->name); 3107 3108 /* 3109 * Before we call engine->cancel_requests(), we should have exclusive 3110 * access to the submission state. This is arranged for us by the 3111 * caller disabling the interrupt generation, the tasklet and other 3112 * threads that may then access the same state, giving us a free hand 3113 * to reset state. However, we still need to let lockdep be aware that 3114 * we know this state may be accessed in hardirq context, so we 3115 * disable the irq around this manipulation and we want to keep 3116 * the spinlock focused on its duties and not accidentally conflate 3117 * coverage to the submission's irq state. (Similarly, although we 3118 * shouldn't need to disable irq around the manipulation of the 3119 * submission's irq state, we also wish to remind ourselves that 3120 * it is irq state.) 3121 */ 3122 spin_lock_irqsave(&engine->active.lock, flags); 3123 3124 __execlists_reset(engine, true); 3125 3126 /* Mark all executing requests as skipped. */ 3127 list_for_each_entry(rq, &engine->active.requests, sched.link) 3128 mark_eio(rq); 3129 3130 /* Flush the queued requests to the timeline list (for retiring). */ 3131 while ((rb = rb_first_cached(&execlists->queue))) { 3132 struct i915_priolist *p = to_priolist(rb); 3133 int i; 3134 3135 priolist_for_each_request_consume(rq, rn, p, i) { 3136 mark_eio(rq); 3137 __i915_request_submit(rq); 3138 } 3139 3140 rb_erase_cached(&p->node, &execlists->queue); 3141 i915_priolist_free(p); 3142 } 3143 3144 /* Cancel all attached virtual engines */ 3145 while ((rb = rb_first_cached(&execlists->virtual))) { 3146 struct virtual_engine *ve = 3147 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 3148 3149 rb_erase_cached(rb, &execlists->virtual); 3150 RB_CLEAR_NODE(rb); 3151 3152 spin_lock(&ve->base.active.lock); 3153 rq = fetch_and_zero(&ve->request); 3154 if (rq) { 3155 mark_eio(rq); 3156 3157 rq->engine = engine; 3158 __i915_request_submit(rq); 3159 i915_request_put(rq); 3160 3161 ve->base.execlists.queue_priority_hint = INT_MIN; 3162 } 3163 spin_unlock(&ve->base.active.lock); 3164 } 3165 3166 /* Remaining _unready_ requests will be nop'ed when submitted */ 3167 3168 execlists->queue_priority_hint = INT_MIN; 3169 execlists->queue = RB_ROOT_CACHED; 3170 3171 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 3172 execlists->tasklet.func = nop_submission_tasklet; 3173 3174 spin_unlock_irqrestore(&engine->active.lock, flags); 3175 } 3176 3177 static void execlists_reset_finish(struct intel_engine_cs *engine) 3178 { 3179 struct intel_engine_execlists * const execlists = &engine->execlists; 3180 3181 /* 3182 * After a GPU reset, we may have requests to replay. Do so now while 3183 * we still have the forcewake to be sure that the GPU is not allowed 3184 * to sleep before we restart and reload a context. 3185 */ 3186 GEM_BUG_ON(!reset_in_progress(execlists)); 3187 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 3188 execlists->tasklet.func(execlists->tasklet.data); 3189 3190 if (__tasklet_enable(&execlists->tasklet)) 3191 /* And kick in case we missed a new request submission. */ 3192 tasklet_hi_schedule(&execlists->tasklet); 3193 GEM_TRACE("%s: depth->%d\n", engine->name, 3194 atomic_read(&execlists->tasklet.count)); 3195 } 3196 3197 static int gen8_emit_bb_start(struct i915_request *rq, 3198 u64 offset, u32 len, 3199 const unsigned int flags) 3200 { 3201 u32 *cs; 3202 3203 cs = intel_ring_begin(rq, 4); 3204 if (IS_ERR(cs)) 3205 return PTR_ERR(cs); 3206 3207 /* 3208 * WaDisableCtxRestoreArbitration:bdw,chv 3209 * 3210 * We don't need to perform MI_ARB_ENABLE as often as we do (in 3211 * particular all the gen that do not need the w/a at all!), if we 3212 * took care to make sure that on every switch into this context 3213 * (both ordinary and for preemption) that arbitrartion was enabled 3214 * we would be fine. However, for gen8 there is another w/a that 3215 * requires us to not preempt inside GPGPU execution, so we keep 3216 * arbitration disabled for gen8 batches. Arbitration will be 3217 * re-enabled before we close the request 3218 * (engine->emit_fini_breadcrumb). 3219 */ 3220 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3221 3222 /* FIXME(BDW+): Address space and security selectors. */ 3223 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3224 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3225 *cs++ = lower_32_bits(offset); 3226 *cs++ = upper_32_bits(offset); 3227 3228 intel_ring_advance(rq, cs); 3229 3230 return 0; 3231 } 3232 3233 static int gen9_emit_bb_start(struct i915_request *rq, 3234 u64 offset, u32 len, 3235 const unsigned int flags) 3236 { 3237 u32 *cs; 3238 3239 cs = intel_ring_begin(rq, 6); 3240 if (IS_ERR(cs)) 3241 return PTR_ERR(cs); 3242 3243 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3244 3245 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3246 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3247 *cs++ = lower_32_bits(offset); 3248 *cs++ = upper_32_bits(offset); 3249 3250 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3251 *cs++ = MI_NOOP; 3252 3253 intel_ring_advance(rq, cs); 3254 3255 return 0; 3256 } 3257 3258 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 3259 { 3260 ENGINE_WRITE(engine, RING_IMR, 3261 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 3262 ENGINE_POSTING_READ(engine, RING_IMR); 3263 } 3264 3265 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 3266 { 3267 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 3268 } 3269 3270 static int gen8_emit_flush(struct i915_request *request, u32 mode) 3271 { 3272 u32 cmd, *cs; 3273 3274 cs = intel_ring_begin(request, 4); 3275 if (IS_ERR(cs)) 3276 return PTR_ERR(cs); 3277 3278 cmd = MI_FLUSH_DW + 1; 3279 3280 /* We always require a command barrier so that subsequent 3281 * commands, such as breadcrumb interrupts, are strictly ordered 3282 * wrt the contents of the write cache being flushed to memory 3283 * (and thus being coherent from the CPU). 3284 */ 3285 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 3286 3287 if (mode & EMIT_INVALIDATE) { 3288 cmd |= MI_INVALIDATE_TLB; 3289 if (request->engine->class == VIDEO_DECODE_CLASS) 3290 cmd |= MI_INVALIDATE_BSD; 3291 } 3292 3293 *cs++ = cmd; 3294 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 3295 *cs++ = 0; /* upper addr */ 3296 *cs++ = 0; /* value */ 3297 intel_ring_advance(request, cs); 3298 3299 return 0; 3300 } 3301 3302 static int gen8_emit_flush_render(struct i915_request *request, 3303 u32 mode) 3304 { 3305 bool vf_flush_wa = false, dc_flush_wa = false; 3306 u32 *cs, flags = 0; 3307 int len; 3308 3309 flags |= PIPE_CONTROL_CS_STALL; 3310 3311 if (mode & EMIT_FLUSH) { 3312 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3313 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3314 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3315 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3316 } 3317 3318 if (mode & EMIT_INVALIDATE) { 3319 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3320 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3321 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3322 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3323 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3324 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3325 flags |= PIPE_CONTROL_QW_WRITE; 3326 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3327 3328 /* 3329 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 3330 * pipe control. 3331 */ 3332 if (IS_GEN(request->i915, 9)) 3333 vf_flush_wa = true; 3334 3335 /* WaForGAMHang:kbl */ 3336 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 3337 dc_flush_wa = true; 3338 } 3339 3340 len = 6; 3341 3342 if (vf_flush_wa) 3343 len += 6; 3344 3345 if (dc_flush_wa) 3346 len += 12; 3347 3348 cs = intel_ring_begin(request, len); 3349 if (IS_ERR(cs)) 3350 return PTR_ERR(cs); 3351 3352 if (vf_flush_wa) 3353 cs = gen8_emit_pipe_control(cs, 0, 0); 3354 3355 if (dc_flush_wa) 3356 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 3357 0); 3358 3359 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3360 3361 if (dc_flush_wa) 3362 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 3363 3364 intel_ring_advance(request, cs); 3365 3366 return 0; 3367 } 3368 3369 static int gen11_emit_flush_render(struct i915_request *request, 3370 u32 mode) 3371 { 3372 if (mode & EMIT_FLUSH) { 3373 u32 *cs; 3374 u32 flags = 0; 3375 3376 flags |= PIPE_CONTROL_CS_STALL; 3377 3378 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3379 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3380 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3381 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3382 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3383 flags |= PIPE_CONTROL_QW_WRITE; 3384 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3385 3386 cs = intel_ring_begin(request, 6); 3387 if (IS_ERR(cs)) 3388 return PTR_ERR(cs); 3389 3390 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3391 intel_ring_advance(request, cs); 3392 } 3393 3394 if (mode & EMIT_INVALIDATE) { 3395 u32 *cs; 3396 u32 flags = 0; 3397 3398 flags |= PIPE_CONTROL_CS_STALL; 3399 3400 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3401 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3402 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3403 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3404 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3405 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3406 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3407 flags |= PIPE_CONTROL_QW_WRITE; 3408 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3409 3410 cs = intel_ring_begin(request, 6); 3411 if (IS_ERR(cs)) 3412 return PTR_ERR(cs); 3413 3414 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3415 intel_ring_advance(request, cs); 3416 } 3417 3418 return 0; 3419 } 3420 3421 static u32 preparser_disable(bool state) 3422 { 3423 return MI_ARB_CHECK | 1 << 8 | state; 3424 } 3425 3426 static int gen12_emit_flush_render(struct i915_request *request, 3427 u32 mode) 3428 { 3429 if (mode & EMIT_FLUSH) { 3430 u32 flags = 0; 3431 u32 *cs; 3432 3433 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3434 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3435 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3436 /* Wa_1409600907:tgl */ 3437 flags |= PIPE_CONTROL_DEPTH_STALL; 3438 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3439 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3440 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 3441 3442 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3443 flags |= PIPE_CONTROL_QW_WRITE; 3444 3445 flags |= PIPE_CONTROL_CS_STALL; 3446 3447 cs = intel_ring_begin(request, 6); 3448 if (IS_ERR(cs)) 3449 return PTR_ERR(cs); 3450 3451 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3452 intel_ring_advance(request, cs); 3453 } 3454 3455 if (mode & EMIT_INVALIDATE) { 3456 u32 flags = 0; 3457 u32 *cs; 3458 3459 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3460 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3461 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3462 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3463 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3464 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3465 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3466 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE; 3467 3468 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3469 flags |= PIPE_CONTROL_QW_WRITE; 3470 3471 flags |= PIPE_CONTROL_CS_STALL; 3472 3473 cs = intel_ring_begin(request, 8); 3474 if (IS_ERR(cs)) 3475 return PTR_ERR(cs); 3476 3477 /* 3478 * Prevent the pre-parser from skipping past the TLB 3479 * invalidate and loading a stale page for the batch 3480 * buffer / request payload. 3481 */ 3482 *cs++ = preparser_disable(true); 3483 3484 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3485 3486 *cs++ = preparser_disable(false); 3487 intel_ring_advance(request, cs); 3488 3489 /* 3490 * Wa_1604544889:tgl 3491 */ 3492 if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) { 3493 flags = 0; 3494 flags |= PIPE_CONTROL_CS_STALL; 3495 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 3496 3497 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3498 flags |= PIPE_CONTROL_QW_WRITE; 3499 3500 cs = intel_ring_begin(request, 6); 3501 if (IS_ERR(cs)) 3502 return PTR_ERR(cs); 3503 3504 cs = gen8_emit_pipe_control(cs, flags, 3505 LRC_PPHWSP_SCRATCH_ADDR); 3506 intel_ring_advance(request, cs); 3507 } 3508 } 3509 3510 return 0; 3511 } 3512 3513 /* 3514 * Reserve space for 2 NOOPs at the end of each request to be 3515 * used as a workaround for not being allowed to do lite 3516 * restore with HEAD==TAIL (WaIdleLiteRestore). 3517 */ 3518 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 3519 { 3520 /* Ensure there's always at least one preemption point per-request. */ 3521 *cs++ = MI_ARB_CHECK; 3522 *cs++ = MI_NOOP; 3523 request->wa_tail = intel_ring_offset(request, cs); 3524 3525 return cs; 3526 } 3527 3528 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 3529 { 3530 *cs++ = MI_SEMAPHORE_WAIT | 3531 MI_SEMAPHORE_GLOBAL_GTT | 3532 MI_SEMAPHORE_POLL | 3533 MI_SEMAPHORE_SAD_EQ_SDD; 3534 *cs++ = 0; 3535 *cs++ = intel_hws_preempt_address(request->engine); 3536 *cs++ = 0; 3537 3538 return cs; 3539 } 3540 3541 static __always_inline u32* 3542 gen8_emit_fini_breadcrumb_footer(struct i915_request *request, 3543 u32 *cs) 3544 { 3545 *cs++ = MI_USER_INTERRUPT; 3546 3547 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3548 if (intel_engine_has_semaphores(request->engine)) 3549 cs = emit_preempt_busywait(request, cs); 3550 3551 request->tail = intel_ring_offset(request, cs); 3552 assert_ring_tail_valid(request->ring, request->tail); 3553 3554 return gen8_emit_wa_tail(request, cs); 3555 } 3556 3557 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 3558 { 3559 cs = gen8_emit_ggtt_write(cs, 3560 request->fence.seqno, 3561 i915_request_active_timeline(request)->hwsp_offset, 3562 0); 3563 3564 return gen8_emit_fini_breadcrumb_footer(request, cs); 3565 } 3566 3567 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 3568 { 3569 cs = gen8_emit_pipe_control(cs, 3570 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 3571 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 3572 PIPE_CONTROL_DC_FLUSH_ENABLE, 3573 0); 3574 3575 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 3576 cs = gen8_emit_ggtt_write_rcs(cs, 3577 request->fence.seqno, 3578 i915_request_active_timeline(request)->hwsp_offset, 3579 PIPE_CONTROL_FLUSH_ENABLE | 3580 PIPE_CONTROL_CS_STALL); 3581 3582 return gen8_emit_fini_breadcrumb_footer(request, cs); 3583 } 3584 3585 static u32 * 3586 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 3587 { 3588 cs = gen8_emit_ggtt_write_rcs(cs, 3589 request->fence.seqno, 3590 i915_request_active_timeline(request)->hwsp_offset, 3591 PIPE_CONTROL_CS_STALL | 3592 PIPE_CONTROL_TILE_CACHE_FLUSH | 3593 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 3594 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 3595 PIPE_CONTROL_DC_FLUSH_ENABLE | 3596 PIPE_CONTROL_FLUSH_ENABLE); 3597 3598 return gen8_emit_fini_breadcrumb_footer(request, cs); 3599 } 3600 3601 /* 3602 * Note that the CS instruction pre-parser will not stall on the breadcrumb 3603 * flush and will continue pre-fetching the instructions after it before the 3604 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 3605 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 3606 * of the next request before the memory has been flushed, we're guaranteed that 3607 * we won't access the batch itself too early. 3608 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 3609 * so, if the current request is modifying an instruction in the next request on 3610 * the same intel_context, we might pre-fetch and then execute the pre-update 3611 * instruction. To avoid this, the users of self-modifying code should either 3612 * disable the parser around the code emitting the memory writes, via a new flag 3613 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 3614 * the in-kernel use-cases we've opted to use a separate context, see 3615 * reloc_gpu() as an example. 3616 * All the above applies only to the instructions themselves. Non-inline data 3617 * used by the instructions is not pre-fetched. 3618 */ 3619 3620 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 3621 { 3622 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 3623 MI_SEMAPHORE_GLOBAL_GTT | 3624 MI_SEMAPHORE_POLL | 3625 MI_SEMAPHORE_SAD_EQ_SDD; 3626 *cs++ = 0; 3627 *cs++ = intel_hws_preempt_address(request->engine); 3628 *cs++ = 0; 3629 *cs++ = 0; 3630 *cs++ = MI_NOOP; 3631 3632 return cs; 3633 } 3634 3635 static __always_inline u32* 3636 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs) 3637 { 3638 *cs++ = MI_USER_INTERRUPT; 3639 3640 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3641 if (intel_engine_has_semaphores(request->engine)) 3642 cs = gen12_emit_preempt_busywait(request, cs); 3643 3644 request->tail = intel_ring_offset(request, cs); 3645 assert_ring_tail_valid(request->ring, request->tail); 3646 3647 return gen8_emit_wa_tail(request, cs); 3648 } 3649 3650 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 3651 { 3652 cs = gen8_emit_ggtt_write(cs, 3653 request->fence.seqno, 3654 i915_request_active_timeline(request)->hwsp_offset, 3655 0); 3656 3657 return gen12_emit_fini_breadcrumb_footer(request, cs); 3658 } 3659 3660 static u32 * 3661 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 3662 { 3663 cs = gen8_emit_ggtt_write_rcs(cs, 3664 request->fence.seqno, 3665 i915_request_active_timeline(request)->hwsp_offset, 3666 PIPE_CONTROL_CS_STALL | 3667 PIPE_CONTROL_TILE_CACHE_FLUSH | 3668 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 3669 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 3670 /* Wa_1409600907:tgl */ 3671 PIPE_CONTROL_DEPTH_STALL | 3672 PIPE_CONTROL_DC_FLUSH_ENABLE | 3673 PIPE_CONTROL_FLUSH_ENABLE | 3674 PIPE_CONTROL_HDC_PIPELINE_FLUSH); 3675 3676 return gen12_emit_fini_breadcrumb_footer(request, cs); 3677 } 3678 3679 static void execlists_park(struct intel_engine_cs *engine) 3680 { 3681 cancel_timer(&engine->execlists.timer); 3682 cancel_timer(&engine->execlists.preempt); 3683 } 3684 3685 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 3686 { 3687 engine->submit_request = execlists_submit_request; 3688 engine->cancel_requests = execlists_cancel_requests; 3689 engine->schedule = i915_schedule; 3690 engine->execlists.tasklet.func = execlists_submission_tasklet; 3691 3692 engine->reset.prepare = execlists_reset_prepare; 3693 engine->reset.reset = execlists_reset; 3694 engine->reset.finish = execlists_reset_finish; 3695 3696 engine->park = execlists_park; 3697 engine->unpark = NULL; 3698 3699 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 3700 if (!intel_vgpu_active(engine->i915)) { 3701 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 3702 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) 3703 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 3704 } 3705 3706 if (INTEL_GEN(engine->i915) >= 12) 3707 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 3708 } 3709 3710 static void execlists_destroy(struct intel_engine_cs *engine) 3711 { 3712 intel_engine_cleanup_common(engine); 3713 lrc_destroy_wa_ctx(engine); 3714 kfree(engine); 3715 } 3716 3717 static void 3718 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 3719 { 3720 /* Default vfuncs which can be overriden by each engine. */ 3721 3722 engine->destroy = execlists_destroy; 3723 engine->resume = execlists_resume; 3724 3725 engine->reset.prepare = execlists_reset_prepare; 3726 engine->reset.reset = execlists_reset; 3727 engine->reset.finish = execlists_reset_finish; 3728 3729 engine->cops = &execlists_context_ops; 3730 engine->request_alloc = execlists_request_alloc; 3731 3732 engine->emit_flush = gen8_emit_flush; 3733 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 3734 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 3735 if (INTEL_GEN(engine->i915) >= 12) 3736 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 3737 3738 engine->set_default_submission = intel_execlists_set_default_submission; 3739 3740 if (INTEL_GEN(engine->i915) < 11) { 3741 engine->irq_enable = gen8_logical_ring_enable_irq; 3742 engine->irq_disable = gen8_logical_ring_disable_irq; 3743 } else { 3744 /* 3745 * TODO: On Gen11 interrupt masks need to be clear 3746 * to allow C6 entry. Keep interrupts enabled at 3747 * and take the hit of generating extra interrupts 3748 * until a more refined solution exists. 3749 */ 3750 } 3751 if (IS_GEN(engine->i915, 8)) 3752 engine->emit_bb_start = gen8_emit_bb_start; 3753 else 3754 engine->emit_bb_start = gen9_emit_bb_start; 3755 } 3756 3757 static inline void 3758 logical_ring_default_irqs(struct intel_engine_cs *engine) 3759 { 3760 unsigned int shift = 0; 3761 3762 if (INTEL_GEN(engine->i915) < 11) { 3763 const u8 irq_shifts[] = { 3764 [RCS0] = GEN8_RCS_IRQ_SHIFT, 3765 [BCS0] = GEN8_BCS_IRQ_SHIFT, 3766 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 3767 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 3768 [VECS0] = GEN8_VECS_IRQ_SHIFT, 3769 }; 3770 3771 shift = irq_shifts[engine->id]; 3772 } 3773 3774 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 3775 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 3776 } 3777 3778 static void rcs_submission_override(struct intel_engine_cs *engine) 3779 { 3780 switch (INTEL_GEN(engine->i915)) { 3781 case 12: 3782 engine->emit_flush = gen12_emit_flush_render; 3783 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 3784 break; 3785 case 11: 3786 engine->emit_flush = gen11_emit_flush_render; 3787 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 3788 break; 3789 default: 3790 engine->emit_flush = gen8_emit_flush_render; 3791 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 3792 break; 3793 } 3794 } 3795 3796 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 3797 { 3798 tasklet_init(&engine->execlists.tasklet, 3799 execlists_submission_tasklet, (unsigned long)engine); 3800 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 3801 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 3802 3803 logical_ring_default_vfuncs(engine); 3804 logical_ring_default_irqs(engine); 3805 3806 if (engine->class == RENDER_CLASS) 3807 rcs_submission_override(engine); 3808 3809 return 0; 3810 } 3811 3812 int intel_execlists_submission_init(struct intel_engine_cs *engine) 3813 { 3814 struct intel_engine_execlists * const execlists = &engine->execlists; 3815 struct drm_i915_private *i915 = engine->i915; 3816 struct intel_uncore *uncore = engine->uncore; 3817 u32 base = engine->mmio_base; 3818 int ret; 3819 3820 ret = intel_engine_init_common(engine); 3821 if (ret) 3822 return ret; 3823 3824 if (intel_init_workaround_bb(engine)) 3825 /* 3826 * We continue even if we fail to initialize WA batch 3827 * because we only expect rare glitches but nothing 3828 * critical to prevent us from using GPU 3829 */ 3830 DRM_ERROR("WA batch buffer initialization failed\n"); 3831 3832 if (HAS_LOGICAL_RING_ELSQ(i915)) { 3833 execlists->submit_reg = uncore->regs + 3834 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 3835 execlists->ctrl_reg = uncore->regs + 3836 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 3837 } else { 3838 execlists->submit_reg = uncore->regs + 3839 i915_mmio_reg_offset(RING_ELSP(base)); 3840 } 3841 3842 execlists->csb_status = 3843 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 3844 3845 execlists->csb_write = 3846 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 3847 3848 if (INTEL_GEN(i915) < 11) 3849 execlists->csb_size = GEN8_CSB_ENTRIES; 3850 else 3851 execlists->csb_size = GEN11_CSB_ENTRIES; 3852 3853 reset_csb_pointers(engine); 3854 3855 return 0; 3856 } 3857 3858 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) 3859 { 3860 u32 indirect_ctx_offset; 3861 3862 switch (INTEL_GEN(engine->i915)) { 3863 default: 3864 MISSING_CASE(INTEL_GEN(engine->i915)); 3865 /* fall through */ 3866 case 12: 3867 indirect_ctx_offset = 3868 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3869 break; 3870 case 11: 3871 indirect_ctx_offset = 3872 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3873 break; 3874 case 10: 3875 indirect_ctx_offset = 3876 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3877 break; 3878 case 9: 3879 indirect_ctx_offset = 3880 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3881 break; 3882 case 8: 3883 indirect_ctx_offset = 3884 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3885 break; 3886 } 3887 3888 return indirect_ctx_offset; 3889 } 3890 3891 3892 static void init_common_reg_state(u32 * const regs, 3893 const struct intel_engine_cs *engine, 3894 const struct intel_ring *ring) 3895 { 3896 regs[CTX_CONTEXT_CONTROL] = 3897 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) | 3898 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 3899 if (INTEL_GEN(engine->i915) < 11) 3900 regs[CTX_CONTEXT_CONTROL] |= 3901 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 3902 CTX_CTRL_RS_CTX_ENABLE); 3903 3904 regs[CTX_RING_BUFFER_CONTROL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3905 regs[CTX_BB_STATE] = RING_BB_PPGTT; 3906 } 3907 3908 static void init_wa_bb_reg_state(u32 * const regs, 3909 const struct intel_engine_cs *engine, 3910 u32 pos_bb_per_ctx) 3911 { 3912 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 3913 3914 if (wa_ctx->per_ctx.size) { 3915 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 3916 3917 regs[pos_bb_per_ctx] = 3918 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 3919 } 3920 3921 if (wa_ctx->indirect_ctx.size) { 3922 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 3923 3924 regs[pos_bb_per_ctx + 2] = 3925 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 3926 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 3927 3928 regs[pos_bb_per_ctx + 4] = 3929 intel_lr_indirect_ctx_offset(engine) << 6; 3930 } 3931 } 3932 3933 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 3934 { 3935 if (i915_vm_is_4lvl(&ppgtt->vm)) { 3936 /* 64b PPGTT (48bit canonical) 3937 * PDP0_DESCRIPTOR contains the base address to PML4 and 3938 * other PDP Descriptors are ignored. 3939 */ 3940 ASSIGN_CTX_PML4(ppgtt, regs); 3941 } else { 3942 ASSIGN_CTX_PDP(ppgtt, regs, 3); 3943 ASSIGN_CTX_PDP(ppgtt, regs, 2); 3944 ASSIGN_CTX_PDP(ppgtt, regs, 1); 3945 ASSIGN_CTX_PDP(ppgtt, regs, 0); 3946 } 3947 } 3948 3949 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 3950 { 3951 if (i915_is_ggtt(vm)) 3952 return i915_vm_to_ggtt(vm)->alias; 3953 else 3954 return i915_vm_to_ppgtt(vm); 3955 } 3956 3957 static void execlists_init_reg_state(u32 *regs, 3958 const struct intel_context *ce, 3959 const struct intel_engine_cs *engine, 3960 const struct intel_ring *ring, 3961 bool close) 3962 { 3963 /* 3964 * A context is actually a big batch buffer with several 3965 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 3966 * values we are setting here are only for the first context restore: 3967 * on a subsequent save, the GPU will recreate this batchbuffer with new 3968 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 3969 * we are not initializing here). 3970 * 3971 * Must keep consistent with virtual_update_register_offsets(). 3972 */ 3973 u32 *bbe = set_offsets(regs, reg_offsets(engine), engine); 3974 3975 if (close) { /* Close the batch; used mainly by live_lrc_layout() */ 3976 *bbe = MI_BATCH_BUFFER_END; 3977 if (INTEL_GEN(engine->i915) >= 10) 3978 *bbe |= BIT(0); 3979 } 3980 3981 init_common_reg_state(regs, engine, ring); 3982 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 3983 3984 init_wa_bb_reg_state(regs, engine, 3985 INTEL_GEN(engine->i915) >= 12 ? 3986 GEN12_CTX_BB_PER_CTX_PTR : 3987 CTX_BB_PER_CTX_PTR); 3988 } 3989 3990 static int 3991 populate_lr_context(struct intel_context *ce, 3992 struct drm_i915_gem_object *ctx_obj, 3993 struct intel_engine_cs *engine, 3994 struct intel_ring *ring) 3995 { 3996 bool inhibit = true; 3997 void *vaddr; 3998 u32 *regs; 3999 int ret; 4000 4001 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 4002 if (IS_ERR(vaddr)) { 4003 ret = PTR_ERR(vaddr); 4004 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 4005 return ret; 4006 } 4007 4008 set_redzone(vaddr, engine); 4009 4010 if (engine->default_state) { 4011 void *defaults; 4012 4013 defaults = i915_gem_object_pin_map(engine->default_state, 4014 I915_MAP_WB); 4015 if (IS_ERR(defaults)) { 4016 ret = PTR_ERR(defaults); 4017 goto err_unpin_ctx; 4018 } 4019 4020 memcpy(vaddr, defaults, engine->context_size); 4021 i915_gem_object_unpin_map(engine->default_state); 4022 inhibit = false; 4023 } 4024 4025 /* The second page of the context object contains some fields which must 4026 * be set up prior to the first execution. */ 4027 regs = vaddr + LRC_STATE_PN * PAGE_SIZE; 4028 execlists_init_reg_state(regs, ce, engine, ring, inhibit); 4029 if (inhibit) 4030 regs[CTX_CONTEXT_CONTROL] |= 4031 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 4032 4033 ret = 0; 4034 err_unpin_ctx: 4035 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 4036 i915_gem_object_unpin_map(ctx_obj); 4037 return ret; 4038 } 4039 4040 static int __execlists_context_alloc(struct intel_context *ce, 4041 struct intel_engine_cs *engine) 4042 { 4043 struct drm_i915_gem_object *ctx_obj; 4044 struct intel_ring *ring; 4045 struct i915_vma *vma; 4046 u32 context_size; 4047 int ret; 4048 4049 GEM_BUG_ON(ce->state); 4050 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 4051 4052 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4053 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 4054 4055 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 4056 if (IS_ERR(ctx_obj)) 4057 return PTR_ERR(ctx_obj); 4058 4059 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 4060 if (IS_ERR(vma)) { 4061 ret = PTR_ERR(vma); 4062 goto error_deref_obj; 4063 } 4064 4065 if (!ce->timeline) { 4066 struct intel_timeline *tl; 4067 4068 tl = intel_timeline_create(engine->gt, NULL); 4069 if (IS_ERR(tl)) { 4070 ret = PTR_ERR(tl); 4071 goto error_deref_obj; 4072 } 4073 4074 ce->timeline = tl; 4075 } 4076 4077 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 4078 if (IS_ERR(ring)) { 4079 ret = PTR_ERR(ring); 4080 goto error_deref_obj; 4081 } 4082 4083 ret = populate_lr_context(ce, ctx_obj, engine, ring); 4084 if (ret) { 4085 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 4086 goto error_ring_free; 4087 } 4088 4089 ce->ring = ring; 4090 ce->state = vma; 4091 4092 return 0; 4093 4094 error_ring_free: 4095 intel_ring_put(ring); 4096 error_deref_obj: 4097 i915_gem_object_put(ctx_obj); 4098 return ret; 4099 } 4100 4101 static struct list_head *virtual_queue(struct virtual_engine *ve) 4102 { 4103 return &ve->base.execlists.default_priolist.requests[0]; 4104 } 4105 4106 static void virtual_context_destroy(struct kref *kref) 4107 { 4108 struct virtual_engine *ve = 4109 container_of(kref, typeof(*ve), context.ref); 4110 unsigned int n; 4111 4112 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4113 GEM_BUG_ON(ve->request); 4114 GEM_BUG_ON(ve->context.inflight); 4115 4116 for (n = 0; n < ve->num_siblings; n++) { 4117 struct intel_engine_cs *sibling = ve->siblings[n]; 4118 struct rb_node *node = &ve->nodes[sibling->id].rb; 4119 unsigned long flags; 4120 4121 if (RB_EMPTY_NODE(node)) 4122 continue; 4123 4124 spin_lock_irqsave(&sibling->active.lock, flags); 4125 4126 /* Detachment is lazily performed in the execlists tasklet */ 4127 if (!RB_EMPTY_NODE(node)) 4128 rb_erase_cached(node, &sibling->execlists.virtual); 4129 4130 spin_unlock_irqrestore(&sibling->active.lock, flags); 4131 } 4132 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 4133 4134 if (ve->context.state) 4135 __execlists_context_fini(&ve->context); 4136 intel_context_fini(&ve->context); 4137 4138 kfree(ve->bonds); 4139 kfree(ve); 4140 } 4141 4142 static void virtual_engine_initial_hint(struct virtual_engine *ve) 4143 { 4144 int swp; 4145 4146 /* 4147 * Pick a random sibling on starting to help spread the load around. 4148 * 4149 * New contexts are typically created with exactly the same order 4150 * of siblings, and often started in batches. Due to the way we iterate 4151 * the array of sibling when submitting requests, sibling[0] is 4152 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 4153 * randomised across the system, we also help spread the load by the 4154 * first engine we inspect being different each time. 4155 * 4156 * NB This does not force us to execute on this engine, it will just 4157 * typically be the first we inspect for submission. 4158 */ 4159 swp = prandom_u32_max(ve->num_siblings); 4160 if (!swp) 4161 return; 4162 4163 swap(ve->siblings[swp], ve->siblings[0]); 4164 if (!intel_engine_has_relative_mmio(ve->siblings[0])) 4165 virtual_update_register_offsets(ve->context.lrc_reg_state, 4166 ve->siblings[0]); 4167 } 4168 4169 static int virtual_context_pin(struct intel_context *ce) 4170 { 4171 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4172 int err; 4173 4174 /* Note: we must use a real engine class for setting up reg state */ 4175 err = __execlists_context_pin(ce, ve->siblings[0]); 4176 if (err) 4177 return err; 4178 4179 virtual_engine_initial_hint(ve); 4180 return 0; 4181 } 4182 4183 static void virtual_context_enter(struct intel_context *ce) 4184 { 4185 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4186 unsigned int n; 4187 4188 for (n = 0; n < ve->num_siblings; n++) 4189 intel_engine_pm_get(ve->siblings[n]); 4190 4191 intel_timeline_enter(ce->timeline); 4192 } 4193 4194 static void virtual_context_exit(struct intel_context *ce) 4195 { 4196 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4197 unsigned int n; 4198 4199 intel_timeline_exit(ce->timeline); 4200 4201 for (n = 0; n < ve->num_siblings; n++) 4202 intel_engine_pm_put(ve->siblings[n]); 4203 } 4204 4205 static const struct intel_context_ops virtual_context_ops = { 4206 .pin = virtual_context_pin, 4207 .unpin = execlists_context_unpin, 4208 4209 .enter = virtual_context_enter, 4210 .exit = virtual_context_exit, 4211 4212 .destroy = virtual_context_destroy, 4213 }; 4214 4215 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 4216 { 4217 struct i915_request *rq; 4218 intel_engine_mask_t mask; 4219 4220 rq = READ_ONCE(ve->request); 4221 if (!rq) 4222 return 0; 4223 4224 /* The rq is ready for submission; rq->execution_mask is now stable. */ 4225 mask = rq->execution_mask; 4226 if (unlikely(!mask)) { 4227 /* Invalid selection, submit to a random engine in error */ 4228 i915_request_skip(rq, -ENODEV); 4229 mask = ve->siblings[0]->mask; 4230 } 4231 4232 GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n", 4233 ve->base.name, 4234 rq->fence.context, rq->fence.seqno, 4235 mask, ve->base.execlists.queue_priority_hint); 4236 4237 return mask; 4238 } 4239 4240 static void virtual_submission_tasklet(unsigned long data) 4241 { 4242 struct virtual_engine * const ve = (struct virtual_engine *)data; 4243 const int prio = ve->base.execlists.queue_priority_hint; 4244 intel_engine_mask_t mask; 4245 unsigned int n; 4246 4247 rcu_read_lock(); 4248 mask = virtual_submission_mask(ve); 4249 rcu_read_unlock(); 4250 if (unlikely(!mask)) 4251 return; 4252 4253 local_irq_disable(); 4254 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { 4255 struct intel_engine_cs *sibling = ve->siblings[n]; 4256 struct ve_node * const node = &ve->nodes[sibling->id]; 4257 struct rb_node **parent, *rb; 4258 bool first; 4259 4260 if (unlikely(!(mask & sibling->mask))) { 4261 if (!RB_EMPTY_NODE(&node->rb)) { 4262 spin_lock(&sibling->active.lock); 4263 rb_erase_cached(&node->rb, 4264 &sibling->execlists.virtual); 4265 RB_CLEAR_NODE(&node->rb); 4266 spin_unlock(&sibling->active.lock); 4267 } 4268 continue; 4269 } 4270 4271 spin_lock(&sibling->active.lock); 4272 4273 if (!RB_EMPTY_NODE(&node->rb)) { 4274 /* 4275 * Cheat and avoid rebalancing the tree if we can 4276 * reuse this node in situ. 4277 */ 4278 first = rb_first_cached(&sibling->execlists.virtual) == 4279 &node->rb; 4280 if (prio == node->prio || (prio > node->prio && first)) 4281 goto submit_engine; 4282 4283 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 4284 } 4285 4286 rb = NULL; 4287 first = true; 4288 parent = &sibling->execlists.virtual.rb_root.rb_node; 4289 while (*parent) { 4290 struct ve_node *other; 4291 4292 rb = *parent; 4293 other = rb_entry(rb, typeof(*other), rb); 4294 if (prio > other->prio) { 4295 parent = &rb->rb_left; 4296 } else { 4297 parent = &rb->rb_right; 4298 first = false; 4299 } 4300 } 4301 4302 rb_link_node(&node->rb, rb, parent); 4303 rb_insert_color_cached(&node->rb, 4304 &sibling->execlists.virtual, 4305 first); 4306 4307 submit_engine: 4308 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 4309 node->prio = prio; 4310 if (first && prio > sibling->execlists.queue_priority_hint) { 4311 sibling->execlists.queue_priority_hint = prio; 4312 tasklet_hi_schedule(&sibling->execlists.tasklet); 4313 } 4314 4315 spin_unlock(&sibling->active.lock); 4316 } 4317 local_irq_enable(); 4318 } 4319 4320 static void virtual_submit_request(struct i915_request *rq) 4321 { 4322 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4323 struct i915_request *old; 4324 unsigned long flags; 4325 4326 GEM_TRACE("%s: rq=%llx:%lld\n", 4327 ve->base.name, 4328 rq->fence.context, 4329 rq->fence.seqno); 4330 4331 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 4332 4333 spin_lock_irqsave(&ve->base.active.lock, flags); 4334 4335 old = ve->request; 4336 if (old) { /* background completion event from preempt-to-busy */ 4337 GEM_BUG_ON(!i915_request_completed(old)); 4338 __i915_request_submit(old); 4339 i915_request_put(old); 4340 } 4341 4342 if (i915_request_completed(rq)) { 4343 __i915_request_submit(rq); 4344 4345 ve->base.execlists.queue_priority_hint = INT_MIN; 4346 ve->request = NULL; 4347 } else { 4348 ve->base.execlists.queue_priority_hint = rq_prio(rq); 4349 ve->request = i915_request_get(rq); 4350 4351 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4352 list_move_tail(&rq->sched.link, virtual_queue(ve)); 4353 4354 tasklet_schedule(&ve->base.execlists.tasklet); 4355 } 4356 4357 spin_unlock_irqrestore(&ve->base.active.lock, flags); 4358 } 4359 4360 static struct ve_bond * 4361 virtual_find_bond(struct virtual_engine *ve, 4362 const struct intel_engine_cs *master) 4363 { 4364 int i; 4365 4366 for (i = 0; i < ve->num_bonds; i++) { 4367 if (ve->bonds[i].master == master) 4368 return &ve->bonds[i]; 4369 } 4370 4371 return NULL; 4372 } 4373 4374 static void 4375 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 4376 { 4377 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4378 intel_engine_mask_t allowed, exec; 4379 struct ve_bond *bond; 4380 4381 allowed = ~to_request(signal)->engine->mask; 4382 4383 bond = virtual_find_bond(ve, to_request(signal)->engine); 4384 if (bond) 4385 allowed &= bond->sibling_mask; 4386 4387 /* Restrict the bonded request to run on only the available engines */ 4388 exec = READ_ONCE(rq->execution_mask); 4389 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 4390 ; 4391 4392 /* Prevent the master from being re-run on the bonded engines */ 4393 to_request(signal)->execution_mask &= ~allowed; 4394 } 4395 4396 struct intel_context * 4397 intel_execlists_create_virtual(struct i915_gem_context *ctx, 4398 struct intel_engine_cs **siblings, 4399 unsigned int count) 4400 { 4401 struct virtual_engine *ve; 4402 unsigned int n; 4403 int err; 4404 4405 if (count == 0) 4406 return ERR_PTR(-EINVAL); 4407 4408 if (count == 1) 4409 return intel_context_create(ctx, siblings[0]); 4410 4411 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 4412 if (!ve) 4413 return ERR_PTR(-ENOMEM); 4414 4415 ve->base.i915 = ctx->i915; 4416 ve->base.gt = siblings[0]->gt; 4417 ve->base.uncore = siblings[0]->uncore; 4418 ve->base.id = -1; 4419 4420 ve->base.class = OTHER_CLASS; 4421 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 4422 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 4423 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 4424 4425 /* 4426 * The decision on whether to submit a request using semaphores 4427 * depends on the saturated state of the engine. We only compute 4428 * this during HW submission of the request, and we need for this 4429 * state to be globally applied to all requests being submitted 4430 * to this engine. Virtual engines encompass more than one physical 4431 * engine and so we cannot accurately tell in advance if one of those 4432 * engines is already saturated and so cannot afford to use a semaphore 4433 * and be pessimized in priority for doing so -- if we are the only 4434 * context using semaphores after all other clients have stopped, we 4435 * will be starved on the saturated system. Such a global switch for 4436 * semaphores is less than ideal, but alas is the current compromise. 4437 */ 4438 ve->base.saturated = ALL_ENGINES; 4439 4440 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 4441 4442 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 4443 intel_engine_init_breadcrumbs(&ve->base); 4444 4445 intel_engine_init_execlists(&ve->base); 4446 4447 ve->base.cops = &virtual_context_ops; 4448 ve->base.request_alloc = execlists_request_alloc; 4449 4450 ve->base.schedule = i915_schedule; 4451 ve->base.submit_request = virtual_submit_request; 4452 ve->base.bond_execute = virtual_bond_execute; 4453 4454 INIT_LIST_HEAD(virtual_queue(ve)); 4455 ve->base.execlists.queue_priority_hint = INT_MIN; 4456 tasklet_init(&ve->base.execlists.tasklet, 4457 virtual_submission_tasklet, 4458 (unsigned long)ve); 4459 4460 intel_context_init(&ve->context, ctx, &ve->base); 4461 4462 for (n = 0; n < count; n++) { 4463 struct intel_engine_cs *sibling = siblings[n]; 4464 4465 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 4466 if (sibling->mask & ve->base.mask) { 4467 DRM_DEBUG("duplicate %s entry in load balancer\n", 4468 sibling->name); 4469 err = -EINVAL; 4470 goto err_put; 4471 } 4472 4473 /* 4474 * The virtual engine implementation is tightly coupled to 4475 * the execlists backend -- we push out request directly 4476 * into a tree inside each physical engine. We could support 4477 * layering if we handle cloning of the requests and 4478 * submitting a copy into each backend. 4479 */ 4480 if (sibling->execlists.tasklet.func != 4481 execlists_submission_tasklet) { 4482 err = -ENODEV; 4483 goto err_put; 4484 } 4485 4486 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 4487 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 4488 4489 ve->siblings[ve->num_siblings++] = sibling; 4490 ve->base.mask |= sibling->mask; 4491 4492 /* 4493 * All physical engines must be compatible for their emission 4494 * functions (as we build the instructions during request 4495 * construction and do not alter them before submission 4496 * on the physical engine). We use the engine class as a guide 4497 * here, although that could be refined. 4498 */ 4499 if (ve->base.class != OTHER_CLASS) { 4500 if (ve->base.class != sibling->class) { 4501 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 4502 sibling->class, ve->base.class); 4503 err = -EINVAL; 4504 goto err_put; 4505 } 4506 continue; 4507 } 4508 4509 ve->base.class = sibling->class; 4510 ve->base.uabi_class = sibling->uabi_class; 4511 snprintf(ve->base.name, sizeof(ve->base.name), 4512 "v%dx%d", ve->base.class, count); 4513 ve->base.context_size = sibling->context_size; 4514 4515 ve->base.emit_bb_start = sibling->emit_bb_start; 4516 ve->base.emit_flush = sibling->emit_flush; 4517 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 4518 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 4519 ve->base.emit_fini_breadcrumb_dw = 4520 sibling->emit_fini_breadcrumb_dw; 4521 4522 ve->base.flags = sibling->flags; 4523 } 4524 4525 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 4526 4527 err = __execlists_context_alloc(&ve->context, siblings[0]); 4528 if (err) 4529 goto err_put; 4530 4531 __set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags); 4532 4533 return &ve->context; 4534 4535 err_put: 4536 intel_context_put(&ve->context); 4537 return ERR_PTR(err); 4538 } 4539 4540 struct intel_context * 4541 intel_execlists_clone_virtual(struct i915_gem_context *ctx, 4542 struct intel_engine_cs *src) 4543 { 4544 struct virtual_engine *se = to_virtual_engine(src); 4545 struct intel_context *dst; 4546 4547 dst = intel_execlists_create_virtual(ctx, 4548 se->siblings, 4549 se->num_siblings); 4550 if (IS_ERR(dst)) 4551 return dst; 4552 4553 if (se->num_bonds) { 4554 struct virtual_engine *de = to_virtual_engine(dst->engine); 4555 4556 de->bonds = kmemdup(se->bonds, 4557 sizeof(*se->bonds) * se->num_bonds, 4558 GFP_KERNEL); 4559 if (!de->bonds) { 4560 intel_context_put(dst); 4561 return ERR_PTR(-ENOMEM); 4562 } 4563 4564 de->num_bonds = se->num_bonds; 4565 } 4566 4567 return dst; 4568 } 4569 4570 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 4571 const struct intel_engine_cs *master, 4572 const struct intel_engine_cs *sibling) 4573 { 4574 struct virtual_engine *ve = to_virtual_engine(engine); 4575 struct ve_bond *bond; 4576 int n; 4577 4578 /* Sanity check the sibling is part of the virtual engine */ 4579 for (n = 0; n < ve->num_siblings; n++) 4580 if (sibling == ve->siblings[n]) 4581 break; 4582 if (n == ve->num_siblings) 4583 return -EINVAL; 4584 4585 bond = virtual_find_bond(ve, master); 4586 if (bond) { 4587 bond->sibling_mask |= sibling->mask; 4588 return 0; 4589 } 4590 4591 bond = krealloc(ve->bonds, 4592 sizeof(*bond) * (ve->num_bonds + 1), 4593 GFP_KERNEL); 4594 if (!bond) 4595 return -ENOMEM; 4596 4597 bond[ve->num_bonds].master = master; 4598 bond[ve->num_bonds].sibling_mask = sibling->mask; 4599 4600 ve->bonds = bond; 4601 ve->num_bonds++; 4602 4603 return 0; 4604 } 4605 4606 struct intel_engine_cs * 4607 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 4608 unsigned int sibling) 4609 { 4610 struct virtual_engine *ve = to_virtual_engine(engine); 4611 4612 if (sibling >= ve->num_siblings) 4613 return NULL; 4614 4615 return ve->siblings[sibling]; 4616 } 4617 4618 void intel_execlists_show_requests(struct intel_engine_cs *engine, 4619 struct drm_printer *m, 4620 void (*show_request)(struct drm_printer *m, 4621 struct i915_request *rq, 4622 const char *prefix), 4623 unsigned int max) 4624 { 4625 const struct intel_engine_execlists *execlists = &engine->execlists; 4626 struct i915_request *rq, *last; 4627 unsigned long flags; 4628 unsigned int count; 4629 struct rb_node *rb; 4630 4631 spin_lock_irqsave(&engine->active.lock, flags); 4632 4633 last = NULL; 4634 count = 0; 4635 list_for_each_entry(rq, &engine->active.requests, sched.link) { 4636 if (count++ < max - 1) 4637 show_request(m, rq, "\t\tE "); 4638 else 4639 last = rq; 4640 } 4641 if (last) { 4642 if (count > max) { 4643 drm_printf(m, 4644 "\t\t...skipping %d executing requests...\n", 4645 count - max); 4646 } 4647 show_request(m, last, "\t\tE "); 4648 } 4649 4650 last = NULL; 4651 count = 0; 4652 if (execlists->queue_priority_hint != INT_MIN) 4653 drm_printf(m, "\t\tQueue priority hint: %d\n", 4654 execlists->queue_priority_hint); 4655 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 4656 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 4657 int i; 4658 4659 priolist_for_each_request(rq, p, i) { 4660 if (count++ < max - 1) 4661 show_request(m, rq, "\t\tQ "); 4662 else 4663 last = rq; 4664 } 4665 } 4666 if (last) { 4667 if (count > max) { 4668 drm_printf(m, 4669 "\t\t...skipping %d queued requests...\n", 4670 count - max); 4671 } 4672 show_request(m, last, "\t\tQ "); 4673 } 4674 4675 last = NULL; 4676 count = 0; 4677 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 4678 struct virtual_engine *ve = 4679 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4680 struct i915_request *rq = READ_ONCE(ve->request); 4681 4682 if (rq) { 4683 if (count++ < max - 1) 4684 show_request(m, rq, "\t\tV "); 4685 else 4686 last = rq; 4687 } 4688 } 4689 if (last) { 4690 if (count > max) { 4691 drm_printf(m, 4692 "\t\t...skipping %d virtual requests...\n", 4693 count - max); 4694 } 4695 show_request(m, last, "\t\tV "); 4696 } 4697 4698 spin_unlock_irqrestore(&engine->active.lock, flags); 4699 } 4700 4701 void intel_lr_context_reset(struct intel_engine_cs *engine, 4702 struct intel_context *ce, 4703 u32 head, 4704 bool scrub) 4705 { 4706 GEM_BUG_ON(!intel_context_is_pinned(ce)); 4707 4708 /* 4709 * We want a simple context + ring to execute the breadcrumb update. 4710 * We cannot rely on the context being intact across the GPU hang, 4711 * so clear it and rebuild just what we need for the breadcrumb. 4712 * All pending requests for this context will be zapped, and any 4713 * future request will be after userspace has had the opportunity 4714 * to recreate its own state. 4715 */ 4716 if (scrub) 4717 restore_default_state(ce, engine); 4718 4719 /* Rerun the request; its payload has been neutered (if guilty). */ 4720 ce->ring->head = head; 4721 intel_ring_update_space(ce->ring); 4722 4723 __execlists_update_reg_state(ce, engine); 4724 } 4725 4726 bool 4727 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 4728 { 4729 return engine->set_default_submission == 4730 intel_execlists_set_default_submission; 4731 } 4732 4733 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 4734 #include "selftest_lrc.c" 4735 #endif 4736