1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "gem/i915_gem_context.h" 137 138 #include "i915_drv.h" 139 #include "i915_perf.h" 140 #include "i915_trace.h" 141 #include "i915_vgpu.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_lrc_reg.h" 146 #include "intel_mocs.h" 147 #include "intel_reset.h" 148 #include "intel_ring.h" 149 #include "intel_workarounds.h" 150 151 #define RING_EXECLIST_QFULL (1 << 0x2) 152 #define RING_EXECLIST1_VALID (1 << 0x3) 153 #define RING_EXECLIST0_VALID (1 << 0x4) 154 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 155 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 156 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 157 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 159 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 162 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 163 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 164 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 166 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 167 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 169 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 172 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 173 #define GEN12_IDLE_CTX_ID 0x7FF 174 #define GEN12_CSB_CTX_VALID(csb_dw) \ 175 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 176 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 179 #define WA_TAIL_DWORDS 2 180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 186 /* 187 * We allow only a single request through the virtual engine at a time 188 * (each request in the timeline waits for the completion fence of 189 * the previous before being submitted). By restricting ourselves to 190 * only submitting a single request, each request is placed on to a 191 * physical to maximise load spreading (by virtue of the late greedy 192 * scheduling -- each real engine takes the next available request 193 * upon idling). 194 */ 195 struct i915_request *request; 196 197 /* 198 * We keep a rbtree of available virtual engines inside each physical 199 * engine, sorted by priority. Here we preallocate the nodes we need 200 * for the virtual engine, indexed by physical_engine->id. 201 */ 202 struct ve_node { 203 struct rb_node rb; 204 int prio; 205 } nodes[I915_NUM_ENGINES]; 206 207 /* 208 * Keep track of bonded pairs -- restrictions upon on our selection 209 * of physical engines any particular request may be submitted to. 210 * If we receive a submit-fence from a master engine, we will only 211 * use one of sibling_mask physical engines. 212 */ 213 struct ve_bond { 214 const struct intel_engine_cs *master; 215 intel_engine_mask_t sibling_mask; 216 } *bonds; 217 unsigned int num_bonds; 218 219 /* And finally, which physical engines this virtual engine maps onto. */ 220 unsigned int num_siblings; 221 struct intel_engine_cs *siblings[0]; 222 }; 223 224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 225 { 226 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 227 return container_of(engine, struct virtual_engine, base); 228 } 229 230 static int __execlists_context_alloc(struct intel_context *ce, 231 struct intel_engine_cs *engine); 232 233 static void execlists_init_reg_state(u32 *reg_state, 234 const struct intel_context *ce, 235 const struct intel_engine_cs *engine, 236 const struct intel_ring *ring, 237 bool close); 238 static void 239 __execlists_update_reg_state(const struct intel_context *ce, 240 const struct intel_engine_cs *engine); 241 242 static void mark_eio(struct i915_request *rq) 243 { 244 if (i915_request_completed(rq)) 245 return; 246 247 GEM_BUG_ON(i915_request_signaled(rq)); 248 249 dma_fence_set_error(&rq->fence, -EIO); 250 i915_request_mark_complete(rq); 251 } 252 253 static struct i915_request * 254 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 255 { 256 struct i915_request *active = rq; 257 258 rcu_read_lock(); 259 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 260 if (i915_request_completed(rq)) 261 break; 262 263 active = rq; 264 } 265 rcu_read_unlock(); 266 267 return active; 268 } 269 270 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 271 { 272 return (i915_ggtt_offset(engine->status_page.vma) + 273 I915_GEM_HWS_PREEMPT_ADDR); 274 } 275 276 static inline void 277 ring_set_paused(const struct intel_engine_cs *engine, int state) 278 { 279 /* 280 * We inspect HWS_PREEMPT with a semaphore inside 281 * engine->emit_fini_breadcrumb. If the dword is true, 282 * the ring is paused as the semaphore will busywait 283 * until the dword is false. 284 */ 285 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 286 if (state) 287 wmb(); 288 } 289 290 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 291 { 292 return rb_entry(rb, struct i915_priolist, node); 293 } 294 295 static inline int rq_prio(const struct i915_request *rq) 296 { 297 return rq->sched.attr.priority; 298 } 299 300 static int effective_prio(const struct i915_request *rq) 301 { 302 int prio = rq_prio(rq); 303 304 /* 305 * If this request is special and must not be interrupted at any 306 * cost, so be it. Note we are only checking the most recent request 307 * in the context and so may be masking an earlier vip request. It 308 * is hoped that under the conditions where nopreempt is used, this 309 * will not matter (i.e. all requests to that context will be 310 * nopreempt for as long as desired). 311 */ 312 if (i915_request_has_nopreempt(rq)) 313 prio = I915_PRIORITY_UNPREEMPTABLE; 314 315 /* 316 * On unwinding the active request, we give it a priority bump 317 * if it has completed waiting on any semaphore. If we know that 318 * the request has already started, we can prevent an unwanted 319 * preempt-to-idle cycle by taking that into account now. 320 */ 321 if (__i915_request_has_started(rq)) 322 prio |= I915_PRIORITY_NOSEMAPHORE; 323 324 /* Restrict mere WAIT boosts from triggering preemption */ 325 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ 326 return prio | __NO_PREEMPTION; 327 } 328 329 static int queue_prio(const struct intel_engine_execlists *execlists) 330 { 331 struct i915_priolist *p; 332 struct rb_node *rb; 333 334 rb = rb_first_cached(&execlists->queue); 335 if (!rb) 336 return INT_MIN; 337 338 /* 339 * As the priolist[] are inverted, with the highest priority in [0], 340 * we have to flip the index value to become priority. 341 */ 342 p = to_priolist(rb); 343 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 344 } 345 346 static inline bool need_preempt(const struct intel_engine_cs *engine, 347 const struct i915_request *rq, 348 struct rb_node *rb) 349 { 350 int last_prio; 351 352 if (!intel_engine_has_semaphores(engine)) 353 return false; 354 355 /* 356 * Check if the current priority hint merits a preemption attempt. 357 * 358 * We record the highest value priority we saw during rescheduling 359 * prior to this dequeue, therefore we know that if it is strictly 360 * less than the current tail of ESLP[0], we do not need to force 361 * a preempt-to-idle cycle. 362 * 363 * However, the priority hint is a mere hint that we may need to 364 * preempt. If that hint is stale or we may be trying to preempt 365 * ourselves, ignore the request. 366 * 367 * More naturally we would write 368 * prio >= max(0, last); 369 * except that we wish to prevent triggering preemption at the same 370 * priority level: the task that is running should remain running 371 * to preserve FIFO ordering of dependencies. 372 */ 373 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 374 if (engine->execlists.queue_priority_hint <= last_prio) 375 return false; 376 377 /* 378 * Check against the first request in ELSP[1], it will, thanks to the 379 * power of PI, be the highest priority of that context. 380 */ 381 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 382 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 383 return true; 384 385 if (rb) { 386 struct virtual_engine *ve = 387 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 388 bool preempt = false; 389 390 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 391 struct i915_request *next; 392 393 rcu_read_lock(); 394 next = READ_ONCE(ve->request); 395 if (next) 396 preempt = rq_prio(next) > last_prio; 397 rcu_read_unlock(); 398 } 399 400 if (preempt) 401 return preempt; 402 } 403 404 /* 405 * If the inflight context did not trigger the preemption, then maybe 406 * it was the set of queued requests? Pick the highest priority in 407 * the queue (the first active priolist) and see if it deserves to be 408 * running instead of ELSP[0]. 409 * 410 * The highest priority request in the queue can not be either 411 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 412 * context, it's priority would not exceed ELSP[0] aka last_prio. 413 */ 414 return queue_prio(&engine->execlists) > last_prio; 415 } 416 417 __maybe_unused static inline bool 418 assert_priority_queue(const struct i915_request *prev, 419 const struct i915_request *next) 420 { 421 /* 422 * Without preemption, the prev may refer to the still active element 423 * which we refuse to let go. 424 * 425 * Even with preemption, there are times when we think it is better not 426 * to preempt and leave an ostensibly lower priority request in flight. 427 */ 428 if (i915_request_is_active(prev)) 429 return true; 430 431 return rq_prio(prev) >= rq_prio(next); 432 } 433 434 /* 435 * The context descriptor encodes various attributes of a context, 436 * including its GTT address and some flags. Because it's fairly 437 * expensive to calculate, we'll just do it once and cache the result, 438 * which remains valid until the context is unpinned. 439 * 440 * This is what a descriptor looks like, from LSB to MSB:: 441 * 442 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 443 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 444 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 445 * bits 53-54: mbz, reserved for use by hardware 446 * bits 55-63: group ID, currently unused and set to 0 447 * 448 * Starting from Gen11, the upper dword of the descriptor has a new format: 449 * 450 * bits 32-36: reserved 451 * bits 37-47: SW context ID 452 * bits 48:53: engine instance 453 * bit 54: mbz, reserved for use by hardware 454 * bits 55-60: SW counter 455 * bits 61-63: engine class 456 * 457 * engine info, SW context ID and SW counter need to form a unique number 458 * (Context ID) per lrc. 459 */ 460 static u64 461 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 462 { 463 u64 desc; 464 465 desc = INTEL_LEGACY_32B_CONTEXT; 466 if (i915_vm_is_4lvl(ce->vm)) 467 desc = INTEL_LEGACY_64B_CONTEXT; 468 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 469 470 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 471 if (IS_GEN(engine->i915, 8)) 472 desc |= GEN8_CTX_L3LLC_COHERENT; 473 474 desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */ 475 /* 476 * The following 32bits are copied into the OA reports (dword 2). 477 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing 478 * anything below. 479 */ 480 if (INTEL_GEN(engine->i915) >= 11) { 481 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; 482 /* bits 48-53 */ 483 484 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; 485 /* bits 61-63 */ 486 } 487 488 return desc; 489 } 490 491 static u32 *set_offsets(u32 *regs, 492 const u8 *data, 493 const struct intel_engine_cs *engine) 494 #define NOP(x) (BIT(7) | (x)) 495 #define LRI(count, flags) ((flags) << 6 | (count)) 496 #define POSTED BIT(0) 497 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 498 #define REG16(x) \ 499 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 500 (((x) >> 2) & 0x7f) 501 #define END() 0 502 { 503 const u32 base = engine->mmio_base; 504 505 while (*data) { 506 u8 count, flags; 507 508 if (*data & BIT(7)) { /* skip */ 509 regs += *data++ & ~BIT(7); 510 continue; 511 } 512 513 count = *data & 0x3f; 514 flags = *data >> 6; 515 data++; 516 517 *regs = MI_LOAD_REGISTER_IMM(count); 518 if (flags & POSTED) 519 *regs |= MI_LRI_FORCE_POSTED; 520 if (INTEL_GEN(engine->i915) >= 11) 521 *regs |= MI_LRI_CS_MMIO; 522 regs++; 523 524 GEM_BUG_ON(!count); 525 do { 526 u32 offset = 0; 527 u8 v; 528 529 do { 530 v = *data++; 531 offset <<= 7; 532 offset |= v & ~BIT(7); 533 } while (v & BIT(7)); 534 535 *regs = base + (offset << 2); 536 regs += 2; 537 } while (--count); 538 } 539 540 return regs; 541 } 542 543 static const u8 gen8_xcs_offsets[] = { 544 NOP(1), 545 LRI(11, 0), 546 REG16(0x244), 547 REG(0x034), 548 REG(0x030), 549 REG(0x038), 550 REG(0x03c), 551 REG(0x168), 552 REG(0x140), 553 REG(0x110), 554 REG(0x11c), 555 REG(0x114), 556 REG(0x118), 557 558 NOP(9), 559 LRI(9, 0), 560 REG16(0x3a8), 561 REG16(0x28c), 562 REG16(0x288), 563 REG16(0x284), 564 REG16(0x280), 565 REG16(0x27c), 566 REG16(0x278), 567 REG16(0x274), 568 REG16(0x270), 569 570 NOP(13), 571 LRI(2, 0), 572 REG16(0x200), 573 REG(0x028), 574 575 END(), 576 }; 577 578 static const u8 gen9_xcs_offsets[] = { 579 NOP(1), 580 LRI(14, POSTED), 581 REG16(0x244), 582 REG(0x034), 583 REG(0x030), 584 REG(0x038), 585 REG(0x03c), 586 REG(0x168), 587 REG(0x140), 588 REG(0x110), 589 REG(0x11c), 590 REG(0x114), 591 REG(0x118), 592 REG(0x1c0), 593 REG(0x1c4), 594 REG(0x1c8), 595 596 NOP(3), 597 LRI(9, POSTED), 598 REG16(0x3a8), 599 REG16(0x28c), 600 REG16(0x288), 601 REG16(0x284), 602 REG16(0x280), 603 REG16(0x27c), 604 REG16(0x278), 605 REG16(0x274), 606 REG16(0x270), 607 608 NOP(13), 609 LRI(1, POSTED), 610 REG16(0x200), 611 612 NOP(13), 613 LRI(44, POSTED), 614 REG(0x028), 615 REG(0x09c), 616 REG(0x0c0), 617 REG(0x178), 618 REG(0x17c), 619 REG16(0x358), 620 REG(0x170), 621 REG(0x150), 622 REG(0x154), 623 REG(0x158), 624 REG16(0x41c), 625 REG16(0x600), 626 REG16(0x604), 627 REG16(0x608), 628 REG16(0x60c), 629 REG16(0x610), 630 REG16(0x614), 631 REG16(0x618), 632 REG16(0x61c), 633 REG16(0x620), 634 REG16(0x624), 635 REG16(0x628), 636 REG16(0x62c), 637 REG16(0x630), 638 REG16(0x634), 639 REG16(0x638), 640 REG16(0x63c), 641 REG16(0x640), 642 REG16(0x644), 643 REG16(0x648), 644 REG16(0x64c), 645 REG16(0x650), 646 REG16(0x654), 647 REG16(0x658), 648 REG16(0x65c), 649 REG16(0x660), 650 REG16(0x664), 651 REG16(0x668), 652 REG16(0x66c), 653 REG16(0x670), 654 REG16(0x674), 655 REG16(0x678), 656 REG16(0x67c), 657 REG(0x068), 658 659 END(), 660 }; 661 662 static const u8 gen12_xcs_offsets[] = { 663 NOP(1), 664 LRI(13, POSTED), 665 REG16(0x244), 666 REG(0x034), 667 REG(0x030), 668 REG(0x038), 669 REG(0x03c), 670 REG(0x168), 671 REG(0x140), 672 REG(0x110), 673 REG(0x1c0), 674 REG(0x1c4), 675 REG(0x1c8), 676 REG(0x180), 677 REG16(0x2b4), 678 679 NOP(5), 680 LRI(9, POSTED), 681 REG16(0x3a8), 682 REG16(0x28c), 683 REG16(0x288), 684 REG16(0x284), 685 REG16(0x280), 686 REG16(0x27c), 687 REG16(0x278), 688 REG16(0x274), 689 REG16(0x270), 690 691 END(), 692 }; 693 694 static const u8 gen8_rcs_offsets[] = { 695 NOP(1), 696 LRI(14, POSTED), 697 REG16(0x244), 698 REG(0x034), 699 REG(0x030), 700 REG(0x038), 701 REG(0x03c), 702 REG(0x168), 703 REG(0x140), 704 REG(0x110), 705 REG(0x11c), 706 REG(0x114), 707 REG(0x118), 708 REG(0x1c0), 709 REG(0x1c4), 710 REG(0x1c8), 711 712 NOP(3), 713 LRI(9, POSTED), 714 REG16(0x3a8), 715 REG16(0x28c), 716 REG16(0x288), 717 REG16(0x284), 718 REG16(0x280), 719 REG16(0x27c), 720 REG16(0x278), 721 REG16(0x274), 722 REG16(0x270), 723 724 NOP(13), 725 LRI(1, 0), 726 REG(0x0c8), 727 728 END(), 729 }; 730 731 static const u8 gen11_rcs_offsets[] = { 732 NOP(1), 733 LRI(15, POSTED), 734 REG16(0x244), 735 REG(0x034), 736 REG(0x030), 737 REG(0x038), 738 REG(0x03c), 739 REG(0x168), 740 REG(0x140), 741 REG(0x110), 742 REG(0x11c), 743 REG(0x114), 744 REG(0x118), 745 REG(0x1c0), 746 REG(0x1c4), 747 REG(0x1c8), 748 REG(0x180), 749 750 NOP(1), 751 LRI(9, POSTED), 752 REG16(0x3a8), 753 REG16(0x28c), 754 REG16(0x288), 755 REG16(0x284), 756 REG16(0x280), 757 REG16(0x27c), 758 REG16(0x278), 759 REG16(0x274), 760 REG16(0x270), 761 762 LRI(1, POSTED), 763 REG(0x1b0), 764 765 NOP(10), 766 LRI(1, 0), 767 REG(0x0c8), 768 769 END(), 770 }; 771 772 static const u8 gen12_rcs_offsets[] = { 773 NOP(1), 774 LRI(13, POSTED), 775 REG16(0x244), 776 REG(0x034), 777 REG(0x030), 778 REG(0x038), 779 REG(0x03c), 780 REG(0x168), 781 REG(0x140), 782 REG(0x110), 783 REG(0x1c0), 784 REG(0x1c4), 785 REG(0x1c8), 786 REG(0x180), 787 REG16(0x2b4), 788 789 NOP(5), 790 LRI(9, POSTED), 791 REG16(0x3a8), 792 REG16(0x28c), 793 REG16(0x288), 794 REG16(0x284), 795 REG16(0x280), 796 REG16(0x27c), 797 REG16(0x278), 798 REG16(0x274), 799 REG16(0x270), 800 801 LRI(3, POSTED), 802 REG(0x1b0), 803 REG16(0x5a8), 804 REG16(0x5ac), 805 806 NOP(6), 807 LRI(1, 0), 808 REG(0x0c8), 809 810 END(), 811 }; 812 813 #undef END 814 #undef REG16 815 #undef REG 816 #undef LRI 817 #undef NOP 818 819 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 820 { 821 /* 822 * The gen12+ lists only have the registers we program in the basic 823 * default state. We rely on the context image using relative 824 * addressing to automatic fixup the register state between the 825 * physical engines for virtual engine. 826 */ 827 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 828 !intel_engine_has_relative_mmio(engine)); 829 830 if (engine->class == RENDER_CLASS) { 831 if (INTEL_GEN(engine->i915) >= 12) 832 return gen12_rcs_offsets; 833 else if (INTEL_GEN(engine->i915) >= 11) 834 return gen11_rcs_offsets; 835 else 836 return gen8_rcs_offsets; 837 } else { 838 if (INTEL_GEN(engine->i915) >= 12) 839 return gen12_xcs_offsets; 840 else if (INTEL_GEN(engine->i915) >= 9) 841 return gen9_xcs_offsets; 842 else 843 return gen8_xcs_offsets; 844 } 845 } 846 847 static void unwind_wa_tail(struct i915_request *rq) 848 { 849 rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES); 850 assert_ring_tail_valid(rq->ring, rq->tail); 851 } 852 853 static struct i915_request * 854 __unwind_incomplete_requests(struct intel_engine_cs *engine) 855 { 856 struct i915_request *rq, *rn, *active = NULL; 857 struct list_head *uninitialized_var(pl); 858 int prio = I915_PRIORITY_INVALID; 859 860 lockdep_assert_held(&engine->active.lock); 861 862 list_for_each_entry_safe_reverse(rq, rn, 863 &engine->active.requests, 864 sched.link) { 865 866 if (i915_request_completed(rq)) 867 continue; /* XXX */ 868 869 __i915_request_unsubmit(rq); 870 unwind_wa_tail(rq); 871 872 /* 873 * Push the request back into the queue for later resubmission. 874 * If this request is not native to this physical engine (i.e. 875 * it came from a virtual source), push it back onto the virtual 876 * engine so that it can be moved across onto another physical 877 * engine as load dictates. 878 */ 879 if (likely(rq->execution_mask == engine->mask)) { 880 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 881 if (rq_prio(rq) != prio) { 882 prio = rq_prio(rq); 883 pl = i915_sched_lookup_priolist(engine, prio); 884 } 885 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 886 887 list_move(&rq->sched.link, pl); 888 active = rq; 889 } else { 890 struct intel_engine_cs *owner = rq->hw_context->engine; 891 892 /* 893 * Decouple the virtual breadcrumb before moving it 894 * back to the virtual engine -- we don't want the 895 * request to complete in the background and try 896 * and cancel the breadcrumb on the virtual engine 897 * (instead of the old engine where it is linked)! 898 */ 899 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 900 &rq->fence.flags)) { 901 spin_lock_nested(&rq->lock, 902 SINGLE_DEPTH_NESTING); 903 i915_request_cancel_breadcrumb(rq); 904 spin_unlock(&rq->lock); 905 } 906 rq->engine = owner; 907 owner->submit_request(rq); 908 active = NULL; 909 } 910 } 911 912 return active; 913 } 914 915 struct i915_request * 916 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 917 { 918 struct intel_engine_cs *engine = 919 container_of(execlists, typeof(*engine), execlists); 920 921 return __unwind_incomplete_requests(engine); 922 } 923 924 static inline void 925 execlists_context_status_change(struct i915_request *rq, unsigned long status) 926 { 927 /* 928 * Only used when GVT-g is enabled now. When GVT-g is disabled, 929 * The compiler should eliminate this function as dead-code. 930 */ 931 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 932 return; 933 934 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 935 status, rq); 936 } 937 938 static void intel_engine_context_in(struct intel_engine_cs *engine) 939 { 940 unsigned long flags; 941 942 if (READ_ONCE(engine->stats.enabled) == 0) 943 return; 944 945 write_seqlock_irqsave(&engine->stats.lock, flags); 946 947 if (engine->stats.enabled > 0) { 948 if (engine->stats.active++ == 0) 949 engine->stats.start = ktime_get(); 950 GEM_BUG_ON(engine->stats.active == 0); 951 } 952 953 write_sequnlock_irqrestore(&engine->stats.lock, flags); 954 } 955 956 static void intel_engine_context_out(struct intel_engine_cs *engine) 957 { 958 unsigned long flags; 959 960 if (READ_ONCE(engine->stats.enabled) == 0) 961 return; 962 963 write_seqlock_irqsave(&engine->stats.lock, flags); 964 965 if (engine->stats.enabled > 0) { 966 ktime_t last; 967 968 if (engine->stats.active && --engine->stats.active == 0) { 969 /* 970 * Decrement the active context count and in case GPU 971 * is now idle add up to the running total. 972 */ 973 last = ktime_sub(ktime_get(), engine->stats.start); 974 975 engine->stats.total = ktime_add(engine->stats.total, 976 last); 977 } else if (engine->stats.active == 0) { 978 /* 979 * After turning on engine stats, context out might be 980 * the first event in which case we account from the 981 * time stats gathering was turned on. 982 */ 983 last = ktime_sub(ktime_get(), engine->stats.enabled_at); 984 985 engine->stats.total = ktime_add(engine->stats.total, 986 last); 987 } 988 } 989 990 write_sequnlock_irqrestore(&engine->stats.lock, flags); 991 } 992 993 static void restore_default_state(struct intel_context *ce, 994 struct intel_engine_cs *engine) 995 { 996 u32 *regs = ce->lrc_reg_state; 997 998 if (engine->pinned_default_state) 999 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 1000 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 1001 engine->context_size - PAGE_SIZE); 1002 1003 execlists_init_reg_state(regs, ce, engine, ce->ring, false); 1004 } 1005 1006 static void reset_active(struct i915_request *rq, 1007 struct intel_engine_cs *engine) 1008 { 1009 struct intel_context * const ce = rq->hw_context; 1010 u32 head; 1011 1012 /* 1013 * The executing context has been cancelled. We want to prevent 1014 * further execution along this context and propagate the error on 1015 * to anything depending on its results. 1016 * 1017 * In __i915_request_submit(), we apply the -EIO and remove the 1018 * requests' payloads for any banned requests. But first, we must 1019 * rewind the context back to the start of the incomplete request so 1020 * that we do not jump back into the middle of the batch. 1021 * 1022 * We preserve the breadcrumbs and semaphores of the incomplete 1023 * requests so that inter-timeline dependencies (i.e other timelines) 1024 * remain correctly ordered. And we defer to __i915_request_submit() 1025 * so that all asynchronous waits are correctly handled. 1026 */ 1027 GEM_TRACE("%s(%s): { rq=%llx:%lld }\n", 1028 __func__, engine->name, rq->fence.context, rq->fence.seqno); 1029 1030 /* On resubmission of the active request, payload will be scrubbed */ 1031 if (i915_request_completed(rq)) 1032 head = rq->tail; 1033 else 1034 head = active_request(ce->timeline, rq)->head; 1035 ce->ring->head = intel_ring_wrap(ce->ring, head); 1036 intel_ring_update_space(ce->ring); 1037 1038 /* Scrub the context image to prevent replaying the previous batch */ 1039 restore_default_state(ce, engine); 1040 __execlists_update_reg_state(ce, engine); 1041 1042 /* We've switched away, so this should be a no-op, but intent matters */ 1043 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1044 } 1045 1046 static inline struct intel_engine_cs * 1047 __execlists_schedule_in(struct i915_request *rq) 1048 { 1049 struct intel_engine_cs * const engine = rq->engine; 1050 struct intel_context * const ce = rq->hw_context; 1051 1052 intel_context_get(ce); 1053 1054 if (unlikely(i915_gem_context_is_banned(ce->gem_context))) 1055 reset_active(rq, engine); 1056 1057 if (ce->tag) { 1058 /* Use a fixed tag for OA and friends */ 1059 ce->lrc_desc |= (u64)ce->tag << 32; 1060 } else { 1061 /* We don't need a strict matching tag, just different values */ 1062 ce->lrc_desc &= ~GENMASK_ULL(47, 37); 1063 ce->lrc_desc |= 1064 (u64)(engine->context_tag++ % NUM_CONTEXT_TAG) << 1065 GEN11_SW_CTX_ID_SHIFT; 1066 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID); 1067 } 1068 1069 intel_gt_pm_get(engine->gt); 1070 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1071 intel_engine_context_in(engine); 1072 1073 return engine; 1074 } 1075 1076 static inline struct i915_request * 1077 execlists_schedule_in(struct i915_request *rq, int idx) 1078 { 1079 struct intel_context * const ce = rq->hw_context; 1080 struct intel_engine_cs *old; 1081 1082 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1083 trace_i915_request_in(rq, idx); 1084 1085 old = READ_ONCE(ce->inflight); 1086 do { 1087 if (!old) { 1088 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1089 break; 1090 } 1091 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1092 1093 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1094 return i915_request_get(rq); 1095 } 1096 1097 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1098 { 1099 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1100 struct i915_request *next = READ_ONCE(ve->request); 1101 1102 if (next && next->execution_mask & ~rq->execution_mask) 1103 tasklet_schedule(&ve->base.execlists.tasklet); 1104 } 1105 1106 static inline void 1107 __execlists_schedule_out(struct i915_request *rq, 1108 struct intel_engine_cs * const engine) 1109 { 1110 struct intel_context * const ce = rq->hw_context; 1111 1112 /* 1113 * NB process_csb() is not under the engine->active.lock and hence 1114 * schedule_out can race with schedule_in meaning that we should 1115 * refrain from doing non-trivial work here. 1116 */ 1117 1118 intel_engine_context_out(engine); 1119 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1120 intel_gt_pm_put(engine->gt); 1121 1122 /* 1123 * If this is part of a virtual engine, its next request may 1124 * have been blocked waiting for access to the active context. 1125 * We have to kick all the siblings again in case we need to 1126 * switch (e.g. the next request is not runnable on this 1127 * engine). Hopefully, we will already have submitted the next 1128 * request before the tasklet runs and do not need to rebuild 1129 * each virtual tree and kick everyone again. 1130 */ 1131 if (ce->engine != engine) 1132 kick_siblings(rq, ce); 1133 1134 intel_context_put(ce); 1135 } 1136 1137 static inline void 1138 execlists_schedule_out(struct i915_request *rq) 1139 { 1140 struct intel_context * const ce = rq->hw_context; 1141 struct intel_engine_cs *cur, *old; 1142 1143 trace_i915_request_out(rq); 1144 1145 old = READ_ONCE(ce->inflight); 1146 do 1147 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1148 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1149 if (!cur) 1150 __execlists_schedule_out(rq, old); 1151 1152 i915_request_put(rq); 1153 } 1154 1155 static u64 execlists_update_context(const struct i915_request *rq) 1156 { 1157 struct intel_context *ce = rq->hw_context; 1158 u64 desc; 1159 1160 ce->lrc_reg_state[CTX_RING_TAIL] = 1161 intel_ring_set_tail(rq->ring, rq->tail); 1162 1163 /* 1164 * Make sure the context image is complete before we submit it to HW. 1165 * 1166 * Ostensibly, writes (including the WCB) should be flushed prior to 1167 * an uncached write such as our mmio register access, the empirical 1168 * evidence (esp. on Braswell) suggests that the WC write into memory 1169 * may not be visible to the HW prior to the completion of the UC 1170 * register write and that we may begin execution from the context 1171 * before its image is complete leading to invalid PD chasing. 1172 * 1173 * Furthermore, Braswell, at least, wants a full mb to be sure that 1174 * the writes are coherent in memory (visible to the GPU) prior to 1175 * execution, and not just visible to other CPUs (as is the result of 1176 * wmb). 1177 */ 1178 mb(); 1179 1180 desc = ce->lrc_desc; 1181 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; 1182 1183 /* Wa_1607138340:tgl */ 1184 if (IS_TGL_REVID(rq->i915, TGL_REVID_A0, TGL_REVID_A0)) 1185 desc |= CTX_DESC_FORCE_RESTORE; 1186 1187 return desc; 1188 } 1189 1190 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1191 { 1192 if (execlists->ctrl_reg) { 1193 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1194 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1195 } else { 1196 writel(upper_32_bits(desc), execlists->submit_reg); 1197 writel(lower_32_bits(desc), execlists->submit_reg); 1198 } 1199 } 1200 1201 static __maybe_unused void 1202 trace_ports(const struct intel_engine_execlists *execlists, 1203 const char *msg, 1204 struct i915_request * const *ports) 1205 { 1206 const struct intel_engine_cs *engine = 1207 container_of(execlists, typeof(*engine), execlists); 1208 1209 if (!ports[0]) 1210 return; 1211 1212 GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n", 1213 engine->name, msg, 1214 ports[0]->fence.context, 1215 ports[0]->fence.seqno, 1216 i915_request_completed(ports[0]) ? "!" : 1217 i915_request_started(ports[0]) ? "*" : 1218 "", 1219 ports[1] ? ports[1]->fence.context : 0, 1220 ports[1] ? ports[1]->fence.seqno : 0); 1221 } 1222 1223 static __maybe_unused bool 1224 assert_pending_valid(const struct intel_engine_execlists *execlists, 1225 const char *msg) 1226 { 1227 struct i915_request * const *port, *rq; 1228 struct intel_context *ce = NULL; 1229 1230 trace_ports(execlists, msg, execlists->pending); 1231 1232 if (!execlists->pending[0]) { 1233 GEM_TRACE_ERR("Nothing pending for promotion!\n"); 1234 return false; 1235 } 1236 1237 if (execlists->pending[execlists_num_ports(execlists)]) { 1238 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n", 1239 execlists_num_ports(execlists)); 1240 return false; 1241 } 1242 1243 for (port = execlists->pending; (rq = *port); port++) { 1244 if (ce == rq->hw_context) { 1245 GEM_TRACE_ERR("Duplicate context in pending[%zd]\n", 1246 port - execlists->pending); 1247 return false; 1248 } 1249 1250 ce = rq->hw_context; 1251 if (i915_request_completed(rq)) 1252 continue; 1253 1254 if (i915_active_is_idle(&ce->active)) { 1255 GEM_TRACE_ERR("Inactive context in pending[%zd]\n", 1256 port - execlists->pending); 1257 return false; 1258 } 1259 1260 if (!i915_vma_is_pinned(ce->state)) { 1261 GEM_TRACE_ERR("Unpinned context in pending[%zd]\n", 1262 port - execlists->pending); 1263 return false; 1264 } 1265 1266 if (!i915_vma_is_pinned(ce->ring->vma)) { 1267 GEM_TRACE_ERR("Unpinned ringbuffer in pending[%zd]\n", 1268 port - execlists->pending); 1269 return false; 1270 } 1271 } 1272 1273 return ce; 1274 } 1275 1276 static void execlists_submit_ports(struct intel_engine_cs *engine) 1277 { 1278 struct intel_engine_execlists *execlists = &engine->execlists; 1279 unsigned int n; 1280 1281 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1282 1283 /* 1284 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1285 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1286 * not be relinquished until the device is idle (see 1287 * i915_gem_idle_work_handler()). As a precaution, we make sure 1288 * that all ELSP are drained i.e. we have processed the CSB, 1289 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1290 */ 1291 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1292 1293 /* 1294 * ELSQ note: the submit queue is not cleared after being submitted 1295 * to the HW so we need to make sure we always clean it up. This is 1296 * currently ensured by the fact that we always write the same number 1297 * of elsq entries, keep this in mind before changing the loop below. 1298 */ 1299 for (n = execlists_num_ports(execlists); n--; ) { 1300 struct i915_request *rq = execlists->pending[n]; 1301 1302 write_desc(execlists, 1303 rq ? execlists_update_context(rq) : 0, 1304 n); 1305 } 1306 1307 /* we need to manually load the submit queue */ 1308 if (execlists->ctrl_reg) 1309 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1310 } 1311 1312 static bool ctx_single_port_submission(const struct intel_context *ce) 1313 { 1314 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1315 i915_gem_context_force_single_submission(ce->gem_context)); 1316 } 1317 1318 static bool can_merge_ctx(const struct intel_context *prev, 1319 const struct intel_context *next) 1320 { 1321 if (prev != next) 1322 return false; 1323 1324 if (ctx_single_port_submission(prev)) 1325 return false; 1326 1327 return true; 1328 } 1329 1330 static bool can_merge_rq(const struct i915_request *prev, 1331 const struct i915_request *next) 1332 { 1333 GEM_BUG_ON(prev == next); 1334 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1335 1336 /* 1337 * We do not submit known completed requests. Therefore if the next 1338 * request is already completed, we can pretend to merge it in 1339 * with the previous context (and we will skip updating the ELSP 1340 * and tracking). Thus hopefully keeping the ELSP full with active 1341 * contexts, despite the best efforts of preempt-to-busy to confuse 1342 * us. 1343 */ 1344 if (i915_request_completed(next)) 1345 return true; 1346 1347 if (unlikely((prev->flags ^ next->flags) & 1348 (I915_REQUEST_NOPREEMPT | I915_REQUEST_SENTINEL))) 1349 return false; 1350 1351 if (!can_merge_ctx(prev->hw_context, next->hw_context)) 1352 return false; 1353 1354 return true; 1355 } 1356 1357 static void virtual_update_register_offsets(u32 *regs, 1358 struct intel_engine_cs *engine) 1359 { 1360 set_offsets(regs, reg_offsets(engine), engine); 1361 } 1362 1363 static bool virtual_matches(const struct virtual_engine *ve, 1364 const struct i915_request *rq, 1365 const struct intel_engine_cs *engine) 1366 { 1367 const struct intel_engine_cs *inflight; 1368 1369 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1370 return false; 1371 1372 /* 1373 * We track when the HW has completed saving the context image 1374 * (i.e. when we have seen the final CS event switching out of 1375 * the context) and must not overwrite the context image before 1376 * then. This restricts us to only using the active engine 1377 * while the previous virtualized request is inflight (so 1378 * we reuse the register offsets). This is a very small 1379 * hystersis on the greedy seelction algorithm. 1380 */ 1381 inflight = intel_context_inflight(&ve->context); 1382 if (inflight && inflight != engine) 1383 return false; 1384 1385 return true; 1386 } 1387 1388 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, 1389 struct intel_engine_cs *engine) 1390 { 1391 struct intel_engine_cs *old = ve->siblings[0]; 1392 1393 /* All unattached (rq->engine == old) must already be completed */ 1394 1395 spin_lock(&old->breadcrumbs.irq_lock); 1396 if (!list_empty(&ve->context.signal_link)) { 1397 list_move_tail(&ve->context.signal_link, 1398 &engine->breadcrumbs.signalers); 1399 intel_engine_queue_breadcrumbs(engine); 1400 } 1401 spin_unlock(&old->breadcrumbs.irq_lock); 1402 } 1403 1404 static struct i915_request * 1405 last_active(const struct intel_engine_execlists *execlists) 1406 { 1407 struct i915_request * const *last = READ_ONCE(execlists->active); 1408 1409 while (*last && i915_request_completed(*last)) 1410 last++; 1411 1412 return *last; 1413 } 1414 1415 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1416 { 1417 LIST_HEAD(list); 1418 1419 /* 1420 * We want to move the interrupted request to the back of 1421 * the round-robin list (i.e. its priority level), but 1422 * in doing so, we must then move all requests that were in 1423 * flight and were waiting for the interrupted request to 1424 * be run after it again. 1425 */ 1426 do { 1427 struct i915_dependency *p; 1428 1429 GEM_BUG_ON(i915_request_is_active(rq)); 1430 list_move_tail(&rq->sched.link, pl); 1431 1432 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 1433 struct i915_request *w = 1434 container_of(p->waiter, typeof(*w), sched); 1435 1436 /* Leave semaphores spinning on the other engines */ 1437 if (w->engine != rq->engine) 1438 continue; 1439 1440 /* No waiter should start before its signaler */ 1441 GEM_BUG_ON(i915_request_started(w) && 1442 !i915_request_completed(rq)); 1443 1444 GEM_BUG_ON(i915_request_is_active(w)); 1445 if (list_empty(&w->sched.link)) 1446 continue; /* Not yet submitted; unready */ 1447 1448 if (rq_prio(w) < rq_prio(rq)) 1449 continue; 1450 1451 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1452 list_move_tail(&w->sched.link, &list); 1453 } 1454 1455 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1456 } while (rq); 1457 } 1458 1459 static void defer_active(struct intel_engine_cs *engine) 1460 { 1461 struct i915_request *rq; 1462 1463 rq = __unwind_incomplete_requests(engine); 1464 if (!rq) 1465 return; 1466 1467 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1468 } 1469 1470 static bool 1471 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) 1472 { 1473 int hint; 1474 1475 if (!intel_engine_has_timeslices(engine)) 1476 return false; 1477 1478 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1479 return false; 1480 1481 hint = max(rq_prio(list_next_entry(rq, sched.link)), 1482 engine->execlists.queue_priority_hint); 1483 1484 return hint >= effective_prio(rq); 1485 } 1486 1487 static int 1488 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1489 { 1490 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1491 return INT_MIN; 1492 1493 return rq_prio(list_next_entry(rq, sched.link)); 1494 } 1495 1496 static inline unsigned long 1497 timeslice(const struct intel_engine_cs *engine) 1498 { 1499 return READ_ONCE(engine->props.timeslice_duration_ms); 1500 } 1501 1502 static unsigned long 1503 active_timeslice(const struct intel_engine_cs *engine) 1504 { 1505 const struct i915_request *rq = *engine->execlists.active; 1506 1507 if (i915_request_completed(rq)) 1508 return 0; 1509 1510 if (engine->execlists.switch_priority_hint < effective_prio(rq)) 1511 return 0; 1512 1513 return timeslice(engine); 1514 } 1515 1516 static void set_timeslice(struct intel_engine_cs *engine) 1517 { 1518 if (!intel_engine_has_timeslices(engine)) 1519 return; 1520 1521 set_timer_ms(&engine->execlists.timer, active_timeslice(engine)); 1522 } 1523 1524 static void record_preemption(struct intel_engine_execlists *execlists) 1525 { 1526 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 1527 } 1528 1529 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine) 1530 { 1531 struct i915_request *rq; 1532 1533 rq = last_active(&engine->execlists); 1534 if (!rq) 1535 return 0; 1536 1537 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 1538 if (unlikely(i915_gem_context_is_banned(rq->gem_context))) 1539 return 1; 1540 1541 return READ_ONCE(engine->props.preempt_timeout_ms); 1542 } 1543 1544 static void set_preempt_timeout(struct intel_engine_cs *engine) 1545 { 1546 if (!intel_engine_has_preempt_reset(engine)) 1547 return; 1548 1549 set_timer_ms(&engine->execlists.preempt, 1550 active_preempt_timeout(engine)); 1551 } 1552 1553 static void execlists_dequeue(struct intel_engine_cs *engine) 1554 { 1555 struct intel_engine_execlists * const execlists = &engine->execlists; 1556 struct i915_request **port = execlists->pending; 1557 struct i915_request ** const last_port = port + execlists->port_mask; 1558 struct i915_request *last; 1559 struct rb_node *rb; 1560 bool submit = false; 1561 1562 /* 1563 * Hardware submission is through 2 ports. Conceptually each port 1564 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 1565 * static for a context, and unique to each, so we only execute 1566 * requests belonging to a single context from each ring. RING_HEAD 1567 * is maintained by the CS in the context image, it marks the place 1568 * where it got up to last time, and through RING_TAIL we tell the CS 1569 * where we want to execute up to this time. 1570 * 1571 * In this list the requests are in order of execution. Consecutive 1572 * requests from the same context are adjacent in the ringbuffer. We 1573 * can combine these requests into a single RING_TAIL update: 1574 * 1575 * RING_HEAD...req1...req2 1576 * ^- RING_TAIL 1577 * since to execute req2 the CS must first execute req1. 1578 * 1579 * Our goal then is to point each port to the end of a consecutive 1580 * sequence of requests as being the most optimal (fewest wake ups 1581 * and context switches) submission. 1582 */ 1583 1584 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 1585 struct virtual_engine *ve = 1586 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1587 struct i915_request *rq = READ_ONCE(ve->request); 1588 1589 if (!rq) { /* lazily cleanup after another engine handled rq */ 1590 rb_erase_cached(rb, &execlists->virtual); 1591 RB_CLEAR_NODE(rb); 1592 rb = rb_first_cached(&execlists->virtual); 1593 continue; 1594 } 1595 1596 if (!virtual_matches(ve, rq, engine)) { 1597 rb = rb_next(rb); 1598 continue; 1599 } 1600 1601 break; 1602 } 1603 1604 /* 1605 * If the queue is higher priority than the last 1606 * request in the currently active context, submit afresh. 1607 * We will resubmit again afterwards in case we need to split 1608 * the active context to interject the preemption request, 1609 * i.e. we will retrigger preemption following the ack in case 1610 * of trouble. 1611 */ 1612 last = last_active(execlists); 1613 if (last) { 1614 if (need_preempt(engine, last, rb)) { 1615 GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n", 1616 engine->name, 1617 last->fence.context, 1618 last->fence.seqno, 1619 last->sched.attr.priority, 1620 execlists->queue_priority_hint); 1621 record_preemption(execlists); 1622 1623 /* 1624 * Don't let the RING_HEAD advance past the breadcrumb 1625 * as we unwind (and until we resubmit) so that we do 1626 * not accidentally tell it to go backwards. 1627 */ 1628 ring_set_paused(engine, 1); 1629 1630 /* 1631 * Note that we have not stopped the GPU at this point, 1632 * so we are unwinding the incomplete requests as they 1633 * remain inflight and so by the time we do complete 1634 * the preemption, some of the unwound requests may 1635 * complete! 1636 */ 1637 __unwind_incomplete_requests(engine); 1638 1639 /* 1640 * If we need to return to the preempted context, we 1641 * need to skip the lite-restore and force it to 1642 * reload the RING_TAIL. Otherwise, the HW has a 1643 * tendency to ignore us rewinding the TAIL to the 1644 * end of an earlier request. 1645 */ 1646 last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1647 last = NULL; 1648 } else if (need_timeslice(engine, last) && 1649 timer_expired(&engine->execlists.timer)) { 1650 GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n", 1651 engine->name, 1652 last->fence.context, 1653 last->fence.seqno, 1654 last->sched.attr.priority, 1655 execlists->queue_priority_hint); 1656 1657 ring_set_paused(engine, 1); 1658 defer_active(engine); 1659 1660 /* 1661 * Unlike for preemption, if we rewind and continue 1662 * executing the same context as previously active, 1663 * the order of execution will remain the same and 1664 * the tail will only advance. We do not need to 1665 * force a full context restore, as a lite-restore 1666 * is sufficient to resample the monotonic TAIL. 1667 * 1668 * If we switch to any other context, similarly we 1669 * will not rewind TAIL of current context, and 1670 * normal save/restore will preserve state and allow 1671 * us to later continue executing the same request. 1672 */ 1673 last = NULL; 1674 } else { 1675 /* 1676 * Otherwise if we already have a request pending 1677 * for execution after the current one, we can 1678 * just wait until the next CS event before 1679 * queuing more. In either case we will force a 1680 * lite-restore preemption event, but if we wait 1681 * we hopefully coalesce several updates into a single 1682 * submission. 1683 */ 1684 if (!list_is_last(&last->sched.link, 1685 &engine->active.requests)) { 1686 /* 1687 * Even if ELSP[1] is occupied and not worthy 1688 * of timeslices, our queue might be. 1689 */ 1690 if (!execlists->timer.expires && 1691 need_timeslice(engine, last)) 1692 set_timer_ms(&execlists->timer, 1693 timeslice(engine)); 1694 1695 return; 1696 } 1697 1698 /* 1699 * WaIdleLiteRestore:bdw,skl 1700 * Apply the wa NOOPs to prevent 1701 * ring:HEAD == rq:TAIL as we resubmit the 1702 * request. See gen8_emit_fini_breadcrumb() for 1703 * where we prepare the padding after the 1704 * end of the request. 1705 */ 1706 last->tail = last->wa_tail; 1707 } 1708 } 1709 1710 while (rb) { /* XXX virtual is always taking precedence */ 1711 struct virtual_engine *ve = 1712 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1713 struct i915_request *rq; 1714 1715 spin_lock(&ve->base.active.lock); 1716 1717 rq = ve->request; 1718 if (unlikely(!rq)) { /* lost the race to a sibling */ 1719 spin_unlock(&ve->base.active.lock); 1720 rb_erase_cached(rb, &execlists->virtual); 1721 RB_CLEAR_NODE(rb); 1722 rb = rb_first_cached(&execlists->virtual); 1723 continue; 1724 } 1725 1726 GEM_BUG_ON(rq != ve->request); 1727 GEM_BUG_ON(rq->engine != &ve->base); 1728 GEM_BUG_ON(rq->hw_context != &ve->context); 1729 1730 if (rq_prio(rq) >= queue_prio(execlists)) { 1731 if (!virtual_matches(ve, rq, engine)) { 1732 spin_unlock(&ve->base.active.lock); 1733 rb = rb_next(rb); 1734 continue; 1735 } 1736 1737 if (last && !can_merge_rq(last, rq)) { 1738 spin_unlock(&ve->base.active.lock); 1739 return; /* leave this for another */ 1740 } 1741 1742 GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n", 1743 engine->name, 1744 rq->fence.context, 1745 rq->fence.seqno, 1746 i915_request_completed(rq) ? "!" : 1747 i915_request_started(rq) ? "*" : 1748 "", 1749 yesno(engine != ve->siblings[0])); 1750 1751 ve->request = NULL; 1752 ve->base.execlists.queue_priority_hint = INT_MIN; 1753 rb_erase_cached(rb, &execlists->virtual); 1754 RB_CLEAR_NODE(rb); 1755 1756 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 1757 rq->engine = engine; 1758 1759 if (engine != ve->siblings[0]) { 1760 u32 *regs = ve->context.lrc_reg_state; 1761 unsigned int n; 1762 1763 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1764 1765 if (!intel_engine_has_relative_mmio(engine)) 1766 virtual_update_register_offsets(regs, 1767 engine); 1768 1769 if (!list_empty(&ve->context.signals)) 1770 virtual_xfer_breadcrumbs(ve, engine); 1771 1772 /* 1773 * Move the bound engine to the top of the list 1774 * for future execution. We then kick this 1775 * tasklet first before checking others, so that 1776 * we preferentially reuse this set of bound 1777 * registers. 1778 */ 1779 for (n = 1; n < ve->num_siblings; n++) { 1780 if (ve->siblings[n] == engine) { 1781 swap(ve->siblings[n], 1782 ve->siblings[0]); 1783 break; 1784 } 1785 } 1786 1787 GEM_BUG_ON(ve->siblings[0] != engine); 1788 } 1789 1790 if (__i915_request_submit(rq)) { 1791 submit = true; 1792 last = rq; 1793 } 1794 i915_request_put(rq); 1795 1796 /* 1797 * Hmm, we have a bunch of virtual engine requests, 1798 * but the first one was already completed (thanks 1799 * preempt-to-busy!). Keep looking at the veng queue 1800 * until we have no more relevant requests (i.e. 1801 * the normal submit queue has higher priority). 1802 */ 1803 if (!submit) { 1804 spin_unlock(&ve->base.active.lock); 1805 rb = rb_first_cached(&execlists->virtual); 1806 continue; 1807 } 1808 } 1809 1810 spin_unlock(&ve->base.active.lock); 1811 break; 1812 } 1813 1814 while ((rb = rb_first_cached(&execlists->queue))) { 1815 struct i915_priolist *p = to_priolist(rb); 1816 struct i915_request *rq, *rn; 1817 int i; 1818 1819 priolist_for_each_request_consume(rq, rn, p, i) { 1820 bool merge = true; 1821 1822 /* 1823 * Can we combine this request with the current port? 1824 * It has to be the same context/ringbuffer and not 1825 * have any exceptions (e.g. GVT saying never to 1826 * combine contexts). 1827 * 1828 * If we can combine the requests, we can execute both 1829 * by updating the RING_TAIL to point to the end of the 1830 * second request, and so we never need to tell the 1831 * hardware about the first. 1832 */ 1833 if (last && !can_merge_rq(last, rq)) { 1834 /* 1835 * If we are on the second port and cannot 1836 * combine this request with the last, then we 1837 * are done. 1838 */ 1839 if (port == last_port) 1840 goto done; 1841 1842 /* 1843 * We must not populate both ELSP[] with the 1844 * same LRCA, i.e. we must submit 2 different 1845 * contexts if we submit 2 ELSP. 1846 */ 1847 if (last->hw_context == rq->hw_context) 1848 goto done; 1849 1850 if (i915_request_has_sentinel(last)) 1851 goto done; 1852 1853 /* 1854 * If GVT overrides us we only ever submit 1855 * port[0], leaving port[1] empty. Note that we 1856 * also have to be careful that we don't queue 1857 * the same context (even though a different 1858 * request) to the second port. 1859 */ 1860 if (ctx_single_port_submission(last->hw_context) || 1861 ctx_single_port_submission(rq->hw_context)) 1862 goto done; 1863 1864 merge = false; 1865 } 1866 1867 if (__i915_request_submit(rq)) { 1868 if (!merge) { 1869 *port = execlists_schedule_in(last, port - execlists->pending); 1870 port++; 1871 last = NULL; 1872 } 1873 1874 GEM_BUG_ON(last && 1875 !can_merge_ctx(last->hw_context, 1876 rq->hw_context)); 1877 1878 submit = true; 1879 last = rq; 1880 } 1881 } 1882 1883 rb_erase_cached(&p->node, &execlists->queue); 1884 i915_priolist_free(p); 1885 } 1886 1887 done: 1888 /* 1889 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 1890 * 1891 * We choose the priority hint such that if we add a request of greater 1892 * priority than this, we kick the submission tasklet to decide on 1893 * the right order of submitting the requests to hardware. We must 1894 * also be prepared to reorder requests as they are in-flight on the 1895 * HW. We derive the priority hint then as the first "hole" in 1896 * the HW submission ports and if there are no available slots, 1897 * the priority of the lowest executing request, i.e. last. 1898 * 1899 * When we do receive a higher priority request ready to run from the 1900 * user, see queue_request(), the priority hint is bumped to that 1901 * request triggering preemption on the next dequeue (or subsequent 1902 * interrupt for secondary ports). 1903 */ 1904 execlists->queue_priority_hint = queue_prio(execlists); 1905 GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n", 1906 engine->name, execlists->queue_priority_hint, 1907 yesno(submit)); 1908 1909 if (submit) { 1910 *port = execlists_schedule_in(last, port - execlists->pending); 1911 execlists->switch_priority_hint = 1912 switch_prio(engine, *execlists->pending); 1913 1914 /* 1915 * Skip if we ended up with exactly the same set of requests, 1916 * e.g. trying to timeslice a pair of ordered contexts 1917 */ 1918 if (!memcmp(execlists->active, execlists->pending, 1919 (port - execlists->pending + 1) * sizeof(*port))) { 1920 do 1921 execlists_schedule_out(fetch_and_zero(port)); 1922 while (port-- != execlists->pending); 1923 1924 goto skip_submit; 1925 } 1926 1927 memset(port + 1, 0, (last_port - port) * sizeof(*port)); 1928 execlists_submit_ports(engine); 1929 1930 set_preempt_timeout(engine); 1931 } else { 1932 skip_submit: 1933 ring_set_paused(engine, 0); 1934 } 1935 } 1936 1937 static void 1938 cancel_port_requests(struct intel_engine_execlists * const execlists) 1939 { 1940 struct i915_request * const *port, *rq; 1941 1942 for (port = execlists->pending; (rq = *port); port++) 1943 execlists_schedule_out(rq); 1944 memset(execlists->pending, 0, sizeof(execlists->pending)); 1945 1946 for (port = execlists->active; (rq = *port); port++) 1947 execlists_schedule_out(rq); 1948 execlists->active = 1949 memset(execlists->inflight, 0, sizeof(execlists->inflight)); 1950 } 1951 1952 static inline void 1953 invalidate_csb_entries(const u32 *first, const u32 *last) 1954 { 1955 clflush((void *)first); 1956 clflush((void *)last); 1957 } 1958 1959 static inline bool 1960 reset_in_progress(const struct intel_engine_execlists *execlists) 1961 { 1962 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1963 } 1964 1965 /* 1966 * Starting with Gen12, the status has a new format: 1967 * 1968 * bit 0: switched to new queue 1969 * bit 1: reserved 1970 * bit 2: semaphore wait mode (poll or signal), only valid when 1971 * switch detail is set to "wait on semaphore" 1972 * bits 3-5: engine class 1973 * bits 6-11: engine instance 1974 * bits 12-14: reserved 1975 * bits 15-25: sw context id of the lrc the GT switched to 1976 * bits 26-31: sw counter of the lrc the GT switched to 1977 * bits 32-35: context switch detail 1978 * - 0: ctx complete 1979 * - 1: wait on sync flip 1980 * - 2: wait on vblank 1981 * - 3: wait on scanline 1982 * - 4: wait on semaphore 1983 * - 5: context preempted (not on SEMAPHORE_WAIT or 1984 * WAIT_FOR_EVENT) 1985 * bit 36: reserved 1986 * bits 37-43: wait detail (for switch detail 1 to 4) 1987 * bits 44-46: reserved 1988 * bits 47-57: sw context id of the lrc the GT switched away from 1989 * bits 58-63: sw counter of the lrc the GT switched away from 1990 */ 1991 static inline bool 1992 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 1993 { 1994 u32 lower_dw = csb[0]; 1995 u32 upper_dw = csb[1]; 1996 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 1997 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 1998 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 1999 2000 /* 2001 * The context switch detail is not guaranteed to be 5 when a preemption 2002 * occurs, so we can't just check for that. The check below works for 2003 * all the cases we care about, including preemptions of WAIT 2004 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2005 * would require some extra handling, but we don't support that. 2006 */ 2007 if (!ctx_away_valid || new_queue) { 2008 GEM_BUG_ON(!ctx_to_valid); 2009 return true; 2010 } 2011 2012 /* 2013 * switch detail = 5 is covered by the case above and we do not expect a 2014 * context switch on an unsuccessful wait instruction since we always 2015 * use polling mode. 2016 */ 2017 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 2018 return false; 2019 } 2020 2021 static inline bool 2022 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2023 { 2024 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2025 } 2026 2027 static void process_csb(struct intel_engine_cs *engine) 2028 { 2029 struct intel_engine_execlists * const execlists = &engine->execlists; 2030 const u32 * const buf = execlists->csb_status; 2031 const u8 num_entries = execlists->csb_size; 2032 u8 head, tail; 2033 2034 /* 2035 * As we modify our execlists state tracking we require exclusive 2036 * access. Either we are inside the tasklet, or the tasklet is disabled 2037 * and we assume that is only inside the reset paths and so serialised. 2038 */ 2039 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2040 !reset_in_progress(execlists)); 2041 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2042 2043 /* 2044 * Note that csb_write, csb_status may be either in HWSP or mmio. 2045 * When reading from the csb_write mmio register, we have to be 2046 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2047 * the low 4bits. As it happens we know the next 4bits are always 2048 * zero and so we can simply masked off the low u8 of the register 2049 * and treat it identically to reading from the HWSP (without having 2050 * to use explicit shifting and masking, and probably bifurcating 2051 * the code to handle the legacy mmio read). 2052 */ 2053 head = execlists->csb_head; 2054 tail = READ_ONCE(*execlists->csb_write); 2055 GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail); 2056 if (unlikely(head == tail)) 2057 return; 2058 2059 /* 2060 * Hopefully paired with a wmb() in HW! 2061 * 2062 * We must complete the read of the write pointer before any reads 2063 * from the CSB, so that we do not see stale values. Without an rmb 2064 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2065 * we perform the READ_ONCE(*csb_write). 2066 */ 2067 rmb(); 2068 2069 do { 2070 bool promote; 2071 2072 if (++head == num_entries) 2073 head = 0; 2074 2075 /* 2076 * We are flying near dragons again. 2077 * 2078 * We hold a reference to the request in execlist_port[] 2079 * but no more than that. We are operating in softirq 2080 * context and so cannot hold any mutex or sleep. That 2081 * prevents us stopping the requests we are processing 2082 * in port[] from being retired simultaneously (the 2083 * breadcrumb will be complete before we see the 2084 * context-switch). As we only hold the reference to the 2085 * request, any pointer chasing underneath the request 2086 * is subject to a potential use-after-free. Thus we 2087 * store all of the bookkeeping within port[] as 2088 * required, and avoid using unguarded pointers beneath 2089 * request itself. The same applies to the atomic 2090 * status notifier. 2091 */ 2092 2093 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n", 2094 engine->name, head, 2095 buf[2 * head + 0], buf[2 * head + 1]); 2096 2097 if (INTEL_GEN(engine->i915) >= 12) 2098 promote = gen12_csb_parse(execlists, buf + 2 * head); 2099 else 2100 promote = gen8_csb_parse(execlists, buf + 2 * head); 2101 if (promote) { 2102 if (!inject_preempt_hang(execlists)) 2103 ring_set_paused(engine, 0); 2104 2105 /* cancel old inflight, prepare for switch */ 2106 trace_ports(execlists, "preempted", execlists->active); 2107 while (*execlists->active) 2108 execlists_schedule_out(*execlists->active++); 2109 2110 /* switch pending to inflight */ 2111 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2112 execlists->active = 2113 memcpy(execlists->inflight, 2114 execlists->pending, 2115 execlists_num_ports(execlists) * 2116 sizeof(*execlists->pending)); 2117 2118 set_timeslice(engine); 2119 2120 WRITE_ONCE(execlists->pending[0], NULL); 2121 } else { 2122 GEM_BUG_ON(!*execlists->active); 2123 2124 /* port0 completed, advanced to port1 */ 2125 trace_ports(execlists, "completed", execlists->active); 2126 2127 /* 2128 * We rely on the hardware being strongly 2129 * ordered, that the breadcrumb write is 2130 * coherent (visible from the CPU) before the 2131 * user interrupt and CSB is processed. 2132 */ 2133 GEM_BUG_ON(!i915_request_completed(*execlists->active) && 2134 !reset_in_progress(execlists)); 2135 execlists_schedule_out(*execlists->active++); 2136 2137 GEM_BUG_ON(execlists->active - execlists->inflight > 2138 execlists_num_ports(execlists)); 2139 } 2140 } while (head != tail); 2141 2142 execlists->csb_head = head; 2143 2144 /* 2145 * Gen11 has proven to fail wrt global observation point between 2146 * entry and tail update, failing on the ordering and thus 2147 * we see an old entry in the context status buffer. 2148 * 2149 * Forcibly evict out entries for the next gpu csb update, 2150 * to increase the odds that we get a fresh entries with non 2151 * working hardware. The cost for doing so comes out mostly with 2152 * the wash as hardware, working or not, will need to do the 2153 * invalidation before. 2154 */ 2155 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2156 } 2157 2158 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2159 { 2160 lockdep_assert_held(&engine->active.lock); 2161 if (!engine->execlists.pending[0]) { 2162 rcu_read_lock(); /* protect peeking at execlists->active */ 2163 execlists_dequeue(engine); 2164 rcu_read_unlock(); 2165 } 2166 } 2167 2168 static noinline void preempt_reset(struct intel_engine_cs *engine) 2169 { 2170 const unsigned int bit = I915_RESET_ENGINE + engine->id; 2171 unsigned long *lock = &engine->gt->reset.flags; 2172 2173 if (i915_modparams.reset < 3) 2174 return; 2175 2176 if (test_and_set_bit(bit, lock)) 2177 return; 2178 2179 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 2180 tasklet_disable_nosync(&engine->execlists.tasklet); 2181 2182 GEM_TRACE("%s: preempt timeout %lu+%ums\n", 2183 engine->name, 2184 READ_ONCE(engine->props.preempt_timeout_ms), 2185 jiffies_to_msecs(jiffies - engine->execlists.preempt.expires)); 2186 intel_engine_reset(engine, "preemption time out"); 2187 2188 tasklet_enable(&engine->execlists.tasklet); 2189 clear_and_wake_up_bit(bit, lock); 2190 } 2191 2192 static bool preempt_timeout(const struct intel_engine_cs *const engine) 2193 { 2194 const struct timer_list *t = &engine->execlists.preempt; 2195 2196 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 2197 return false; 2198 2199 if (!timer_expired(t)) 2200 return false; 2201 2202 return READ_ONCE(engine->execlists.pending[0]); 2203 } 2204 2205 /* 2206 * Check the unread Context Status Buffers and manage the submission of new 2207 * contexts to the ELSP accordingly. 2208 */ 2209 static void execlists_submission_tasklet(unsigned long data) 2210 { 2211 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 2212 bool timeout = preempt_timeout(engine); 2213 2214 process_csb(engine); 2215 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 2216 unsigned long flags; 2217 2218 spin_lock_irqsave(&engine->active.lock, flags); 2219 __execlists_submission_tasklet(engine); 2220 spin_unlock_irqrestore(&engine->active.lock, flags); 2221 2222 /* Recheck after serialising with direct-submission */ 2223 if (timeout && preempt_timeout(engine)) 2224 preempt_reset(engine); 2225 } 2226 } 2227 2228 static void __execlists_kick(struct intel_engine_execlists *execlists) 2229 { 2230 /* Kick the tasklet for some interrupt coalescing and reset handling */ 2231 tasklet_hi_schedule(&execlists->tasklet); 2232 } 2233 2234 #define execlists_kick(t, member) \ 2235 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 2236 2237 static void execlists_timeslice(struct timer_list *timer) 2238 { 2239 execlists_kick(timer, timer); 2240 } 2241 2242 static void execlists_preempt(struct timer_list *timer) 2243 { 2244 execlists_kick(timer, preempt); 2245 } 2246 2247 static void queue_request(struct intel_engine_cs *engine, 2248 struct i915_sched_node *node, 2249 int prio) 2250 { 2251 GEM_BUG_ON(!list_empty(&node->link)); 2252 list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio)); 2253 } 2254 2255 static void __submit_queue_imm(struct intel_engine_cs *engine) 2256 { 2257 struct intel_engine_execlists * const execlists = &engine->execlists; 2258 2259 if (reset_in_progress(execlists)) 2260 return; /* defer until we restart the engine following reset */ 2261 2262 if (execlists->tasklet.func == execlists_submission_tasklet) 2263 __execlists_submission_tasklet(engine); 2264 else 2265 tasklet_hi_schedule(&execlists->tasklet); 2266 } 2267 2268 static void submit_queue(struct intel_engine_cs *engine, 2269 const struct i915_request *rq) 2270 { 2271 struct intel_engine_execlists *execlists = &engine->execlists; 2272 2273 if (rq_prio(rq) <= execlists->queue_priority_hint) 2274 return; 2275 2276 execlists->queue_priority_hint = rq_prio(rq); 2277 __submit_queue_imm(engine); 2278 } 2279 2280 static void execlists_submit_request(struct i915_request *request) 2281 { 2282 struct intel_engine_cs *engine = request->engine; 2283 unsigned long flags; 2284 2285 /* Will be called from irq-context when using foreign fences. */ 2286 spin_lock_irqsave(&engine->active.lock, flags); 2287 2288 queue_request(engine, &request->sched, rq_prio(request)); 2289 2290 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 2291 GEM_BUG_ON(list_empty(&request->sched.link)); 2292 2293 submit_queue(engine, request); 2294 2295 spin_unlock_irqrestore(&engine->active.lock, flags); 2296 } 2297 2298 static void __execlists_context_fini(struct intel_context *ce) 2299 { 2300 intel_ring_put(ce->ring); 2301 i915_vma_put(ce->state); 2302 } 2303 2304 static void execlists_context_destroy(struct kref *kref) 2305 { 2306 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 2307 2308 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 2309 GEM_BUG_ON(intel_context_is_pinned(ce)); 2310 2311 if (ce->state) 2312 __execlists_context_fini(ce); 2313 2314 intel_context_fini(ce); 2315 intel_context_free(ce); 2316 } 2317 2318 static void 2319 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 2320 { 2321 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2322 return; 2323 2324 vaddr += engine->context_size; 2325 2326 memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE); 2327 } 2328 2329 static void 2330 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 2331 { 2332 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2333 return; 2334 2335 vaddr += engine->context_size; 2336 2337 if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) 2338 dev_err_once(engine->i915->drm.dev, 2339 "%s context redzone overwritten!\n", 2340 engine->name); 2341 } 2342 2343 static void execlists_context_unpin(struct intel_context *ce) 2344 { 2345 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, 2346 ce->engine); 2347 2348 i915_gem_object_unpin_map(ce->state->obj); 2349 intel_ring_reset(ce->ring, ce->ring->tail); 2350 } 2351 2352 static void 2353 __execlists_update_reg_state(const struct intel_context *ce, 2354 const struct intel_engine_cs *engine) 2355 { 2356 struct intel_ring *ring = ce->ring; 2357 u32 *regs = ce->lrc_reg_state; 2358 2359 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head)); 2360 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 2361 2362 regs[CTX_RING_BUFFER_START] = i915_ggtt_offset(ring->vma); 2363 regs[CTX_RING_HEAD] = ring->head; 2364 regs[CTX_RING_TAIL] = ring->tail; 2365 2366 /* RPCS */ 2367 if (engine->class == RENDER_CLASS) { 2368 regs[CTX_R_PWR_CLK_STATE] = 2369 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 2370 2371 i915_oa_init_reg_state(ce, engine); 2372 } 2373 } 2374 2375 static int 2376 __execlists_context_pin(struct intel_context *ce, 2377 struct intel_engine_cs *engine) 2378 { 2379 void *vaddr; 2380 int ret; 2381 2382 GEM_BUG_ON(!ce->state); 2383 2384 ret = intel_context_active_acquire(ce); 2385 if (ret) 2386 goto err; 2387 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 2388 2389 vaddr = i915_gem_object_pin_map(ce->state->obj, 2390 i915_coherent_map_type(engine->i915) | 2391 I915_MAP_OVERRIDE); 2392 if (IS_ERR(vaddr)) { 2393 ret = PTR_ERR(vaddr); 2394 goto unpin_active; 2395 } 2396 2397 ce->lrc_desc = lrc_descriptor(ce, engine); 2398 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 2399 __execlists_update_reg_state(ce, engine); 2400 2401 return 0; 2402 2403 unpin_active: 2404 intel_context_active_release(ce); 2405 err: 2406 return ret; 2407 } 2408 2409 static int execlists_context_pin(struct intel_context *ce) 2410 { 2411 return __execlists_context_pin(ce, ce->engine); 2412 } 2413 2414 static int execlists_context_alloc(struct intel_context *ce) 2415 { 2416 return __execlists_context_alloc(ce, ce->engine); 2417 } 2418 2419 static void execlists_context_reset(struct intel_context *ce) 2420 { 2421 /* 2422 * Because we emit WA_TAIL_DWORDS there may be a disparity 2423 * between our bookkeeping in ce->ring->head and ce->ring->tail and 2424 * that stored in context. As we only write new commands from 2425 * ce->ring->tail onwards, everything before that is junk. If the GPU 2426 * starts reading from its RING_HEAD from the context, it may try to 2427 * execute that junk and die. 2428 * 2429 * The contexts that are stilled pinned on resume belong to the 2430 * kernel, and are local to each engine. All other contexts will 2431 * have their head/tail sanitized upon pinning before use, so they 2432 * will never see garbage, 2433 * 2434 * So to avoid that we reset the context images upon resume. For 2435 * simplicity, we just zero everything out. 2436 */ 2437 intel_ring_reset(ce->ring, 0); 2438 __execlists_update_reg_state(ce, ce->engine); 2439 } 2440 2441 static const struct intel_context_ops execlists_context_ops = { 2442 .alloc = execlists_context_alloc, 2443 2444 .pin = execlists_context_pin, 2445 .unpin = execlists_context_unpin, 2446 2447 .enter = intel_context_enter_engine, 2448 .exit = intel_context_exit_engine, 2449 2450 .reset = execlists_context_reset, 2451 .destroy = execlists_context_destroy, 2452 }; 2453 2454 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 2455 { 2456 u32 *cs; 2457 2458 GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb); 2459 2460 cs = intel_ring_begin(rq, 6); 2461 if (IS_ERR(cs)) 2462 return PTR_ERR(cs); 2463 2464 /* 2465 * Check if we have been preempted before we even get started. 2466 * 2467 * After this point i915_request_started() reports true, even if 2468 * we get preempted and so are no longer running. 2469 */ 2470 *cs++ = MI_ARB_CHECK; 2471 *cs++ = MI_NOOP; 2472 2473 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 2474 *cs++ = i915_request_timeline(rq)->hwsp_offset; 2475 *cs++ = 0; 2476 *cs++ = rq->fence.seqno - 1; 2477 2478 intel_ring_advance(rq, cs); 2479 2480 /* Record the updated position of the request's payload */ 2481 rq->infix = intel_ring_offset(rq, cs); 2482 2483 return 0; 2484 } 2485 2486 static int execlists_request_alloc(struct i915_request *request) 2487 { 2488 int ret; 2489 2490 GEM_BUG_ON(!intel_context_is_pinned(request->hw_context)); 2491 2492 /* 2493 * Flush enough space to reduce the likelihood of waiting after 2494 * we start building the request - in which case we will just 2495 * have to repeat work. 2496 */ 2497 request->reserved_space += EXECLISTS_REQUEST_SIZE; 2498 2499 /* 2500 * Note that after this point, we have committed to using 2501 * this request as it is being used to both track the 2502 * state of engine initialisation and liveness of the 2503 * golden renderstate above. Think twice before you try 2504 * to cancel/unwind this request now. 2505 */ 2506 2507 /* Unconditionally invalidate GPU caches and TLBs. */ 2508 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 2509 if (ret) 2510 return ret; 2511 2512 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 2513 return 0; 2514 } 2515 2516 /* 2517 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 2518 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 2519 * but there is a slight complication as this is applied in WA batch where the 2520 * values are only initialized once so we cannot take register value at the 2521 * beginning and reuse it further; hence we save its value to memory, upload a 2522 * constant value with bit21 set and then we restore it back with the saved value. 2523 * To simplify the WA, a constant value is formed by using the default value 2524 * of this register. This shouldn't be a problem because we are only modifying 2525 * it for a short period and this batch in non-premptible. We can ofcourse 2526 * use additional instructions that read the actual value of the register 2527 * at that time and set our bit of interest but it makes the WA complicated. 2528 * 2529 * This WA is also required for Gen9 so extracting as a function avoids 2530 * code duplication. 2531 */ 2532 static u32 * 2533 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 2534 { 2535 /* NB no one else is allowed to scribble over scratch + 256! */ 2536 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 2537 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2538 *batch++ = intel_gt_scratch_offset(engine->gt, 2539 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 2540 *batch++ = 0; 2541 2542 *batch++ = MI_LOAD_REGISTER_IMM(1); 2543 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2544 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 2545 2546 batch = gen8_emit_pipe_control(batch, 2547 PIPE_CONTROL_CS_STALL | 2548 PIPE_CONTROL_DC_FLUSH_ENABLE, 2549 0); 2550 2551 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 2552 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 2553 *batch++ = intel_gt_scratch_offset(engine->gt, 2554 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 2555 *batch++ = 0; 2556 2557 return batch; 2558 } 2559 2560 /* 2561 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 2562 * initialized at the beginning and shared across all contexts but this field 2563 * helps us to have multiple batches at different offsets and select them based 2564 * on a criteria. At the moment this batch always start at the beginning of the page 2565 * and at this point we don't have multiple wa_ctx batch buffers. 2566 * 2567 * The number of WA applied are not known at the beginning; we use this field 2568 * to return the no of DWORDS written. 2569 * 2570 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 2571 * so it adds NOOPs as padding to make it cacheline aligned. 2572 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 2573 * makes a complete batch buffer. 2574 */ 2575 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2576 { 2577 /* WaDisableCtxRestoreArbitration:bdw,chv */ 2578 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2579 2580 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 2581 if (IS_BROADWELL(engine->i915)) 2582 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 2583 2584 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 2585 /* Actual scratch location is at 128 bytes offset */ 2586 batch = gen8_emit_pipe_control(batch, 2587 PIPE_CONTROL_FLUSH_L3 | 2588 PIPE_CONTROL_STORE_DATA_INDEX | 2589 PIPE_CONTROL_CS_STALL | 2590 PIPE_CONTROL_QW_WRITE, 2591 LRC_PPHWSP_SCRATCH_ADDR); 2592 2593 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2594 2595 /* Pad to end of cacheline */ 2596 while ((unsigned long)batch % CACHELINE_BYTES) 2597 *batch++ = MI_NOOP; 2598 2599 /* 2600 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 2601 * execution depends on the length specified in terms of cache lines 2602 * in the register CTX_RCS_INDIRECT_CTX 2603 */ 2604 2605 return batch; 2606 } 2607 2608 struct lri { 2609 i915_reg_t reg; 2610 u32 value; 2611 }; 2612 2613 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 2614 { 2615 GEM_BUG_ON(!count || count > 63); 2616 2617 *batch++ = MI_LOAD_REGISTER_IMM(count); 2618 do { 2619 *batch++ = i915_mmio_reg_offset(lri->reg); 2620 *batch++ = lri->value; 2621 } while (lri++, --count); 2622 *batch++ = MI_NOOP; 2623 2624 return batch; 2625 } 2626 2627 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2628 { 2629 static const struct lri lri[] = { 2630 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 2631 { 2632 COMMON_SLICE_CHICKEN2, 2633 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 2634 0), 2635 }, 2636 2637 /* BSpec: 11391 */ 2638 { 2639 FF_SLICE_CHICKEN, 2640 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 2641 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 2642 }, 2643 2644 /* BSpec: 11299 */ 2645 { 2646 _3D_CHICKEN3, 2647 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 2648 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 2649 } 2650 }; 2651 2652 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2653 2654 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 2655 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 2656 2657 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 2658 2659 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 2660 if (HAS_POOLED_EU(engine->i915)) { 2661 /* 2662 * EU pool configuration is setup along with golden context 2663 * during context initialization. This value depends on 2664 * device type (2x6 or 3x6) and needs to be updated based 2665 * on which subslice is disabled especially for 2x6 2666 * devices, however it is safe to load default 2667 * configuration of 3x6 device instead of masking off 2668 * corresponding bits because HW ignores bits of a disabled 2669 * subslice and drops down to appropriate config. Please 2670 * see render_state_setup() in i915_gem_render_state.c for 2671 * possible configurations, to avoid duplication they are 2672 * not shown here again. 2673 */ 2674 *batch++ = GEN9_MEDIA_POOL_STATE; 2675 *batch++ = GEN9_MEDIA_POOL_ENABLE; 2676 *batch++ = 0x00777000; 2677 *batch++ = 0; 2678 *batch++ = 0; 2679 *batch++ = 0; 2680 } 2681 2682 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2683 2684 /* Pad to end of cacheline */ 2685 while ((unsigned long)batch % CACHELINE_BYTES) 2686 *batch++ = MI_NOOP; 2687 2688 return batch; 2689 } 2690 2691 static u32 * 2692 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2693 { 2694 int i; 2695 2696 /* 2697 * WaPipeControlBefore3DStateSamplePattern: cnl 2698 * 2699 * Ensure the engine is idle prior to programming a 2700 * 3DSTATE_SAMPLE_PATTERN during a context restore. 2701 */ 2702 batch = gen8_emit_pipe_control(batch, 2703 PIPE_CONTROL_CS_STALL, 2704 0); 2705 /* 2706 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 2707 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 2708 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 2709 * confusing. Since gen8_emit_pipe_control() already advances the 2710 * batch by 6 dwords, we advance the other 10 here, completing a 2711 * cacheline. It's not clear if the workaround requires this padding 2712 * before other commands, or if it's just the regular padding we would 2713 * already have for the workaround bb, so leave it here for now. 2714 */ 2715 for (i = 0; i < 10; i++) 2716 *batch++ = MI_NOOP; 2717 2718 /* Pad to end of cacheline */ 2719 while ((unsigned long)batch % CACHELINE_BYTES) 2720 *batch++ = MI_NOOP; 2721 2722 return batch; 2723 } 2724 2725 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 2726 2727 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 2728 { 2729 struct drm_i915_gem_object *obj; 2730 struct i915_vma *vma; 2731 int err; 2732 2733 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 2734 if (IS_ERR(obj)) 2735 return PTR_ERR(obj); 2736 2737 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 2738 if (IS_ERR(vma)) { 2739 err = PTR_ERR(vma); 2740 goto err; 2741 } 2742 2743 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); 2744 if (err) 2745 goto err; 2746 2747 engine->wa_ctx.vma = vma; 2748 return 0; 2749 2750 err: 2751 i915_gem_object_put(obj); 2752 return err; 2753 } 2754 2755 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 2756 { 2757 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 2758 } 2759 2760 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 2761 2762 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 2763 { 2764 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 2765 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 2766 &wa_ctx->per_ctx }; 2767 wa_bb_func_t wa_bb_fn[2]; 2768 struct page *page; 2769 void *batch, *batch_ptr; 2770 unsigned int i; 2771 int ret; 2772 2773 if (engine->class != RENDER_CLASS) 2774 return 0; 2775 2776 switch (INTEL_GEN(engine->i915)) { 2777 case 12: 2778 case 11: 2779 return 0; 2780 case 10: 2781 wa_bb_fn[0] = gen10_init_indirectctx_bb; 2782 wa_bb_fn[1] = NULL; 2783 break; 2784 case 9: 2785 wa_bb_fn[0] = gen9_init_indirectctx_bb; 2786 wa_bb_fn[1] = NULL; 2787 break; 2788 case 8: 2789 wa_bb_fn[0] = gen8_init_indirectctx_bb; 2790 wa_bb_fn[1] = NULL; 2791 break; 2792 default: 2793 MISSING_CASE(INTEL_GEN(engine->i915)); 2794 return 0; 2795 } 2796 2797 ret = lrc_setup_wa_ctx(engine); 2798 if (ret) { 2799 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 2800 return ret; 2801 } 2802 2803 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 2804 batch = batch_ptr = kmap_atomic(page); 2805 2806 /* 2807 * Emit the two workaround batch buffers, recording the offset from the 2808 * start of the workaround batch buffer object for each and their 2809 * respective sizes. 2810 */ 2811 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 2812 wa_bb[i]->offset = batch_ptr - batch; 2813 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 2814 CACHELINE_BYTES))) { 2815 ret = -EINVAL; 2816 break; 2817 } 2818 if (wa_bb_fn[i]) 2819 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 2820 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 2821 } 2822 2823 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 2824 2825 kunmap_atomic(batch); 2826 if (ret) 2827 lrc_destroy_wa_ctx(engine); 2828 2829 return ret; 2830 } 2831 2832 static void enable_execlists(struct intel_engine_cs *engine) 2833 { 2834 u32 mode; 2835 2836 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 2837 2838 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 2839 2840 if (INTEL_GEN(engine->i915) >= 11) 2841 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 2842 else 2843 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 2844 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 2845 2846 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 2847 2848 ENGINE_WRITE_FW(engine, 2849 RING_HWS_PGA, 2850 i915_ggtt_offset(engine->status_page.vma)); 2851 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 2852 } 2853 2854 static bool unexpected_starting_state(struct intel_engine_cs *engine) 2855 { 2856 bool unexpected = false; 2857 2858 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 2859 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); 2860 unexpected = true; 2861 } 2862 2863 return unexpected; 2864 } 2865 2866 static int execlists_resume(struct intel_engine_cs *engine) 2867 { 2868 intel_engine_apply_workarounds(engine); 2869 intel_engine_apply_whitelist(engine); 2870 2871 intel_mocs_init_engine(engine); 2872 2873 intel_engine_reset_breadcrumbs(engine); 2874 2875 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 2876 struct drm_printer p = drm_debug_printer(__func__); 2877 2878 intel_engine_dump(engine, &p, NULL); 2879 } 2880 2881 enable_execlists(engine); 2882 2883 return 0; 2884 } 2885 2886 static void execlists_reset_prepare(struct intel_engine_cs *engine) 2887 { 2888 struct intel_engine_execlists * const execlists = &engine->execlists; 2889 unsigned long flags; 2890 2891 GEM_TRACE("%s: depth<-%d\n", engine->name, 2892 atomic_read(&execlists->tasklet.count)); 2893 2894 /* 2895 * Prevent request submission to the hardware until we have 2896 * completed the reset in i915_gem_reset_finish(). If a request 2897 * is completed by one engine, it may then queue a request 2898 * to a second via its execlists->tasklet *just* as we are 2899 * calling engine->resume() and also writing the ELSP. 2900 * Turning off the execlists->tasklet until the reset is over 2901 * prevents the race. 2902 */ 2903 __tasklet_disable_sync_once(&execlists->tasklet); 2904 GEM_BUG_ON(!reset_in_progress(execlists)); 2905 2906 /* And flush any current direct submission. */ 2907 spin_lock_irqsave(&engine->active.lock, flags); 2908 spin_unlock_irqrestore(&engine->active.lock, flags); 2909 2910 /* 2911 * We stop engines, otherwise we might get failed reset and a 2912 * dead gpu (on elk). Also as modern gpu as kbl can suffer 2913 * from system hang if batchbuffer is progressing when 2914 * the reset is issued, regardless of READY_TO_RESET ack. 2915 * Thus assume it is best to stop engines on all gens 2916 * where we have a gpu reset. 2917 * 2918 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 2919 * 2920 * FIXME: Wa for more modern gens needs to be validated 2921 */ 2922 intel_engine_stop_cs(engine); 2923 } 2924 2925 static void reset_csb_pointers(struct intel_engine_cs *engine) 2926 { 2927 struct intel_engine_execlists * const execlists = &engine->execlists; 2928 const unsigned int reset_value = execlists->csb_size - 1; 2929 2930 ring_set_paused(engine, 0); 2931 2932 /* 2933 * After a reset, the HW starts writing into CSB entry [0]. We 2934 * therefore have to set our HEAD pointer back one entry so that 2935 * the *first* entry we check is entry 0. To complicate this further, 2936 * as we don't wait for the first interrupt after reset, we have to 2937 * fake the HW write to point back to the last entry so that our 2938 * inline comparison of our cached head position against the last HW 2939 * write works even before the first interrupt. 2940 */ 2941 execlists->csb_head = reset_value; 2942 WRITE_ONCE(*execlists->csb_write, reset_value); 2943 wmb(); /* Make sure this is visible to HW (paranoia?) */ 2944 2945 invalidate_csb_entries(&execlists->csb_status[0], 2946 &execlists->csb_status[reset_value]); 2947 } 2948 2949 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 2950 { 2951 if (INTEL_GEN(engine->i915) >= 12) 2952 return 0x60; 2953 else if (INTEL_GEN(engine->i915) >= 9) 2954 return 0x54; 2955 else if (engine->class == RENDER_CLASS) 2956 return 0x58; 2957 else 2958 return -1; 2959 } 2960 2961 static void __execlists_reset_reg_state(const struct intel_context *ce, 2962 const struct intel_engine_cs *engine) 2963 { 2964 u32 *regs = ce->lrc_reg_state; 2965 int x; 2966 2967 x = lrc_ring_mi_mode(engine); 2968 if (x != -1) { 2969 regs[x + 1] &= ~STOP_RING; 2970 regs[x + 1] |= STOP_RING << 16; 2971 } 2972 } 2973 2974 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 2975 { 2976 struct intel_engine_execlists * const execlists = &engine->execlists; 2977 struct intel_context *ce; 2978 struct i915_request *rq; 2979 2980 mb(); /* paranoia: read the CSB pointers from after the reset */ 2981 clflush(execlists->csb_write); 2982 mb(); 2983 2984 process_csb(engine); /* drain preemption events */ 2985 2986 /* Following the reset, we need to reload the CSB read/write pointers */ 2987 reset_csb_pointers(engine); 2988 2989 /* 2990 * Save the currently executing context, even if we completed 2991 * its request, it was still running at the time of the 2992 * reset and will have been clobbered. 2993 */ 2994 rq = execlists_active(execlists); 2995 if (!rq) 2996 goto unwind; 2997 2998 /* We still have requests in-flight; the engine should be active */ 2999 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 3000 3001 ce = rq->hw_context; 3002 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3003 3004 if (i915_request_completed(rq)) { 3005 /* Idle context; tidy up the ring so we can restart afresh */ 3006 ce->ring->head = intel_ring_wrap(ce->ring, rq->tail); 3007 goto out_replay; 3008 } 3009 3010 /* Context has requests still in-flight; it should not be idle! */ 3011 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 3012 rq = active_request(ce->timeline, rq); 3013 ce->ring->head = intel_ring_wrap(ce->ring, rq->head); 3014 GEM_BUG_ON(ce->ring->head == ce->ring->tail); 3015 3016 /* 3017 * If this request hasn't started yet, e.g. it is waiting on a 3018 * semaphore, we need to avoid skipping the request or else we 3019 * break the signaling chain. However, if the context is corrupt 3020 * the request will not restart and we will be stuck with a wedged 3021 * device. It is quite often the case that if we issue a reset 3022 * while the GPU is loading the context image, that the context 3023 * image becomes corrupt. 3024 * 3025 * Otherwise, if we have not started yet, the request should replay 3026 * perfectly and we do not need to flag the result as being erroneous. 3027 */ 3028 if (!i915_request_started(rq)) 3029 goto out_replay; 3030 3031 /* 3032 * If the request was innocent, we leave the request in the ELSP 3033 * and will try to replay it on restarting. The context image may 3034 * have been corrupted by the reset, in which case we may have 3035 * to service a new GPU hang, but more likely we can continue on 3036 * without impact. 3037 * 3038 * If the request was guilty, we presume the context is corrupt 3039 * and have to at least restore the RING register in the context 3040 * image back to the expected values to skip over the guilty request. 3041 */ 3042 __i915_request_reset(rq, stalled); 3043 if (!stalled) 3044 goto out_replay; 3045 3046 /* 3047 * We want a simple context + ring to execute the breadcrumb update. 3048 * We cannot rely on the context being intact across the GPU hang, 3049 * so clear it and rebuild just what we need for the breadcrumb. 3050 * All pending requests for this context will be zapped, and any 3051 * future request will be after userspace has had the opportunity 3052 * to recreate its own state. 3053 */ 3054 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3055 restore_default_state(ce, engine); 3056 3057 out_replay: 3058 GEM_TRACE("%s replay {head:%04x, tail:%04x}\n", 3059 engine->name, ce->ring->head, ce->ring->tail); 3060 intel_ring_update_space(ce->ring); 3061 __execlists_reset_reg_state(ce, engine); 3062 __execlists_update_reg_state(ce, engine); 3063 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 3064 3065 unwind: 3066 /* Push back any incomplete requests for replay after the reset. */ 3067 cancel_port_requests(execlists); 3068 __unwind_incomplete_requests(engine); 3069 } 3070 3071 static void execlists_reset(struct intel_engine_cs *engine, bool stalled) 3072 { 3073 unsigned long flags; 3074 3075 GEM_TRACE("%s\n", engine->name); 3076 3077 spin_lock_irqsave(&engine->active.lock, flags); 3078 3079 __execlists_reset(engine, stalled); 3080 3081 spin_unlock_irqrestore(&engine->active.lock, flags); 3082 } 3083 3084 static void nop_submission_tasklet(unsigned long data) 3085 { 3086 /* The driver is wedged; don't process any more events. */ 3087 } 3088 3089 static void execlists_cancel_requests(struct intel_engine_cs *engine) 3090 { 3091 struct intel_engine_execlists * const execlists = &engine->execlists; 3092 struct i915_request *rq, *rn; 3093 struct rb_node *rb; 3094 unsigned long flags; 3095 3096 GEM_TRACE("%s\n", engine->name); 3097 3098 /* 3099 * Before we call engine->cancel_requests(), we should have exclusive 3100 * access to the submission state. This is arranged for us by the 3101 * caller disabling the interrupt generation, the tasklet and other 3102 * threads that may then access the same state, giving us a free hand 3103 * to reset state. However, we still need to let lockdep be aware that 3104 * we know this state may be accessed in hardirq context, so we 3105 * disable the irq around this manipulation and we want to keep 3106 * the spinlock focused on its duties and not accidentally conflate 3107 * coverage to the submission's irq state. (Similarly, although we 3108 * shouldn't need to disable irq around the manipulation of the 3109 * submission's irq state, we also wish to remind ourselves that 3110 * it is irq state.) 3111 */ 3112 spin_lock_irqsave(&engine->active.lock, flags); 3113 3114 __execlists_reset(engine, true); 3115 3116 /* Mark all executing requests as skipped. */ 3117 list_for_each_entry(rq, &engine->active.requests, sched.link) 3118 mark_eio(rq); 3119 3120 /* Flush the queued requests to the timeline list (for retiring). */ 3121 while ((rb = rb_first_cached(&execlists->queue))) { 3122 struct i915_priolist *p = to_priolist(rb); 3123 int i; 3124 3125 priolist_for_each_request_consume(rq, rn, p, i) { 3126 mark_eio(rq); 3127 __i915_request_submit(rq); 3128 } 3129 3130 rb_erase_cached(&p->node, &execlists->queue); 3131 i915_priolist_free(p); 3132 } 3133 3134 /* Cancel all attached virtual engines */ 3135 while ((rb = rb_first_cached(&execlists->virtual))) { 3136 struct virtual_engine *ve = 3137 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 3138 3139 rb_erase_cached(rb, &execlists->virtual); 3140 RB_CLEAR_NODE(rb); 3141 3142 spin_lock(&ve->base.active.lock); 3143 rq = fetch_and_zero(&ve->request); 3144 if (rq) { 3145 mark_eio(rq); 3146 3147 rq->engine = engine; 3148 __i915_request_submit(rq); 3149 i915_request_put(rq); 3150 3151 ve->base.execlists.queue_priority_hint = INT_MIN; 3152 } 3153 spin_unlock(&ve->base.active.lock); 3154 } 3155 3156 /* Remaining _unready_ requests will be nop'ed when submitted */ 3157 3158 execlists->queue_priority_hint = INT_MIN; 3159 execlists->queue = RB_ROOT_CACHED; 3160 3161 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 3162 execlists->tasklet.func = nop_submission_tasklet; 3163 3164 spin_unlock_irqrestore(&engine->active.lock, flags); 3165 } 3166 3167 static void execlists_reset_finish(struct intel_engine_cs *engine) 3168 { 3169 struct intel_engine_execlists * const execlists = &engine->execlists; 3170 3171 /* 3172 * After a GPU reset, we may have requests to replay. Do so now while 3173 * we still have the forcewake to be sure that the GPU is not allowed 3174 * to sleep before we restart and reload a context. 3175 */ 3176 GEM_BUG_ON(!reset_in_progress(execlists)); 3177 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 3178 execlists->tasklet.func(execlists->tasklet.data); 3179 3180 if (__tasklet_enable(&execlists->tasklet)) 3181 /* And kick in case we missed a new request submission. */ 3182 tasklet_hi_schedule(&execlists->tasklet); 3183 GEM_TRACE("%s: depth->%d\n", engine->name, 3184 atomic_read(&execlists->tasklet.count)); 3185 } 3186 3187 static int gen8_emit_bb_start(struct i915_request *rq, 3188 u64 offset, u32 len, 3189 const unsigned int flags) 3190 { 3191 u32 *cs; 3192 3193 cs = intel_ring_begin(rq, 4); 3194 if (IS_ERR(cs)) 3195 return PTR_ERR(cs); 3196 3197 /* 3198 * WaDisableCtxRestoreArbitration:bdw,chv 3199 * 3200 * We don't need to perform MI_ARB_ENABLE as often as we do (in 3201 * particular all the gen that do not need the w/a at all!), if we 3202 * took care to make sure that on every switch into this context 3203 * (both ordinary and for preemption) that arbitrartion was enabled 3204 * we would be fine. However, for gen8 there is another w/a that 3205 * requires us to not preempt inside GPGPU execution, so we keep 3206 * arbitration disabled for gen8 batches. Arbitration will be 3207 * re-enabled before we close the request 3208 * (engine->emit_fini_breadcrumb). 3209 */ 3210 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3211 3212 /* FIXME(BDW+): Address space and security selectors. */ 3213 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3214 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3215 *cs++ = lower_32_bits(offset); 3216 *cs++ = upper_32_bits(offset); 3217 3218 intel_ring_advance(rq, cs); 3219 3220 return 0; 3221 } 3222 3223 static int gen9_emit_bb_start(struct i915_request *rq, 3224 u64 offset, u32 len, 3225 const unsigned int flags) 3226 { 3227 u32 *cs; 3228 3229 cs = intel_ring_begin(rq, 6); 3230 if (IS_ERR(cs)) 3231 return PTR_ERR(cs); 3232 3233 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3234 3235 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3236 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3237 *cs++ = lower_32_bits(offset); 3238 *cs++ = upper_32_bits(offset); 3239 3240 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3241 *cs++ = MI_NOOP; 3242 3243 intel_ring_advance(rq, cs); 3244 3245 return 0; 3246 } 3247 3248 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 3249 { 3250 ENGINE_WRITE(engine, RING_IMR, 3251 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 3252 ENGINE_POSTING_READ(engine, RING_IMR); 3253 } 3254 3255 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 3256 { 3257 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 3258 } 3259 3260 static int gen8_emit_flush(struct i915_request *request, u32 mode) 3261 { 3262 u32 cmd, *cs; 3263 3264 cs = intel_ring_begin(request, 4); 3265 if (IS_ERR(cs)) 3266 return PTR_ERR(cs); 3267 3268 cmd = MI_FLUSH_DW + 1; 3269 3270 /* We always require a command barrier so that subsequent 3271 * commands, such as breadcrumb interrupts, are strictly ordered 3272 * wrt the contents of the write cache being flushed to memory 3273 * (and thus being coherent from the CPU). 3274 */ 3275 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 3276 3277 if (mode & EMIT_INVALIDATE) { 3278 cmd |= MI_INVALIDATE_TLB; 3279 if (request->engine->class == VIDEO_DECODE_CLASS) 3280 cmd |= MI_INVALIDATE_BSD; 3281 } 3282 3283 *cs++ = cmd; 3284 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 3285 *cs++ = 0; /* upper addr */ 3286 *cs++ = 0; /* value */ 3287 intel_ring_advance(request, cs); 3288 3289 return 0; 3290 } 3291 3292 static int gen8_emit_flush_render(struct i915_request *request, 3293 u32 mode) 3294 { 3295 bool vf_flush_wa = false, dc_flush_wa = false; 3296 u32 *cs, flags = 0; 3297 int len; 3298 3299 flags |= PIPE_CONTROL_CS_STALL; 3300 3301 if (mode & EMIT_FLUSH) { 3302 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3303 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3304 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3305 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3306 } 3307 3308 if (mode & EMIT_INVALIDATE) { 3309 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3310 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3311 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3312 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3313 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3314 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3315 flags |= PIPE_CONTROL_QW_WRITE; 3316 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3317 3318 /* 3319 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 3320 * pipe control. 3321 */ 3322 if (IS_GEN(request->i915, 9)) 3323 vf_flush_wa = true; 3324 3325 /* WaForGAMHang:kbl */ 3326 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 3327 dc_flush_wa = true; 3328 } 3329 3330 len = 6; 3331 3332 if (vf_flush_wa) 3333 len += 6; 3334 3335 if (dc_flush_wa) 3336 len += 12; 3337 3338 cs = intel_ring_begin(request, len); 3339 if (IS_ERR(cs)) 3340 return PTR_ERR(cs); 3341 3342 if (vf_flush_wa) 3343 cs = gen8_emit_pipe_control(cs, 0, 0); 3344 3345 if (dc_flush_wa) 3346 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 3347 0); 3348 3349 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3350 3351 if (dc_flush_wa) 3352 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 3353 3354 intel_ring_advance(request, cs); 3355 3356 return 0; 3357 } 3358 3359 static int gen11_emit_flush_render(struct i915_request *request, 3360 u32 mode) 3361 { 3362 if (mode & EMIT_FLUSH) { 3363 u32 *cs; 3364 u32 flags = 0; 3365 3366 flags |= PIPE_CONTROL_CS_STALL; 3367 3368 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3369 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3370 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3371 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3372 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3373 flags |= PIPE_CONTROL_QW_WRITE; 3374 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3375 3376 cs = intel_ring_begin(request, 6); 3377 if (IS_ERR(cs)) 3378 return PTR_ERR(cs); 3379 3380 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3381 intel_ring_advance(request, cs); 3382 } 3383 3384 if (mode & EMIT_INVALIDATE) { 3385 u32 *cs; 3386 u32 flags = 0; 3387 3388 flags |= PIPE_CONTROL_CS_STALL; 3389 3390 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3391 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3392 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3393 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3394 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3395 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3396 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3397 flags |= PIPE_CONTROL_QW_WRITE; 3398 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3399 3400 cs = intel_ring_begin(request, 6); 3401 if (IS_ERR(cs)) 3402 return PTR_ERR(cs); 3403 3404 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3405 intel_ring_advance(request, cs); 3406 } 3407 3408 return 0; 3409 } 3410 3411 static u32 preparser_disable(bool state) 3412 { 3413 return MI_ARB_CHECK | 1 << 8 | state; 3414 } 3415 3416 static int gen12_emit_flush_render(struct i915_request *request, 3417 u32 mode) 3418 { 3419 if (mode & EMIT_FLUSH) { 3420 u32 flags = 0; 3421 u32 *cs; 3422 3423 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3424 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3425 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3426 /* Wa_1409600907:tgl */ 3427 flags |= PIPE_CONTROL_DEPTH_STALL; 3428 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3429 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3430 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 3431 3432 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3433 flags |= PIPE_CONTROL_QW_WRITE; 3434 3435 flags |= PIPE_CONTROL_CS_STALL; 3436 3437 cs = intel_ring_begin(request, 6); 3438 if (IS_ERR(cs)) 3439 return PTR_ERR(cs); 3440 3441 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3442 intel_ring_advance(request, cs); 3443 } 3444 3445 if (mode & EMIT_INVALIDATE) { 3446 u32 flags = 0; 3447 u32 *cs; 3448 3449 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3450 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3451 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3452 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3453 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3454 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3455 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3456 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE; 3457 3458 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3459 flags |= PIPE_CONTROL_QW_WRITE; 3460 3461 flags |= PIPE_CONTROL_CS_STALL; 3462 3463 cs = intel_ring_begin(request, 8); 3464 if (IS_ERR(cs)) 3465 return PTR_ERR(cs); 3466 3467 /* 3468 * Prevent the pre-parser from skipping past the TLB 3469 * invalidate and loading a stale page for the batch 3470 * buffer / request payload. 3471 */ 3472 *cs++ = preparser_disable(true); 3473 3474 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3475 3476 *cs++ = preparser_disable(false); 3477 intel_ring_advance(request, cs); 3478 3479 /* 3480 * Wa_1604544889:tgl 3481 */ 3482 if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) { 3483 flags = 0; 3484 flags |= PIPE_CONTROL_CS_STALL; 3485 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 3486 3487 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3488 flags |= PIPE_CONTROL_QW_WRITE; 3489 3490 cs = intel_ring_begin(request, 6); 3491 if (IS_ERR(cs)) 3492 return PTR_ERR(cs); 3493 3494 cs = gen8_emit_pipe_control(cs, flags, 3495 LRC_PPHWSP_SCRATCH_ADDR); 3496 intel_ring_advance(request, cs); 3497 } 3498 } 3499 3500 return 0; 3501 } 3502 3503 /* 3504 * Reserve space for 2 NOOPs at the end of each request to be 3505 * used as a workaround for not being allowed to do lite 3506 * restore with HEAD==TAIL (WaIdleLiteRestore). 3507 */ 3508 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 3509 { 3510 /* Ensure there's always at least one preemption point per-request. */ 3511 *cs++ = MI_ARB_CHECK; 3512 *cs++ = MI_NOOP; 3513 request->wa_tail = intel_ring_offset(request, cs); 3514 3515 return cs; 3516 } 3517 3518 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 3519 { 3520 *cs++ = MI_SEMAPHORE_WAIT | 3521 MI_SEMAPHORE_GLOBAL_GTT | 3522 MI_SEMAPHORE_POLL | 3523 MI_SEMAPHORE_SAD_EQ_SDD; 3524 *cs++ = 0; 3525 *cs++ = intel_hws_preempt_address(request->engine); 3526 *cs++ = 0; 3527 3528 return cs; 3529 } 3530 3531 static __always_inline u32* 3532 gen8_emit_fini_breadcrumb_footer(struct i915_request *request, 3533 u32 *cs) 3534 { 3535 *cs++ = MI_USER_INTERRUPT; 3536 3537 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3538 if (intel_engine_has_semaphores(request->engine)) 3539 cs = emit_preempt_busywait(request, cs); 3540 3541 request->tail = intel_ring_offset(request, cs); 3542 assert_ring_tail_valid(request->ring, request->tail); 3543 3544 return gen8_emit_wa_tail(request, cs); 3545 } 3546 3547 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 3548 { 3549 cs = gen8_emit_ggtt_write(cs, 3550 request->fence.seqno, 3551 i915_request_active_timeline(request)->hwsp_offset, 3552 0); 3553 3554 return gen8_emit_fini_breadcrumb_footer(request, cs); 3555 } 3556 3557 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 3558 { 3559 cs = gen8_emit_pipe_control(cs, 3560 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 3561 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 3562 PIPE_CONTROL_DC_FLUSH_ENABLE, 3563 0); 3564 3565 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 3566 cs = gen8_emit_ggtt_write_rcs(cs, 3567 request->fence.seqno, 3568 i915_request_active_timeline(request)->hwsp_offset, 3569 PIPE_CONTROL_FLUSH_ENABLE | 3570 PIPE_CONTROL_CS_STALL); 3571 3572 return gen8_emit_fini_breadcrumb_footer(request, cs); 3573 } 3574 3575 static u32 * 3576 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 3577 { 3578 cs = gen8_emit_ggtt_write_rcs(cs, 3579 request->fence.seqno, 3580 i915_request_active_timeline(request)->hwsp_offset, 3581 PIPE_CONTROL_CS_STALL | 3582 PIPE_CONTROL_TILE_CACHE_FLUSH | 3583 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 3584 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 3585 PIPE_CONTROL_DC_FLUSH_ENABLE | 3586 PIPE_CONTROL_FLUSH_ENABLE); 3587 3588 return gen8_emit_fini_breadcrumb_footer(request, cs); 3589 } 3590 3591 /* 3592 * Note that the CS instruction pre-parser will not stall on the breadcrumb 3593 * flush and will continue pre-fetching the instructions after it before the 3594 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 3595 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 3596 * of the next request before the memory has been flushed, we're guaranteed that 3597 * we won't access the batch itself too early. 3598 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 3599 * so, if the current request is modifying an instruction in the next request on 3600 * the same intel_context, we might pre-fetch and then execute the pre-update 3601 * instruction. To avoid this, the users of self-modifying code should either 3602 * disable the parser around the code emitting the memory writes, via a new flag 3603 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 3604 * the in-kernel use-cases we've opted to use a separate context, see 3605 * reloc_gpu() as an example. 3606 * All the above applies only to the instructions themselves. Non-inline data 3607 * used by the instructions is not pre-fetched. 3608 */ 3609 3610 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 3611 { 3612 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 3613 MI_SEMAPHORE_GLOBAL_GTT | 3614 MI_SEMAPHORE_POLL | 3615 MI_SEMAPHORE_SAD_EQ_SDD; 3616 *cs++ = 0; 3617 *cs++ = intel_hws_preempt_address(request->engine); 3618 *cs++ = 0; 3619 *cs++ = 0; 3620 *cs++ = MI_NOOP; 3621 3622 return cs; 3623 } 3624 3625 static __always_inline u32* 3626 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs) 3627 { 3628 *cs++ = MI_USER_INTERRUPT; 3629 3630 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3631 if (intel_engine_has_semaphores(request->engine)) 3632 cs = gen12_emit_preempt_busywait(request, cs); 3633 3634 request->tail = intel_ring_offset(request, cs); 3635 assert_ring_tail_valid(request->ring, request->tail); 3636 3637 return gen8_emit_wa_tail(request, cs); 3638 } 3639 3640 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 3641 { 3642 cs = gen8_emit_ggtt_write(cs, 3643 request->fence.seqno, 3644 i915_request_active_timeline(request)->hwsp_offset, 3645 0); 3646 3647 return gen12_emit_fini_breadcrumb_footer(request, cs); 3648 } 3649 3650 static u32 * 3651 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 3652 { 3653 cs = gen8_emit_ggtt_write_rcs(cs, 3654 request->fence.seqno, 3655 i915_request_active_timeline(request)->hwsp_offset, 3656 PIPE_CONTROL_CS_STALL | 3657 PIPE_CONTROL_TILE_CACHE_FLUSH | 3658 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 3659 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 3660 /* Wa_1409600907:tgl */ 3661 PIPE_CONTROL_DEPTH_STALL | 3662 PIPE_CONTROL_DC_FLUSH_ENABLE | 3663 PIPE_CONTROL_FLUSH_ENABLE | 3664 PIPE_CONTROL_HDC_PIPELINE_FLUSH); 3665 3666 return gen12_emit_fini_breadcrumb_footer(request, cs); 3667 } 3668 3669 static void execlists_park(struct intel_engine_cs *engine) 3670 { 3671 cancel_timer(&engine->execlists.timer); 3672 cancel_timer(&engine->execlists.preempt); 3673 } 3674 3675 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 3676 { 3677 engine->submit_request = execlists_submit_request; 3678 engine->cancel_requests = execlists_cancel_requests; 3679 engine->schedule = i915_schedule; 3680 engine->execlists.tasklet.func = execlists_submission_tasklet; 3681 3682 engine->reset.prepare = execlists_reset_prepare; 3683 engine->reset.reset = execlists_reset; 3684 engine->reset.finish = execlists_reset_finish; 3685 3686 engine->park = execlists_park; 3687 engine->unpark = NULL; 3688 3689 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 3690 if (!intel_vgpu_active(engine->i915)) { 3691 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 3692 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) 3693 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 3694 } 3695 3696 if (INTEL_GEN(engine->i915) >= 12) 3697 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 3698 } 3699 3700 static void execlists_destroy(struct intel_engine_cs *engine) 3701 { 3702 intel_engine_cleanup_common(engine); 3703 lrc_destroy_wa_ctx(engine); 3704 kfree(engine); 3705 } 3706 3707 static void 3708 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 3709 { 3710 /* Default vfuncs which can be overriden by each engine. */ 3711 3712 engine->destroy = execlists_destroy; 3713 engine->resume = execlists_resume; 3714 3715 engine->reset.prepare = execlists_reset_prepare; 3716 engine->reset.reset = execlists_reset; 3717 engine->reset.finish = execlists_reset_finish; 3718 3719 engine->cops = &execlists_context_ops; 3720 engine->request_alloc = execlists_request_alloc; 3721 3722 engine->emit_flush = gen8_emit_flush; 3723 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 3724 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 3725 if (INTEL_GEN(engine->i915) >= 12) 3726 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 3727 3728 engine->set_default_submission = intel_execlists_set_default_submission; 3729 3730 if (INTEL_GEN(engine->i915) < 11) { 3731 engine->irq_enable = gen8_logical_ring_enable_irq; 3732 engine->irq_disable = gen8_logical_ring_disable_irq; 3733 } else { 3734 /* 3735 * TODO: On Gen11 interrupt masks need to be clear 3736 * to allow C6 entry. Keep interrupts enabled at 3737 * and take the hit of generating extra interrupts 3738 * until a more refined solution exists. 3739 */ 3740 } 3741 if (IS_GEN(engine->i915, 8)) 3742 engine->emit_bb_start = gen8_emit_bb_start; 3743 else 3744 engine->emit_bb_start = gen9_emit_bb_start; 3745 } 3746 3747 static inline void 3748 logical_ring_default_irqs(struct intel_engine_cs *engine) 3749 { 3750 unsigned int shift = 0; 3751 3752 if (INTEL_GEN(engine->i915) < 11) { 3753 const u8 irq_shifts[] = { 3754 [RCS0] = GEN8_RCS_IRQ_SHIFT, 3755 [BCS0] = GEN8_BCS_IRQ_SHIFT, 3756 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 3757 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 3758 [VECS0] = GEN8_VECS_IRQ_SHIFT, 3759 }; 3760 3761 shift = irq_shifts[engine->id]; 3762 } 3763 3764 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 3765 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 3766 } 3767 3768 static void rcs_submission_override(struct intel_engine_cs *engine) 3769 { 3770 switch (INTEL_GEN(engine->i915)) { 3771 case 12: 3772 engine->emit_flush = gen12_emit_flush_render; 3773 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 3774 break; 3775 case 11: 3776 engine->emit_flush = gen11_emit_flush_render; 3777 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 3778 break; 3779 default: 3780 engine->emit_flush = gen8_emit_flush_render; 3781 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 3782 break; 3783 } 3784 } 3785 3786 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 3787 { 3788 tasklet_init(&engine->execlists.tasklet, 3789 execlists_submission_tasklet, (unsigned long)engine); 3790 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 3791 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 3792 3793 logical_ring_default_vfuncs(engine); 3794 logical_ring_default_irqs(engine); 3795 3796 if (engine->class == RENDER_CLASS) 3797 rcs_submission_override(engine); 3798 3799 return 0; 3800 } 3801 3802 int intel_execlists_submission_init(struct intel_engine_cs *engine) 3803 { 3804 struct intel_engine_execlists * const execlists = &engine->execlists; 3805 struct drm_i915_private *i915 = engine->i915; 3806 struct intel_uncore *uncore = engine->uncore; 3807 u32 base = engine->mmio_base; 3808 int ret; 3809 3810 ret = intel_engine_init_common(engine); 3811 if (ret) 3812 return ret; 3813 3814 if (intel_init_workaround_bb(engine)) 3815 /* 3816 * We continue even if we fail to initialize WA batch 3817 * because we only expect rare glitches but nothing 3818 * critical to prevent us from using GPU 3819 */ 3820 DRM_ERROR("WA batch buffer initialization failed\n"); 3821 3822 if (HAS_LOGICAL_RING_ELSQ(i915)) { 3823 execlists->submit_reg = uncore->regs + 3824 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 3825 execlists->ctrl_reg = uncore->regs + 3826 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 3827 } else { 3828 execlists->submit_reg = uncore->regs + 3829 i915_mmio_reg_offset(RING_ELSP(base)); 3830 } 3831 3832 execlists->csb_status = 3833 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 3834 3835 execlists->csb_write = 3836 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 3837 3838 if (INTEL_GEN(i915) < 11) 3839 execlists->csb_size = GEN8_CSB_ENTRIES; 3840 else 3841 execlists->csb_size = GEN11_CSB_ENTRIES; 3842 3843 reset_csb_pointers(engine); 3844 3845 return 0; 3846 } 3847 3848 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) 3849 { 3850 u32 indirect_ctx_offset; 3851 3852 switch (INTEL_GEN(engine->i915)) { 3853 default: 3854 MISSING_CASE(INTEL_GEN(engine->i915)); 3855 /* fall through */ 3856 case 12: 3857 indirect_ctx_offset = 3858 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3859 break; 3860 case 11: 3861 indirect_ctx_offset = 3862 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3863 break; 3864 case 10: 3865 indirect_ctx_offset = 3866 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3867 break; 3868 case 9: 3869 indirect_ctx_offset = 3870 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3871 break; 3872 case 8: 3873 indirect_ctx_offset = 3874 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3875 break; 3876 } 3877 3878 return indirect_ctx_offset; 3879 } 3880 3881 3882 static void init_common_reg_state(u32 * const regs, 3883 const struct intel_engine_cs *engine, 3884 const struct intel_ring *ring) 3885 { 3886 regs[CTX_CONTEXT_CONTROL] = 3887 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) | 3888 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 3889 if (INTEL_GEN(engine->i915) < 11) 3890 regs[CTX_CONTEXT_CONTROL] |= 3891 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 3892 CTX_CTRL_RS_CTX_ENABLE); 3893 3894 regs[CTX_RING_BUFFER_CONTROL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3895 regs[CTX_BB_STATE] = RING_BB_PPGTT; 3896 } 3897 3898 static void init_wa_bb_reg_state(u32 * const regs, 3899 const struct intel_engine_cs *engine, 3900 u32 pos_bb_per_ctx) 3901 { 3902 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 3903 3904 if (wa_ctx->per_ctx.size) { 3905 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 3906 3907 regs[pos_bb_per_ctx] = 3908 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 3909 } 3910 3911 if (wa_ctx->indirect_ctx.size) { 3912 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 3913 3914 regs[pos_bb_per_ctx + 2] = 3915 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 3916 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 3917 3918 regs[pos_bb_per_ctx + 4] = 3919 intel_lr_indirect_ctx_offset(engine) << 6; 3920 } 3921 } 3922 3923 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 3924 { 3925 if (i915_vm_is_4lvl(&ppgtt->vm)) { 3926 /* 64b PPGTT (48bit canonical) 3927 * PDP0_DESCRIPTOR contains the base address to PML4 and 3928 * other PDP Descriptors are ignored. 3929 */ 3930 ASSIGN_CTX_PML4(ppgtt, regs); 3931 } else { 3932 ASSIGN_CTX_PDP(ppgtt, regs, 3); 3933 ASSIGN_CTX_PDP(ppgtt, regs, 2); 3934 ASSIGN_CTX_PDP(ppgtt, regs, 1); 3935 ASSIGN_CTX_PDP(ppgtt, regs, 0); 3936 } 3937 } 3938 3939 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 3940 { 3941 if (i915_is_ggtt(vm)) 3942 return i915_vm_to_ggtt(vm)->alias; 3943 else 3944 return i915_vm_to_ppgtt(vm); 3945 } 3946 3947 static void execlists_init_reg_state(u32 *regs, 3948 const struct intel_context *ce, 3949 const struct intel_engine_cs *engine, 3950 const struct intel_ring *ring, 3951 bool close) 3952 { 3953 /* 3954 * A context is actually a big batch buffer with several 3955 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 3956 * values we are setting here are only for the first context restore: 3957 * on a subsequent save, the GPU will recreate this batchbuffer with new 3958 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 3959 * we are not initializing here). 3960 * 3961 * Must keep consistent with virtual_update_register_offsets(). 3962 */ 3963 u32 *bbe = set_offsets(regs, reg_offsets(engine), engine); 3964 3965 if (close) { /* Close the batch; used mainly by live_lrc_layout() */ 3966 *bbe = MI_BATCH_BUFFER_END; 3967 if (INTEL_GEN(engine->i915) >= 10) 3968 *bbe |= BIT(0); 3969 } 3970 3971 init_common_reg_state(regs, engine, ring); 3972 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 3973 3974 init_wa_bb_reg_state(regs, engine, 3975 INTEL_GEN(engine->i915) >= 12 ? 3976 GEN12_CTX_BB_PER_CTX_PTR : 3977 CTX_BB_PER_CTX_PTR); 3978 } 3979 3980 static int 3981 populate_lr_context(struct intel_context *ce, 3982 struct drm_i915_gem_object *ctx_obj, 3983 struct intel_engine_cs *engine, 3984 struct intel_ring *ring) 3985 { 3986 bool inhibit = true; 3987 void *vaddr; 3988 u32 *regs; 3989 int ret; 3990 3991 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 3992 if (IS_ERR(vaddr)) { 3993 ret = PTR_ERR(vaddr); 3994 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 3995 return ret; 3996 } 3997 3998 set_redzone(vaddr, engine); 3999 4000 if (engine->default_state) { 4001 void *defaults; 4002 4003 defaults = i915_gem_object_pin_map(engine->default_state, 4004 I915_MAP_WB); 4005 if (IS_ERR(defaults)) { 4006 ret = PTR_ERR(defaults); 4007 goto err_unpin_ctx; 4008 } 4009 4010 memcpy(vaddr, defaults, engine->context_size); 4011 i915_gem_object_unpin_map(engine->default_state); 4012 inhibit = false; 4013 } 4014 4015 /* The second page of the context object contains some fields which must 4016 * be set up prior to the first execution. */ 4017 regs = vaddr + LRC_STATE_PN * PAGE_SIZE; 4018 execlists_init_reg_state(regs, ce, engine, ring, inhibit); 4019 if (inhibit) 4020 regs[CTX_CONTEXT_CONTROL] |= 4021 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 4022 4023 ret = 0; 4024 err_unpin_ctx: 4025 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 4026 i915_gem_object_unpin_map(ctx_obj); 4027 return ret; 4028 } 4029 4030 static int __execlists_context_alloc(struct intel_context *ce, 4031 struct intel_engine_cs *engine) 4032 { 4033 struct drm_i915_gem_object *ctx_obj; 4034 struct intel_ring *ring; 4035 struct i915_vma *vma; 4036 u32 context_size; 4037 int ret; 4038 4039 GEM_BUG_ON(ce->state); 4040 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 4041 4042 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4043 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 4044 4045 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 4046 if (IS_ERR(ctx_obj)) 4047 return PTR_ERR(ctx_obj); 4048 4049 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 4050 if (IS_ERR(vma)) { 4051 ret = PTR_ERR(vma); 4052 goto error_deref_obj; 4053 } 4054 4055 if (!ce->timeline) { 4056 struct intel_timeline *tl; 4057 4058 tl = intel_timeline_create(engine->gt, NULL); 4059 if (IS_ERR(tl)) { 4060 ret = PTR_ERR(tl); 4061 goto error_deref_obj; 4062 } 4063 4064 ce->timeline = tl; 4065 } 4066 4067 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 4068 if (IS_ERR(ring)) { 4069 ret = PTR_ERR(ring); 4070 goto error_deref_obj; 4071 } 4072 4073 ret = populate_lr_context(ce, ctx_obj, engine, ring); 4074 if (ret) { 4075 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 4076 goto error_ring_free; 4077 } 4078 4079 ce->ring = ring; 4080 ce->state = vma; 4081 4082 return 0; 4083 4084 error_ring_free: 4085 intel_ring_put(ring); 4086 error_deref_obj: 4087 i915_gem_object_put(ctx_obj); 4088 return ret; 4089 } 4090 4091 static struct list_head *virtual_queue(struct virtual_engine *ve) 4092 { 4093 return &ve->base.execlists.default_priolist.requests[0]; 4094 } 4095 4096 static void virtual_context_destroy(struct kref *kref) 4097 { 4098 struct virtual_engine *ve = 4099 container_of(kref, typeof(*ve), context.ref); 4100 unsigned int n; 4101 4102 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4103 GEM_BUG_ON(ve->request); 4104 GEM_BUG_ON(ve->context.inflight); 4105 4106 for (n = 0; n < ve->num_siblings; n++) { 4107 struct intel_engine_cs *sibling = ve->siblings[n]; 4108 struct rb_node *node = &ve->nodes[sibling->id].rb; 4109 4110 if (RB_EMPTY_NODE(node)) 4111 continue; 4112 4113 spin_lock_irq(&sibling->active.lock); 4114 4115 /* Detachment is lazily performed in the execlists tasklet */ 4116 if (!RB_EMPTY_NODE(node)) 4117 rb_erase_cached(node, &sibling->execlists.virtual); 4118 4119 spin_unlock_irq(&sibling->active.lock); 4120 } 4121 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 4122 4123 if (ve->context.state) 4124 __execlists_context_fini(&ve->context); 4125 intel_context_fini(&ve->context); 4126 4127 kfree(ve->bonds); 4128 kfree(ve); 4129 } 4130 4131 static void virtual_engine_initial_hint(struct virtual_engine *ve) 4132 { 4133 int swp; 4134 4135 /* 4136 * Pick a random sibling on starting to help spread the load around. 4137 * 4138 * New contexts are typically created with exactly the same order 4139 * of siblings, and often started in batches. Due to the way we iterate 4140 * the array of sibling when submitting requests, sibling[0] is 4141 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 4142 * randomised across the system, we also help spread the load by the 4143 * first engine we inspect being different each time. 4144 * 4145 * NB This does not force us to execute on this engine, it will just 4146 * typically be the first we inspect for submission. 4147 */ 4148 swp = prandom_u32_max(ve->num_siblings); 4149 if (!swp) 4150 return; 4151 4152 swap(ve->siblings[swp], ve->siblings[0]); 4153 if (!intel_engine_has_relative_mmio(ve->siblings[0])) 4154 virtual_update_register_offsets(ve->context.lrc_reg_state, 4155 ve->siblings[0]); 4156 } 4157 4158 static int virtual_context_pin(struct intel_context *ce) 4159 { 4160 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4161 int err; 4162 4163 /* Note: we must use a real engine class for setting up reg state */ 4164 err = __execlists_context_pin(ce, ve->siblings[0]); 4165 if (err) 4166 return err; 4167 4168 virtual_engine_initial_hint(ve); 4169 return 0; 4170 } 4171 4172 static void virtual_context_enter(struct intel_context *ce) 4173 { 4174 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4175 unsigned int n; 4176 4177 for (n = 0; n < ve->num_siblings; n++) 4178 intel_engine_pm_get(ve->siblings[n]); 4179 4180 intel_timeline_enter(ce->timeline); 4181 } 4182 4183 static void virtual_context_exit(struct intel_context *ce) 4184 { 4185 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4186 unsigned int n; 4187 4188 intel_timeline_exit(ce->timeline); 4189 4190 for (n = 0; n < ve->num_siblings; n++) 4191 intel_engine_pm_put(ve->siblings[n]); 4192 } 4193 4194 static const struct intel_context_ops virtual_context_ops = { 4195 .pin = virtual_context_pin, 4196 .unpin = execlists_context_unpin, 4197 4198 .enter = virtual_context_enter, 4199 .exit = virtual_context_exit, 4200 4201 .destroy = virtual_context_destroy, 4202 }; 4203 4204 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 4205 { 4206 struct i915_request *rq; 4207 intel_engine_mask_t mask; 4208 4209 rq = READ_ONCE(ve->request); 4210 if (!rq) 4211 return 0; 4212 4213 /* The rq is ready for submission; rq->execution_mask is now stable. */ 4214 mask = rq->execution_mask; 4215 if (unlikely(!mask)) { 4216 /* Invalid selection, submit to a random engine in error */ 4217 i915_request_skip(rq, -ENODEV); 4218 mask = ve->siblings[0]->mask; 4219 } 4220 4221 GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n", 4222 ve->base.name, 4223 rq->fence.context, rq->fence.seqno, 4224 mask, ve->base.execlists.queue_priority_hint); 4225 4226 return mask; 4227 } 4228 4229 static void virtual_submission_tasklet(unsigned long data) 4230 { 4231 struct virtual_engine * const ve = (struct virtual_engine *)data; 4232 const int prio = ve->base.execlists.queue_priority_hint; 4233 intel_engine_mask_t mask; 4234 unsigned int n; 4235 4236 rcu_read_lock(); 4237 mask = virtual_submission_mask(ve); 4238 rcu_read_unlock(); 4239 if (unlikely(!mask)) 4240 return; 4241 4242 local_irq_disable(); 4243 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { 4244 struct intel_engine_cs *sibling = ve->siblings[n]; 4245 struct ve_node * const node = &ve->nodes[sibling->id]; 4246 struct rb_node **parent, *rb; 4247 bool first; 4248 4249 if (unlikely(!(mask & sibling->mask))) { 4250 if (!RB_EMPTY_NODE(&node->rb)) { 4251 spin_lock(&sibling->active.lock); 4252 rb_erase_cached(&node->rb, 4253 &sibling->execlists.virtual); 4254 RB_CLEAR_NODE(&node->rb); 4255 spin_unlock(&sibling->active.lock); 4256 } 4257 continue; 4258 } 4259 4260 spin_lock(&sibling->active.lock); 4261 4262 if (!RB_EMPTY_NODE(&node->rb)) { 4263 /* 4264 * Cheat and avoid rebalancing the tree if we can 4265 * reuse this node in situ. 4266 */ 4267 first = rb_first_cached(&sibling->execlists.virtual) == 4268 &node->rb; 4269 if (prio == node->prio || (prio > node->prio && first)) 4270 goto submit_engine; 4271 4272 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 4273 } 4274 4275 rb = NULL; 4276 first = true; 4277 parent = &sibling->execlists.virtual.rb_root.rb_node; 4278 while (*parent) { 4279 struct ve_node *other; 4280 4281 rb = *parent; 4282 other = rb_entry(rb, typeof(*other), rb); 4283 if (prio > other->prio) { 4284 parent = &rb->rb_left; 4285 } else { 4286 parent = &rb->rb_right; 4287 first = false; 4288 } 4289 } 4290 4291 rb_link_node(&node->rb, rb, parent); 4292 rb_insert_color_cached(&node->rb, 4293 &sibling->execlists.virtual, 4294 first); 4295 4296 submit_engine: 4297 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 4298 node->prio = prio; 4299 if (first && prio > sibling->execlists.queue_priority_hint) { 4300 sibling->execlists.queue_priority_hint = prio; 4301 tasklet_hi_schedule(&sibling->execlists.tasklet); 4302 } 4303 4304 spin_unlock(&sibling->active.lock); 4305 } 4306 local_irq_enable(); 4307 } 4308 4309 static void virtual_submit_request(struct i915_request *rq) 4310 { 4311 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4312 struct i915_request *old; 4313 unsigned long flags; 4314 4315 GEM_TRACE("%s: rq=%llx:%lld\n", 4316 ve->base.name, 4317 rq->fence.context, 4318 rq->fence.seqno); 4319 4320 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 4321 4322 spin_lock_irqsave(&ve->base.active.lock, flags); 4323 4324 old = ve->request; 4325 if (old) { /* background completion event from preempt-to-busy */ 4326 GEM_BUG_ON(!i915_request_completed(old)); 4327 __i915_request_submit(old); 4328 i915_request_put(old); 4329 } 4330 4331 if (i915_request_completed(rq)) { 4332 __i915_request_submit(rq); 4333 4334 ve->base.execlists.queue_priority_hint = INT_MIN; 4335 ve->request = NULL; 4336 } else { 4337 ve->base.execlists.queue_priority_hint = rq_prio(rq); 4338 ve->request = i915_request_get(rq); 4339 4340 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4341 list_move_tail(&rq->sched.link, virtual_queue(ve)); 4342 4343 tasklet_schedule(&ve->base.execlists.tasklet); 4344 } 4345 4346 spin_unlock_irqrestore(&ve->base.active.lock, flags); 4347 } 4348 4349 static struct ve_bond * 4350 virtual_find_bond(struct virtual_engine *ve, 4351 const struct intel_engine_cs *master) 4352 { 4353 int i; 4354 4355 for (i = 0; i < ve->num_bonds; i++) { 4356 if (ve->bonds[i].master == master) 4357 return &ve->bonds[i]; 4358 } 4359 4360 return NULL; 4361 } 4362 4363 static void 4364 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 4365 { 4366 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4367 intel_engine_mask_t allowed, exec; 4368 struct ve_bond *bond; 4369 4370 allowed = ~to_request(signal)->engine->mask; 4371 4372 bond = virtual_find_bond(ve, to_request(signal)->engine); 4373 if (bond) 4374 allowed &= bond->sibling_mask; 4375 4376 /* Restrict the bonded request to run on only the available engines */ 4377 exec = READ_ONCE(rq->execution_mask); 4378 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 4379 ; 4380 4381 /* Prevent the master from being re-run on the bonded engines */ 4382 to_request(signal)->execution_mask &= ~allowed; 4383 } 4384 4385 struct intel_context * 4386 intel_execlists_create_virtual(struct i915_gem_context *ctx, 4387 struct intel_engine_cs **siblings, 4388 unsigned int count) 4389 { 4390 struct virtual_engine *ve; 4391 unsigned int n; 4392 int err; 4393 4394 if (count == 0) 4395 return ERR_PTR(-EINVAL); 4396 4397 if (count == 1) 4398 return intel_context_create(ctx, siblings[0]); 4399 4400 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 4401 if (!ve) 4402 return ERR_PTR(-ENOMEM); 4403 4404 ve->base.i915 = ctx->i915; 4405 ve->base.gt = siblings[0]->gt; 4406 ve->base.uncore = siblings[0]->uncore; 4407 ve->base.id = -1; 4408 ve->base.class = OTHER_CLASS; 4409 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 4410 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 4411 4412 /* 4413 * The decision on whether to submit a request using semaphores 4414 * depends on the saturated state of the engine. We only compute 4415 * this during HW submission of the request, and we need for this 4416 * state to be globally applied to all requests being submitted 4417 * to this engine. Virtual engines encompass more than one physical 4418 * engine and so we cannot accurately tell in advance if one of those 4419 * engines is already saturated and so cannot afford to use a semaphore 4420 * and be pessimized in priority for doing so -- if we are the only 4421 * context using semaphores after all other clients have stopped, we 4422 * will be starved on the saturated system. Such a global switch for 4423 * semaphores is less than ideal, but alas is the current compromise. 4424 */ 4425 ve->base.saturated = ALL_ENGINES; 4426 4427 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 4428 4429 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 4430 intel_engine_init_breadcrumbs(&ve->base); 4431 4432 intel_engine_init_execlists(&ve->base); 4433 4434 ve->base.cops = &virtual_context_ops; 4435 ve->base.request_alloc = execlists_request_alloc; 4436 4437 ve->base.schedule = i915_schedule; 4438 ve->base.submit_request = virtual_submit_request; 4439 ve->base.bond_execute = virtual_bond_execute; 4440 4441 INIT_LIST_HEAD(virtual_queue(ve)); 4442 ve->base.execlists.queue_priority_hint = INT_MIN; 4443 tasklet_init(&ve->base.execlists.tasklet, 4444 virtual_submission_tasklet, 4445 (unsigned long)ve); 4446 4447 intel_context_init(&ve->context, ctx, &ve->base); 4448 4449 for (n = 0; n < count; n++) { 4450 struct intel_engine_cs *sibling = siblings[n]; 4451 4452 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 4453 if (sibling->mask & ve->base.mask) { 4454 DRM_DEBUG("duplicate %s entry in load balancer\n", 4455 sibling->name); 4456 err = -EINVAL; 4457 goto err_put; 4458 } 4459 4460 /* 4461 * The virtual engine implementation is tightly coupled to 4462 * the execlists backend -- we push out request directly 4463 * into a tree inside each physical engine. We could support 4464 * layering if we handle cloning of the requests and 4465 * submitting a copy into each backend. 4466 */ 4467 if (sibling->execlists.tasklet.func != 4468 execlists_submission_tasklet) { 4469 err = -ENODEV; 4470 goto err_put; 4471 } 4472 4473 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 4474 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 4475 4476 ve->siblings[ve->num_siblings++] = sibling; 4477 ve->base.mask |= sibling->mask; 4478 4479 /* 4480 * All physical engines must be compatible for their emission 4481 * functions (as we build the instructions during request 4482 * construction and do not alter them before submission 4483 * on the physical engine). We use the engine class as a guide 4484 * here, although that could be refined. 4485 */ 4486 if (ve->base.class != OTHER_CLASS) { 4487 if (ve->base.class != sibling->class) { 4488 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 4489 sibling->class, ve->base.class); 4490 err = -EINVAL; 4491 goto err_put; 4492 } 4493 continue; 4494 } 4495 4496 ve->base.class = sibling->class; 4497 ve->base.uabi_class = sibling->uabi_class; 4498 snprintf(ve->base.name, sizeof(ve->base.name), 4499 "v%dx%d", ve->base.class, count); 4500 ve->base.context_size = sibling->context_size; 4501 4502 ve->base.emit_bb_start = sibling->emit_bb_start; 4503 ve->base.emit_flush = sibling->emit_flush; 4504 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 4505 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 4506 ve->base.emit_fini_breadcrumb_dw = 4507 sibling->emit_fini_breadcrumb_dw; 4508 4509 ve->base.flags = sibling->flags; 4510 } 4511 4512 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 4513 4514 err = __execlists_context_alloc(&ve->context, siblings[0]); 4515 if (err) 4516 goto err_put; 4517 4518 __set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags); 4519 4520 return &ve->context; 4521 4522 err_put: 4523 intel_context_put(&ve->context); 4524 return ERR_PTR(err); 4525 } 4526 4527 struct intel_context * 4528 intel_execlists_clone_virtual(struct i915_gem_context *ctx, 4529 struct intel_engine_cs *src) 4530 { 4531 struct virtual_engine *se = to_virtual_engine(src); 4532 struct intel_context *dst; 4533 4534 dst = intel_execlists_create_virtual(ctx, 4535 se->siblings, 4536 se->num_siblings); 4537 if (IS_ERR(dst)) 4538 return dst; 4539 4540 if (se->num_bonds) { 4541 struct virtual_engine *de = to_virtual_engine(dst->engine); 4542 4543 de->bonds = kmemdup(se->bonds, 4544 sizeof(*se->bonds) * se->num_bonds, 4545 GFP_KERNEL); 4546 if (!de->bonds) { 4547 intel_context_put(dst); 4548 return ERR_PTR(-ENOMEM); 4549 } 4550 4551 de->num_bonds = se->num_bonds; 4552 } 4553 4554 return dst; 4555 } 4556 4557 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 4558 const struct intel_engine_cs *master, 4559 const struct intel_engine_cs *sibling) 4560 { 4561 struct virtual_engine *ve = to_virtual_engine(engine); 4562 struct ve_bond *bond; 4563 int n; 4564 4565 /* Sanity check the sibling is part of the virtual engine */ 4566 for (n = 0; n < ve->num_siblings; n++) 4567 if (sibling == ve->siblings[n]) 4568 break; 4569 if (n == ve->num_siblings) 4570 return -EINVAL; 4571 4572 bond = virtual_find_bond(ve, master); 4573 if (bond) { 4574 bond->sibling_mask |= sibling->mask; 4575 return 0; 4576 } 4577 4578 bond = krealloc(ve->bonds, 4579 sizeof(*bond) * (ve->num_bonds + 1), 4580 GFP_KERNEL); 4581 if (!bond) 4582 return -ENOMEM; 4583 4584 bond[ve->num_bonds].master = master; 4585 bond[ve->num_bonds].sibling_mask = sibling->mask; 4586 4587 ve->bonds = bond; 4588 ve->num_bonds++; 4589 4590 return 0; 4591 } 4592 4593 struct intel_engine_cs * 4594 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 4595 unsigned int sibling) 4596 { 4597 struct virtual_engine *ve = to_virtual_engine(engine); 4598 4599 if (sibling >= ve->num_siblings) 4600 return NULL; 4601 4602 return ve->siblings[sibling]; 4603 } 4604 4605 void intel_execlists_show_requests(struct intel_engine_cs *engine, 4606 struct drm_printer *m, 4607 void (*show_request)(struct drm_printer *m, 4608 struct i915_request *rq, 4609 const char *prefix), 4610 unsigned int max) 4611 { 4612 const struct intel_engine_execlists *execlists = &engine->execlists; 4613 struct i915_request *rq, *last; 4614 unsigned long flags; 4615 unsigned int count; 4616 struct rb_node *rb; 4617 4618 spin_lock_irqsave(&engine->active.lock, flags); 4619 4620 last = NULL; 4621 count = 0; 4622 list_for_each_entry(rq, &engine->active.requests, sched.link) { 4623 if (count++ < max - 1) 4624 show_request(m, rq, "\t\tE "); 4625 else 4626 last = rq; 4627 } 4628 if (last) { 4629 if (count > max) { 4630 drm_printf(m, 4631 "\t\t...skipping %d executing requests...\n", 4632 count - max); 4633 } 4634 show_request(m, last, "\t\tE "); 4635 } 4636 4637 last = NULL; 4638 count = 0; 4639 if (execlists->queue_priority_hint != INT_MIN) 4640 drm_printf(m, "\t\tQueue priority hint: %d\n", 4641 execlists->queue_priority_hint); 4642 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 4643 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 4644 int i; 4645 4646 priolist_for_each_request(rq, p, i) { 4647 if (count++ < max - 1) 4648 show_request(m, rq, "\t\tQ "); 4649 else 4650 last = rq; 4651 } 4652 } 4653 if (last) { 4654 if (count > max) { 4655 drm_printf(m, 4656 "\t\t...skipping %d queued requests...\n", 4657 count - max); 4658 } 4659 show_request(m, last, "\t\tQ "); 4660 } 4661 4662 last = NULL; 4663 count = 0; 4664 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 4665 struct virtual_engine *ve = 4666 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4667 struct i915_request *rq = READ_ONCE(ve->request); 4668 4669 if (rq) { 4670 if (count++ < max - 1) 4671 show_request(m, rq, "\t\tV "); 4672 else 4673 last = rq; 4674 } 4675 } 4676 if (last) { 4677 if (count > max) { 4678 drm_printf(m, 4679 "\t\t...skipping %d virtual requests...\n", 4680 count - max); 4681 } 4682 show_request(m, last, "\t\tV "); 4683 } 4684 4685 spin_unlock_irqrestore(&engine->active.lock, flags); 4686 } 4687 4688 void intel_lr_context_reset(struct intel_engine_cs *engine, 4689 struct intel_context *ce, 4690 u32 head, 4691 bool scrub) 4692 { 4693 GEM_BUG_ON(!intel_context_is_pinned(ce)); 4694 4695 /* 4696 * We want a simple context + ring to execute the breadcrumb update. 4697 * We cannot rely on the context being intact across the GPU hang, 4698 * so clear it and rebuild just what we need for the breadcrumb. 4699 * All pending requests for this context will be zapped, and any 4700 * future request will be after userspace has had the opportunity 4701 * to recreate its own state. 4702 */ 4703 if (scrub) 4704 restore_default_state(ce, engine); 4705 4706 /* Rerun the request; its payload has been neutered (if guilty). */ 4707 ce->ring->head = head; 4708 intel_ring_update_space(ce->ring); 4709 4710 __execlists_update_reg_state(ce, engine); 4711 } 4712 4713 bool 4714 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 4715 { 4716 return engine->set_default_submission == 4717 intel_execlists_set_default_submission; 4718 } 4719 4720 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 4721 #include "selftest_lrc.c" 4722 #endif 4723