1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_context.h" 141 #include "intel_engine_pm.h" 142 #include "intel_gt.h" 143 #include "intel_gt_pm.h" 144 #include "intel_gt_requests.h" 145 #include "intel_lrc_reg.h" 146 #include "intel_mocs.h" 147 #include "intel_reset.h" 148 #include "intel_ring.h" 149 #include "intel_workarounds.h" 150 151 #define RING_EXECLIST_QFULL (1 << 0x2) 152 #define RING_EXECLIST1_VALID (1 << 0x3) 153 #define RING_EXECLIST0_VALID (1 << 0x4) 154 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 155 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 156 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 157 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 159 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 162 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 163 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 164 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 166 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 167 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 169 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 172 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 173 #define GEN12_IDLE_CTX_ID 0x7FF 174 #define GEN12_CSB_CTX_VALID(csb_dw) \ 175 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 176 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 179 180 struct virtual_engine { 181 struct intel_engine_cs base; 182 struct intel_context context; 183 184 /* 185 * We allow only a single request through the virtual engine at a time 186 * (each request in the timeline waits for the completion fence of 187 * the previous before being submitted). By restricting ourselves to 188 * only submitting a single request, each request is placed on to a 189 * physical to maximise load spreading (by virtue of the late greedy 190 * scheduling -- each real engine takes the next available request 191 * upon idling). 192 */ 193 struct i915_request *request; 194 195 /* 196 * We keep a rbtree of available virtual engines inside each physical 197 * engine, sorted by priority. Here we preallocate the nodes we need 198 * for the virtual engine, indexed by physical_engine->id. 199 */ 200 struct ve_node { 201 struct rb_node rb; 202 int prio; 203 } nodes[I915_NUM_ENGINES]; 204 205 /* 206 * Keep track of bonded pairs -- restrictions upon on our selection 207 * of physical engines any particular request may be submitted to. 208 * If we receive a submit-fence from a master engine, we will only 209 * use one of sibling_mask physical engines. 210 */ 211 struct ve_bond { 212 const struct intel_engine_cs *master; 213 intel_engine_mask_t sibling_mask; 214 } *bonds; 215 unsigned int num_bonds; 216 217 /* And finally, which physical engines this virtual engine maps onto. */ 218 unsigned int num_siblings; 219 struct intel_engine_cs *siblings[0]; 220 }; 221 222 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 223 { 224 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 225 return container_of(engine, struct virtual_engine, base); 226 } 227 228 static int __execlists_context_alloc(struct intel_context *ce, 229 struct intel_engine_cs *engine); 230 231 static void execlists_init_reg_state(u32 *reg_state, 232 const struct intel_context *ce, 233 const struct intel_engine_cs *engine, 234 const struct intel_ring *ring, 235 bool close); 236 static void 237 __execlists_update_reg_state(const struct intel_context *ce, 238 const struct intel_engine_cs *engine, 239 u32 head); 240 241 static void mark_eio(struct i915_request *rq) 242 { 243 if (i915_request_completed(rq)) 244 return; 245 246 GEM_BUG_ON(i915_request_signaled(rq)); 247 248 i915_request_set_error_once(rq, -EIO); 249 i915_request_mark_complete(rq); 250 } 251 252 static struct i915_request * 253 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 254 { 255 struct i915_request *active = rq; 256 257 rcu_read_lock(); 258 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 259 if (i915_request_completed(rq)) 260 break; 261 262 active = rq; 263 } 264 rcu_read_unlock(); 265 266 return active; 267 } 268 269 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 270 { 271 return (i915_ggtt_offset(engine->status_page.vma) + 272 I915_GEM_HWS_PREEMPT_ADDR); 273 } 274 275 static inline void 276 ring_set_paused(const struct intel_engine_cs *engine, int state) 277 { 278 /* 279 * We inspect HWS_PREEMPT with a semaphore inside 280 * engine->emit_fini_breadcrumb. If the dword is true, 281 * the ring is paused as the semaphore will busywait 282 * until the dword is false. 283 */ 284 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 285 if (state) 286 wmb(); 287 } 288 289 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 290 { 291 return rb_entry(rb, struct i915_priolist, node); 292 } 293 294 static inline int rq_prio(const struct i915_request *rq) 295 { 296 return READ_ONCE(rq->sched.attr.priority); 297 } 298 299 static int effective_prio(const struct i915_request *rq) 300 { 301 int prio = rq_prio(rq); 302 303 /* 304 * If this request is special and must not be interrupted at any 305 * cost, so be it. Note we are only checking the most recent request 306 * in the context and so may be masking an earlier vip request. It 307 * is hoped that under the conditions where nopreempt is used, this 308 * will not matter (i.e. all requests to that context will be 309 * nopreempt for as long as desired). 310 */ 311 if (i915_request_has_nopreempt(rq)) 312 prio = I915_PRIORITY_UNPREEMPTABLE; 313 314 /* 315 * On unwinding the active request, we give it a priority bump 316 * if it has completed waiting on any semaphore. If we know that 317 * the request has already started, we can prevent an unwanted 318 * preempt-to-idle cycle by taking that into account now. 319 */ 320 if (__i915_request_has_started(rq)) 321 prio |= I915_PRIORITY_NOSEMAPHORE; 322 323 /* Restrict mere WAIT boosts from triggering preemption */ 324 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ 325 return prio | __NO_PREEMPTION; 326 } 327 328 static int queue_prio(const struct intel_engine_execlists *execlists) 329 { 330 struct i915_priolist *p; 331 struct rb_node *rb; 332 333 rb = rb_first_cached(&execlists->queue); 334 if (!rb) 335 return INT_MIN; 336 337 /* 338 * As the priolist[] are inverted, with the highest priority in [0], 339 * we have to flip the index value to become priority. 340 */ 341 p = to_priolist(rb); 342 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 343 } 344 345 static inline bool need_preempt(const struct intel_engine_cs *engine, 346 const struct i915_request *rq, 347 struct rb_node *rb) 348 { 349 int last_prio; 350 351 if (!intel_engine_has_semaphores(engine)) 352 return false; 353 354 /* 355 * Check if the current priority hint merits a preemption attempt. 356 * 357 * We record the highest value priority we saw during rescheduling 358 * prior to this dequeue, therefore we know that if it is strictly 359 * less than the current tail of ESLP[0], we do not need to force 360 * a preempt-to-idle cycle. 361 * 362 * However, the priority hint is a mere hint that we may need to 363 * preempt. If that hint is stale or we may be trying to preempt 364 * ourselves, ignore the request. 365 * 366 * More naturally we would write 367 * prio >= max(0, last); 368 * except that we wish to prevent triggering preemption at the same 369 * priority level: the task that is running should remain running 370 * to preserve FIFO ordering of dependencies. 371 */ 372 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 373 if (engine->execlists.queue_priority_hint <= last_prio) 374 return false; 375 376 /* 377 * Check against the first request in ELSP[1], it will, thanks to the 378 * power of PI, be the highest priority of that context. 379 */ 380 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 381 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 382 return true; 383 384 if (rb) { 385 struct virtual_engine *ve = 386 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 387 bool preempt = false; 388 389 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 390 struct i915_request *next; 391 392 rcu_read_lock(); 393 next = READ_ONCE(ve->request); 394 if (next) 395 preempt = rq_prio(next) > last_prio; 396 rcu_read_unlock(); 397 } 398 399 if (preempt) 400 return preempt; 401 } 402 403 /* 404 * If the inflight context did not trigger the preemption, then maybe 405 * it was the set of queued requests? Pick the highest priority in 406 * the queue (the first active priolist) and see if it deserves to be 407 * running instead of ELSP[0]. 408 * 409 * The highest priority request in the queue can not be either 410 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 411 * context, it's priority would not exceed ELSP[0] aka last_prio. 412 */ 413 return queue_prio(&engine->execlists) > last_prio; 414 } 415 416 __maybe_unused static inline bool 417 assert_priority_queue(const struct i915_request *prev, 418 const struct i915_request *next) 419 { 420 /* 421 * Without preemption, the prev may refer to the still active element 422 * which we refuse to let go. 423 * 424 * Even with preemption, there are times when we think it is better not 425 * to preempt and leave an ostensibly lower priority request in flight. 426 */ 427 if (i915_request_is_active(prev)) 428 return true; 429 430 return rq_prio(prev) >= rq_prio(next); 431 } 432 433 /* 434 * The context descriptor encodes various attributes of a context, 435 * including its GTT address and some flags. Because it's fairly 436 * expensive to calculate, we'll just do it once and cache the result, 437 * which remains valid until the context is unpinned. 438 * 439 * This is what a descriptor looks like, from LSB to MSB:: 440 * 441 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 442 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 443 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 444 * bits 53-54: mbz, reserved for use by hardware 445 * bits 55-63: group ID, currently unused and set to 0 446 * 447 * Starting from Gen11, the upper dword of the descriptor has a new format: 448 * 449 * bits 32-36: reserved 450 * bits 37-47: SW context ID 451 * bits 48:53: engine instance 452 * bit 54: mbz, reserved for use by hardware 453 * bits 55-60: SW counter 454 * bits 61-63: engine class 455 * 456 * engine info, SW context ID and SW counter need to form a unique number 457 * (Context ID) per lrc. 458 */ 459 static u64 460 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 461 { 462 u64 desc; 463 464 desc = INTEL_LEGACY_32B_CONTEXT; 465 if (i915_vm_is_4lvl(ce->vm)) 466 desc = INTEL_LEGACY_64B_CONTEXT; 467 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 468 469 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 470 if (IS_GEN(engine->i915, 8)) 471 desc |= GEN8_CTX_L3LLC_COHERENT; 472 473 desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */ 474 /* 475 * The following 32bits are copied into the OA reports (dword 2). 476 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing 477 * anything below. 478 */ 479 if (INTEL_GEN(engine->i915) >= 11) { 480 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; 481 /* bits 48-53 */ 482 483 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; 484 /* bits 61-63 */ 485 } 486 487 return desc; 488 } 489 490 static inline unsigned int dword_in_page(void *addr) 491 { 492 return offset_in_page(addr) / sizeof(u32); 493 } 494 495 static void set_offsets(u32 *regs, 496 const u8 *data, 497 const struct intel_engine_cs *engine, 498 bool clear) 499 #define NOP(x) (BIT(7) | (x)) 500 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 501 #define POSTED BIT(0) 502 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 503 #define REG16(x) \ 504 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 505 (((x) >> 2) & 0x7f) 506 #define END(x) 0, (x) 507 { 508 const u32 base = engine->mmio_base; 509 510 while (*data) { 511 u8 count, flags; 512 513 if (*data & BIT(7)) { /* skip */ 514 count = *data++ & ~BIT(7); 515 if (clear) 516 memset32(regs, MI_NOOP, count); 517 regs += count; 518 continue; 519 } 520 521 count = *data & 0x3f; 522 flags = *data >> 6; 523 data++; 524 525 *regs = MI_LOAD_REGISTER_IMM(count); 526 if (flags & POSTED) 527 *regs |= MI_LRI_FORCE_POSTED; 528 if (INTEL_GEN(engine->i915) >= 11) 529 *regs |= MI_LRI_CS_MMIO; 530 regs++; 531 532 GEM_BUG_ON(!count); 533 do { 534 u32 offset = 0; 535 u8 v; 536 537 do { 538 v = *data++; 539 offset <<= 7; 540 offset |= v & ~BIT(7); 541 } while (v & BIT(7)); 542 543 regs[0] = base + (offset << 2); 544 if (clear) 545 regs[1] = 0; 546 regs += 2; 547 } while (--count); 548 } 549 550 if (clear) { 551 u8 count = *++data; 552 553 /* Clear past the tail for HW access */ 554 GEM_BUG_ON(dword_in_page(regs) > count); 555 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 556 557 /* Close the batch; used mainly by live_lrc_layout() */ 558 *regs = MI_BATCH_BUFFER_END; 559 if (INTEL_GEN(engine->i915) >= 10) 560 *regs |= BIT(0); 561 } 562 } 563 564 static const u8 gen8_xcs_offsets[] = { 565 NOP(1), 566 LRI(11, 0), 567 REG16(0x244), 568 REG(0x034), 569 REG(0x030), 570 REG(0x038), 571 REG(0x03c), 572 REG(0x168), 573 REG(0x140), 574 REG(0x110), 575 REG(0x11c), 576 REG(0x114), 577 REG(0x118), 578 579 NOP(9), 580 LRI(9, 0), 581 REG16(0x3a8), 582 REG16(0x28c), 583 REG16(0x288), 584 REG16(0x284), 585 REG16(0x280), 586 REG16(0x27c), 587 REG16(0x278), 588 REG16(0x274), 589 REG16(0x270), 590 591 NOP(13), 592 LRI(2, 0), 593 REG16(0x200), 594 REG(0x028), 595 596 END(80) 597 }; 598 599 static const u8 gen9_xcs_offsets[] = { 600 NOP(1), 601 LRI(14, POSTED), 602 REG16(0x244), 603 REG(0x034), 604 REG(0x030), 605 REG(0x038), 606 REG(0x03c), 607 REG(0x168), 608 REG(0x140), 609 REG(0x110), 610 REG(0x11c), 611 REG(0x114), 612 REG(0x118), 613 REG(0x1c0), 614 REG(0x1c4), 615 REG(0x1c8), 616 617 NOP(3), 618 LRI(9, POSTED), 619 REG16(0x3a8), 620 REG16(0x28c), 621 REG16(0x288), 622 REG16(0x284), 623 REG16(0x280), 624 REG16(0x27c), 625 REG16(0x278), 626 REG16(0x274), 627 REG16(0x270), 628 629 NOP(13), 630 LRI(1, POSTED), 631 REG16(0x200), 632 633 NOP(13), 634 LRI(44, POSTED), 635 REG(0x028), 636 REG(0x09c), 637 REG(0x0c0), 638 REG(0x178), 639 REG(0x17c), 640 REG16(0x358), 641 REG(0x170), 642 REG(0x150), 643 REG(0x154), 644 REG(0x158), 645 REG16(0x41c), 646 REG16(0x600), 647 REG16(0x604), 648 REG16(0x608), 649 REG16(0x60c), 650 REG16(0x610), 651 REG16(0x614), 652 REG16(0x618), 653 REG16(0x61c), 654 REG16(0x620), 655 REG16(0x624), 656 REG16(0x628), 657 REG16(0x62c), 658 REG16(0x630), 659 REG16(0x634), 660 REG16(0x638), 661 REG16(0x63c), 662 REG16(0x640), 663 REG16(0x644), 664 REG16(0x648), 665 REG16(0x64c), 666 REG16(0x650), 667 REG16(0x654), 668 REG16(0x658), 669 REG16(0x65c), 670 REG16(0x660), 671 REG16(0x664), 672 REG16(0x668), 673 REG16(0x66c), 674 REG16(0x670), 675 REG16(0x674), 676 REG16(0x678), 677 REG16(0x67c), 678 REG(0x068), 679 680 END(176) 681 }; 682 683 static const u8 gen12_xcs_offsets[] = { 684 NOP(1), 685 LRI(13, POSTED), 686 REG16(0x244), 687 REG(0x034), 688 REG(0x030), 689 REG(0x038), 690 REG(0x03c), 691 REG(0x168), 692 REG(0x140), 693 REG(0x110), 694 REG(0x1c0), 695 REG(0x1c4), 696 REG(0x1c8), 697 REG(0x180), 698 REG16(0x2b4), 699 700 NOP(5), 701 LRI(9, POSTED), 702 REG16(0x3a8), 703 REG16(0x28c), 704 REG16(0x288), 705 REG16(0x284), 706 REG16(0x280), 707 REG16(0x27c), 708 REG16(0x278), 709 REG16(0x274), 710 REG16(0x270), 711 712 END(80) 713 }; 714 715 static const u8 gen8_rcs_offsets[] = { 716 NOP(1), 717 LRI(14, POSTED), 718 REG16(0x244), 719 REG(0x034), 720 REG(0x030), 721 REG(0x038), 722 REG(0x03c), 723 REG(0x168), 724 REG(0x140), 725 REG(0x110), 726 REG(0x11c), 727 REG(0x114), 728 REG(0x118), 729 REG(0x1c0), 730 REG(0x1c4), 731 REG(0x1c8), 732 733 NOP(3), 734 LRI(9, POSTED), 735 REG16(0x3a8), 736 REG16(0x28c), 737 REG16(0x288), 738 REG16(0x284), 739 REG16(0x280), 740 REG16(0x27c), 741 REG16(0x278), 742 REG16(0x274), 743 REG16(0x270), 744 745 NOP(13), 746 LRI(1, 0), 747 REG(0x0c8), 748 749 END(80) 750 }; 751 752 static const u8 gen9_rcs_offsets[] = { 753 NOP(1), 754 LRI(14, POSTED), 755 REG16(0x244), 756 REG(0x34), 757 REG(0x30), 758 REG(0x38), 759 REG(0x3c), 760 REG(0x168), 761 REG(0x140), 762 REG(0x110), 763 REG(0x11c), 764 REG(0x114), 765 REG(0x118), 766 REG(0x1c0), 767 REG(0x1c4), 768 REG(0x1c8), 769 770 NOP(3), 771 LRI(9, POSTED), 772 REG16(0x3a8), 773 REG16(0x28c), 774 REG16(0x288), 775 REG16(0x284), 776 REG16(0x280), 777 REG16(0x27c), 778 REG16(0x278), 779 REG16(0x274), 780 REG16(0x270), 781 782 NOP(13), 783 LRI(1, 0), 784 REG(0xc8), 785 786 NOP(13), 787 LRI(44, POSTED), 788 REG(0x28), 789 REG(0x9c), 790 REG(0xc0), 791 REG(0x178), 792 REG(0x17c), 793 REG16(0x358), 794 REG(0x170), 795 REG(0x150), 796 REG(0x154), 797 REG(0x158), 798 REG16(0x41c), 799 REG16(0x600), 800 REG16(0x604), 801 REG16(0x608), 802 REG16(0x60c), 803 REG16(0x610), 804 REG16(0x614), 805 REG16(0x618), 806 REG16(0x61c), 807 REG16(0x620), 808 REG16(0x624), 809 REG16(0x628), 810 REG16(0x62c), 811 REG16(0x630), 812 REG16(0x634), 813 REG16(0x638), 814 REG16(0x63c), 815 REG16(0x640), 816 REG16(0x644), 817 REG16(0x648), 818 REG16(0x64c), 819 REG16(0x650), 820 REG16(0x654), 821 REG16(0x658), 822 REG16(0x65c), 823 REG16(0x660), 824 REG16(0x664), 825 REG16(0x668), 826 REG16(0x66c), 827 REG16(0x670), 828 REG16(0x674), 829 REG16(0x678), 830 REG16(0x67c), 831 REG(0x68), 832 833 END(176) 834 }; 835 836 static const u8 gen11_rcs_offsets[] = { 837 NOP(1), 838 LRI(15, POSTED), 839 REG16(0x244), 840 REG(0x034), 841 REG(0x030), 842 REG(0x038), 843 REG(0x03c), 844 REG(0x168), 845 REG(0x140), 846 REG(0x110), 847 REG(0x11c), 848 REG(0x114), 849 REG(0x118), 850 REG(0x1c0), 851 REG(0x1c4), 852 REG(0x1c8), 853 REG(0x180), 854 855 NOP(1), 856 LRI(9, POSTED), 857 REG16(0x3a8), 858 REG16(0x28c), 859 REG16(0x288), 860 REG16(0x284), 861 REG16(0x280), 862 REG16(0x27c), 863 REG16(0x278), 864 REG16(0x274), 865 REG16(0x270), 866 867 LRI(1, POSTED), 868 REG(0x1b0), 869 870 NOP(10), 871 LRI(1, 0), 872 REG(0x0c8), 873 874 END(80) 875 }; 876 877 static const u8 gen12_rcs_offsets[] = { 878 NOP(1), 879 LRI(13, POSTED), 880 REG16(0x244), 881 REG(0x034), 882 REG(0x030), 883 REG(0x038), 884 REG(0x03c), 885 REG(0x168), 886 REG(0x140), 887 REG(0x110), 888 REG(0x1c0), 889 REG(0x1c4), 890 REG(0x1c8), 891 REG(0x180), 892 REG16(0x2b4), 893 894 NOP(5), 895 LRI(9, POSTED), 896 REG16(0x3a8), 897 REG16(0x28c), 898 REG16(0x288), 899 REG16(0x284), 900 REG16(0x280), 901 REG16(0x27c), 902 REG16(0x278), 903 REG16(0x274), 904 REG16(0x270), 905 906 LRI(3, POSTED), 907 REG(0x1b0), 908 REG16(0x5a8), 909 REG16(0x5ac), 910 911 NOP(6), 912 LRI(1, 0), 913 REG(0x0c8), 914 915 END(80) 916 }; 917 918 #undef END 919 #undef REG16 920 #undef REG 921 #undef LRI 922 #undef NOP 923 924 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 925 { 926 /* 927 * The gen12+ lists only have the registers we program in the basic 928 * default state. We rely on the context image using relative 929 * addressing to automatic fixup the register state between the 930 * physical engines for virtual engine. 931 */ 932 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 933 !intel_engine_has_relative_mmio(engine)); 934 935 if (engine->class == RENDER_CLASS) { 936 if (INTEL_GEN(engine->i915) >= 12) 937 return gen12_rcs_offsets; 938 else if (INTEL_GEN(engine->i915) >= 11) 939 return gen11_rcs_offsets; 940 else if (INTEL_GEN(engine->i915) >= 9) 941 return gen9_rcs_offsets; 942 else 943 return gen8_rcs_offsets; 944 } else { 945 if (INTEL_GEN(engine->i915) >= 12) 946 return gen12_xcs_offsets; 947 else if (INTEL_GEN(engine->i915) >= 9) 948 return gen9_xcs_offsets; 949 else 950 return gen8_xcs_offsets; 951 } 952 } 953 954 static struct i915_request * 955 __unwind_incomplete_requests(struct intel_engine_cs *engine) 956 { 957 struct i915_request *rq, *rn, *active = NULL; 958 struct list_head *uninitialized_var(pl); 959 int prio = I915_PRIORITY_INVALID; 960 961 lockdep_assert_held(&engine->active.lock); 962 963 list_for_each_entry_safe_reverse(rq, rn, 964 &engine->active.requests, 965 sched.link) { 966 if (i915_request_completed(rq)) 967 continue; /* XXX */ 968 969 __i915_request_unsubmit(rq); 970 971 /* 972 * Push the request back into the queue for later resubmission. 973 * If this request is not native to this physical engine (i.e. 974 * it came from a virtual source), push it back onto the virtual 975 * engine so that it can be moved across onto another physical 976 * engine as load dictates. 977 */ 978 if (likely(rq->execution_mask == engine->mask)) { 979 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 980 if (rq_prio(rq) != prio) { 981 prio = rq_prio(rq); 982 pl = i915_sched_lookup_priolist(engine, prio); 983 } 984 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 985 986 list_move(&rq->sched.link, pl); 987 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 988 989 active = rq; 990 } else { 991 struct intel_engine_cs *owner = rq->context->engine; 992 993 /* 994 * Decouple the virtual breadcrumb before moving it 995 * back to the virtual engine -- we don't want the 996 * request to complete in the background and try 997 * and cancel the breadcrumb on the virtual engine 998 * (instead of the old engine where it is linked)! 999 */ 1000 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 1001 &rq->fence.flags)) { 1002 spin_lock_nested(&rq->lock, 1003 SINGLE_DEPTH_NESTING); 1004 i915_request_cancel_breadcrumb(rq); 1005 spin_unlock(&rq->lock); 1006 } 1007 WRITE_ONCE(rq->engine, owner); 1008 owner->submit_request(rq); 1009 active = NULL; 1010 } 1011 } 1012 1013 return active; 1014 } 1015 1016 struct i915_request * 1017 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1018 { 1019 struct intel_engine_cs *engine = 1020 container_of(execlists, typeof(*engine), execlists); 1021 1022 return __unwind_incomplete_requests(engine); 1023 } 1024 1025 static inline void 1026 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1027 { 1028 /* 1029 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1030 * The compiler should eliminate this function as dead-code. 1031 */ 1032 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1033 return; 1034 1035 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1036 status, rq); 1037 } 1038 1039 static void intel_engine_context_in(struct intel_engine_cs *engine) 1040 { 1041 unsigned long flags; 1042 1043 if (READ_ONCE(engine->stats.enabled) == 0) 1044 return; 1045 1046 write_seqlock_irqsave(&engine->stats.lock, flags); 1047 1048 if (engine->stats.enabled > 0) { 1049 if (engine->stats.active++ == 0) 1050 engine->stats.start = ktime_get(); 1051 GEM_BUG_ON(engine->stats.active == 0); 1052 } 1053 1054 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1055 } 1056 1057 static void intel_engine_context_out(struct intel_engine_cs *engine) 1058 { 1059 unsigned long flags; 1060 1061 if (READ_ONCE(engine->stats.enabled) == 0) 1062 return; 1063 1064 write_seqlock_irqsave(&engine->stats.lock, flags); 1065 1066 if (engine->stats.enabled > 0) { 1067 ktime_t last; 1068 1069 if (engine->stats.active && --engine->stats.active == 0) { 1070 /* 1071 * Decrement the active context count and in case GPU 1072 * is now idle add up to the running total. 1073 */ 1074 last = ktime_sub(ktime_get(), engine->stats.start); 1075 1076 engine->stats.total = ktime_add(engine->stats.total, 1077 last); 1078 } else if (engine->stats.active == 0) { 1079 /* 1080 * After turning on engine stats, context out might be 1081 * the first event in which case we account from the 1082 * time stats gathering was turned on. 1083 */ 1084 last = ktime_sub(ktime_get(), engine->stats.enabled_at); 1085 1086 engine->stats.total = ktime_add(engine->stats.total, 1087 last); 1088 } 1089 } 1090 1091 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1092 } 1093 1094 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 1095 { 1096 if (INTEL_GEN(engine->i915) >= 12) 1097 return 0x60; 1098 else if (INTEL_GEN(engine->i915) >= 9) 1099 return 0x54; 1100 else if (engine->class == RENDER_CLASS) 1101 return 0x58; 1102 else 1103 return -1; 1104 } 1105 1106 static void 1107 execlists_check_context(const struct intel_context *ce, 1108 const struct intel_engine_cs *engine) 1109 { 1110 const struct intel_ring *ring = ce->ring; 1111 u32 *regs = ce->lrc_reg_state; 1112 bool valid = true; 1113 int x; 1114 1115 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1116 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1117 engine->name, 1118 regs[CTX_RING_START], 1119 i915_ggtt_offset(ring->vma)); 1120 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1121 valid = false; 1122 } 1123 1124 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1125 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1126 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1127 engine->name, 1128 regs[CTX_RING_CTL], 1129 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1130 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1131 valid = false; 1132 } 1133 1134 x = lrc_ring_mi_mode(engine); 1135 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1136 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1137 engine->name, regs[x + 1]); 1138 regs[x + 1] &= ~STOP_RING; 1139 regs[x + 1] |= STOP_RING << 16; 1140 valid = false; 1141 } 1142 1143 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1144 } 1145 1146 static void restore_default_state(struct intel_context *ce, 1147 struct intel_engine_cs *engine) 1148 { 1149 u32 *regs = ce->lrc_reg_state; 1150 1151 if (engine->pinned_default_state) 1152 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 1153 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 1154 engine->context_size - PAGE_SIZE); 1155 1156 execlists_init_reg_state(regs, ce, engine, ce->ring, false); 1157 } 1158 1159 static void reset_active(struct i915_request *rq, 1160 struct intel_engine_cs *engine) 1161 { 1162 struct intel_context * const ce = rq->context; 1163 u32 head; 1164 1165 /* 1166 * The executing context has been cancelled. We want to prevent 1167 * further execution along this context and propagate the error on 1168 * to anything depending on its results. 1169 * 1170 * In __i915_request_submit(), we apply the -EIO and remove the 1171 * requests' payloads for any banned requests. But first, we must 1172 * rewind the context back to the start of the incomplete request so 1173 * that we do not jump back into the middle of the batch. 1174 * 1175 * We preserve the breadcrumbs and semaphores of the incomplete 1176 * requests so that inter-timeline dependencies (i.e other timelines) 1177 * remain correctly ordered. And we defer to __i915_request_submit() 1178 * so that all asynchronous waits are correctly handled. 1179 */ 1180 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1181 rq->fence.context, rq->fence.seqno); 1182 1183 /* On resubmission of the active request, payload will be scrubbed */ 1184 if (i915_request_completed(rq)) 1185 head = rq->tail; 1186 else 1187 head = active_request(ce->timeline, rq)->head; 1188 head = intel_ring_wrap(ce->ring, head); 1189 1190 /* Scrub the context image to prevent replaying the previous batch */ 1191 restore_default_state(ce, engine); 1192 __execlists_update_reg_state(ce, engine, head); 1193 1194 /* We've switched away, so this should be a no-op, but intent matters */ 1195 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1196 } 1197 1198 static u32 intel_context_get_runtime(const struct intel_context *ce) 1199 { 1200 /* 1201 * We can use either ppHWSP[16] which is recorded before the context 1202 * switch (and so excludes the cost of context switches) or use the 1203 * value from the context image itself, which is saved/restored earlier 1204 * and so includes the cost of the save. 1205 */ 1206 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1207 } 1208 1209 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1210 { 1211 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1212 ce->runtime.num_underflow += dt < 0; 1213 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1214 #endif 1215 } 1216 1217 static void intel_context_update_runtime(struct intel_context *ce) 1218 { 1219 u32 old; 1220 s32 dt; 1221 1222 if (intel_context_is_barrier(ce)) 1223 return; 1224 1225 old = ce->runtime.last; 1226 ce->runtime.last = intel_context_get_runtime(ce); 1227 dt = ce->runtime.last - old; 1228 1229 if (unlikely(dt <= 0)) { 1230 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1231 old, ce->runtime.last, dt); 1232 st_update_runtime_underflow(ce, dt); 1233 return; 1234 } 1235 1236 ewma_runtime_add(&ce->runtime.avg, dt); 1237 ce->runtime.total += dt; 1238 } 1239 1240 static inline struct intel_engine_cs * 1241 __execlists_schedule_in(struct i915_request *rq) 1242 { 1243 struct intel_engine_cs * const engine = rq->engine; 1244 struct intel_context * const ce = rq->context; 1245 1246 intel_context_get(ce); 1247 1248 if (unlikely(intel_context_is_banned(ce))) 1249 reset_active(rq, engine); 1250 1251 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1252 execlists_check_context(ce, engine); 1253 1254 ce->lrc_desc &= ~GENMASK_ULL(47, 37); 1255 if (ce->tag) { 1256 /* Use a fixed tag for OA and friends */ 1257 ce->lrc_desc |= (u64)ce->tag << 32; 1258 } else { 1259 /* We don't need a strict matching tag, just different values */ 1260 ce->lrc_desc |= 1261 (u64)(++engine->context_tag % NUM_CONTEXT_TAG) << 1262 GEN11_SW_CTX_ID_SHIFT; 1263 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID); 1264 } 1265 1266 __intel_gt_pm_get(engine->gt); 1267 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1268 intel_engine_context_in(engine); 1269 1270 return engine; 1271 } 1272 1273 static inline struct i915_request * 1274 execlists_schedule_in(struct i915_request *rq, int idx) 1275 { 1276 struct intel_context * const ce = rq->context; 1277 struct intel_engine_cs *old; 1278 1279 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1280 trace_i915_request_in(rq, idx); 1281 1282 old = READ_ONCE(ce->inflight); 1283 do { 1284 if (!old) { 1285 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1286 break; 1287 } 1288 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1289 1290 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1291 return i915_request_get(rq); 1292 } 1293 1294 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1295 { 1296 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1297 struct i915_request *next = READ_ONCE(ve->request); 1298 1299 if (next && next->execution_mask & ~rq->execution_mask) 1300 tasklet_schedule(&ve->base.execlists.tasklet); 1301 } 1302 1303 static inline void 1304 __execlists_schedule_out(struct i915_request *rq, 1305 struct intel_engine_cs * const engine) 1306 { 1307 struct intel_context * const ce = rq->context; 1308 1309 /* 1310 * NB process_csb() is not under the engine->active.lock and hence 1311 * schedule_out can race with schedule_in meaning that we should 1312 * refrain from doing non-trivial work here. 1313 */ 1314 1315 /* 1316 * If we have just completed this context, the engine may now be 1317 * idle and we want to re-enter powersaving. 1318 */ 1319 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && 1320 i915_request_completed(rq)) 1321 intel_engine_add_retire(engine, ce->timeline); 1322 1323 intel_context_update_runtime(ce); 1324 intel_engine_context_out(engine); 1325 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1326 intel_gt_pm_put_async(engine->gt); 1327 1328 /* 1329 * If this is part of a virtual engine, its next request may 1330 * have been blocked waiting for access to the active context. 1331 * We have to kick all the siblings again in case we need to 1332 * switch (e.g. the next request is not runnable on this 1333 * engine). Hopefully, we will already have submitted the next 1334 * request before the tasklet runs and do not need to rebuild 1335 * each virtual tree and kick everyone again. 1336 */ 1337 if (ce->engine != engine) 1338 kick_siblings(rq, ce); 1339 1340 intel_context_put(ce); 1341 } 1342 1343 static inline void 1344 execlists_schedule_out(struct i915_request *rq) 1345 { 1346 struct intel_context * const ce = rq->context; 1347 struct intel_engine_cs *cur, *old; 1348 1349 trace_i915_request_out(rq); 1350 1351 old = READ_ONCE(ce->inflight); 1352 do 1353 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1354 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1355 if (!cur) 1356 __execlists_schedule_out(rq, old); 1357 1358 i915_request_put(rq); 1359 } 1360 1361 static u64 execlists_update_context(struct i915_request *rq) 1362 { 1363 struct intel_context *ce = rq->context; 1364 u64 desc = ce->lrc_desc; 1365 u32 tail, prev; 1366 1367 /* 1368 * WaIdleLiteRestore:bdw,skl 1369 * 1370 * We should never submit the context with the same RING_TAIL twice 1371 * just in case we submit an empty ring, which confuses the HW. 1372 * 1373 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1374 * the normal request to be able to always advance the RING_TAIL on 1375 * subsequent resubmissions (for lite restore). Should that fail us, 1376 * and we try and submit the same tail again, force the context 1377 * reload. 1378 * 1379 * If we need to return to a preempted context, we need to skip the 1380 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1381 * HW has a tendency to ignore us rewinding the TAIL to the end of 1382 * an earlier request. 1383 */ 1384 tail = intel_ring_set_tail(rq->ring, rq->tail); 1385 prev = ce->lrc_reg_state[CTX_RING_TAIL]; 1386 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1387 desc |= CTX_DESC_FORCE_RESTORE; 1388 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1389 rq->tail = rq->wa_tail; 1390 1391 /* 1392 * Make sure the context image is complete before we submit it to HW. 1393 * 1394 * Ostensibly, writes (including the WCB) should be flushed prior to 1395 * an uncached write such as our mmio register access, the empirical 1396 * evidence (esp. on Braswell) suggests that the WC write into memory 1397 * may not be visible to the HW prior to the completion of the UC 1398 * register write and that we may begin execution from the context 1399 * before its image is complete leading to invalid PD chasing. 1400 */ 1401 wmb(); 1402 1403 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; 1404 return desc; 1405 } 1406 1407 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1408 { 1409 if (execlists->ctrl_reg) { 1410 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1411 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1412 } else { 1413 writel(upper_32_bits(desc), execlists->submit_reg); 1414 writel(lower_32_bits(desc), execlists->submit_reg); 1415 } 1416 } 1417 1418 static __maybe_unused void 1419 trace_ports(const struct intel_engine_execlists *execlists, 1420 const char *msg, 1421 struct i915_request * const *ports) 1422 { 1423 const struct intel_engine_cs *engine = 1424 container_of(execlists, typeof(*engine), execlists); 1425 1426 if (!ports[0]) 1427 return; 1428 1429 ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg, 1430 ports[0]->fence.context, 1431 ports[0]->fence.seqno, 1432 i915_request_completed(ports[0]) ? "!" : 1433 i915_request_started(ports[0]) ? "*" : 1434 "", 1435 ports[1] ? ports[1]->fence.context : 0, 1436 ports[1] ? ports[1]->fence.seqno : 0); 1437 } 1438 1439 static inline bool 1440 reset_in_progress(const struct intel_engine_execlists *execlists) 1441 { 1442 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1443 } 1444 1445 static __maybe_unused bool 1446 assert_pending_valid(const struct intel_engine_execlists *execlists, 1447 const char *msg) 1448 { 1449 struct i915_request * const *port, *rq; 1450 struct intel_context *ce = NULL; 1451 bool sentinel = false; 1452 1453 trace_ports(execlists, msg, execlists->pending); 1454 1455 /* We may be messing around with the lists during reset, lalala */ 1456 if (reset_in_progress(execlists)) 1457 return true; 1458 1459 if (!execlists->pending[0]) { 1460 GEM_TRACE_ERR("Nothing pending for promotion!\n"); 1461 return false; 1462 } 1463 1464 if (execlists->pending[execlists_num_ports(execlists)]) { 1465 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n", 1466 execlists_num_ports(execlists)); 1467 return false; 1468 } 1469 1470 for (port = execlists->pending; (rq = *port); port++) { 1471 unsigned long flags; 1472 bool ok = true; 1473 1474 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1475 GEM_BUG_ON(!i915_request_is_active(rq)); 1476 1477 if (ce == rq->context) { 1478 GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n", 1479 ce->timeline->fence_context, 1480 port - execlists->pending); 1481 return false; 1482 } 1483 ce = rq->context; 1484 1485 /* 1486 * Sentinels are supposed to be lonely so they flush the 1487 * current exection off the HW. Check that they are the 1488 * only request in the pending submission. 1489 */ 1490 if (sentinel) { 1491 GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n", 1492 ce->timeline->fence_context, 1493 port - execlists->pending); 1494 return false; 1495 } 1496 1497 sentinel = i915_request_has_sentinel(rq); 1498 if (sentinel && port != execlists->pending) { 1499 GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n", 1500 ce->timeline->fence_context, 1501 port - execlists->pending); 1502 return false; 1503 } 1504 1505 /* Hold tightly onto the lock to prevent concurrent retires! */ 1506 if (!spin_trylock_irqsave(&rq->lock, flags)) 1507 continue; 1508 1509 if (i915_request_completed(rq)) 1510 goto unlock; 1511 1512 if (i915_active_is_idle(&ce->active) && 1513 !intel_context_is_barrier(ce)) { 1514 GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n", 1515 ce->timeline->fence_context, 1516 port - execlists->pending); 1517 ok = false; 1518 goto unlock; 1519 } 1520 1521 if (!i915_vma_is_pinned(ce->state)) { 1522 GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n", 1523 ce->timeline->fence_context, 1524 port - execlists->pending); 1525 ok = false; 1526 goto unlock; 1527 } 1528 1529 if (!i915_vma_is_pinned(ce->ring->vma)) { 1530 GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n", 1531 ce->timeline->fence_context, 1532 port - execlists->pending); 1533 ok = false; 1534 goto unlock; 1535 } 1536 1537 unlock: 1538 spin_unlock_irqrestore(&rq->lock, flags); 1539 if (!ok) 1540 return false; 1541 } 1542 1543 return ce; 1544 } 1545 1546 static void execlists_submit_ports(struct intel_engine_cs *engine) 1547 { 1548 struct intel_engine_execlists *execlists = &engine->execlists; 1549 unsigned int n; 1550 1551 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1552 1553 /* 1554 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1555 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1556 * not be relinquished until the device is idle (see 1557 * i915_gem_idle_work_handler()). As a precaution, we make sure 1558 * that all ELSP are drained i.e. we have processed the CSB, 1559 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1560 */ 1561 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1562 1563 /* 1564 * ELSQ note: the submit queue is not cleared after being submitted 1565 * to the HW so we need to make sure we always clean it up. This is 1566 * currently ensured by the fact that we always write the same number 1567 * of elsq entries, keep this in mind before changing the loop below. 1568 */ 1569 for (n = execlists_num_ports(execlists); n--; ) { 1570 struct i915_request *rq = execlists->pending[n]; 1571 1572 write_desc(execlists, 1573 rq ? execlists_update_context(rq) : 0, 1574 n); 1575 } 1576 1577 /* we need to manually load the submit queue */ 1578 if (execlists->ctrl_reg) 1579 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1580 } 1581 1582 static bool ctx_single_port_submission(const struct intel_context *ce) 1583 { 1584 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1585 intel_context_force_single_submission(ce)); 1586 } 1587 1588 static bool can_merge_ctx(const struct intel_context *prev, 1589 const struct intel_context *next) 1590 { 1591 if (prev != next) 1592 return false; 1593 1594 if (ctx_single_port_submission(prev)) 1595 return false; 1596 1597 return true; 1598 } 1599 1600 static unsigned long i915_request_flags(const struct i915_request *rq) 1601 { 1602 return READ_ONCE(rq->fence.flags); 1603 } 1604 1605 static bool can_merge_rq(const struct i915_request *prev, 1606 const struct i915_request *next) 1607 { 1608 GEM_BUG_ON(prev == next); 1609 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1610 1611 /* 1612 * We do not submit known completed requests. Therefore if the next 1613 * request is already completed, we can pretend to merge it in 1614 * with the previous context (and we will skip updating the ELSP 1615 * and tracking). Thus hopefully keeping the ELSP full with active 1616 * contexts, despite the best efforts of preempt-to-busy to confuse 1617 * us. 1618 */ 1619 if (i915_request_completed(next)) 1620 return true; 1621 1622 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & 1623 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1624 BIT(I915_FENCE_FLAG_SENTINEL)))) 1625 return false; 1626 1627 if (!can_merge_ctx(prev->context, next->context)) 1628 return false; 1629 1630 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); 1631 return true; 1632 } 1633 1634 static void virtual_update_register_offsets(u32 *regs, 1635 struct intel_engine_cs *engine) 1636 { 1637 set_offsets(regs, reg_offsets(engine), engine, false); 1638 } 1639 1640 static bool virtual_matches(const struct virtual_engine *ve, 1641 const struct i915_request *rq, 1642 const struct intel_engine_cs *engine) 1643 { 1644 const struct intel_engine_cs *inflight; 1645 1646 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1647 return false; 1648 1649 /* 1650 * We track when the HW has completed saving the context image 1651 * (i.e. when we have seen the final CS event switching out of 1652 * the context) and must not overwrite the context image before 1653 * then. This restricts us to only using the active engine 1654 * while the previous virtualized request is inflight (so 1655 * we reuse the register offsets). This is a very small 1656 * hystersis on the greedy seelction algorithm. 1657 */ 1658 inflight = intel_context_inflight(&ve->context); 1659 if (inflight && inflight != engine) 1660 return false; 1661 1662 return true; 1663 } 1664 1665 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, 1666 struct i915_request *rq) 1667 { 1668 struct intel_engine_cs *old = ve->siblings[0]; 1669 1670 /* All unattached (rq->engine == old) must already be completed */ 1671 1672 spin_lock(&old->breadcrumbs.irq_lock); 1673 if (!list_empty(&ve->context.signal_link)) { 1674 list_del_init(&ve->context.signal_link); 1675 1676 /* 1677 * We cannot acquire the new engine->breadcrumbs.irq_lock 1678 * (as we are holding a breadcrumbs.irq_lock already), 1679 * so attach this request to the signaler on submission. 1680 * The queued irq_work will occur when we finally drop 1681 * the engine->active.lock after dequeue. 1682 */ 1683 set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags); 1684 1685 /* Also transfer the pending irq_work for the old breadcrumb. */ 1686 intel_engine_signal_breadcrumbs(rq->engine); 1687 } 1688 spin_unlock(&old->breadcrumbs.irq_lock); 1689 } 1690 1691 #define for_each_waiter(p__, rq__) \ 1692 list_for_each_entry_lockless(p__, \ 1693 &(rq__)->sched.waiters_list, \ 1694 wait_link) 1695 1696 #define for_each_signaler(p__, rq__) \ 1697 list_for_each_entry_rcu(p__, \ 1698 &(rq__)->sched.signalers_list, \ 1699 signal_link) 1700 1701 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1702 { 1703 LIST_HEAD(list); 1704 1705 /* 1706 * We want to move the interrupted request to the back of 1707 * the round-robin list (i.e. its priority level), but 1708 * in doing so, we must then move all requests that were in 1709 * flight and were waiting for the interrupted request to 1710 * be run after it again. 1711 */ 1712 do { 1713 struct i915_dependency *p; 1714 1715 GEM_BUG_ON(i915_request_is_active(rq)); 1716 list_move_tail(&rq->sched.link, pl); 1717 1718 for_each_waiter(p, rq) { 1719 struct i915_request *w = 1720 container_of(p->waiter, typeof(*w), sched); 1721 1722 /* Leave semaphores spinning on the other engines */ 1723 if (w->engine != rq->engine) 1724 continue; 1725 1726 /* No waiter should start before its signaler */ 1727 GEM_BUG_ON(i915_request_started(w) && 1728 !i915_request_completed(rq)); 1729 1730 GEM_BUG_ON(i915_request_is_active(w)); 1731 if (!i915_request_is_ready(w)) 1732 continue; 1733 1734 if (rq_prio(w) < rq_prio(rq)) 1735 continue; 1736 1737 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1738 list_move_tail(&w->sched.link, &list); 1739 } 1740 1741 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1742 } while (rq); 1743 } 1744 1745 static void defer_active(struct intel_engine_cs *engine) 1746 { 1747 struct i915_request *rq; 1748 1749 rq = __unwind_incomplete_requests(engine); 1750 if (!rq) 1751 return; 1752 1753 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1754 } 1755 1756 static bool 1757 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) 1758 { 1759 int hint; 1760 1761 if (!intel_engine_has_timeslices(engine)) 1762 return false; 1763 1764 hint = engine->execlists.queue_priority_hint; 1765 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1766 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1767 1768 return hint >= effective_prio(rq); 1769 } 1770 1771 static int 1772 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1773 { 1774 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1775 return INT_MIN; 1776 1777 return rq_prio(list_next_entry(rq, sched.link)); 1778 } 1779 1780 static inline unsigned long 1781 timeslice(const struct intel_engine_cs *engine) 1782 { 1783 return READ_ONCE(engine->props.timeslice_duration_ms); 1784 } 1785 1786 static unsigned long 1787 active_timeslice(const struct intel_engine_cs *engine) 1788 { 1789 const struct intel_engine_execlists *execlists = &engine->execlists; 1790 const struct i915_request *rq = *execlists->active; 1791 1792 if (!rq || i915_request_completed(rq)) 1793 return 0; 1794 1795 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) 1796 return 0; 1797 1798 return timeslice(engine); 1799 } 1800 1801 static void set_timeslice(struct intel_engine_cs *engine) 1802 { 1803 if (!intel_engine_has_timeslices(engine)) 1804 return; 1805 1806 set_timer_ms(&engine->execlists.timer, active_timeslice(engine)); 1807 } 1808 1809 static void start_timeslice(struct intel_engine_cs *engine) 1810 { 1811 struct intel_engine_execlists *execlists = &engine->execlists; 1812 int prio = queue_prio(execlists); 1813 1814 WRITE_ONCE(execlists->switch_priority_hint, prio); 1815 if (prio == INT_MIN) 1816 return; 1817 1818 if (timer_pending(&execlists->timer)) 1819 return; 1820 1821 set_timer_ms(&execlists->timer, timeslice(engine)); 1822 } 1823 1824 static void record_preemption(struct intel_engine_execlists *execlists) 1825 { 1826 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 1827 } 1828 1829 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, 1830 const struct i915_request *rq) 1831 { 1832 if (!rq) 1833 return 0; 1834 1835 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 1836 if (unlikely(intel_context_is_banned(rq->context))) 1837 return 1; 1838 1839 return READ_ONCE(engine->props.preempt_timeout_ms); 1840 } 1841 1842 static void set_preempt_timeout(struct intel_engine_cs *engine, 1843 const struct i915_request *rq) 1844 { 1845 if (!intel_engine_has_preempt_reset(engine)) 1846 return; 1847 1848 set_timer_ms(&engine->execlists.preempt, 1849 active_preempt_timeout(engine, rq)); 1850 } 1851 1852 static inline void clear_ports(struct i915_request **ports, int count) 1853 { 1854 memset_p((void **)ports, NULL, count); 1855 } 1856 1857 static void execlists_dequeue(struct intel_engine_cs *engine) 1858 { 1859 struct intel_engine_execlists * const execlists = &engine->execlists; 1860 struct i915_request **port = execlists->pending; 1861 struct i915_request ** const last_port = port + execlists->port_mask; 1862 struct i915_request * const *active; 1863 struct i915_request *last; 1864 struct rb_node *rb; 1865 bool submit = false; 1866 1867 /* 1868 * Hardware submission is through 2 ports. Conceptually each port 1869 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 1870 * static for a context, and unique to each, so we only execute 1871 * requests belonging to a single context from each ring. RING_HEAD 1872 * is maintained by the CS in the context image, it marks the place 1873 * where it got up to last time, and through RING_TAIL we tell the CS 1874 * where we want to execute up to this time. 1875 * 1876 * In this list the requests are in order of execution. Consecutive 1877 * requests from the same context are adjacent in the ringbuffer. We 1878 * can combine these requests into a single RING_TAIL update: 1879 * 1880 * RING_HEAD...req1...req2 1881 * ^- RING_TAIL 1882 * since to execute req2 the CS must first execute req1. 1883 * 1884 * Our goal then is to point each port to the end of a consecutive 1885 * sequence of requests as being the most optimal (fewest wake ups 1886 * and context switches) submission. 1887 */ 1888 1889 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 1890 struct virtual_engine *ve = 1891 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1892 struct i915_request *rq = READ_ONCE(ve->request); 1893 1894 if (!rq) { /* lazily cleanup after another engine handled rq */ 1895 rb_erase_cached(rb, &execlists->virtual); 1896 RB_CLEAR_NODE(rb); 1897 rb = rb_first_cached(&execlists->virtual); 1898 continue; 1899 } 1900 1901 if (!virtual_matches(ve, rq, engine)) { 1902 rb = rb_next(rb); 1903 continue; 1904 } 1905 1906 break; 1907 } 1908 1909 /* 1910 * If the queue is higher priority than the last 1911 * request in the currently active context, submit afresh. 1912 * We will resubmit again afterwards in case we need to split 1913 * the active context to interject the preemption request, 1914 * i.e. we will retrigger preemption following the ack in case 1915 * of trouble. 1916 */ 1917 active = READ_ONCE(execlists->active); 1918 while ((last = *active) && i915_request_completed(last)) 1919 active++; 1920 1921 if (last) { 1922 if (need_preempt(engine, last, rb)) { 1923 ENGINE_TRACE(engine, 1924 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 1925 last->fence.context, 1926 last->fence.seqno, 1927 last->sched.attr.priority, 1928 execlists->queue_priority_hint); 1929 record_preemption(execlists); 1930 1931 /* 1932 * Don't let the RING_HEAD advance past the breadcrumb 1933 * as we unwind (and until we resubmit) so that we do 1934 * not accidentally tell it to go backwards. 1935 */ 1936 ring_set_paused(engine, 1); 1937 1938 /* 1939 * Note that we have not stopped the GPU at this point, 1940 * so we are unwinding the incomplete requests as they 1941 * remain inflight and so by the time we do complete 1942 * the preemption, some of the unwound requests may 1943 * complete! 1944 */ 1945 __unwind_incomplete_requests(engine); 1946 1947 last = NULL; 1948 } else if (need_timeslice(engine, last) && 1949 timer_expired(&engine->execlists.timer)) { 1950 ENGINE_TRACE(engine, 1951 "expired last=%llx:%lld, prio=%d, hint=%d\n", 1952 last->fence.context, 1953 last->fence.seqno, 1954 last->sched.attr.priority, 1955 execlists->queue_priority_hint); 1956 1957 ring_set_paused(engine, 1); 1958 defer_active(engine); 1959 1960 /* 1961 * Unlike for preemption, if we rewind and continue 1962 * executing the same context as previously active, 1963 * the order of execution will remain the same and 1964 * the tail will only advance. We do not need to 1965 * force a full context restore, as a lite-restore 1966 * is sufficient to resample the monotonic TAIL. 1967 * 1968 * If we switch to any other context, similarly we 1969 * will not rewind TAIL of current context, and 1970 * normal save/restore will preserve state and allow 1971 * us to later continue executing the same request. 1972 */ 1973 last = NULL; 1974 } else { 1975 /* 1976 * Otherwise if we already have a request pending 1977 * for execution after the current one, we can 1978 * just wait until the next CS event before 1979 * queuing more. In either case we will force a 1980 * lite-restore preemption event, but if we wait 1981 * we hopefully coalesce several updates into a single 1982 * submission. 1983 */ 1984 if (!list_is_last(&last->sched.link, 1985 &engine->active.requests)) { 1986 /* 1987 * Even if ELSP[1] is occupied and not worthy 1988 * of timeslices, our queue might be. 1989 */ 1990 start_timeslice(engine); 1991 return; 1992 } 1993 } 1994 } 1995 1996 while (rb) { /* XXX virtual is always taking precedence */ 1997 struct virtual_engine *ve = 1998 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1999 struct i915_request *rq; 2000 2001 spin_lock(&ve->base.active.lock); 2002 2003 rq = ve->request; 2004 if (unlikely(!rq)) { /* lost the race to a sibling */ 2005 spin_unlock(&ve->base.active.lock); 2006 rb_erase_cached(rb, &execlists->virtual); 2007 RB_CLEAR_NODE(rb); 2008 rb = rb_first_cached(&execlists->virtual); 2009 continue; 2010 } 2011 2012 GEM_BUG_ON(rq != ve->request); 2013 GEM_BUG_ON(rq->engine != &ve->base); 2014 GEM_BUG_ON(rq->context != &ve->context); 2015 2016 if (rq_prio(rq) >= queue_prio(execlists)) { 2017 if (!virtual_matches(ve, rq, engine)) { 2018 spin_unlock(&ve->base.active.lock); 2019 rb = rb_next(rb); 2020 continue; 2021 } 2022 2023 if (last && !can_merge_rq(last, rq)) { 2024 spin_unlock(&ve->base.active.lock); 2025 start_timeslice(engine); 2026 return; /* leave this for another sibling */ 2027 } 2028 2029 ENGINE_TRACE(engine, 2030 "virtual rq=%llx:%lld%s, new engine? %s\n", 2031 rq->fence.context, 2032 rq->fence.seqno, 2033 i915_request_completed(rq) ? "!" : 2034 i915_request_started(rq) ? "*" : 2035 "", 2036 yesno(engine != ve->siblings[0])); 2037 2038 WRITE_ONCE(ve->request, NULL); 2039 WRITE_ONCE(ve->base.execlists.queue_priority_hint, 2040 INT_MIN); 2041 rb_erase_cached(rb, &execlists->virtual); 2042 RB_CLEAR_NODE(rb); 2043 2044 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2045 WRITE_ONCE(rq->engine, engine); 2046 2047 if (engine != ve->siblings[0]) { 2048 u32 *regs = ve->context.lrc_reg_state; 2049 unsigned int n; 2050 2051 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 2052 2053 if (!intel_engine_has_relative_mmio(engine)) 2054 virtual_update_register_offsets(regs, 2055 engine); 2056 2057 if (!list_empty(&ve->context.signals)) 2058 virtual_xfer_breadcrumbs(ve, rq); 2059 2060 /* 2061 * Move the bound engine to the top of the list 2062 * for future execution. We then kick this 2063 * tasklet first before checking others, so that 2064 * we preferentially reuse this set of bound 2065 * registers. 2066 */ 2067 for (n = 1; n < ve->num_siblings; n++) { 2068 if (ve->siblings[n] == engine) { 2069 swap(ve->siblings[n], 2070 ve->siblings[0]); 2071 break; 2072 } 2073 } 2074 2075 GEM_BUG_ON(ve->siblings[0] != engine); 2076 } 2077 2078 if (__i915_request_submit(rq)) { 2079 submit = true; 2080 last = rq; 2081 } 2082 i915_request_put(rq); 2083 2084 /* 2085 * Hmm, we have a bunch of virtual engine requests, 2086 * but the first one was already completed (thanks 2087 * preempt-to-busy!). Keep looking at the veng queue 2088 * until we have no more relevant requests (i.e. 2089 * the normal submit queue has higher priority). 2090 */ 2091 if (!submit) { 2092 spin_unlock(&ve->base.active.lock); 2093 rb = rb_first_cached(&execlists->virtual); 2094 continue; 2095 } 2096 } 2097 2098 spin_unlock(&ve->base.active.lock); 2099 break; 2100 } 2101 2102 while ((rb = rb_first_cached(&execlists->queue))) { 2103 struct i915_priolist *p = to_priolist(rb); 2104 struct i915_request *rq, *rn; 2105 int i; 2106 2107 priolist_for_each_request_consume(rq, rn, p, i) { 2108 bool merge = true; 2109 2110 /* 2111 * Can we combine this request with the current port? 2112 * It has to be the same context/ringbuffer and not 2113 * have any exceptions (e.g. GVT saying never to 2114 * combine contexts). 2115 * 2116 * If we can combine the requests, we can execute both 2117 * by updating the RING_TAIL to point to the end of the 2118 * second request, and so we never need to tell the 2119 * hardware about the first. 2120 */ 2121 if (last && !can_merge_rq(last, rq)) { 2122 /* 2123 * If we are on the second port and cannot 2124 * combine this request with the last, then we 2125 * are done. 2126 */ 2127 if (port == last_port) 2128 goto done; 2129 2130 /* 2131 * We must not populate both ELSP[] with the 2132 * same LRCA, i.e. we must submit 2 different 2133 * contexts if we submit 2 ELSP. 2134 */ 2135 if (last->context == rq->context) 2136 goto done; 2137 2138 if (i915_request_has_sentinel(last)) 2139 goto done; 2140 2141 /* 2142 * If GVT overrides us we only ever submit 2143 * port[0], leaving port[1] empty. Note that we 2144 * also have to be careful that we don't queue 2145 * the same context (even though a different 2146 * request) to the second port. 2147 */ 2148 if (ctx_single_port_submission(last->context) || 2149 ctx_single_port_submission(rq->context)) 2150 goto done; 2151 2152 merge = false; 2153 } 2154 2155 if (__i915_request_submit(rq)) { 2156 if (!merge) { 2157 *port = execlists_schedule_in(last, port - execlists->pending); 2158 port++; 2159 last = NULL; 2160 } 2161 2162 GEM_BUG_ON(last && 2163 !can_merge_ctx(last->context, 2164 rq->context)); 2165 GEM_BUG_ON(last && 2166 i915_seqno_passed(last->fence.seqno, 2167 rq->fence.seqno)); 2168 2169 submit = true; 2170 last = rq; 2171 } 2172 } 2173 2174 rb_erase_cached(&p->node, &execlists->queue); 2175 i915_priolist_free(p); 2176 } 2177 2178 done: 2179 /* 2180 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2181 * 2182 * We choose the priority hint such that if we add a request of greater 2183 * priority than this, we kick the submission tasklet to decide on 2184 * the right order of submitting the requests to hardware. We must 2185 * also be prepared to reorder requests as they are in-flight on the 2186 * HW. We derive the priority hint then as the first "hole" in 2187 * the HW submission ports and if there are no available slots, 2188 * the priority of the lowest executing request, i.e. last. 2189 * 2190 * When we do receive a higher priority request ready to run from the 2191 * user, see queue_request(), the priority hint is bumped to that 2192 * request triggering preemption on the next dequeue (or subsequent 2193 * interrupt for secondary ports). 2194 */ 2195 execlists->queue_priority_hint = queue_prio(execlists); 2196 2197 if (submit) { 2198 *port = execlists_schedule_in(last, port - execlists->pending); 2199 execlists->switch_priority_hint = 2200 switch_prio(engine, *execlists->pending); 2201 2202 /* 2203 * Skip if we ended up with exactly the same set of requests, 2204 * e.g. trying to timeslice a pair of ordered contexts 2205 */ 2206 if (!memcmp(active, execlists->pending, 2207 (port - execlists->pending + 1) * sizeof(*port))) { 2208 do 2209 execlists_schedule_out(fetch_and_zero(port)); 2210 while (port-- != execlists->pending); 2211 2212 goto skip_submit; 2213 } 2214 clear_ports(port + 1, last_port - port); 2215 2216 execlists_submit_ports(engine); 2217 set_preempt_timeout(engine, *active); 2218 } else { 2219 skip_submit: 2220 ring_set_paused(engine, 0); 2221 } 2222 } 2223 2224 static void 2225 cancel_port_requests(struct intel_engine_execlists * const execlists) 2226 { 2227 struct i915_request * const *port; 2228 2229 for (port = execlists->pending; *port; port++) 2230 execlists_schedule_out(*port); 2231 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2232 2233 /* Mark the end of active before we overwrite *active */ 2234 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2235 execlists_schedule_out(*port); 2236 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2237 2238 smp_wmb(); /* complete the seqlock for execlists_active() */ 2239 WRITE_ONCE(execlists->active, execlists->inflight); 2240 } 2241 2242 static inline void 2243 invalidate_csb_entries(const u32 *first, const u32 *last) 2244 { 2245 clflush((void *)first); 2246 clflush((void *)last); 2247 } 2248 2249 /* 2250 * Starting with Gen12, the status has a new format: 2251 * 2252 * bit 0: switched to new queue 2253 * bit 1: reserved 2254 * bit 2: semaphore wait mode (poll or signal), only valid when 2255 * switch detail is set to "wait on semaphore" 2256 * bits 3-5: engine class 2257 * bits 6-11: engine instance 2258 * bits 12-14: reserved 2259 * bits 15-25: sw context id of the lrc the GT switched to 2260 * bits 26-31: sw counter of the lrc the GT switched to 2261 * bits 32-35: context switch detail 2262 * - 0: ctx complete 2263 * - 1: wait on sync flip 2264 * - 2: wait on vblank 2265 * - 3: wait on scanline 2266 * - 4: wait on semaphore 2267 * - 5: context preempted (not on SEMAPHORE_WAIT or 2268 * WAIT_FOR_EVENT) 2269 * bit 36: reserved 2270 * bits 37-43: wait detail (for switch detail 1 to 4) 2271 * bits 44-46: reserved 2272 * bits 47-57: sw context id of the lrc the GT switched away from 2273 * bits 58-63: sw counter of the lrc the GT switched away from 2274 */ 2275 static inline bool 2276 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2277 { 2278 u32 lower_dw = csb[0]; 2279 u32 upper_dw = csb[1]; 2280 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 2281 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 2282 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2283 2284 /* 2285 * The context switch detail is not guaranteed to be 5 when a preemption 2286 * occurs, so we can't just check for that. The check below works for 2287 * all the cases we care about, including preemptions of WAIT 2288 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2289 * would require some extra handling, but we don't support that. 2290 */ 2291 if (!ctx_away_valid || new_queue) { 2292 GEM_BUG_ON(!ctx_to_valid); 2293 return true; 2294 } 2295 2296 /* 2297 * switch detail = 5 is covered by the case above and we do not expect a 2298 * context switch on an unsuccessful wait instruction since we always 2299 * use polling mode. 2300 */ 2301 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 2302 return false; 2303 } 2304 2305 static inline bool 2306 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2307 { 2308 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2309 } 2310 2311 static void process_csb(struct intel_engine_cs *engine) 2312 { 2313 struct intel_engine_execlists * const execlists = &engine->execlists; 2314 const u32 * const buf = execlists->csb_status; 2315 const u8 num_entries = execlists->csb_size; 2316 u8 head, tail; 2317 2318 /* 2319 * As we modify our execlists state tracking we require exclusive 2320 * access. Either we are inside the tasklet, or the tasklet is disabled 2321 * and we assume that is only inside the reset paths and so serialised. 2322 */ 2323 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2324 !reset_in_progress(execlists)); 2325 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2326 2327 /* 2328 * Note that csb_write, csb_status may be either in HWSP or mmio. 2329 * When reading from the csb_write mmio register, we have to be 2330 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2331 * the low 4bits. As it happens we know the next 4bits are always 2332 * zero and so we can simply masked off the low u8 of the register 2333 * and treat it identically to reading from the HWSP (without having 2334 * to use explicit shifting and masking, and probably bifurcating 2335 * the code to handle the legacy mmio read). 2336 */ 2337 head = execlists->csb_head; 2338 tail = READ_ONCE(*execlists->csb_write); 2339 if (unlikely(head == tail)) 2340 return; 2341 2342 /* 2343 * Hopefully paired with a wmb() in HW! 2344 * 2345 * We must complete the read of the write pointer before any reads 2346 * from the CSB, so that we do not see stale values. Without an rmb 2347 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2348 * we perform the READ_ONCE(*csb_write). 2349 */ 2350 rmb(); 2351 2352 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2353 do { 2354 bool promote; 2355 2356 if (++head == num_entries) 2357 head = 0; 2358 2359 /* 2360 * We are flying near dragons again. 2361 * 2362 * We hold a reference to the request in execlist_port[] 2363 * but no more than that. We are operating in softirq 2364 * context and so cannot hold any mutex or sleep. That 2365 * prevents us stopping the requests we are processing 2366 * in port[] from being retired simultaneously (the 2367 * breadcrumb will be complete before we see the 2368 * context-switch). As we only hold the reference to the 2369 * request, any pointer chasing underneath the request 2370 * is subject to a potential use-after-free. Thus we 2371 * store all of the bookkeeping within port[] as 2372 * required, and avoid using unguarded pointers beneath 2373 * request itself. The same applies to the atomic 2374 * status notifier. 2375 */ 2376 2377 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2378 head, buf[2 * head + 0], buf[2 * head + 1]); 2379 2380 if (INTEL_GEN(engine->i915) >= 12) 2381 promote = gen12_csb_parse(execlists, buf + 2 * head); 2382 else 2383 promote = gen8_csb_parse(execlists, buf + 2 * head); 2384 if (promote) { 2385 struct i915_request * const *old = execlists->active; 2386 2387 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2388 2389 ring_set_paused(engine, 0); 2390 2391 /* Point active to the new ELSP; prevent overwriting */ 2392 WRITE_ONCE(execlists->active, execlists->pending); 2393 smp_wmb(); /* notify execlists_active() */ 2394 2395 /* cancel old inflight, prepare for switch */ 2396 trace_ports(execlists, "preempted", old); 2397 while (*old) 2398 execlists_schedule_out(*old++); 2399 2400 /* switch pending to inflight */ 2401 memcpy(execlists->inflight, 2402 execlists->pending, 2403 execlists_num_ports(execlists) * 2404 sizeof(*execlists->pending)); 2405 smp_wmb(); /* complete the seqlock */ 2406 WRITE_ONCE(execlists->active, execlists->inflight); 2407 2408 WRITE_ONCE(execlists->pending[0], NULL); 2409 } else { 2410 GEM_BUG_ON(!*execlists->active); 2411 2412 /* port0 completed, advanced to port1 */ 2413 trace_ports(execlists, "completed", execlists->active); 2414 2415 /* 2416 * We rely on the hardware being strongly 2417 * ordered, that the breadcrumb write is 2418 * coherent (visible from the CPU) before the 2419 * user interrupt and CSB is processed. 2420 */ 2421 if (GEM_SHOW_DEBUG() && 2422 !i915_request_completed(*execlists->active) && 2423 !reset_in_progress(execlists)) { 2424 struct i915_request *rq __maybe_unused = 2425 *execlists->active; 2426 const u32 *regs __maybe_unused = 2427 rq->context->lrc_reg_state; 2428 2429 ENGINE_TRACE(engine, 2430 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2431 ENGINE_READ(engine, RING_START), 2432 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2433 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2434 ENGINE_READ(engine, RING_CTL), 2435 ENGINE_READ(engine, RING_MI_MODE)); 2436 ENGINE_TRACE(engine, 2437 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2438 i915_ggtt_offset(rq->ring->vma), 2439 rq->head, rq->tail, 2440 rq->fence.context, 2441 lower_32_bits(rq->fence.seqno), 2442 hwsp_seqno(rq)); 2443 ENGINE_TRACE(engine, 2444 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2445 regs[CTX_RING_START], 2446 regs[CTX_RING_HEAD], 2447 regs[CTX_RING_TAIL]); 2448 2449 GEM_BUG_ON("context completed before request"); 2450 } 2451 2452 execlists_schedule_out(*execlists->active++); 2453 2454 GEM_BUG_ON(execlists->active - execlists->inflight > 2455 execlists_num_ports(execlists)); 2456 } 2457 } while (head != tail); 2458 2459 execlists->csb_head = head; 2460 set_timeslice(engine); 2461 2462 /* 2463 * Gen11 has proven to fail wrt global observation point between 2464 * entry and tail update, failing on the ordering and thus 2465 * we see an old entry in the context status buffer. 2466 * 2467 * Forcibly evict out entries for the next gpu csb update, 2468 * to increase the odds that we get a fresh entries with non 2469 * working hardware. The cost for doing so comes out mostly with 2470 * the wash as hardware, working or not, will need to do the 2471 * invalidation before. 2472 */ 2473 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2474 } 2475 2476 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2477 { 2478 lockdep_assert_held(&engine->active.lock); 2479 if (!READ_ONCE(engine->execlists.pending[0])) { 2480 rcu_read_lock(); /* protect peeking at execlists->active */ 2481 execlists_dequeue(engine); 2482 rcu_read_unlock(); 2483 } 2484 } 2485 2486 static void __execlists_hold(struct i915_request *rq) 2487 { 2488 LIST_HEAD(list); 2489 2490 do { 2491 struct i915_dependency *p; 2492 2493 if (i915_request_is_active(rq)) 2494 __i915_request_unsubmit(rq); 2495 2496 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2497 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2498 i915_request_set_hold(rq); 2499 RQ_TRACE(rq, "on hold\n"); 2500 2501 for_each_waiter(p, rq) { 2502 struct i915_request *w = 2503 container_of(p->waiter, typeof(*w), sched); 2504 2505 /* Leave semaphores spinning on the other engines */ 2506 if (w->engine != rq->engine) 2507 continue; 2508 2509 if (!i915_request_is_ready(w)) 2510 continue; 2511 2512 if (i915_request_completed(w)) 2513 continue; 2514 2515 if (i915_request_on_hold(w)) 2516 continue; 2517 2518 list_move_tail(&w->sched.link, &list); 2519 } 2520 2521 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2522 } while (rq); 2523 } 2524 2525 static bool execlists_hold(struct intel_engine_cs *engine, 2526 struct i915_request *rq) 2527 { 2528 spin_lock_irq(&engine->active.lock); 2529 2530 if (i915_request_completed(rq)) { /* too late! */ 2531 rq = NULL; 2532 goto unlock; 2533 } 2534 2535 if (rq->engine != engine) { /* preempted virtual engine */ 2536 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2537 2538 /* 2539 * intel_context_inflight() is only protected by virtue 2540 * of process_csb() being called only by the tasklet (or 2541 * directly from inside reset while the tasklet is suspended). 2542 * Assert that neither of those are allowed to run while we 2543 * poke at the request queues. 2544 */ 2545 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2546 2547 /* 2548 * An unsubmitted request along a virtual engine will 2549 * remain on the active (this) engine until we are able 2550 * to process the context switch away (and so mark the 2551 * context as no longer in flight). That cannot have happened 2552 * yet, otherwise we would not be hanging! 2553 */ 2554 spin_lock(&ve->base.active.lock); 2555 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2556 GEM_BUG_ON(ve->request != rq); 2557 ve->request = NULL; 2558 spin_unlock(&ve->base.active.lock); 2559 i915_request_put(rq); 2560 2561 rq->engine = engine; 2562 } 2563 2564 /* 2565 * Transfer this request onto the hold queue to prevent it 2566 * being resumbitted to HW (and potentially completed) before we have 2567 * released it. Since we may have already submitted following 2568 * requests, we need to remove those as well. 2569 */ 2570 GEM_BUG_ON(i915_request_on_hold(rq)); 2571 GEM_BUG_ON(rq->engine != engine); 2572 __execlists_hold(rq); 2573 GEM_BUG_ON(list_empty(&engine->active.hold)); 2574 2575 unlock: 2576 spin_unlock_irq(&engine->active.lock); 2577 return rq; 2578 } 2579 2580 static bool hold_request(const struct i915_request *rq) 2581 { 2582 struct i915_dependency *p; 2583 bool result = false; 2584 2585 /* 2586 * If one of our ancestors is on hold, we must also be on hold, 2587 * otherwise we will bypass it and execute before it. 2588 */ 2589 rcu_read_lock(); 2590 for_each_signaler(p, rq) { 2591 const struct i915_request *s = 2592 container_of(p->signaler, typeof(*s), sched); 2593 2594 if (s->engine != rq->engine) 2595 continue; 2596 2597 result = i915_request_on_hold(s); 2598 if (result) 2599 break; 2600 } 2601 rcu_read_unlock(); 2602 2603 return result; 2604 } 2605 2606 static void __execlists_unhold(struct i915_request *rq) 2607 { 2608 LIST_HEAD(list); 2609 2610 do { 2611 struct i915_dependency *p; 2612 2613 RQ_TRACE(rq, "hold release\n"); 2614 2615 GEM_BUG_ON(!i915_request_on_hold(rq)); 2616 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2617 2618 i915_request_clear_hold(rq); 2619 list_move_tail(&rq->sched.link, 2620 i915_sched_lookup_priolist(rq->engine, 2621 rq_prio(rq))); 2622 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2623 2624 /* Also release any children on this engine that are ready */ 2625 for_each_waiter(p, rq) { 2626 struct i915_request *w = 2627 container_of(p->waiter, typeof(*w), sched); 2628 2629 /* Propagate any change in error status */ 2630 if (rq->fence.error) 2631 i915_request_set_error_once(w, rq->fence.error); 2632 2633 if (w->engine != rq->engine) 2634 continue; 2635 2636 if (!i915_request_on_hold(w)) 2637 continue; 2638 2639 /* Check that no other parents are also on hold */ 2640 if (hold_request(w)) 2641 continue; 2642 2643 list_move_tail(&w->sched.link, &list); 2644 } 2645 2646 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2647 } while (rq); 2648 } 2649 2650 static void execlists_unhold(struct intel_engine_cs *engine, 2651 struct i915_request *rq) 2652 { 2653 spin_lock_irq(&engine->active.lock); 2654 2655 /* 2656 * Move this request back to the priority queue, and all of its 2657 * children and grandchildren that were suspended along with it. 2658 */ 2659 __execlists_unhold(rq); 2660 2661 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2662 engine->execlists.queue_priority_hint = rq_prio(rq); 2663 tasklet_hi_schedule(&engine->execlists.tasklet); 2664 } 2665 2666 spin_unlock_irq(&engine->active.lock); 2667 } 2668 2669 struct execlists_capture { 2670 struct work_struct work; 2671 struct i915_request *rq; 2672 struct i915_gpu_coredump *error; 2673 }; 2674 2675 static void execlists_capture_work(struct work_struct *work) 2676 { 2677 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2678 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2679 struct intel_engine_cs *engine = cap->rq->engine; 2680 struct intel_gt_coredump *gt = cap->error->gt; 2681 struct intel_engine_capture_vma *vma; 2682 2683 /* Compress all the objects attached to the request, slow! */ 2684 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2685 if (vma) { 2686 struct i915_vma_compress *compress = 2687 i915_vma_capture_prepare(gt); 2688 2689 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2690 i915_vma_capture_finish(gt, compress); 2691 } 2692 2693 gt->simulated = gt->engine->simulated; 2694 cap->error->simulated = gt->simulated; 2695 2696 /* Publish the error state, and announce it to the world */ 2697 i915_error_state_store(cap->error); 2698 i915_gpu_coredump_put(cap->error); 2699 2700 /* Return this request and all that depend upon it for signaling */ 2701 execlists_unhold(engine, cap->rq); 2702 i915_request_put(cap->rq); 2703 2704 kfree(cap); 2705 } 2706 2707 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 2708 { 2709 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 2710 struct execlists_capture *cap; 2711 2712 cap = kmalloc(sizeof(*cap), gfp); 2713 if (!cap) 2714 return NULL; 2715 2716 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 2717 if (!cap->error) 2718 goto err_cap; 2719 2720 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 2721 if (!cap->error->gt) 2722 goto err_gpu; 2723 2724 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 2725 if (!cap->error->gt->engine) 2726 goto err_gt; 2727 2728 return cap; 2729 2730 err_gt: 2731 kfree(cap->error->gt); 2732 err_gpu: 2733 kfree(cap->error); 2734 err_cap: 2735 kfree(cap); 2736 return NULL; 2737 } 2738 2739 static bool execlists_capture(struct intel_engine_cs *engine) 2740 { 2741 struct execlists_capture *cap; 2742 2743 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 2744 return true; 2745 2746 /* 2747 * We need to _quickly_ capture the engine state before we reset. 2748 * We are inside an atomic section (softirq) here and we are delaying 2749 * the forced preemption event. 2750 */ 2751 cap = capture_regs(engine); 2752 if (!cap) 2753 return true; 2754 2755 spin_lock_irq(&engine->active.lock); 2756 cap->rq = execlists_active(&engine->execlists); 2757 if (cap->rq) { 2758 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 2759 cap->rq = i915_request_get_rcu(cap->rq); 2760 } 2761 spin_unlock_irq(&engine->active.lock); 2762 if (!cap->rq) 2763 goto err_free; 2764 2765 /* 2766 * Remove the request from the execlists queue, and take ownership 2767 * of the request. We pass it to our worker who will _slowly_ compress 2768 * all the pages the _user_ requested for debugging their batch, after 2769 * which we return it to the queue for signaling. 2770 * 2771 * By removing them from the execlists queue, we also remove the 2772 * requests from being processed by __unwind_incomplete_requests() 2773 * during the intel_engine_reset(), and so they will *not* be replayed 2774 * afterwards. 2775 * 2776 * Note that because we have not yet reset the engine at this point, 2777 * it is possible for the request that we have identified as being 2778 * guilty, did in fact complete and we will then hit an arbitration 2779 * point allowing the outstanding preemption to succeed. The likelihood 2780 * of that is very low (as capturing of the engine registers should be 2781 * fast enough to run inside an irq-off atomic section!), so we will 2782 * simply hold that request accountable for being non-preemptible 2783 * long enough to force the reset. 2784 */ 2785 if (!execlists_hold(engine, cap->rq)) 2786 goto err_rq; 2787 2788 INIT_WORK(&cap->work, execlists_capture_work); 2789 schedule_work(&cap->work); 2790 return true; 2791 2792 err_rq: 2793 i915_request_put(cap->rq); 2794 err_free: 2795 i915_gpu_coredump_put(cap->error); 2796 kfree(cap); 2797 return false; 2798 } 2799 2800 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 2801 { 2802 const unsigned int bit = I915_RESET_ENGINE + engine->id; 2803 unsigned long *lock = &engine->gt->reset.flags; 2804 2805 if (!intel_has_reset_engine(engine->gt)) 2806 return; 2807 2808 if (test_and_set_bit(bit, lock)) 2809 return; 2810 2811 ENGINE_TRACE(engine, "reset for %s\n", msg); 2812 2813 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 2814 tasklet_disable_nosync(&engine->execlists.tasklet); 2815 2816 ring_set_paused(engine, 1); /* Freeze the current request in place */ 2817 if (execlists_capture(engine)) 2818 intel_engine_reset(engine, msg); 2819 else 2820 ring_set_paused(engine, 0); 2821 2822 tasklet_enable(&engine->execlists.tasklet); 2823 clear_and_wake_up_bit(bit, lock); 2824 } 2825 2826 static bool preempt_timeout(const struct intel_engine_cs *const engine) 2827 { 2828 const struct timer_list *t = &engine->execlists.preempt; 2829 2830 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 2831 return false; 2832 2833 if (!timer_expired(t)) 2834 return false; 2835 2836 return READ_ONCE(engine->execlists.pending[0]); 2837 } 2838 2839 /* 2840 * Check the unread Context Status Buffers and manage the submission of new 2841 * contexts to the ELSP accordingly. 2842 */ 2843 static void execlists_submission_tasklet(unsigned long data) 2844 { 2845 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 2846 bool timeout = preempt_timeout(engine); 2847 2848 process_csb(engine); 2849 2850 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 2851 engine->execlists.error_interrupt = 0; 2852 if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */ 2853 execlists_reset(engine, "CS error"); 2854 } 2855 2856 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 2857 unsigned long flags; 2858 2859 spin_lock_irqsave(&engine->active.lock, flags); 2860 __execlists_submission_tasklet(engine); 2861 spin_unlock_irqrestore(&engine->active.lock, flags); 2862 2863 /* Recheck after serialising with direct-submission */ 2864 if (unlikely(timeout && preempt_timeout(engine))) 2865 execlists_reset(engine, "preemption time out"); 2866 } 2867 } 2868 2869 static void __execlists_kick(struct intel_engine_execlists *execlists) 2870 { 2871 /* Kick the tasklet for some interrupt coalescing and reset handling */ 2872 tasklet_hi_schedule(&execlists->tasklet); 2873 } 2874 2875 #define execlists_kick(t, member) \ 2876 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 2877 2878 static void execlists_timeslice(struct timer_list *timer) 2879 { 2880 execlists_kick(timer, timer); 2881 } 2882 2883 static void execlists_preempt(struct timer_list *timer) 2884 { 2885 execlists_kick(timer, preempt); 2886 } 2887 2888 static void queue_request(struct intel_engine_cs *engine, 2889 struct i915_request *rq) 2890 { 2891 GEM_BUG_ON(!list_empty(&rq->sched.link)); 2892 list_add_tail(&rq->sched.link, 2893 i915_sched_lookup_priolist(engine, rq_prio(rq))); 2894 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2895 } 2896 2897 static void __submit_queue_imm(struct intel_engine_cs *engine) 2898 { 2899 struct intel_engine_execlists * const execlists = &engine->execlists; 2900 2901 if (reset_in_progress(execlists)) 2902 return; /* defer until we restart the engine following reset */ 2903 2904 if (execlists->tasklet.func == execlists_submission_tasklet) 2905 __execlists_submission_tasklet(engine); 2906 else 2907 tasklet_hi_schedule(&execlists->tasklet); 2908 } 2909 2910 static void submit_queue(struct intel_engine_cs *engine, 2911 const struct i915_request *rq) 2912 { 2913 struct intel_engine_execlists *execlists = &engine->execlists; 2914 2915 if (rq_prio(rq) <= execlists->queue_priority_hint) 2916 return; 2917 2918 execlists->queue_priority_hint = rq_prio(rq); 2919 __submit_queue_imm(engine); 2920 } 2921 2922 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 2923 const struct i915_request *rq) 2924 { 2925 GEM_BUG_ON(i915_request_on_hold(rq)); 2926 return !list_empty(&engine->active.hold) && hold_request(rq); 2927 } 2928 2929 static void execlists_submit_request(struct i915_request *request) 2930 { 2931 struct intel_engine_cs *engine = request->engine; 2932 unsigned long flags; 2933 2934 /* Will be called from irq-context when using foreign fences. */ 2935 spin_lock_irqsave(&engine->active.lock, flags); 2936 2937 if (unlikely(ancestor_on_hold(engine, request))) { 2938 RQ_TRACE(request, "ancestor on hold\n"); 2939 list_add_tail(&request->sched.link, &engine->active.hold); 2940 i915_request_set_hold(request); 2941 } else { 2942 queue_request(engine, request); 2943 2944 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 2945 GEM_BUG_ON(list_empty(&request->sched.link)); 2946 2947 submit_queue(engine, request); 2948 } 2949 2950 spin_unlock_irqrestore(&engine->active.lock, flags); 2951 } 2952 2953 static void __execlists_context_fini(struct intel_context *ce) 2954 { 2955 intel_ring_put(ce->ring); 2956 i915_vma_put(ce->state); 2957 } 2958 2959 static void execlists_context_destroy(struct kref *kref) 2960 { 2961 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 2962 2963 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 2964 GEM_BUG_ON(intel_context_is_pinned(ce)); 2965 2966 if (ce->state) 2967 __execlists_context_fini(ce); 2968 2969 intel_context_fini(ce); 2970 intel_context_free(ce); 2971 } 2972 2973 static void 2974 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 2975 { 2976 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2977 return; 2978 2979 vaddr += engine->context_size; 2980 2981 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 2982 } 2983 2984 static void 2985 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 2986 { 2987 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2988 return; 2989 2990 vaddr += engine->context_size; 2991 2992 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 2993 dev_err_once(engine->i915->drm.dev, 2994 "%s context redzone overwritten!\n", 2995 engine->name); 2996 } 2997 2998 static void execlists_context_unpin(struct intel_context *ce) 2999 { 3000 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, 3001 ce->engine); 3002 3003 i915_gem_object_unpin_map(ce->state->obj); 3004 } 3005 3006 static void 3007 __execlists_update_reg_state(const struct intel_context *ce, 3008 const struct intel_engine_cs *engine, 3009 u32 head) 3010 { 3011 struct intel_ring *ring = ce->ring; 3012 u32 *regs = ce->lrc_reg_state; 3013 3014 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 3015 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 3016 3017 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 3018 regs[CTX_RING_HEAD] = head; 3019 regs[CTX_RING_TAIL] = ring->tail; 3020 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3021 3022 /* RPCS */ 3023 if (engine->class == RENDER_CLASS) { 3024 regs[CTX_R_PWR_CLK_STATE] = 3025 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 3026 3027 i915_oa_init_reg_state(ce, engine); 3028 } 3029 } 3030 3031 static int 3032 __execlists_context_pin(struct intel_context *ce, 3033 struct intel_engine_cs *engine) 3034 { 3035 void *vaddr; 3036 3037 GEM_BUG_ON(!ce->state); 3038 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3039 3040 vaddr = i915_gem_object_pin_map(ce->state->obj, 3041 i915_coherent_map_type(engine->i915) | 3042 I915_MAP_OVERRIDE); 3043 if (IS_ERR(vaddr)) 3044 return PTR_ERR(vaddr); 3045 3046 ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 3047 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 3048 __execlists_update_reg_state(ce, engine, ce->ring->tail); 3049 3050 return 0; 3051 } 3052 3053 static int execlists_context_pin(struct intel_context *ce) 3054 { 3055 return __execlists_context_pin(ce, ce->engine); 3056 } 3057 3058 static int execlists_context_alloc(struct intel_context *ce) 3059 { 3060 return __execlists_context_alloc(ce, ce->engine); 3061 } 3062 3063 static void execlists_context_reset(struct intel_context *ce) 3064 { 3065 CE_TRACE(ce, "reset\n"); 3066 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3067 3068 intel_ring_reset(ce->ring, ce->ring->emit); 3069 3070 /* Scrub away the garbage */ 3071 execlists_init_reg_state(ce->lrc_reg_state, 3072 ce, ce->engine, ce->ring, true); 3073 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3074 3075 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 3076 } 3077 3078 static const struct intel_context_ops execlists_context_ops = { 3079 .alloc = execlists_context_alloc, 3080 3081 .pin = execlists_context_pin, 3082 .unpin = execlists_context_unpin, 3083 3084 .enter = intel_context_enter_engine, 3085 .exit = intel_context_exit_engine, 3086 3087 .reset = execlists_context_reset, 3088 .destroy = execlists_context_destroy, 3089 }; 3090 3091 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3092 { 3093 u32 *cs; 3094 3095 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3096 return 0; 3097 3098 cs = intel_ring_begin(rq, 6); 3099 if (IS_ERR(cs)) 3100 return PTR_ERR(cs); 3101 3102 /* 3103 * Check if we have been preempted before we even get started. 3104 * 3105 * After this point i915_request_started() reports true, even if 3106 * we get preempted and so are no longer running. 3107 */ 3108 *cs++ = MI_ARB_CHECK; 3109 *cs++ = MI_NOOP; 3110 3111 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3112 *cs++ = i915_request_timeline(rq)->hwsp_offset; 3113 *cs++ = 0; 3114 *cs++ = rq->fence.seqno - 1; 3115 3116 intel_ring_advance(rq, cs); 3117 3118 /* Record the updated position of the request's payload */ 3119 rq->infix = intel_ring_offset(rq, cs); 3120 3121 return 0; 3122 } 3123 3124 static int execlists_request_alloc(struct i915_request *request) 3125 { 3126 int ret; 3127 3128 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3129 3130 /* 3131 * Flush enough space to reduce the likelihood of waiting after 3132 * we start building the request - in which case we will just 3133 * have to repeat work. 3134 */ 3135 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3136 3137 /* 3138 * Note that after this point, we have committed to using 3139 * this request as it is being used to both track the 3140 * state of engine initialisation and liveness of the 3141 * golden renderstate above. Think twice before you try 3142 * to cancel/unwind this request now. 3143 */ 3144 3145 /* Unconditionally invalidate GPU caches and TLBs. */ 3146 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3147 if (ret) 3148 return ret; 3149 3150 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3151 return 0; 3152 } 3153 3154 /* 3155 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3156 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3157 * but there is a slight complication as this is applied in WA batch where the 3158 * values are only initialized once so we cannot take register value at the 3159 * beginning and reuse it further; hence we save its value to memory, upload a 3160 * constant value with bit21 set and then we restore it back with the saved value. 3161 * To simplify the WA, a constant value is formed by using the default value 3162 * of this register. This shouldn't be a problem because we are only modifying 3163 * it for a short period and this batch in non-premptible. We can ofcourse 3164 * use additional instructions that read the actual value of the register 3165 * at that time and set our bit of interest but it makes the WA complicated. 3166 * 3167 * This WA is also required for Gen9 so extracting as a function avoids 3168 * code duplication. 3169 */ 3170 static u32 * 3171 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3172 { 3173 /* NB no one else is allowed to scribble over scratch + 256! */ 3174 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3175 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3176 *batch++ = intel_gt_scratch_offset(engine->gt, 3177 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3178 *batch++ = 0; 3179 3180 *batch++ = MI_LOAD_REGISTER_IMM(1); 3181 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3182 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3183 3184 batch = gen8_emit_pipe_control(batch, 3185 PIPE_CONTROL_CS_STALL | 3186 PIPE_CONTROL_DC_FLUSH_ENABLE, 3187 0); 3188 3189 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3190 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3191 *batch++ = intel_gt_scratch_offset(engine->gt, 3192 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3193 *batch++ = 0; 3194 3195 return batch; 3196 } 3197 3198 /* 3199 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3200 * initialized at the beginning and shared across all contexts but this field 3201 * helps us to have multiple batches at different offsets and select them based 3202 * on a criteria. At the moment this batch always start at the beginning of the page 3203 * and at this point we don't have multiple wa_ctx batch buffers. 3204 * 3205 * The number of WA applied are not known at the beginning; we use this field 3206 * to return the no of DWORDS written. 3207 * 3208 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3209 * so it adds NOOPs as padding to make it cacheline aligned. 3210 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3211 * makes a complete batch buffer. 3212 */ 3213 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3214 { 3215 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3216 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3217 3218 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3219 if (IS_BROADWELL(engine->i915)) 3220 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3221 3222 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3223 /* Actual scratch location is at 128 bytes offset */ 3224 batch = gen8_emit_pipe_control(batch, 3225 PIPE_CONTROL_FLUSH_L3 | 3226 PIPE_CONTROL_STORE_DATA_INDEX | 3227 PIPE_CONTROL_CS_STALL | 3228 PIPE_CONTROL_QW_WRITE, 3229 LRC_PPHWSP_SCRATCH_ADDR); 3230 3231 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3232 3233 /* Pad to end of cacheline */ 3234 while ((unsigned long)batch % CACHELINE_BYTES) 3235 *batch++ = MI_NOOP; 3236 3237 /* 3238 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3239 * execution depends on the length specified in terms of cache lines 3240 * in the register CTX_RCS_INDIRECT_CTX 3241 */ 3242 3243 return batch; 3244 } 3245 3246 struct lri { 3247 i915_reg_t reg; 3248 u32 value; 3249 }; 3250 3251 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3252 { 3253 GEM_BUG_ON(!count || count > 63); 3254 3255 *batch++ = MI_LOAD_REGISTER_IMM(count); 3256 do { 3257 *batch++ = i915_mmio_reg_offset(lri->reg); 3258 *batch++ = lri->value; 3259 } while (lri++, --count); 3260 *batch++ = MI_NOOP; 3261 3262 return batch; 3263 } 3264 3265 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3266 { 3267 static const struct lri lri[] = { 3268 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3269 { 3270 COMMON_SLICE_CHICKEN2, 3271 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3272 0), 3273 }, 3274 3275 /* BSpec: 11391 */ 3276 { 3277 FF_SLICE_CHICKEN, 3278 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3279 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3280 }, 3281 3282 /* BSpec: 11299 */ 3283 { 3284 _3D_CHICKEN3, 3285 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3286 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3287 } 3288 }; 3289 3290 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3291 3292 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3293 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3294 3295 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3296 batch = gen8_emit_pipe_control(batch, 3297 PIPE_CONTROL_FLUSH_L3 | 3298 PIPE_CONTROL_STORE_DATA_INDEX | 3299 PIPE_CONTROL_CS_STALL | 3300 PIPE_CONTROL_QW_WRITE, 3301 LRC_PPHWSP_SCRATCH_ADDR); 3302 3303 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3304 3305 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3306 if (HAS_POOLED_EU(engine->i915)) { 3307 /* 3308 * EU pool configuration is setup along with golden context 3309 * during context initialization. This value depends on 3310 * device type (2x6 or 3x6) and needs to be updated based 3311 * on which subslice is disabled especially for 2x6 3312 * devices, however it is safe to load default 3313 * configuration of 3x6 device instead of masking off 3314 * corresponding bits because HW ignores bits of a disabled 3315 * subslice and drops down to appropriate config. Please 3316 * see render_state_setup() in i915_gem_render_state.c for 3317 * possible configurations, to avoid duplication they are 3318 * not shown here again. 3319 */ 3320 *batch++ = GEN9_MEDIA_POOL_STATE; 3321 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3322 *batch++ = 0x00777000; 3323 *batch++ = 0; 3324 *batch++ = 0; 3325 *batch++ = 0; 3326 } 3327 3328 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3329 3330 /* Pad to end of cacheline */ 3331 while ((unsigned long)batch % CACHELINE_BYTES) 3332 *batch++ = MI_NOOP; 3333 3334 return batch; 3335 } 3336 3337 static u32 * 3338 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3339 { 3340 int i; 3341 3342 /* 3343 * WaPipeControlBefore3DStateSamplePattern: cnl 3344 * 3345 * Ensure the engine is idle prior to programming a 3346 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3347 */ 3348 batch = gen8_emit_pipe_control(batch, 3349 PIPE_CONTROL_CS_STALL, 3350 0); 3351 /* 3352 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3353 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3354 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3355 * confusing. Since gen8_emit_pipe_control() already advances the 3356 * batch by 6 dwords, we advance the other 10 here, completing a 3357 * cacheline. It's not clear if the workaround requires this padding 3358 * before other commands, or if it's just the regular padding we would 3359 * already have for the workaround bb, so leave it here for now. 3360 */ 3361 for (i = 0; i < 10; i++) 3362 *batch++ = MI_NOOP; 3363 3364 /* Pad to end of cacheline */ 3365 while ((unsigned long)batch % CACHELINE_BYTES) 3366 *batch++ = MI_NOOP; 3367 3368 return batch; 3369 } 3370 3371 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3372 3373 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3374 { 3375 struct drm_i915_gem_object *obj; 3376 struct i915_vma *vma; 3377 int err; 3378 3379 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3380 if (IS_ERR(obj)) 3381 return PTR_ERR(obj); 3382 3383 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3384 if (IS_ERR(vma)) { 3385 err = PTR_ERR(vma); 3386 goto err; 3387 } 3388 3389 err = i915_ggtt_pin(vma, 0, PIN_HIGH); 3390 if (err) 3391 goto err; 3392 3393 engine->wa_ctx.vma = vma; 3394 return 0; 3395 3396 err: 3397 i915_gem_object_put(obj); 3398 return err; 3399 } 3400 3401 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3402 { 3403 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3404 } 3405 3406 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3407 3408 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3409 { 3410 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3411 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3412 &wa_ctx->per_ctx }; 3413 wa_bb_func_t wa_bb_fn[2]; 3414 struct page *page; 3415 void *batch, *batch_ptr; 3416 unsigned int i; 3417 int ret; 3418 3419 if (engine->class != RENDER_CLASS) 3420 return 0; 3421 3422 switch (INTEL_GEN(engine->i915)) { 3423 case 12: 3424 case 11: 3425 return 0; 3426 case 10: 3427 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3428 wa_bb_fn[1] = NULL; 3429 break; 3430 case 9: 3431 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3432 wa_bb_fn[1] = NULL; 3433 break; 3434 case 8: 3435 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3436 wa_bb_fn[1] = NULL; 3437 break; 3438 default: 3439 MISSING_CASE(INTEL_GEN(engine->i915)); 3440 return 0; 3441 } 3442 3443 ret = lrc_setup_wa_ctx(engine); 3444 if (ret) { 3445 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 3446 return ret; 3447 } 3448 3449 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 3450 batch = batch_ptr = kmap_atomic(page); 3451 3452 /* 3453 * Emit the two workaround batch buffers, recording the offset from the 3454 * start of the workaround batch buffer object for each and their 3455 * respective sizes. 3456 */ 3457 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 3458 wa_bb[i]->offset = batch_ptr - batch; 3459 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 3460 CACHELINE_BYTES))) { 3461 ret = -EINVAL; 3462 break; 3463 } 3464 if (wa_bb_fn[i]) 3465 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 3466 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 3467 } 3468 3469 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 3470 3471 kunmap_atomic(batch); 3472 if (ret) 3473 lrc_destroy_wa_ctx(engine); 3474 3475 return ret; 3476 } 3477 3478 static void enable_error_interrupt(struct intel_engine_cs *engine) 3479 { 3480 u32 status; 3481 3482 engine->execlists.error_interrupt = 0; 3483 ENGINE_WRITE(engine, RING_EMR, ~0u); 3484 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 3485 3486 status = ENGINE_READ(engine, RING_ESR); 3487 if (unlikely(status)) { 3488 dev_err(engine->i915->drm.dev, 3489 "engine '%s' resumed still in error: %08x\n", 3490 engine->name, status); 3491 __intel_gt_reset(engine->gt, engine->mask); 3492 } 3493 3494 /* 3495 * On current gen8+, we have 2 signals to play with 3496 * 3497 * - I915_ERROR_INSTUCTION (bit 0) 3498 * 3499 * Generate an error if the command parser encounters an invalid 3500 * instruction 3501 * 3502 * This is a fatal error. 3503 * 3504 * - CP_PRIV (bit 2) 3505 * 3506 * Generate an error on privilege violation (where the CP replaces 3507 * the instruction with a no-op). This also fires for writes into 3508 * read-only scratch pages. 3509 * 3510 * This is a non-fatal error, parsing continues. 3511 * 3512 * * there are a few others defined for odd HW that we do not use 3513 * 3514 * Since CP_PRIV fires for cases where we have chosen to ignore the 3515 * error (as the HW is validating and suppressing the mistakes), we 3516 * only unmask the instruction error bit. 3517 */ 3518 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 3519 } 3520 3521 static void enable_execlists(struct intel_engine_cs *engine) 3522 { 3523 u32 mode; 3524 3525 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 3526 3527 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 3528 3529 if (INTEL_GEN(engine->i915) >= 11) 3530 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 3531 else 3532 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 3533 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 3534 3535 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 3536 3537 ENGINE_WRITE_FW(engine, 3538 RING_HWS_PGA, 3539 i915_ggtt_offset(engine->status_page.vma)); 3540 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 3541 3542 enable_error_interrupt(engine); 3543 3544 engine->context_tag = 0; 3545 } 3546 3547 static bool unexpected_starting_state(struct intel_engine_cs *engine) 3548 { 3549 bool unexpected = false; 3550 3551 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 3552 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); 3553 unexpected = true; 3554 } 3555 3556 return unexpected; 3557 } 3558 3559 static int execlists_resume(struct intel_engine_cs *engine) 3560 { 3561 intel_mocs_init_engine(engine); 3562 3563 intel_engine_reset_breadcrumbs(engine); 3564 3565 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 3566 struct drm_printer p = drm_debug_printer(__func__); 3567 3568 intel_engine_dump(engine, &p, NULL); 3569 } 3570 3571 enable_execlists(engine); 3572 3573 return 0; 3574 } 3575 3576 static void execlists_reset_prepare(struct intel_engine_cs *engine) 3577 { 3578 struct intel_engine_execlists * const execlists = &engine->execlists; 3579 unsigned long flags; 3580 3581 ENGINE_TRACE(engine, "depth<-%d\n", 3582 atomic_read(&execlists->tasklet.count)); 3583 3584 /* 3585 * Prevent request submission to the hardware until we have 3586 * completed the reset in i915_gem_reset_finish(). If a request 3587 * is completed by one engine, it may then queue a request 3588 * to a second via its execlists->tasklet *just* as we are 3589 * calling engine->resume() and also writing the ELSP. 3590 * Turning off the execlists->tasklet until the reset is over 3591 * prevents the race. 3592 */ 3593 __tasklet_disable_sync_once(&execlists->tasklet); 3594 GEM_BUG_ON(!reset_in_progress(execlists)); 3595 3596 /* And flush any current direct submission. */ 3597 spin_lock_irqsave(&engine->active.lock, flags); 3598 spin_unlock_irqrestore(&engine->active.lock, flags); 3599 3600 /* 3601 * We stop engines, otherwise we might get failed reset and a 3602 * dead gpu (on elk). Also as modern gpu as kbl can suffer 3603 * from system hang if batchbuffer is progressing when 3604 * the reset is issued, regardless of READY_TO_RESET ack. 3605 * Thus assume it is best to stop engines on all gens 3606 * where we have a gpu reset. 3607 * 3608 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 3609 * 3610 * FIXME: Wa for more modern gens needs to be validated 3611 */ 3612 intel_engine_stop_cs(engine); 3613 } 3614 3615 static void reset_csb_pointers(struct intel_engine_cs *engine) 3616 { 3617 struct intel_engine_execlists * const execlists = &engine->execlists; 3618 const unsigned int reset_value = execlists->csb_size - 1; 3619 3620 ring_set_paused(engine, 0); 3621 3622 /* 3623 * After a reset, the HW starts writing into CSB entry [0]. We 3624 * therefore have to set our HEAD pointer back one entry so that 3625 * the *first* entry we check is entry 0. To complicate this further, 3626 * as we don't wait for the first interrupt after reset, we have to 3627 * fake the HW write to point back to the last entry so that our 3628 * inline comparison of our cached head position against the last HW 3629 * write works even before the first interrupt. 3630 */ 3631 execlists->csb_head = reset_value; 3632 WRITE_ONCE(*execlists->csb_write, reset_value); 3633 wmb(); /* Make sure this is visible to HW (paranoia?) */ 3634 3635 /* 3636 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 3637 * Bludgeon them with a mmio update to be sure. 3638 */ 3639 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 3640 reset_value << 8 | reset_value); 3641 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 3642 3643 invalidate_csb_entries(&execlists->csb_status[0], 3644 &execlists->csb_status[reset_value]); 3645 } 3646 3647 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 3648 { 3649 int x; 3650 3651 x = lrc_ring_mi_mode(engine); 3652 if (x != -1) { 3653 regs[x + 1] &= ~STOP_RING; 3654 regs[x + 1] |= STOP_RING << 16; 3655 } 3656 } 3657 3658 static void __execlists_reset_reg_state(const struct intel_context *ce, 3659 const struct intel_engine_cs *engine) 3660 { 3661 u32 *regs = ce->lrc_reg_state; 3662 3663 __reset_stop_ring(regs, engine); 3664 } 3665 3666 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 3667 { 3668 struct intel_engine_execlists * const execlists = &engine->execlists; 3669 struct intel_context *ce; 3670 struct i915_request *rq; 3671 u32 head; 3672 3673 mb(); /* paranoia: read the CSB pointers from after the reset */ 3674 clflush(execlists->csb_write); 3675 mb(); 3676 3677 process_csb(engine); /* drain preemption events */ 3678 3679 /* Following the reset, we need to reload the CSB read/write pointers */ 3680 reset_csb_pointers(engine); 3681 3682 /* 3683 * Save the currently executing context, even if we completed 3684 * its request, it was still running at the time of the 3685 * reset and will have been clobbered. 3686 */ 3687 rq = execlists_active(execlists); 3688 if (!rq) 3689 goto unwind; 3690 3691 ce = rq->context; 3692 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3693 3694 if (i915_request_completed(rq)) { 3695 /* Idle context; tidy up the ring so we can restart afresh */ 3696 head = intel_ring_wrap(ce->ring, rq->tail); 3697 goto out_replay; 3698 } 3699 3700 /* We still have requests in-flight; the engine should be active */ 3701 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 3702 3703 /* Context has requests still in-flight; it should not be idle! */ 3704 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 3705 3706 rq = active_request(ce->timeline, rq); 3707 head = intel_ring_wrap(ce->ring, rq->head); 3708 GEM_BUG_ON(head == ce->ring->tail); 3709 3710 /* 3711 * If this request hasn't started yet, e.g. it is waiting on a 3712 * semaphore, we need to avoid skipping the request or else we 3713 * break the signaling chain. However, if the context is corrupt 3714 * the request will not restart and we will be stuck with a wedged 3715 * device. It is quite often the case that if we issue a reset 3716 * while the GPU is loading the context image, that the context 3717 * image becomes corrupt. 3718 * 3719 * Otherwise, if we have not started yet, the request should replay 3720 * perfectly and we do not need to flag the result as being erroneous. 3721 */ 3722 if (!i915_request_started(rq)) 3723 goto out_replay; 3724 3725 /* 3726 * If the request was innocent, we leave the request in the ELSP 3727 * and will try to replay it on restarting. The context image may 3728 * have been corrupted by the reset, in which case we may have 3729 * to service a new GPU hang, but more likely we can continue on 3730 * without impact. 3731 * 3732 * If the request was guilty, we presume the context is corrupt 3733 * and have to at least restore the RING register in the context 3734 * image back to the expected values to skip over the guilty request. 3735 */ 3736 __i915_request_reset(rq, stalled); 3737 if (!stalled) 3738 goto out_replay; 3739 3740 /* 3741 * We want a simple context + ring to execute the breadcrumb update. 3742 * We cannot rely on the context being intact across the GPU hang, 3743 * so clear it and rebuild just what we need for the breadcrumb. 3744 * All pending requests for this context will be zapped, and any 3745 * future request will be after userspace has had the opportunity 3746 * to recreate its own state. 3747 */ 3748 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3749 restore_default_state(ce, engine); 3750 3751 out_replay: 3752 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 3753 head, ce->ring->tail); 3754 __execlists_reset_reg_state(ce, engine); 3755 __execlists_update_reg_state(ce, engine, head); 3756 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 3757 3758 unwind: 3759 /* Push back any incomplete requests for replay after the reset. */ 3760 cancel_port_requests(execlists); 3761 __unwind_incomplete_requests(engine); 3762 } 3763 3764 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 3765 { 3766 unsigned long flags; 3767 3768 ENGINE_TRACE(engine, "\n"); 3769 3770 spin_lock_irqsave(&engine->active.lock, flags); 3771 3772 __execlists_reset(engine, stalled); 3773 3774 spin_unlock_irqrestore(&engine->active.lock, flags); 3775 } 3776 3777 static void nop_submission_tasklet(unsigned long data) 3778 { 3779 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 3780 3781 /* The driver is wedged; don't process any more events. */ 3782 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); 3783 } 3784 3785 static void execlists_reset_cancel(struct intel_engine_cs *engine) 3786 { 3787 struct intel_engine_execlists * const execlists = &engine->execlists; 3788 struct i915_request *rq, *rn; 3789 struct rb_node *rb; 3790 unsigned long flags; 3791 3792 ENGINE_TRACE(engine, "\n"); 3793 3794 /* 3795 * Before we call engine->cancel_requests(), we should have exclusive 3796 * access to the submission state. This is arranged for us by the 3797 * caller disabling the interrupt generation, the tasklet and other 3798 * threads that may then access the same state, giving us a free hand 3799 * to reset state. However, we still need to let lockdep be aware that 3800 * we know this state may be accessed in hardirq context, so we 3801 * disable the irq around this manipulation and we want to keep 3802 * the spinlock focused on its duties and not accidentally conflate 3803 * coverage to the submission's irq state. (Similarly, although we 3804 * shouldn't need to disable irq around the manipulation of the 3805 * submission's irq state, we also wish to remind ourselves that 3806 * it is irq state.) 3807 */ 3808 spin_lock_irqsave(&engine->active.lock, flags); 3809 3810 __execlists_reset(engine, true); 3811 3812 /* Mark all executing requests as skipped. */ 3813 list_for_each_entry(rq, &engine->active.requests, sched.link) 3814 mark_eio(rq); 3815 3816 /* Flush the queued requests to the timeline list (for retiring). */ 3817 while ((rb = rb_first_cached(&execlists->queue))) { 3818 struct i915_priolist *p = to_priolist(rb); 3819 int i; 3820 3821 priolist_for_each_request_consume(rq, rn, p, i) { 3822 mark_eio(rq); 3823 __i915_request_submit(rq); 3824 } 3825 3826 rb_erase_cached(&p->node, &execlists->queue); 3827 i915_priolist_free(p); 3828 } 3829 3830 /* On-hold requests will be flushed to timeline upon their release */ 3831 list_for_each_entry(rq, &engine->active.hold, sched.link) 3832 mark_eio(rq); 3833 3834 /* Cancel all attached virtual engines */ 3835 while ((rb = rb_first_cached(&execlists->virtual))) { 3836 struct virtual_engine *ve = 3837 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 3838 3839 rb_erase_cached(rb, &execlists->virtual); 3840 RB_CLEAR_NODE(rb); 3841 3842 spin_lock(&ve->base.active.lock); 3843 rq = fetch_and_zero(&ve->request); 3844 if (rq) { 3845 mark_eio(rq); 3846 3847 rq->engine = engine; 3848 __i915_request_submit(rq); 3849 i915_request_put(rq); 3850 3851 ve->base.execlists.queue_priority_hint = INT_MIN; 3852 } 3853 spin_unlock(&ve->base.active.lock); 3854 } 3855 3856 /* Remaining _unready_ requests will be nop'ed when submitted */ 3857 3858 execlists->queue_priority_hint = INT_MIN; 3859 execlists->queue = RB_ROOT_CACHED; 3860 3861 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 3862 execlists->tasklet.func = nop_submission_tasklet; 3863 3864 spin_unlock_irqrestore(&engine->active.lock, flags); 3865 } 3866 3867 static void execlists_reset_finish(struct intel_engine_cs *engine) 3868 { 3869 struct intel_engine_execlists * const execlists = &engine->execlists; 3870 3871 /* 3872 * After a GPU reset, we may have requests to replay. Do so now while 3873 * we still have the forcewake to be sure that the GPU is not allowed 3874 * to sleep before we restart and reload a context. 3875 */ 3876 GEM_BUG_ON(!reset_in_progress(execlists)); 3877 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 3878 execlists->tasklet.func(execlists->tasklet.data); 3879 3880 if (__tasklet_enable(&execlists->tasklet)) 3881 /* And kick in case we missed a new request submission. */ 3882 tasklet_hi_schedule(&execlists->tasklet); 3883 ENGINE_TRACE(engine, "depth->%d\n", 3884 atomic_read(&execlists->tasklet.count)); 3885 } 3886 3887 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 3888 u64 offset, u32 len, 3889 const unsigned int flags) 3890 { 3891 u32 *cs; 3892 3893 cs = intel_ring_begin(rq, 4); 3894 if (IS_ERR(cs)) 3895 return PTR_ERR(cs); 3896 3897 /* 3898 * WaDisableCtxRestoreArbitration:bdw,chv 3899 * 3900 * We don't need to perform MI_ARB_ENABLE as often as we do (in 3901 * particular all the gen that do not need the w/a at all!), if we 3902 * took care to make sure that on every switch into this context 3903 * (both ordinary and for preemption) that arbitrartion was enabled 3904 * we would be fine. However, for gen8 there is another w/a that 3905 * requires us to not preempt inside GPGPU execution, so we keep 3906 * arbitration disabled for gen8 batches. Arbitration will be 3907 * re-enabled before we close the request 3908 * (engine->emit_fini_breadcrumb). 3909 */ 3910 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3911 3912 /* FIXME(BDW+): Address space and security selectors. */ 3913 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3914 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3915 *cs++ = lower_32_bits(offset); 3916 *cs++ = upper_32_bits(offset); 3917 3918 intel_ring_advance(rq, cs); 3919 3920 return 0; 3921 } 3922 3923 static int gen8_emit_bb_start(struct i915_request *rq, 3924 u64 offset, u32 len, 3925 const unsigned int flags) 3926 { 3927 u32 *cs; 3928 3929 cs = intel_ring_begin(rq, 6); 3930 if (IS_ERR(cs)) 3931 return PTR_ERR(cs); 3932 3933 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3934 3935 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3936 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3937 *cs++ = lower_32_bits(offset); 3938 *cs++ = upper_32_bits(offset); 3939 3940 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3941 *cs++ = MI_NOOP; 3942 3943 intel_ring_advance(rq, cs); 3944 3945 return 0; 3946 } 3947 3948 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 3949 { 3950 ENGINE_WRITE(engine, RING_IMR, 3951 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 3952 ENGINE_POSTING_READ(engine, RING_IMR); 3953 } 3954 3955 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 3956 { 3957 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 3958 } 3959 3960 static int gen8_emit_flush(struct i915_request *request, u32 mode) 3961 { 3962 u32 cmd, *cs; 3963 3964 cs = intel_ring_begin(request, 4); 3965 if (IS_ERR(cs)) 3966 return PTR_ERR(cs); 3967 3968 cmd = MI_FLUSH_DW + 1; 3969 3970 /* We always require a command barrier so that subsequent 3971 * commands, such as breadcrumb interrupts, are strictly ordered 3972 * wrt the contents of the write cache being flushed to memory 3973 * (and thus being coherent from the CPU). 3974 */ 3975 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 3976 3977 if (mode & EMIT_INVALIDATE) { 3978 cmd |= MI_INVALIDATE_TLB; 3979 if (request->engine->class == VIDEO_DECODE_CLASS) 3980 cmd |= MI_INVALIDATE_BSD; 3981 } 3982 3983 *cs++ = cmd; 3984 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 3985 *cs++ = 0; /* upper addr */ 3986 *cs++ = 0; /* value */ 3987 intel_ring_advance(request, cs); 3988 3989 return 0; 3990 } 3991 3992 static int gen8_emit_flush_render(struct i915_request *request, 3993 u32 mode) 3994 { 3995 bool vf_flush_wa = false, dc_flush_wa = false; 3996 u32 *cs, flags = 0; 3997 int len; 3998 3999 flags |= PIPE_CONTROL_CS_STALL; 4000 4001 if (mode & EMIT_FLUSH) { 4002 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4003 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4004 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4005 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4006 } 4007 4008 if (mode & EMIT_INVALIDATE) { 4009 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4010 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4011 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4012 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4013 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4014 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4015 flags |= PIPE_CONTROL_QW_WRITE; 4016 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4017 4018 /* 4019 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 4020 * pipe control. 4021 */ 4022 if (IS_GEN(request->i915, 9)) 4023 vf_flush_wa = true; 4024 4025 /* WaForGAMHang:kbl */ 4026 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 4027 dc_flush_wa = true; 4028 } 4029 4030 len = 6; 4031 4032 if (vf_flush_wa) 4033 len += 6; 4034 4035 if (dc_flush_wa) 4036 len += 12; 4037 4038 cs = intel_ring_begin(request, len); 4039 if (IS_ERR(cs)) 4040 return PTR_ERR(cs); 4041 4042 if (vf_flush_wa) 4043 cs = gen8_emit_pipe_control(cs, 0, 0); 4044 4045 if (dc_flush_wa) 4046 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 4047 0); 4048 4049 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4050 4051 if (dc_flush_wa) 4052 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 4053 4054 intel_ring_advance(request, cs); 4055 4056 return 0; 4057 } 4058 4059 static int gen11_emit_flush_render(struct i915_request *request, 4060 u32 mode) 4061 { 4062 if (mode & EMIT_FLUSH) { 4063 u32 *cs; 4064 u32 flags = 0; 4065 4066 flags |= PIPE_CONTROL_CS_STALL; 4067 4068 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4069 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4070 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4071 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4072 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4073 flags |= PIPE_CONTROL_QW_WRITE; 4074 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4075 4076 cs = intel_ring_begin(request, 6); 4077 if (IS_ERR(cs)) 4078 return PTR_ERR(cs); 4079 4080 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4081 intel_ring_advance(request, cs); 4082 } 4083 4084 if (mode & EMIT_INVALIDATE) { 4085 u32 *cs; 4086 u32 flags = 0; 4087 4088 flags |= PIPE_CONTROL_CS_STALL; 4089 4090 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4091 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4092 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4093 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4094 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4095 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4096 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4097 flags |= PIPE_CONTROL_QW_WRITE; 4098 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4099 4100 cs = intel_ring_begin(request, 6); 4101 if (IS_ERR(cs)) 4102 return PTR_ERR(cs); 4103 4104 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4105 intel_ring_advance(request, cs); 4106 } 4107 4108 return 0; 4109 } 4110 4111 static u32 preparser_disable(bool state) 4112 { 4113 return MI_ARB_CHECK | 1 << 8 | state; 4114 } 4115 4116 static int gen12_emit_flush_render(struct i915_request *request, 4117 u32 mode) 4118 { 4119 if (mode & EMIT_FLUSH) { 4120 u32 flags = 0; 4121 u32 *cs; 4122 4123 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4124 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4125 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4126 /* Wa_1409600907:tgl */ 4127 flags |= PIPE_CONTROL_DEPTH_STALL; 4128 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4129 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4130 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 4131 4132 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4133 flags |= PIPE_CONTROL_QW_WRITE; 4134 4135 flags |= PIPE_CONTROL_CS_STALL; 4136 4137 cs = intel_ring_begin(request, 6); 4138 if (IS_ERR(cs)) 4139 return PTR_ERR(cs); 4140 4141 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4142 intel_ring_advance(request, cs); 4143 } 4144 4145 if (mode & EMIT_INVALIDATE) { 4146 u32 flags = 0; 4147 u32 *cs; 4148 4149 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4150 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4151 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4152 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4153 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4154 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4155 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4156 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE; 4157 4158 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4159 flags |= PIPE_CONTROL_QW_WRITE; 4160 4161 flags |= PIPE_CONTROL_CS_STALL; 4162 4163 cs = intel_ring_begin(request, 8); 4164 if (IS_ERR(cs)) 4165 return PTR_ERR(cs); 4166 4167 /* 4168 * Prevent the pre-parser from skipping past the TLB 4169 * invalidate and loading a stale page for the batch 4170 * buffer / request payload. 4171 */ 4172 *cs++ = preparser_disable(true); 4173 4174 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4175 4176 *cs++ = preparser_disable(false); 4177 intel_ring_advance(request, cs); 4178 } 4179 4180 return 0; 4181 } 4182 4183 /* 4184 * Reserve space for 2 NOOPs at the end of each request to be 4185 * used as a workaround for not being allowed to do lite 4186 * restore with HEAD==TAIL (WaIdleLiteRestore). 4187 */ 4188 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4189 { 4190 /* Ensure there's always at least one preemption point per-request. */ 4191 *cs++ = MI_ARB_CHECK; 4192 *cs++ = MI_NOOP; 4193 request->wa_tail = intel_ring_offset(request, cs); 4194 4195 return cs; 4196 } 4197 4198 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4199 { 4200 *cs++ = MI_SEMAPHORE_WAIT | 4201 MI_SEMAPHORE_GLOBAL_GTT | 4202 MI_SEMAPHORE_POLL | 4203 MI_SEMAPHORE_SAD_EQ_SDD; 4204 *cs++ = 0; 4205 *cs++ = intel_hws_preempt_address(request->engine); 4206 *cs++ = 0; 4207 4208 return cs; 4209 } 4210 4211 static __always_inline u32* 4212 gen8_emit_fini_breadcrumb_footer(struct i915_request *request, 4213 u32 *cs) 4214 { 4215 *cs++ = MI_USER_INTERRUPT; 4216 4217 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4218 if (intel_engine_has_semaphores(request->engine)) 4219 cs = emit_preempt_busywait(request, cs); 4220 4221 request->tail = intel_ring_offset(request, cs); 4222 assert_ring_tail_valid(request->ring, request->tail); 4223 4224 return gen8_emit_wa_tail(request, cs); 4225 } 4226 4227 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 4228 { 4229 cs = gen8_emit_ggtt_write(cs, 4230 request->fence.seqno, 4231 i915_request_active_timeline(request)->hwsp_offset, 4232 0); 4233 4234 return gen8_emit_fini_breadcrumb_footer(request, cs); 4235 } 4236 4237 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4238 { 4239 cs = gen8_emit_pipe_control(cs, 4240 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4241 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4242 PIPE_CONTROL_DC_FLUSH_ENABLE, 4243 0); 4244 4245 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4246 cs = gen8_emit_ggtt_write_rcs(cs, 4247 request->fence.seqno, 4248 i915_request_active_timeline(request)->hwsp_offset, 4249 PIPE_CONTROL_FLUSH_ENABLE | 4250 PIPE_CONTROL_CS_STALL); 4251 4252 return gen8_emit_fini_breadcrumb_footer(request, cs); 4253 } 4254 4255 static u32 * 4256 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4257 { 4258 cs = gen8_emit_ggtt_write_rcs(cs, 4259 request->fence.seqno, 4260 i915_request_active_timeline(request)->hwsp_offset, 4261 PIPE_CONTROL_CS_STALL | 4262 PIPE_CONTROL_TILE_CACHE_FLUSH | 4263 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4264 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4265 PIPE_CONTROL_DC_FLUSH_ENABLE | 4266 PIPE_CONTROL_FLUSH_ENABLE); 4267 4268 return gen8_emit_fini_breadcrumb_footer(request, cs); 4269 } 4270 4271 /* 4272 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4273 * flush and will continue pre-fetching the instructions after it before the 4274 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4275 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4276 * of the next request before the memory has been flushed, we're guaranteed that 4277 * we won't access the batch itself too early. 4278 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4279 * so, if the current request is modifying an instruction in the next request on 4280 * the same intel_context, we might pre-fetch and then execute the pre-update 4281 * instruction. To avoid this, the users of self-modifying code should either 4282 * disable the parser around the code emitting the memory writes, via a new flag 4283 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4284 * the in-kernel use-cases we've opted to use a separate context, see 4285 * reloc_gpu() as an example. 4286 * All the above applies only to the instructions themselves. Non-inline data 4287 * used by the instructions is not pre-fetched. 4288 */ 4289 4290 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 4291 { 4292 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 4293 MI_SEMAPHORE_GLOBAL_GTT | 4294 MI_SEMAPHORE_POLL | 4295 MI_SEMAPHORE_SAD_EQ_SDD; 4296 *cs++ = 0; 4297 *cs++ = intel_hws_preempt_address(request->engine); 4298 *cs++ = 0; 4299 *cs++ = 0; 4300 *cs++ = MI_NOOP; 4301 4302 return cs; 4303 } 4304 4305 static __always_inline u32* 4306 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs) 4307 { 4308 *cs++ = MI_USER_INTERRUPT; 4309 4310 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4311 if (intel_engine_has_semaphores(request->engine)) 4312 cs = gen12_emit_preempt_busywait(request, cs); 4313 4314 request->tail = intel_ring_offset(request, cs); 4315 assert_ring_tail_valid(request->ring, request->tail); 4316 4317 return gen8_emit_wa_tail(request, cs); 4318 } 4319 4320 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 4321 { 4322 cs = gen8_emit_ggtt_write(cs, 4323 request->fence.seqno, 4324 i915_request_active_timeline(request)->hwsp_offset, 4325 0); 4326 4327 return gen12_emit_fini_breadcrumb_footer(request, cs); 4328 } 4329 4330 static u32 * 4331 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4332 { 4333 cs = gen8_emit_ggtt_write_rcs(cs, 4334 request->fence.seqno, 4335 i915_request_active_timeline(request)->hwsp_offset, 4336 PIPE_CONTROL_CS_STALL | 4337 PIPE_CONTROL_TILE_CACHE_FLUSH | 4338 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4339 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4340 /* Wa_1409600907:tgl */ 4341 PIPE_CONTROL_DEPTH_STALL | 4342 PIPE_CONTROL_DC_FLUSH_ENABLE | 4343 PIPE_CONTROL_FLUSH_ENABLE | 4344 PIPE_CONTROL_HDC_PIPELINE_FLUSH); 4345 4346 return gen12_emit_fini_breadcrumb_footer(request, cs); 4347 } 4348 4349 static void execlists_park(struct intel_engine_cs *engine) 4350 { 4351 cancel_timer(&engine->execlists.timer); 4352 cancel_timer(&engine->execlists.preempt); 4353 } 4354 4355 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 4356 { 4357 engine->submit_request = execlists_submit_request; 4358 engine->schedule = i915_schedule; 4359 engine->execlists.tasklet.func = execlists_submission_tasklet; 4360 4361 engine->reset.prepare = execlists_reset_prepare; 4362 engine->reset.rewind = execlists_reset_rewind; 4363 engine->reset.cancel = execlists_reset_cancel; 4364 engine->reset.finish = execlists_reset_finish; 4365 4366 engine->park = execlists_park; 4367 engine->unpark = NULL; 4368 4369 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 4370 if (!intel_vgpu_active(engine->i915)) { 4371 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 4372 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) 4373 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 4374 } 4375 4376 if (INTEL_GEN(engine->i915) >= 12) 4377 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 4378 4379 if (intel_engine_has_preemption(engine)) 4380 engine->emit_bb_start = gen8_emit_bb_start; 4381 else 4382 engine->emit_bb_start = gen8_emit_bb_start_noarb; 4383 } 4384 4385 static void execlists_shutdown(struct intel_engine_cs *engine) 4386 { 4387 /* Synchronise with residual timers and any softirq they raise */ 4388 del_timer_sync(&engine->execlists.timer); 4389 del_timer_sync(&engine->execlists.preempt); 4390 tasklet_kill(&engine->execlists.tasklet); 4391 } 4392 4393 static void execlists_release(struct intel_engine_cs *engine) 4394 { 4395 execlists_shutdown(engine); 4396 4397 intel_engine_cleanup_common(engine); 4398 lrc_destroy_wa_ctx(engine); 4399 } 4400 4401 static void 4402 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 4403 { 4404 /* Default vfuncs which can be overriden by each engine. */ 4405 4406 engine->resume = execlists_resume; 4407 4408 engine->cops = &execlists_context_ops; 4409 engine->request_alloc = execlists_request_alloc; 4410 4411 engine->emit_flush = gen8_emit_flush; 4412 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 4413 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 4414 if (INTEL_GEN(engine->i915) >= 12) 4415 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 4416 4417 engine->set_default_submission = intel_execlists_set_default_submission; 4418 4419 if (INTEL_GEN(engine->i915) < 11) { 4420 engine->irq_enable = gen8_logical_ring_enable_irq; 4421 engine->irq_disable = gen8_logical_ring_disable_irq; 4422 } else { 4423 /* 4424 * TODO: On Gen11 interrupt masks need to be clear 4425 * to allow C6 entry. Keep interrupts enabled at 4426 * and take the hit of generating extra interrupts 4427 * until a more refined solution exists. 4428 */ 4429 } 4430 } 4431 4432 static inline void 4433 logical_ring_default_irqs(struct intel_engine_cs *engine) 4434 { 4435 unsigned int shift = 0; 4436 4437 if (INTEL_GEN(engine->i915) < 11) { 4438 const u8 irq_shifts[] = { 4439 [RCS0] = GEN8_RCS_IRQ_SHIFT, 4440 [BCS0] = GEN8_BCS_IRQ_SHIFT, 4441 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 4442 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 4443 [VECS0] = GEN8_VECS_IRQ_SHIFT, 4444 }; 4445 4446 shift = irq_shifts[engine->id]; 4447 } 4448 4449 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 4450 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 4451 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 4452 } 4453 4454 static void rcs_submission_override(struct intel_engine_cs *engine) 4455 { 4456 switch (INTEL_GEN(engine->i915)) { 4457 case 12: 4458 engine->emit_flush = gen12_emit_flush_render; 4459 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 4460 break; 4461 case 11: 4462 engine->emit_flush = gen11_emit_flush_render; 4463 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 4464 break; 4465 default: 4466 engine->emit_flush = gen8_emit_flush_render; 4467 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 4468 break; 4469 } 4470 } 4471 4472 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 4473 { 4474 struct intel_engine_execlists * const execlists = &engine->execlists; 4475 struct drm_i915_private *i915 = engine->i915; 4476 struct intel_uncore *uncore = engine->uncore; 4477 u32 base = engine->mmio_base; 4478 4479 tasklet_init(&engine->execlists.tasklet, 4480 execlists_submission_tasklet, (unsigned long)engine); 4481 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 4482 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 4483 4484 logical_ring_default_vfuncs(engine); 4485 logical_ring_default_irqs(engine); 4486 4487 if (engine->class == RENDER_CLASS) 4488 rcs_submission_override(engine); 4489 4490 if (intel_init_workaround_bb(engine)) 4491 /* 4492 * We continue even if we fail to initialize WA batch 4493 * because we only expect rare glitches but nothing 4494 * critical to prevent us from using GPU 4495 */ 4496 DRM_ERROR("WA batch buffer initialization failed\n"); 4497 4498 if (HAS_LOGICAL_RING_ELSQ(i915)) { 4499 execlists->submit_reg = uncore->regs + 4500 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 4501 execlists->ctrl_reg = uncore->regs + 4502 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 4503 } else { 4504 execlists->submit_reg = uncore->regs + 4505 i915_mmio_reg_offset(RING_ELSP(base)); 4506 } 4507 4508 execlists->csb_status = 4509 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 4510 4511 execlists->csb_write = 4512 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 4513 4514 if (INTEL_GEN(i915) < 11) 4515 execlists->csb_size = GEN8_CSB_ENTRIES; 4516 else 4517 execlists->csb_size = GEN11_CSB_ENTRIES; 4518 4519 reset_csb_pointers(engine); 4520 4521 /* Finally, take ownership and responsibility for cleanup! */ 4522 engine->release = execlists_release; 4523 4524 return 0; 4525 } 4526 4527 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) 4528 { 4529 u32 indirect_ctx_offset; 4530 4531 switch (INTEL_GEN(engine->i915)) { 4532 default: 4533 MISSING_CASE(INTEL_GEN(engine->i915)); 4534 /* fall through */ 4535 case 12: 4536 indirect_ctx_offset = 4537 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4538 break; 4539 case 11: 4540 indirect_ctx_offset = 4541 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4542 break; 4543 case 10: 4544 indirect_ctx_offset = 4545 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4546 break; 4547 case 9: 4548 indirect_ctx_offset = 4549 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4550 break; 4551 case 8: 4552 indirect_ctx_offset = 4553 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4554 break; 4555 } 4556 4557 return indirect_ctx_offset; 4558 } 4559 4560 4561 static void init_common_reg_state(u32 * const regs, 4562 const struct intel_engine_cs *engine, 4563 const struct intel_ring *ring, 4564 bool inhibit) 4565 { 4566 u32 ctl; 4567 4568 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 4569 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 4570 if (inhibit) 4571 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 4572 if (INTEL_GEN(engine->i915) < 11) 4573 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 4574 CTX_CTRL_RS_CTX_ENABLE); 4575 regs[CTX_CONTEXT_CONTROL] = ctl; 4576 4577 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 4578 } 4579 4580 static void init_wa_bb_reg_state(u32 * const regs, 4581 const struct intel_engine_cs *engine, 4582 u32 pos_bb_per_ctx) 4583 { 4584 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 4585 4586 if (wa_ctx->per_ctx.size) { 4587 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 4588 4589 regs[pos_bb_per_ctx] = 4590 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 4591 } 4592 4593 if (wa_ctx->indirect_ctx.size) { 4594 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 4595 4596 regs[pos_bb_per_ctx + 2] = 4597 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 4598 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 4599 4600 regs[pos_bb_per_ctx + 4] = 4601 intel_lr_indirect_ctx_offset(engine) << 6; 4602 } 4603 } 4604 4605 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 4606 { 4607 if (i915_vm_is_4lvl(&ppgtt->vm)) { 4608 /* 64b PPGTT (48bit canonical) 4609 * PDP0_DESCRIPTOR contains the base address to PML4 and 4610 * other PDP Descriptors are ignored. 4611 */ 4612 ASSIGN_CTX_PML4(ppgtt, regs); 4613 } else { 4614 ASSIGN_CTX_PDP(ppgtt, regs, 3); 4615 ASSIGN_CTX_PDP(ppgtt, regs, 2); 4616 ASSIGN_CTX_PDP(ppgtt, regs, 1); 4617 ASSIGN_CTX_PDP(ppgtt, regs, 0); 4618 } 4619 } 4620 4621 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 4622 { 4623 if (i915_is_ggtt(vm)) 4624 return i915_vm_to_ggtt(vm)->alias; 4625 else 4626 return i915_vm_to_ppgtt(vm); 4627 } 4628 4629 static void execlists_init_reg_state(u32 *regs, 4630 const struct intel_context *ce, 4631 const struct intel_engine_cs *engine, 4632 const struct intel_ring *ring, 4633 bool inhibit) 4634 { 4635 /* 4636 * A context is actually a big batch buffer with several 4637 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 4638 * values we are setting here are only for the first context restore: 4639 * on a subsequent save, the GPU will recreate this batchbuffer with new 4640 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 4641 * we are not initializing here). 4642 * 4643 * Must keep consistent with virtual_update_register_offsets(). 4644 */ 4645 set_offsets(regs, reg_offsets(engine), engine, inhibit); 4646 4647 init_common_reg_state(regs, engine, ring, inhibit); 4648 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 4649 4650 init_wa_bb_reg_state(regs, engine, 4651 INTEL_GEN(engine->i915) >= 12 ? 4652 GEN12_CTX_BB_PER_CTX_PTR : 4653 CTX_BB_PER_CTX_PTR); 4654 4655 __reset_stop_ring(regs, engine); 4656 } 4657 4658 static int 4659 populate_lr_context(struct intel_context *ce, 4660 struct drm_i915_gem_object *ctx_obj, 4661 struct intel_engine_cs *engine, 4662 struct intel_ring *ring) 4663 { 4664 bool inhibit = true; 4665 void *vaddr; 4666 int ret; 4667 4668 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 4669 if (IS_ERR(vaddr)) { 4670 ret = PTR_ERR(vaddr); 4671 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 4672 return ret; 4673 } 4674 4675 set_redzone(vaddr, engine); 4676 4677 if (engine->default_state) { 4678 void *defaults; 4679 4680 defaults = i915_gem_object_pin_map(engine->default_state, 4681 I915_MAP_WB); 4682 if (IS_ERR(defaults)) { 4683 ret = PTR_ERR(defaults); 4684 goto err_unpin_ctx; 4685 } 4686 4687 memcpy(vaddr, defaults, engine->context_size); 4688 i915_gem_object_unpin_map(engine->default_state); 4689 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 4690 inhibit = false; 4691 } 4692 4693 /* Clear the ppHWSP (inc. per-context counters) */ 4694 memset(vaddr, 0, PAGE_SIZE); 4695 4696 /* 4697 * The second page of the context object contains some registers which 4698 * must be set up prior to the first execution. 4699 */ 4700 execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE, 4701 ce, engine, ring, inhibit); 4702 4703 ret = 0; 4704 err_unpin_ctx: 4705 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 4706 i915_gem_object_unpin_map(ctx_obj); 4707 return ret; 4708 } 4709 4710 static int __execlists_context_alloc(struct intel_context *ce, 4711 struct intel_engine_cs *engine) 4712 { 4713 struct drm_i915_gem_object *ctx_obj; 4714 struct intel_ring *ring; 4715 struct i915_vma *vma; 4716 u32 context_size; 4717 int ret; 4718 4719 GEM_BUG_ON(ce->state); 4720 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 4721 4722 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4723 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 4724 4725 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 4726 if (IS_ERR(ctx_obj)) 4727 return PTR_ERR(ctx_obj); 4728 4729 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 4730 if (IS_ERR(vma)) { 4731 ret = PTR_ERR(vma); 4732 goto error_deref_obj; 4733 } 4734 4735 if (!ce->timeline) { 4736 struct intel_timeline *tl; 4737 struct i915_vma *hwsp; 4738 4739 /* 4740 * Use the static global HWSP for the kernel context, and 4741 * a dynamically allocated cacheline for everyone else. 4742 */ 4743 hwsp = NULL; 4744 if (unlikely(intel_context_is_barrier(ce))) 4745 hwsp = engine->status_page.vma; 4746 4747 tl = intel_timeline_create(engine->gt, hwsp); 4748 if (IS_ERR(tl)) { 4749 ret = PTR_ERR(tl); 4750 goto error_deref_obj; 4751 } 4752 4753 ce->timeline = tl; 4754 } 4755 4756 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 4757 if (IS_ERR(ring)) { 4758 ret = PTR_ERR(ring); 4759 goto error_deref_obj; 4760 } 4761 4762 ret = populate_lr_context(ce, ctx_obj, engine, ring); 4763 if (ret) { 4764 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 4765 goto error_ring_free; 4766 } 4767 4768 ce->ring = ring; 4769 ce->state = vma; 4770 4771 return 0; 4772 4773 error_ring_free: 4774 intel_ring_put(ring); 4775 error_deref_obj: 4776 i915_gem_object_put(ctx_obj); 4777 return ret; 4778 } 4779 4780 static struct list_head *virtual_queue(struct virtual_engine *ve) 4781 { 4782 return &ve->base.execlists.default_priolist.requests[0]; 4783 } 4784 4785 static void virtual_context_destroy(struct kref *kref) 4786 { 4787 struct virtual_engine *ve = 4788 container_of(kref, typeof(*ve), context.ref); 4789 unsigned int n; 4790 4791 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4792 GEM_BUG_ON(ve->request); 4793 GEM_BUG_ON(ve->context.inflight); 4794 4795 for (n = 0; n < ve->num_siblings; n++) { 4796 struct intel_engine_cs *sibling = ve->siblings[n]; 4797 struct rb_node *node = &ve->nodes[sibling->id].rb; 4798 unsigned long flags; 4799 4800 if (RB_EMPTY_NODE(node)) 4801 continue; 4802 4803 spin_lock_irqsave(&sibling->active.lock, flags); 4804 4805 /* Detachment is lazily performed in the execlists tasklet */ 4806 if (!RB_EMPTY_NODE(node)) 4807 rb_erase_cached(node, &sibling->execlists.virtual); 4808 4809 spin_unlock_irqrestore(&sibling->active.lock, flags); 4810 } 4811 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 4812 4813 if (ve->context.state) 4814 __execlists_context_fini(&ve->context); 4815 intel_context_fini(&ve->context); 4816 4817 kfree(ve->bonds); 4818 kfree(ve); 4819 } 4820 4821 static void virtual_engine_initial_hint(struct virtual_engine *ve) 4822 { 4823 int swp; 4824 4825 /* 4826 * Pick a random sibling on starting to help spread the load around. 4827 * 4828 * New contexts are typically created with exactly the same order 4829 * of siblings, and often started in batches. Due to the way we iterate 4830 * the array of sibling when submitting requests, sibling[0] is 4831 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 4832 * randomised across the system, we also help spread the load by the 4833 * first engine we inspect being different each time. 4834 * 4835 * NB This does not force us to execute on this engine, it will just 4836 * typically be the first we inspect for submission. 4837 */ 4838 swp = prandom_u32_max(ve->num_siblings); 4839 if (!swp) 4840 return; 4841 4842 swap(ve->siblings[swp], ve->siblings[0]); 4843 if (!intel_engine_has_relative_mmio(ve->siblings[0])) 4844 virtual_update_register_offsets(ve->context.lrc_reg_state, 4845 ve->siblings[0]); 4846 } 4847 4848 static int virtual_context_alloc(struct intel_context *ce) 4849 { 4850 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4851 4852 return __execlists_context_alloc(ce, ve->siblings[0]); 4853 } 4854 4855 static int virtual_context_pin(struct intel_context *ce) 4856 { 4857 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4858 int err; 4859 4860 /* Note: we must use a real engine class for setting up reg state */ 4861 err = __execlists_context_pin(ce, ve->siblings[0]); 4862 if (err) 4863 return err; 4864 4865 virtual_engine_initial_hint(ve); 4866 return 0; 4867 } 4868 4869 static void virtual_context_enter(struct intel_context *ce) 4870 { 4871 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4872 unsigned int n; 4873 4874 for (n = 0; n < ve->num_siblings; n++) 4875 intel_engine_pm_get(ve->siblings[n]); 4876 4877 intel_timeline_enter(ce->timeline); 4878 } 4879 4880 static void virtual_context_exit(struct intel_context *ce) 4881 { 4882 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4883 unsigned int n; 4884 4885 intel_timeline_exit(ce->timeline); 4886 4887 for (n = 0; n < ve->num_siblings; n++) 4888 intel_engine_pm_put(ve->siblings[n]); 4889 } 4890 4891 static const struct intel_context_ops virtual_context_ops = { 4892 .alloc = virtual_context_alloc, 4893 4894 .pin = virtual_context_pin, 4895 .unpin = execlists_context_unpin, 4896 4897 .enter = virtual_context_enter, 4898 .exit = virtual_context_exit, 4899 4900 .destroy = virtual_context_destroy, 4901 }; 4902 4903 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 4904 { 4905 struct i915_request *rq; 4906 intel_engine_mask_t mask; 4907 4908 rq = READ_ONCE(ve->request); 4909 if (!rq) 4910 return 0; 4911 4912 /* The rq is ready for submission; rq->execution_mask is now stable. */ 4913 mask = rq->execution_mask; 4914 if (unlikely(!mask)) { 4915 /* Invalid selection, submit to a random engine in error */ 4916 i915_request_set_error_once(rq, -ENODEV); 4917 mask = ve->siblings[0]->mask; 4918 } 4919 4920 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 4921 rq->fence.context, rq->fence.seqno, 4922 mask, ve->base.execlists.queue_priority_hint); 4923 4924 return mask; 4925 } 4926 4927 static void virtual_submission_tasklet(unsigned long data) 4928 { 4929 struct virtual_engine * const ve = (struct virtual_engine *)data; 4930 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); 4931 intel_engine_mask_t mask; 4932 unsigned int n; 4933 4934 rcu_read_lock(); 4935 mask = virtual_submission_mask(ve); 4936 rcu_read_unlock(); 4937 if (unlikely(!mask)) 4938 return; 4939 4940 local_irq_disable(); 4941 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { 4942 struct intel_engine_cs *sibling = ve->siblings[n]; 4943 struct ve_node * const node = &ve->nodes[sibling->id]; 4944 struct rb_node **parent, *rb; 4945 bool first; 4946 4947 if (unlikely(!(mask & sibling->mask))) { 4948 if (!RB_EMPTY_NODE(&node->rb)) { 4949 spin_lock(&sibling->active.lock); 4950 rb_erase_cached(&node->rb, 4951 &sibling->execlists.virtual); 4952 RB_CLEAR_NODE(&node->rb); 4953 spin_unlock(&sibling->active.lock); 4954 } 4955 continue; 4956 } 4957 4958 spin_lock(&sibling->active.lock); 4959 4960 if (!RB_EMPTY_NODE(&node->rb)) { 4961 /* 4962 * Cheat and avoid rebalancing the tree if we can 4963 * reuse this node in situ. 4964 */ 4965 first = rb_first_cached(&sibling->execlists.virtual) == 4966 &node->rb; 4967 if (prio == node->prio || (prio > node->prio && first)) 4968 goto submit_engine; 4969 4970 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 4971 } 4972 4973 rb = NULL; 4974 first = true; 4975 parent = &sibling->execlists.virtual.rb_root.rb_node; 4976 while (*parent) { 4977 struct ve_node *other; 4978 4979 rb = *parent; 4980 other = rb_entry(rb, typeof(*other), rb); 4981 if (prio > other->prio) { 4982 parent = &rb->rb_left; 4983 } else { 4984 parent = &rb->rb_right; 4985 first = false; 4986 } 4987 } 4988 4989 rb_link_node(&node->rb, rb, parent); 4990 rb_insert_color_cached(&node->rb, 4991 &sibling->execlists.virtual, 4992 first); 4993 4994 submit_engine: 4995 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 4996 node->prio = prio; 4997 if (first && prio > sibling->execlists.queue_priority_hint) { 4998 sibling->execlists.queue_priority_hint = prio; 4999 tasklet_hi_schedule(&sibling->execlists.tasklet); 5000 } 5001 5002 spin_unlock(&sibling->active.lock); 5003 } 5004 local_irq_enable(); 5005 } 5006 5007 static void virtual_submit_request(struct i915_request *rq) 5008 { 5009 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5010 struct i915_request *old; 5011 unsigned long flags; 5012 5013 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 5014 rq->fence.context, 5015 rq->fence.seqno); 5016 5017 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 5018 5019 spin_lock_irqsave(&ve->base.active.lock, flags); 5020 5021 old = ve->request; 5022 if (old) { /* background completion event from preempt-to-busy */ 5023 GEM_BUG_ON(!i915_request_completed(old)); 5024 __i915_request_submit(old); 5025 i915_request_put(old); 5026 } 5027 5028 if (i915_request_completed(rq)) { 5029 __i915_request_submit(rq); 5030 5031 ve->base.execlists.queue_priority_hint = INT_MIN; 5032 ve->request = NULL; 5033 } else { 5034 ve->base.execlists.queue_priority_hint = rq_prio(rq); 5035 ve->request = i915_request_get(rq); 5036 5037 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5038 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5039 5040 tasklet_schedule(&ve->base.execlists.tasklet); 5041 } 5042 5043 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5044 } 5045 5046 static struct ve_bond * 5047 virtual_find_bond(struct virtual_engine *ve, 5048 const struct intel_engine_cs *master) 5049 { 5050 int i; 5051 5052 for (i = 0; i < ve->num_bonds; i++) { 5053 if (ve->bonds[i].master == master) 5054 return &ve->bonds[i]; 5055 } 5056 5057 return NULL; 5058 } 5059 5060 static void 5061 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5062 { 5063 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5064 intel_engine_mask_t allowed, exec; 5065 struct ve_bond *bond; 5066 5067 allowed = ~to_request(signal)->engine->mask; 5068 5069 bond = virtual_find_bond(ve, to_request(signal)->engine); 5070 if (bond) 5071 allowed &= bond->sibling_mask; 5072 5073 /* Restrict the bonded request to run on only the available engines */ 5074 exec = READ_ONCE(rq->execution_mask); 5075 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5076 ; 5077 5078 /* Prevent the master from being re-run on the bonded engines */ 5079 to_request(signal)->execution_mask &= ~allowed; 5080 } 5081 5082 struct intel_context * 5083 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5084 unsigned int count) 5085 { 5086 struct virtual_engine *ve; 5087 unsigned int n; 5088 int err; 5089 5090 if (count == 0) 5091 return ERR_PTR(-EINVAL); 5092 5093 if (count == 1) 5094 return intel_context_create(siblings[0]); 5095 5096 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5097 if (!ve) 5098 return ERR_PTR(-ENOMEM); 5099 5100 ve->base.i915 = siblings[0]->i915; 5101 ve->base.gt = siblings[0]->gt; 5102 ve->base.uncore = siblings[0]->uncore; 5103 ve->base.id = -1; 5104 5105 ve->base.class = OTHER_CLASS; 5106 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5107 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5108 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5109 5110 /* 5111 * The decision on whether to submit a request using semaphores 5112 * depends on the saturated state of the engine. We only compute 5113 * this during HW submission of the request, and we need for this 5114 * state to be globally applied to all requests being submitted 5115 * to this engine. Virtual engines encompass more than one physical 5116 * engine and so we cannot accurately tell in advance if one of those 5117 * engines is already saturated and so cannot afford to use a semaphore 5118 * and be pessimized in priority for doing so -- if we are the only 5119 * context using semaphores after all other clients have stopped, we 5120 * will be starved on the saturated system. Such a global switch for 5121 * semaphores is less than ideal, but alas is the current compromise. 5122 */ 5123 ve->base.saturated = ALL_ENGINES; 5124 5125 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5126 5127 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5128 intel_engine_init_breadcrumbs(&ve->base); 5129 intel_engine_init_execlists(&ve->base); 5130 5131 ve->base.cops = &virtual_context_ops; 5132 ve->base.request_alloc = execlists_request_alloc; 5133 5134 ve->base.schedule = i915_schedule; 5135 ve->base.submit_request = virtual_submit_request; 5136 ve->base.bond_execute = virtual_bond_execute; 5137 5138 INIT_LIST_HEAD(virtual_queue(ve)); 5139 ve->base.execlists.queue_priority_hint = INT_MIN; 5140 tasklet_init(&ve->base.execlists.tasklet, 5141 virtual_submission_tasklet, 5142 (unsigned long)ve); 5143 5144 intel_context_init(&ve->context, &ve->base); 5145 5146 for (n = 0; n < count; n++) { 5147 struct intel_engine_cs *sibling = siblings[n]; 5148 5149 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5150 if (sibling->mask & ve->base.mask) { 5151 DRM_DEBUG("duplicate %s entry in load balancer\n", 5152 sibling->name); 5153 err = -EINVAL; 5154 goto err_put; 5155 } 5156 5157 /* 5158 * The virtual engine implementation is tightly coupled to 5159 * the execlists backend -- we push out request directly 5160 * into a tree inside each physical engine. We could support 5161 * layering if we handle cloning of the requests and 5162 * submitting a copy into each backend. 5163 */ 5164 if (sibling->execlists.tasklet.func != 5165 execlists_submission_tasklet) { 5166 err = -ENODEV; 5167 goto err_put; 5168 } 5169 5170 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5171 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5172 5173 ve->siblings[ve->num_siblings++] = sibling; 5174 ve->base.mask |= sibling->mask; 5175 5176 /* 5177 * All physical engines must be compatible for their emission 5178 * functions (as we build the instructions during request 5179 * construction and do not alter them before submission 5180 * on the physical engine). We use the engine class as a guide 5181 * here, although that could be refined. 5182 */ 5183 if (ve->base.class != OTHER_CLASS) { 5184 if (ve->base.class != sibling->class) { 5185 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5186 sibling->class, ve->base.class); 5187 err = -EINVAL; 5188 goto err_put; 5189 } 5190 continue; 5191 } 5192 5193 ve->base.class = sibling->class; 5194 ve->base.uabi_class = sibling->uabi_class; 5195 snprintf(ve->base.name, sizeof(ve->base.name), 5196 "v%dx%d", ve->base.class, count); 5197 ve->base.context_size = sibling->context_size; 5198 5199 ve->base.emit_bb_start = sibling->emit_bb_start; 5200 ve->base.emit_flush = sibling->emit_flush; 5201 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5202 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5203 ve->base.emit_fini_breadcrumb_dw = 5204 sibling->emit_fini_breadcrumb_dw; 5205 5206 ve->base.flags = sibling->flags; 5207 } 5208 5209 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5210 5211 return &ve->context; 5212 5213 err_put: 5214 intel_context_put(&ve->context); 5215 return ERR_PTR(err); 5216 } 5217 5218 struct intel_context * 5219 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5220 { 5221 struct virtual_engine *se = to_virtual_engine(src); 5222 struct intel_context *dst; 5223 5224 dst = intel_execlists_create_virtual(se->siblings, 5225 se->num_siblings); 5226 if (IS_ERR(dst)) 5227 return dst; 5228 5229 if (se->num_bonds) { 5230 struct virtual_engine *de = to_virtual_engine(dst->engine); 5231 5232 de->bonds = kmemdup(se->bonds, 5233 sizeof(*se->bonds) * se->num_bonds, 5234 GFP_KERNEL); 5235 if (!de->bonds) { 5236 intel_context_put(dst); 5237 return ERR_PTR(-ENOMEM); 5238 } 5239 5240 de->num_bonds = se->num_bonds; 5241 } 5242 5243 return dst; 5244 } 5245 5246 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5247 const struct intel_engine_cs *master, 5248 const struct intel_engine_cs *sibling) 5249 { 5250 struct virtual_engine *ve = to_virtual_engine(engine); 5251 struct ve_bond *bond; 5252 int n; 5253 5254 /* Sanity check the sibling is part of the virtual engine */ 5255 for (n = 0; n < ve->num_siblings; n++) 5256 if (sibling == ve->siblings[n]) 5257 break; 5258 if (n == ve->num_siblings) 5259 return -EINVAL; 5260 5261 bond = virtual_find_bond(ve, master); 5262 if (bond) { 5263 bond->sibling_mask |= sibling->mask; 5264 return 0; 5265 } 5266 5267 bond = krealloc(ve->bonds, 5268 sizeof(*bond) * (ve->num_bonds + 1), 5269 GFP_KERNEL); 5270 if (!bond) 5271 return -ENOMEM; 5272 5273 bond[ve->num_bonds].master = master; 5274 bond[ve->num_bonds].sibling_mask = sibling->mask; 5275 5276 ve->bonds = bond; 5277 ve->num_bonds++; 5278 5279 return 0; 5280 } 5281 5282 struct intel_engine_cs * 5283 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 5284 unsigned int sibling) 5285 { 5286 struct virtual_engine *ve = to_virtual_engine(engine); 5287 5288 if (sibling >= ve->num_siblings) 5289 return NULL; 5290 5291 return ve->siblings[sibling]; 5292 } 5293 5294 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5295 struct drm_printer *m, 5296 void (*show_request)(struct drm_printer *m, 5297 struct i915_request *rq, 5298 const char *prefix), 5299 unsigned int max) 5300 { 5301 const struct intel_engine_execlists *execlists = &engine->execlists; 5302 struct i915_request *rq, *last; 5303 unsigned long flags; 5304 unsigned int count; 5305 struct rb_node *rb; 5306 5307 spin_lock_irqsave(&engine->active.lock, flags); 5308 5309 last = NULL; 5310 count = 0; 5311 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5312 if (count++ < max - 1) 5313 show_request(m, rq, "\t\tE "); 5314 else 5315 last = rq; 5316 } 5317 if (last) { 5318 if (count > max) { 5319 drm_printf(m, 5320 "\t\t...skipping %d executing requests...\n", 5321 count - max); 5322 } 5323 show_request(m, last, "\t\tE "); 5324 } 5325 5326 if (execlists->switch_priority_hint != INT_MIN) 5327 drm_printf(m, "\t\tSwitch priority hint: %d\n", 5328 READ_ONCE(execlists->switch_priority_hint)); 5329 if (execlists->queue_priority_hint != INT_MIN) 5330 drm_printf(m, "\t\tQueue priority hint: %d\n", 5331 READ_ONCE(execlists->queue_priority_hint)); 5332 5333 last = NULL; 5334 count = 0; 5335 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 5336 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 5337 int i; 5338 5339 priolist_for_each_request(rq, p, i) { 5340 if (count++ < max - 1) 5341 show_request(m, rq, "\t\tQ "); 5342 else 5343 last = rq; 5344 } 5345 } 5346 if (last) { 5347 if (count > max) { 5348 drm_printf(m, 5349 "\t\t...skipping %d queued requests...\n", 5350 count - max); 5351 } 5352 show_request(m, last, "\t\tQ "); 5353 } 5354 5355 last = NULL; 5356 count = 0; 5357 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 5358 struct virtual_engine *ve = 5359 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 5360 struct i915_request *rq = READ_ONCE(ve->request); 5361 5362 if (rq) { 5363 if (count++ < max - 1) 5364 show_request(m, rq, "\t\tV "); 5365 else 5366 last = rq; 5367 } 5368 } 5369 if (last) { 5370 if (count > max) { 5371 drm_printf(m, 5372 "\t\t...skipping %d virtual requests...\n", 5373 count - max); 5374 } 5375 show_request(m, last, "\t\tV "); 5376 } 5377 5378 spin_unlock_irqrestore(&engine->active.lock, flags); 5379 } 5380 5381 void intel_lr_context_reset(struct intel_engine_cs *engine, 5382 struct intel_context *ce, 5383 u32 head, 5384 bool scrub) 5385 { 5386 GEM_BUG_ON(!intel_context_is_pinned(ce)); 5387 5388 /* 5389 * We want a simple context + ring to execute the breadcrumb update. 5390 * We cannot rely on the context being intact across the GPU hang, 5391 * so clear it and rebuild just what we need for the breadcrumb. 5392 * All pending requests for this context will be zapped, and any 5393 * future request will be after userspace has had the opportunity 5394 * to recreate its own state. 5395 */ 5396 if (scrub) 5397 restore_default_state(ce, engine); 5398 5399 /* Rerun the request; its payload has been neutered (if guilty). */ 5400 __execlists_update_reg_state(ce, engine, head); 5401 } 5402 5403 bool 5404 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 5405 { 5406 return engine->set_default_submission == 5407 intel_execlists_set_default_submission; 5408 } 5409 5410 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 5411 #include "selftest_lrc.c" 5412 #endif 5413