1 /* 2 * SPDX-License-Identifier: MIT 3 * 4 * Copyright © 2008,2010 Intel Corporation 5 */ 6 7 #include <linux/intel-iommu.h> 8 #include <linux/dma-resv.h> 9 #include <linux/sync_file.h> 10 #include <linux/uaccess.h> 11 12 #include <drm/drm_syncobj.h> 13 14 #include "display/intel_frontbuffer.h" 15 16 #include "gem/i915_gem_ioctls.h" 17 #include "gt/intel_context.h" 18 #include "gt/intel_gpu_commands.h" 19 #include "gt/intel_gt.h" 20 #include "gt/intel_gt_buffer_pool.h" 21 #include "gt/intel_gt_pm.h" 22 #include "gt/intel_ring.h" 23 24 #include "i915_drv.h" 25 #include "i915_gem_clflush.h" 26 #include "i915_gem_context.h" 27 #include "i915_gem_ioctls.h" 28 #include "i915_sw_fence_work.h" 29 #include "i915_trace.h" 30 #include "i915_user_extensions.h" 31 #include "i915_memcpy.h" 32 33 struct eb_vma { 34 struct i915_vma *vma; 35 unsigned int flags; 36 37 /** This vma's place in the execbuf reservation list */ 38 struct drm_i915_gem_exec_object2 *exec; 39 struct list_head bind_link; 40 struct list_head reloc_link; 41 42 struct hlist_node node; 43 u32 handle; 44 }; 45 46 enum { 47 FORCE_CPU_RELOC = 1, 48 FORCE_GTT_RELOC, 49 FORCE_GPU_RELOC, 50 #define DBG_FORCE_RELOC 0 /* choose one of the above! */ 51 }; 52 53 /* __EXEC_OBJECT_NO_RESERVE is BIT(31), defined in i915_vma.h */ 54 #define __EXEC_OBJECT_HAS_PIN BIT(30) 55 #define __EXEC_OBJECT_HAS_FENCE BIT(29) 56 #define __EXEC_OBJECT_USERPTR_INIT BIT(28) 57 #define __EXEC_OBJECT_NEEDS_MAP BIT(27) 58 #define __EXEC_OBJECT_NEEDS_BIAS BIT(26) 59 #define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 26) /* all of the above + */ 60 #define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE) 61 62 #define __EXEC_HAS_RELOC BIT(31) 63 #define __EXEC_ENGINE_PINNED BIT(30) 64 #define __EXEC_USERPTR_USED BIT(29) 65 #define __EXEC_INTERNAL_FLAGS (~0u << 29) 66 #define UPDATE PIN_OFFSET_FIXED 67 68 #define BATCH_OFFSET_BIAS (256*1024) 69 70 #define __I915_EXEC_ILLEGAL_FLAGS \ 71 (__I915_EXEC_UNKNOWN_FLAGS | \ 72 I915_EXEC_CONSTANTS_MASK | \ 73 I915_EXEC_RESOURCE_STREAMER) 74 75 /* Catch emission of unexpected errors for CI! */ 76 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 77 #undef EINVAL 78 #define EINVAL ({ \ 79 DRM_DEBUG_DRIVER("EINVAL at %s:%d\n", __func__, __LINE__); \ 80 22; \ 81 }) 82 #endif 83 84 /** 85 * DOC: User command execution 86 * 87 * Userspace submits commands to be executed on the GPU as an instruction 88 * stream within a GEM object we call a batchbuffer. This instructions may 89 * refer to other GEM objects containing auxiliary state such as kernels, 90 * samplers, render targets and even secondary batchbuffers. Userspace does 91 * not know where in the GPU memory these objects reside and so before the 92 * batchbuffer is passed to the GPU for execution, those addresses in the 93 * batchbuffer and auxiliary objects are updated. This is known as relocation, 94 * or patching. To try and avoid having to relocate each object on the next 95 * execution, userspace is told the location of those objects in this pass, 96 * but this remains just a hint as the kernel may choose a new location for 97 * any object in the future. 98 * 99 * At the level of talking to the hardware, submitting a batchbuffer for the 100 * GPU to execute is to add content to a buffer from which the HW 101 * command streamer is reading. 102 * 103 * 1. Add a command to load the HW context. For Logical Ring Contexts, i.e. 104 * Execlists, this command is not placed on the same buffer as the 105 * remaining items. 106 * 107 * 2. Add a command to invalidate caches to the buffer. 108 * 109 * 3. Add a batchbuffer start command to the buffer; the start command is 110 * essentially a token together with the GPU address of the batchbuffer 111 * to be executed. 112 * 113 * 4. Add a pipeline flush to the buffer. 114 * 115 * 5. Add a memory write command to the buffer to record when the GPU 116 * is done executing the batchbuffer. The memory write writes the 117 * global sequence number of the request, ``i915_request::global_seqno``; 118 * the i915 driver uses the current value in the register to determine 119 * if the GPU has completed the batchbuffer. 120 * 121 * 6. Add a user interrupt command to the buffer. This command instructs 122 * the GPU to issue an interrupt when the command, pipeline flush and 123 * memory write are completed. 124 * 125 * 7. Inform the hardware of the additional commands added to the buffer 126 * (by updating the tail pointer). 127 * 128 * Processing an execbuf ioctl is conceptually split up into a few phases. 129 * 130 * 1. Validation - Ensure all the pointers, handles and flags are valid. 131 * 2. Reservation - Assign GPU address space for every object 132 * 3. Relocation - Update any addresses to point to the final locations 133 * 4. Serialisation - Order the request with respect to its dependencies 134 * 5. Construction - Construct a request to execute the batchbuffer 135 * 6. Submission (at some point in the future execution) 136 * 137 * Reserving resources for the execbuf is the most complicated phase. We 138 * neither want to have to migrate the object in the address space, nor do 139 * we want to have to update any relocations pointing to this object. Ideally, 140 * we want to leave the object where it is and for all the existing relocations 141 * to match. If the object is given a new address, or if userspace thinks the 142 * object is elsewhere, we have to parse all the relocation entries and update 143 * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that 144 * all the target addresses in all of its objects match the value in the 145 * relocation entries and that they all match the presumed offsets given by the 146 * list of execbuffer objects. Using this knowledge, we know that if we haven't 147 * moved any buffers, all the relocation entries are valid and we can skip 148 * the update. (If userspace is wrong, the likely outcome is an impromptu GPU 149 * hang.) The requirement for using I915_EXEC_NO_RELOC are: 150 * 151 * The addresses written in the objects must match the corresponding 152 * reloc.presumed_offset which in turn must match the corresponding 153 * execobject.offset. 154 * 155 * Any render targets written to in the batch must be flagged with 156 * EXEC_OBJECT_WRITE. 157 * 158 * To avoid stalling, execobject.offset should match the current 159 * address of that object within the active context. 160 * 161 * The reservation is done is multiple phases. First we try and keep any 162 * object already bound in its current location - so as long as meets the 163 * constraints imposed by the new execbuffer. Any object left unbound after the 164 * first pass is then fitted into any available idle space. If an object does 165 * not fit, all objects are removed from the reservation and the process rerun 166 * after sorting the objects into a priority order (more difficult to fit 167 * objects are tried first). Failing that, the entire VM is cleared and we try 168 * to fit the execbuf once last time before concluding that it simply will not 169 * fit. 170 * 171 * A small complication to all of this is that we allow userspace not only to 172 * specify an alignment and a size for the object in the address space, but 173 * we also allow userspace to specify the exact offset. This objects are 174 * simpler to place (the location is known a priori) all we have to do is make 175 * sure the space is available. 176 * 177 * Once all the objects are in place, patching up the buried pointers to point 178 * to the final locations is a fairly simple job of walking over the relocation 179 * entry arrays, looking up the right address and rewriting the value into 180 * the object. Simple! ... The relocation entries are stored in user memory 181 * and so to access them we have to copy them into a local buffer. That copy 182 * has to avoid taking any pagefaults as they may lead back to a GEM object 183 * requiring the struct_mutex (i.e. recursive deadlock). So once again we split 184 * the relocation into multiple passes. First we try to do everything within an 185 * atomic context (avoid the pagefaults) which requires that we never wait. If 186 * we detect that we may wait, or if we need to fault, then we have to fallback 187 * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm 188 * bells yet?) Dropping the mutex means that we lose all the state we have 189 * built up so far for the execbuf and we must reset any global data. However, 190 * we do leave the objects pinned in their final locations - which is a 191 * potential issue for concurrent execbufs. Once we have left the mutex, we can 192 * allocate and copy all the relocation entries into a large array at our 193 * leisure, reacquire the mutex, reclaim all the objects and other state and 194 * then proceed to update any incorrect addresses with the objects. 195 * 196 * As we process the relocation entries, we maintain a record of whether the 197 * object is being written to. Using NORELOC, we expect userspace to provide 198 * this information instead. We also check whether we can skip the relocation 199 * by comparing the expected value inside the relocation entry with the target's 200 * final address. If they differ, we have to map the current object and rewrite 201 * the 4 or 8 byte pointer within. 202 * 203 * Serialising an execbuf is quite simple according to the rules of the GEM 204 * ABI. Execution within each context is ordered by the order of submission. 205 * Writes to any GEM object are in order of submission and are exclusive. Reads 206 * from a GEM object are unordered with respect to other reads, but ordered by 207 * writes. A write submitted after a read cannot occur before the read, and 208 * similarly any read submitted after a write cannot occur before the write. 209 * Writes are ordered between engines such that only one write occurs at any 210 * time (completing any reads beforehand) - using semaphores where available 211 * and CPU serialisation otherwise. Other GEM access obey the same rules, any 212 * write (either via mmaps using set-domain, or via pwrite) must flush all GPU 213 * reads before starting, and any read (either using set-domain or pread) must 214 * flush all GPU writes before starting. (Note we only employ a barrier before, 215 * we currently rely on userspace not concurrently starting a new execution 216 * whilst reading or writing to an object. This may be an advantage or not 217 * depending on how much you trust userspace not to shoot themselves in the 218 * foot.) Serialisation may just result in the request being inserted into 219 * a DAG awaiting its turn, but most simple is to wait on the CPU until 220 * all dependencies are resolved. 221 * 222 * After all of that, is just a matter of closing the request and handing it to 223 * the hardware (well, leaving it in a queue to be executed). However, we also 224 * offer the ability for batchbuffers to be run with elevated privileges so 225 * that they access otherwise hidden registers. (Used to adjust L3 cache etc.) 226 * Before any batch is given extra privileges we first must check that it 227 * contains no nefarious instructions, we check that each instruction is from 228 * our whitelist and all registers are also from an allowed list. We first 229 * copy the user's batchbuffer to a shadow (so that the user doesn't have 230 * access to it, either by the CPU or GPU as we scan it) and then parse each 231 * instruction. If everything is ok, we set a flag telling the hardware to run 232 * the batchbuffer in trusted mode, otherwise the ioctl is rejected. 233 */ 234 235 struct eb_fence { 236 struct drm_syncobj *syncobj; /* Use with ptr_mask_bits() */ 237 struct dma_fence *dma_fence; 238 u64 value; 239 struct dma_fence_chain *chain_fence; 240 }; 241 242 struct i915_execbuffer { 243 struct drm_i915_private *i915; /** i915 backpointer */ 244 struct drm_file *file; /** per-file lookup tables and limits */ 245 struct drm_i915_gem_execbuffer2 *args; /** ioctl parameters */ 246 struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ 247 struct eb_vma *vma; 248 249 struct intel_engine_cs *engine; /** engine to queue the request to */ 250 struct intel_context *context; /* logical state for the request */ 251 struct i915_gem_context *gem_context; /** caller's context */ 252 253 struct i915_request *request; /** our request to build */ 254 struct eb_vma *batch; /** identity of the batch obj/vma */ 255 struct i915_vma *trampoline; /** trampoline used for chaining */ 256 257 /** actual size of execobj[] as we may extend it for the cmdparser */ 258 unsigned int buffer_count; 259 260 /** list of vma not yet bound during reservation phase */ 261 struct list_head unbound; 262 263 /** list of vma that have execobj.relocation_count */ 264 struct list_head relocs; 265 266 struct i915_gem_ww_ctx ww; 267 268 /** 269 * Track the most recently used object for relocations, as we 270 * frequently have to perform multiple relocations within the same 271 * obj/page 272 */ 273 struct reloc_cache { 274 struct drm_mm_node node; /** temporary GTT binding */ 275 unsigned long vaddr; /** Current kmap address */ 276 unsigned long page; /** Currently mapped page index */ 277 unsigned int gen; /** Cached value of INTEL_GEN */ 278 bool use_64bit_reloc : 1; 279 bool has_llc : 1; 280 bool has_fence : 1; 281 bool needs_unfenced : 1; 282 283 struct i915_request *rq; 284 u32 *rq_cmd; 285 unsigned int rq_size; 286 struct intel_gt_buffer_pool_node *pool; 287 } reloc_cache; 288 289 struct intel_gt_buffer_pool_node *reloc_pool; /** relocation pool for -EDEADLK handling */ 290 struct intel_context *reloc_context; 291 292 u64 invalid_flags; /** Set of execobj.flags that are invalid */ 293 u32 context_flags; /** Set of execobj.flags to insert from the ctx */ 294 295 u64 batch_len; /** Length of batch within object */ 296 u32 batch_start_offset; /** Location within object of batch */ 297 u32 batch_flags; /** Flags composed for emit_bb_start() */ 298 struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */ 299 300 /** 301 * Indicate either the size of the hastable used to resolve 302 * relocation handles, or if negative that we are using a direct 303 * index into the execobj[]. 304 */ 305 int lut_size; 306 struct hlist_head *buckets; /** ht for relocation handles */ 307 308 struct eb_fence *fences; 309 unsigned long num_fences; 310 }; 311 312 static int eb_parse(struct i915_execbuffer *eb); 313 static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, 314 bool throttle); 315 static void eb_unpin_engine(struct i915_execbuffer *eb); 316 317 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) 318 { 319 return intel_engine_requires_cmd_parser(eb->engine) || 320 (intel_engine_using_cmd_parser(eb->engine) && 321 eb->args->batch_len); 322 } 323 324 static int eb_create(struct i915_execbuffer *eb) 325 { 326 if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) { 327 unsigned int size = 1 + ilog2(eb->buffer_count); 328 329 /* 330 * Without a 1:1 association between relocation handles and 331 * the execobject[] index, we instead create a hashtable. 332 * We size it dynamically based on available memory, starting 333 * first with 1:1 assocative hash and scaling back until 334 * the allocation succeeds. 335 * 336 * Later on we use a positive lut_size to indicate we are 337 * using this hashtable, and a negative value to indicate a 338 * direct lookup. 339 */ 340 do { 341 gfp_t flags; 342 343 /* While we can still reduce the allocation size, don't 344 * raise a warning and allow the allocation to fail. 345 * On the last pass though, we want to try as hard 346 * as possible to perform the allocation and warn 347 * if it fails. 348 */ 349 flags = GFP_KERNEL; 350 if (size > 1) 351 flags |= __GFP_NORETRY | __GFP_NOWARN; 352 353 eb->buckets = kzalloc(sizeof(struct hlist_head) << size, 354 flags); 355 if (eb->buckets) 356 break; 357 } while (--size); 358 359 if (unlikely(!size)) 360 return -ENOMEM; 361 362 eb->lut_size = size; 363 } else { 364 eb->lut_size = -eb->buffer_count; 365 } 366 367 return 0; 368 } 369 370 static bool 371 eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry, 372 const struct i915_vma *vma, 373 unsigned int flags) 374 { 375 if (vma->node.size < entry->pad_to_size) 376 return true; 377 378 if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment)) 379 return true; 380 381 if (flags & EXEC_OBJECT_PINNED && 382 vma->node.start != entry->offset) 383 return true; 384 385 if (flags & __EXEC_OBJECT_NEEDS_BIAS && 386 vma->node.start < BATCH_OFFSET_BIAS) 387 return true; 388 389 if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) && 390 (vma->node.start + vma->node.size + 4095) >> 32) 391 return true; 392 393 if (flags & __EXEC_OBJECT_NEEDS_MAP && 394 !i915_vma_is_map_and_fenceable(vma)) 395 return true; 396 397 return false; 398 } 399 400 static u64 eb_pin_flags(const struct drm_i915_gem_exec_object2 *entry, 401 unsigned int exec_flags) 402 { 403 u64 pin_flags = 0; 404 405 if (exec_flags & EXEC_OBJECT_NEEDS_GTT) 406 pin_flags |= PIN_GLOBAL; 407 408 /* 409 * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset, 410 * limit address to the first 4GBs for unflagged objects. 411 */ 412 if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 413 pin_flags |= PIN_ZONE_4G; 414 415 if (exec_flags & __EXEC_OBJECT_NEEDS_MAP) 416 pin_flags |= PIN_MAPPABLE; 417 418 if (exec_flags & EXEC_OBJECT_PINNED) 419 pin_flags |= entry->offset | PIN_OFFSET_FIXED; 420 else if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS) 421 pin_flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS; 422 423 return pin_flags; 424 } 425 426 static inline int 427 eb_pin_vma(struct i915_execbuffer *eb, 428 const struct drm_i915_gem_exec_object2 *entry, 429 struct eb_vma *ev) 430 { 431 struct i915_vma *vma = ev->vma; 432 u64 pin_flags; 433 int err; 434 435 if (vma->node.size) 436 pin_flags = vma->node.start; 437 else 438 pin_flags = entry->offset & PIN_OFFSET_MASK; 439 440 pin_flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED; 441 if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_GTT)) 442 pin_flags |= PIN_GLOBAL; 443 444 /* Attempt to reuse the current location if available */ 445 err = i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags); 446 if (err == -EDEADLK) 447 return err; 448 449 if (unlikely(err)) { 450 if (entry->flags & EXEC_OBJECT_PINNED) 451 return err; 452 453 /* Failing that pick any _free_ space if suitable */ 454 err = i915_vma_pin_ww(vma, &eb->ww, 455 entry->pad_to_size, 456 entry->alignment, 457 eb_pin_flags(entry, ev->flags) | 458 PIN_USER | PIN_NOEVICT); 459 if (unlikely(err)) 460 return err; 461 } 462 463 if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_FENCE)) { 464 err = i915_vma_pin_fence(vma); 465 if (unlikely(err)) { 466 i915_vma_unpin(vma); 467 return err; 468 } 469 470 if (vma->fence) 471 ev->flags |= __EXEC_OBJECT_HAS_FENCE; 472 } 473 474 ev->flags |= __EXEC_OBJECT_HAS_PIN; 475 if (eb_vma_misplaced(entry, vma, ev->flags)) 476 return -EBADSLT; 477 478 return 0; 479 } 480 481 static inline void 482 eb_unreserve_vma(struct eb_vma *ev) 483 { 484 if (!(ev->flags & __EXEC_OBJECT_HAS_PIN)) 485 return; 486 487 if (unlikely(ev->flags & __EXEC_OBJECT_HAS_FENCE)) 488 __i915_vma_unpin_fence(ev->vma); 489 490 __i915_vma_unpin(ev->vma); 491 ev->flags &= ~__EXEC_OBJECT_RESERVED; 492 } 493 494 static int 495 eb_validate_vma(struct i915_execbuffer *eb, 496 struct drm_i915_gem_exec_object2 *entry, 497 struct i915_vma *vma) 498 { 499 /* Relocations are disallowed for all platforms after TGL-LP. This 500 * also covers all platforms with local memory. 501 */ 502 if (entry->relocation_count && 503 INTEL_GEN(eb->i915) >= 12 && !IS_TIGERLAKE(eb->i915)) 504 return -EINVAL; 505 506 if (unlikely(entry->flags & eb->invalid_flags)) 507 return -EINVAL; 508 509 if (unlikely(entry->alignment && 510 !is_power_of_2_u64(entry->alignment))) 511 return -EINVAL; 512 513 /* 514 * Offset can be used as input (EXEC_OBJECT_PINNED), reject 515 * any non-page-aligned or non-canonical addresses. 516 */ 517 if (unlikely(entry->flags & EXEC_OBJECT_PINNED && 518 entry->offset != gen8_canonical_addr(entry->offset & I915_GTT_PAGE_MASK))) 519 return -EINVAL; 520 521 /* pad_to_size was once a reserved field, so sanitize it */ 522 if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) { 523 if (unlikely(offset_in_page(entry->pad_to_size))) 524 return -EINVAL; 525 } else { 526 entry->pad_to_size = 0; 527 } 528 /* 529 * From drm_mm perspective address space is continuous, 530 * so from this point we're always using non-canonical 531 * form internally. 532 */ 533 entry->offset = gen8_noncanonical_addr(entry->offset); 534 535 if (!eb->reloc_cache.has_fence) { 536 entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE; 537 } else { 538 if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE || 539 eb->reloc_cache.needs_unfenced) && 540 i915_gem_object_is_tiled(vma->obj)) 541 entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP; 542 } 543 544 if (!(entry->flags & EXEC_OBJECT_PINNED)) 545 entry->flags |= eb->context_flags; 546 547 return 0; 548 } 549 550 static void 551 eb_add_vma(struct i915_execbuffer *eb, 552 unsigned int i, unsigned batch_idx, 553 struct i915_vma *vma) 554 { 555 struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; 556 struct eb_vma *ev = &eb->vma[i]; 557 558 ev->vma = vma; 559 ev->exec = entry; 560 ev->flags = entry->flags; 561 562 if (eb->lut_size > 0) { 563 ev->handle = entry->handle; 564 hlist_add_head(&ev->node, 565 &eb->buckets[hash_32(entry->handle, 566 eb->lut_size)]); 567 } 568 569 if (entry->relocation_count) 570 list_add_tail(&ev->reloc_link, &eb->relocs); 571 572 /* 573 * SNA is doing fancy tricks with compressing batch buffers, which leads 574 * to negative relocation deltas. Usually that works out ok since the 575 * relocate address is still positive, except when the batch is placed 576 * very low in the GTT. Ensure this doesn't happen. 577 * 578 * Note that actual hangs have only been observed on gen7, but for 579 * paranoia do it everywhere. 580 */ 581 if (i == batch_idx) { 582 if (entry->relocation_count && 583 !(ev->flags & EXEC_OBJECT_PINNED)) 584 ev->flags |= __EXEC_OBJECT_NEEDS_BIAS; 585 if (eb->reloc_cache.has_fence) 586 ev->flags |= EXEC_OBJECT_NEEDS_FENCE; 587 588 eb->batch = ev; 589 } 590 } 591 592 static inline int use_cpu_reloc(const struct reloc_cache *cache, 593 const struct drm_i915_gem_object *obj) 594 { 595 if (!i915_gem_object_has_struct_page(obj)) 596 return false; 597 598 if (DBG_FORCE_RELOC == FORCE_CPU_RELOC) 599 return true; 600 601 if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) 602 return false; 603 604 return (cache->has_llc || 605 obj->cache_dirty || 606 obj->cache_level != I915_CACHE_NONE); 607 } 608 609 static int eb_reserve_vma(struct i915_execbuffer *eb, 610 struct eb_vma *ev, 611 u64 pin_flags) 612 { 613 struct drm_i915_gem_exec_object2 *entry = ev->exec; 614 struct i915_vma *vma = ev->vma; 615 int err; 616 617 if (drm_mm_node_allocated(&vma->node) && 618 eb_vma_misplaced(entry, vma, ev->flags)) { 619 err = i915_vma_unbind(vma); 620 if (err) 621 return err; 622 } 623 624 err = i915_vma_pin_ww(vma, &eb->ww, 625 entry->pad_to_size, entry->alignment, 626 eb_pin_flags(entry, ev->flags) | pin_flags); 627 if (err) 628 return err; 629 630 if (entry->offset != vma->node.start) { 631 entry->offset = vma->node.start | UPDATE; 632 eb->args->flags |= __EXEC_HAS_RELOC; 633 } 634 635 if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_FENCE)) { 636 err = i915_vma_pin_fence(vma); 637 if (unlikely(err)) { 638 i915_vma_unpin(vma); 639 return err; 640 } 641 642 if (vma->fence) 643 ev->flags |= __EXEC_OBJECT_HAS_FENCE; 644 } 645 646 ev->flags |= __EXEC_OBJECT_HAS_PIN; 647 GEM_BUG_ON(eb_vma_misplaced(entry, vma, ev->flags)); 648 649 return 0; 650 } 651 652 static int eb_reserve(struct i915_execbuffer *eb) 653 { 654 const unsigned int count = eb->buffer_count; 655 unsigned int pin_flags = PIN_USER | PIN_NONBLOCK; 656 struct list_head last; 657 struct eb_vma *ev; 658 unsigned int i, pass; 659 int err = 0; 660 661 /* 662 * Attempt to pin all of the buffers into the GTT. 663 * This is done in 3 phases: 664 * 665 * 1a. Unbind all objects that do not match the GTT constraints for 666 * the execbuffer (fenceable, mappable, alignment etc). 667 * 1b. Increment pin count for already bound objects. 668 * 2. Bind new objects. 669 * 3. Decrement pin count. 670 * 671 * This avoid unnecessary unbinding of later objects in order to make 672 * room for the earlier objects *unless* we need to defragment. 673 */ 674 pass = 0; 675 do { 676 list_for_each_entry(ev, &eb->unbound, bind_link) { 677 err = eb_reserve_vma(eb, ev, pin_flags); 678 if (err) 679 break; 680 } 681 if (err != -ENOSPC) 682 return err; 683 684 /* Resort *all* the objects into priority order */ 685 INIT_LIST_HEAD(&eb->unbound); 686 INIT_LIST_HEAD(&last); 687 for (i = 0; i < count; i++) { 688 unsigned int flags; 689 690 ev = &eb->vma[i]; 691 flags = ev->flags; 692 if (flags & EXEC_OBJECT_PINNED && 693 flags & __EXEC_OBJECT_HAS_PIN) 694 continue; 695 696 eb_unreserve_vma(ev); 697 698 if (flags & EXEC_OBJECT_PINNED) 699 /* Pinned must have their slot */ 700 list_add(&ev->bind_link, &eb->unbound); 701 else if (flags & __EXEC_OBJECT_NEEDS_MAP) 702 /* Map require the lowest 256MiB (aperture) */ 703 list_add_tail(&ev->bind_link, &eb->unbound); 704 else if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 705 /* Prioritise 4GiB region for restricted bo */ 706 list_add(&ev->bind_link, &last); 707 else 708 list_add_tail(&ev->bind_link, &last); 709 } 710 list_splice_tail(&last, &eb->unbound); 711 712 switch (pass++) { 713 case 0: 714 break; 715 716 case 1: 717 /* Too fragmented, unbind everything and retry */ 718 mutex_lock(&eb->context->vm->mutex); 719 err = i915_gem_evict_vm(eb->context->vm); 720 mutex_unlock(&eb->context->vm->mutex); 721 if (err) 722 return err; 723 break; 724 725 default: 726 return -ENOSPC; 727 } 728 729 pin_flags = PIN_USER; 730 } while (1); 731 } 732 733 static unsigned int eb_batch_index(const struct i915_execbuffer *eb) 734 { 735 if (eb->args->flags & I915_EXEC_BATCH_FIRST) 736 return 0; 737 else 738 return eb->buffer_count - 1; 739 } 740 741 static int eb_select_context(struct i915_execbuffer *eb) 742 { 743 struct i915_gem_context *ctx; 744 745 ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1); 746 if (unlikely(!ctx)) 747 return -ENOENT; 748 749 eb->gem_context = ctx; 750 if (rcu_access_pointer(ctx->vm)) 751 eb->invalid_flags |= EXEC_OBJECT_NEEDS_GTT; 752 753 eb->context_flags = 0; 754 if (test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags)) 755 eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS; 756 757 return 0; 758 } 759 760 static int __eb_add_lut(struct i915_execbuffer *eb, 761 u32 handle, struct i915_vma *vma) 762 { 763 struct i915_gem_context *ctx = eb->gem_context; 764 struct i915_lut_handle *lut; 765 int err; 766 767 lut = i915_lut_handle_alloc(); 768 if (unlikely(!lut)) 769 return -ENOMEM; 770 771 i915_vma_get(vma); 772 if (!atomic_fetch_inc(&vma->open_count)) 773 i915_vma_reopen(vma); 774 lut->handle = handle; 775 lut->ctx = ctx; 776 777 /* Check that the context hasn't been closed in the meantime */ 778 err = -EINTR; 779 if (!mutex_lock_interruptible(&ctx->lut_mutex)) { 780 struct i915_address_space *vm = rcu_access_pointer(ctx->vm); 781 782 if (unlikely(vm && vma->vm != vm)) 783 err = -EAGAIN; /* user racing with ctx set-vm */ 784 else if (likely(!i915_gem_context_is_closed(ctx))) 785 err = radix_tree_insert(&ctx->handles_vma, handle, vma); 786 else 787 err = -ENOENT; 788 if (err == 0) { /* And nor has this handle */ 789 struct drm_i915_gem_object *obj = vma->obj; 790 791 spin_lock(&obj->lut_lock); 792 if (idr_find(&eb->file->object_idr, handle) == obj) { 793 list_add(&lut->obj_link, &obj->lut_list); 794 } else { 795 radix_tree_delete(&ctx->handles_vma, handle); 796 err = -ENOENT; 797 } 798 spin_unlock(&obj->lut_lock); 799 } 800 mutex_unlock(&ctx->lut_mutex); 801 } 802 if (unlikely(err)) 803 goto err; 804 805 return 0; 806 807 err: 808 i915_vma_close(vma); 809 i915_vma_put(vma); 810 i915_lut_handle_free(lut); 811 return err; 812 } 813 814 static struct i915_vma *eb_lookup_vma(struct i915_execbuffer *eb, u32 handle) 815 { 816 struct i915_address_space *vm = eb->context->vm; 817 818 do { 819 struct drm_i915_gem_object *obj; 820 struct i915_vma *vma; 821 int err; 822 823 rcu_read_lock(); 824 vma = radix_tree_lookup(&eb->gem_context->handles_vma, handle); 825 if (likely(vma && vma->vm == vm)) 826 vma = i915_vma_tryget(vma); 827 rcu_read_unlock(); 828 if (likely(vma)) 829 return vma; 830 831 obj = i915_gem_object_lookup(eb->file, handle); 832 if (unlikely(!obj)) 833 return ERR_PTR(-ENOENT); 834 835 vma = i915_vma_instance(obj, vm, NULL); 836 if (IS_ERR(vma)) { 837 i915_gem_object_put(obj); 838 return vma; 839 } 840 841 err = __eb_add_lut(eb, handle, vma); 842 if (likely(!err)) 843 return vma; 844 845 i915_gem_object_put(obj); 846 if (err != -EEXIST) 847 return ERR_PTR(err); 848 } while (1); 849 } 850 851 static int eb_lookup_vmas(struct i915_execbuffer *eb) 852 { 853 struct drm_i915_private *i915 = eb->i915; 854 unsigned int batch = eb_batch_index(eb); 855 unsigned int i; 856 int err = 0; 857 858 INIT_LIST_HEAD(&eb->relocs); 859 860 for (i = 0; i < eb->buffer_count; i++) { 861 struct i915_vma *vma; 862 863 vma = eb_lookup_vma(eb, eb->exec[i].handle); 864 if (IS_ERR(vma)) { 865 err = PTR_ERR(vma); 866 goto err; 867 } 868 869 err = eb_validate_vma(eb, &eb->exec[i], vma); 870 if (unlikely(err)) { 871 i915_vma_put(vma); 872 goto err; 873 } 874 875 eb_add_vma(eb, i, batch, vma); 876 877 if (i915_gem_object_is_userptr(vma->obj)) { 878 err = i915_gem_object_userptr_submit_init(vma->obj); 879 if (err) { 880 if (i + 1 < eb->buffer_count) { 881 /* 882 * Execbuffer code expects last vma entry to be NULL, 883 * since we already initialized this entry, 884 * set the next value to NULL or we mess up 885 * cleanup handling. 886 */ 887 eb->vma[i + 1].vma = NULL; 888 } 889 890 return err; 891 } 892 893 eb->vma[i].flags |= __EXEC_OBJECT_USERPTR_INIT; 894 eb->args->flags |= __EXEC_USERPTR_USED; 895 } 896 } 897 898 if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) { 899 drm_dbg(&i915->drm, 900 "Attempting to use self-modifying batch buffer\n"); 901 return -EINVAL; 902 } 903 904 if (range_overflows_t(u64, 905 eb->batch_start_offset, eb->batch_len, 906 eb->batch->vma->size)) { 907 drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); 908 return -EINVAL; 909 } 910 911 if (eb->batch_len == 0) 912 eb->batch_len = eb->batch->vma->size - eb->batch_start_offset; 913 if (unlikely(eb->batch_len == 0)) { /* impossible! */ 914 drm_dbg(&i915->drm, "Invalid batch length\n"); 915 return -EINVAL; 916 } 917 918 return 0; 919 920 err: 921 eb->vma[i].vma = NULL; 922 return err; 923 } 924 925 static int eb_validate_vmas(struct i915_execbuffer *eb) 926 { 927 unsigned int i; 928 int err; 929 930 INIT_LIST_HEAD(&eb->unbound); 931 932 for (i = 0; i < eb->buffer_count; i++) { 933 struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; 934 struct eb_vma *ev = &eb->vma[i]; 935 struct i915_vma *vma = ev->vma; 936 937 err = i915_gem_object_lock(vma->obj, &eb->ww); 938 if (err) 939 return err; 940 941 err = eb_pin_vma(eb, entry, ev); 942 if (err == -EDEADLK) 943 return err; 944 945 if (!err) { 946 if (entry->offset != vma->node.start) { 947 entry->offset = vma->node.start | UPDATE; 948 eb->args->flags |= __EXEC_HAS_RELOC; 949 } 950 } else { 951 eb_unreserve_vma(ev); 952 953 list_add_tail(&ev->bind_link, &eb->unbound); 954 if (drm_mm_node_allocated(&vma->node)) { 955 err = i915_vma_unbind(vma); 956 if (err) 957 return err; 958 } 959 } 960 961 if (!(ev->flags & EXEC_OBJECT_WRITE)) { 962 err = dma_resv_reserve_shared(vma->resv, 1); 963 if (err) 964 return err; 965 } 966 967 GEM_BUG_ON(drm_mm_node_allocated(&vma->node) && 968 eb_vma_misplaced(&eb->exec[i], vma, ev->flags)); 969 } 970 971 if (!list_empty(&eb->unbound)) 972 return eb_reserve(eb); 973 974 return 0; 975 } 976 977 static struct eb_vma * 978 eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle) 979 { 980 if (eb->lut_size < 0) { 981 if (handle >= -eb->lut_size) 982 return NULL; 983 return &eb->vma[handle]; 984 } else { 985 struct hlist_head *head; 986 struct eb_vma *ev; 987 988 head = &eb->buckets[hash_32(handle, eb->lut_size)]; 989 hlist_for_each_entry(ev, head, node) { 990 if (ev->handle == handle) 991 return ev; 992 } 993 return NULL; 994 } 995 } 996 997 static void eb_release_vmas(struct i915_execbuffer *eb, bool final, bool release_userptr) 998 { 999 const unsigned int count = eb->buffer_count; 1000 unsigned int i; 1001 1002 for (i = 0; i < count; i++) { 1003 struct eb_vma *ev = &eb->vma[i]; 1004 struct i915_vma *vma = ev->vma; 1005 1006 if (!vma) 1007 break; 1008 1009 eb_unreserve_vma(ev); 1010 1011 if (release_userptr && ev->flags & __EXEC_OBJECT_USERPTR_INIT) { 1012 ev->flags &= ~__EXEC_OBJECT_USERPTR_INIT; 1013 i915_gem_object_userptr_submit_fini(vma->obj); 1014 } 1015 1016 if (final) 1017 i915_vma_put(vma); 1018 } 1019 1020 eb_unpin_engine(eb); 1021 } 1022 1023 static void eb_destroy(const struct i915_execbuffer *eb) 1024 { 1025 GEM_BUG_ON(eb->reloc_cache.rq); 1026 1027 if (eb->lut_size > 0) 1028 kfree(eb->buckets); 1029 } 1030 1031 static inline u64 1032 relocation_target(const struct drm_i915_gem_relocation_entry *reloc, 1033 const struct i915_vma *target) 1034 { 1035 return gen8_canonical_addr((int)reloc->delta + target->node.start); 1036 } 1037 1038 static void reloc_cache_clear(struct reloc_cache *cache) 1039 { 1040 cache->rq = NULL; 1041 cache->rq_cmd = NULL; 1042 cache->pool = NULL; 1043 cache->rq_size = 0; 1044 } 1045 1046 static void reloc_cache_init(struct reloc_cache *cache, 1047 struct drm_i915_private *i915) 1048 { 1049 cache->page = -1; 1050 cache->vaddr = 0; 1051 /* Must be a variable in the struct to allow GCC to unroll. */ 1052 cache->gen = INTEL_GEN(i915); 1053 cache->has_llc = HAS_LLC(i915); 1054 cache->use_64bit_reloc = HAS_64BIT_RELOC(i915); 1055 cache->has_fence = cache->gen < 4; 1056 cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment; 1057 cache->node.flags = 0; 1058 reloc_cache_clear(cache); 1059 } 1060 1061 static inline void *unmask_page(unsigned long p) 1062 { 1063 return (void *)(uintptr_t)(p & PAGE_MASK); 1064 } 1065 1066 static inline unsigned int unmask_flags(unsigned long p) 1067 { 1068 return p & ~PAGE_MASK; 1069 } 1070 1071 #define KMAP 0x4 /* after CLFLUSH_FLAGS */ 1072 1073 static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache) 1074 { 1075 struct drm_i915_private *i915 = 1076 container_of(cache, struct i915_execbuffer, reloc_cache)->i915; 1077 return &i915->ggtt; 1078 } 1079 1080 static void reloc_cache_put_pool(struct i915_execbuffer *eb, struct reloc_cache *cache) 1081 { 1082 if (!cache->pool) 1083 return; 1084 1085 /* 1086 * This is a bit nasty, normally we keep objects locked until the end 1087 * of execbuffer, but we already submit this, and have to unlock before 1088 * dropping the reference. Fortunately we can only hold 1 pool node at 1089 * a time, so this should be harmless. 1090 */ 1091 i915_gem_ww_unlock_single(cache->pool->obj); 1092 intel_gt_buffer_pool_put(cache->pool); 1093 cache->pool = NULL; 1094 } 1095 1096 static void reloc_gpu_flush(struct i915_execbuffer *eb, struct reloc_cache *cache) 1097 { 1098 struct drm_i915_gem_object *obj = cache->rq->batch->obj; 1099 1100 GEM_BUG_ON(cache->rq_size >= obj->base.size / sizeof(u32)); 1101 cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END; 1102 1103 i915_gem_object_flush_map(obj); 1104 i915_gem_object_unpin_map(obj); 1105 1106 intel_gt_chipset_flush(cache->rq->engine->gt); 1107 1108 i915_request_add(cache->rq); 1109 reloc_cache_put_pool(eb, cache); 1110 reloc_cache_clear(cache); 1111 1112 eb->reloc_pool = NULL; 1113 } 1114 1115 static void reloc_cache_reset(struct reloc_cache *cache, struct i915_execbuffer *eb) 1116 { 1117 void *vaddr; 1118 1119 if (cache->rq) 1120 reloc_gpu_flush(eb, cache); 1121 1122 if (!cache->vaddr) 1123 return; 1124 1125 vaddr = unmask_page(cache->vaddr); 1126 if (cache->vaddr & KMAP) { 1127 struct drm_i915_gem_object *obj = 1128 (struct drm_i915_gem_object *)cache->node.mm; 1129 if (cache->vaddr & CLFLUSH_AFTER) 1130 mb(); 1131 1132 kunmap_atomic(vaddr); 1133 i915_gem_object_finish_access(obj); 1134 } else { 1135 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 1136 1137 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 1138 io_mapping_unmap_atomic((void __iomem *)vaddr); 1139 1140 if (drm_mm_node_allocated(&cache->node)) { 1141 ggtt->vm.clear_range(&ggtt->vm, 1142 cache->node.start, 1143 cache->node.size); 1144 mutex_lock(&ggtt->vm.mutex); 1145 drm_mm_remove_node(&cache->node); 1146 mutex_unlock(&ggtt->vm.mutex); 1147 } else { 1148 i915_vma_unpin((struct i915_vma *)cache->node.mm); 1149 } 1150 } 1151 1152 cache->vaddr = 0; 1153 cache->page = -1; 1154 } 1155 1156 static void *reloc_kmap(struct drm_i915_gem_object *obj, 1157 struct reloc_cache *cache, 1158 unsigned long pageno) 1159 { 1160 void *vaddr; 1161 struct page *page; 1162 1163 if (cache->vaddr) { 1164 kunmap_atomic(unmask_page(cache->vaddr)); 1165 } else { 1166 unsigned int flushes; 1167 int err; 1168 1169 err = i915_gem_object_prepare_write(obj, &flushes); 1170 if (err) 1171 return ERR_PTR(err); 1172 1173 BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS); 1174 BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK); 1175 1176 cache->vaddr = flushes | KMAP; 1177 cache->node.mm = (void *)obj; 1178 if (flushes) 1179 mb(); 1180 } 1181 1182 page = i915_gem_object_get_page(obj, pageno); 1183 if (!obj->mm.dirty) 1184 set_page_dirty(page); 1185 1186 vaddr = kmap_atomic(page); 1187 cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr; 1188 cache->page = pageno; 1189 1190 return vaddr; 1191 } 1192 1193 static void *reloc_iomap(struct drm_i915_gem_object *obj, 1194 struct i915_execbuffer *eb, 1195 unsigned long page) 1196 { 1197 struct reloc_cache *cache = &eb->reloc_cache; 1198 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 1199 unsigned long offset; 1200 void *vaddr; 1201 1202 if (cache->vaddr) { 1203 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 1204 io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); 1205 } else { 1206 struct i915_vma *vma; 1207 int err; 1208 1209 if (i915_gem_object_is_tiled(obj)) 1210 return ERR_PTR(-EINVAL); 1211 1212 if (use_cpu_reloc(cache, obj)) 1213 return NULL; 1214 1215 err = i915_gem_object_set_to_gtt_domain(obj, true); 1216 if (err) 1217 return ERR_PTR(err); 1218 1219 vma = i915_gem_object_ggtt_pin_ww(obj, &eb->ww, NULL, 0, 0, 1220 PIN_MAPPABLE | 1221 PIN_NONBLOCK /* NOWARN */ | 1222 PIN_NOEVICT); 1223 if (vma == ERR_PTR(-EDEADLK)) 1224 return vma; 1225 1226 if (IS_ERR(vma)) { 1227 memset(&cache->node, 0, sizeof(cache->node)); 1228 mutex_lock(&ggtt->vm.mutex); 1229 err = drm_mm_insert_node_in_range 1230 (&ggtt->vm.mm, &cache->node, 1231 PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE, 1232 0, ggtt->mappable_end, 1233 DRM_MM_INSERT_LOW); 1234 mutex_unlock(&ggtt->vm.mutex); 1235 if (err) /* no inactive aperture space, use cpu reloc */ 1236 return NULL; 1237 } else { 1238 cache->node.start = vma->node.start; 1239 cache->node.mm = (void *)vma; 1240 } 1241 } 1242 1243 offset = cache->node.start; 1244 if (drm_mm_node_allocated(&cache->node)) { 1245 ggtt->vm.insert_page(&ggtt->vm, 1246 i915_gem_object_get_dma_address(obj, page), 1247 offset, I915_CACHE_NONE, 0); 1248 } else { 1249 offset += page << PAGE_SHIFT; 1250 } 1251 1252 vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, 1253 offset); 1254 cache->page = page; 1255 cache->vaddr = (unsigned long)vaddr; 1256 1257 return vaddr; 1258 } 1259 1260 static void *reloc_vaddr(struct drm_i915_gem_object *obj, 1261 struct i915_execbuffer *eb, 1262 unsigned long page) 1263 { 1264 struct reloc_cache *cache = &eb->reloc_cache; 1265 void *vaddr; 1266 1267 if (cache->page == page) { 1268 vaddr = unmask_page(cache->vaddr); 1269 } else { 1270 vaddr = NULL; 1271 if ((cache->vaddr & KMAP) == 0) 1272 vaddr = reloc_iomap(obj, eb, page); 1273 if (!vaddr) 1274 vaddr = reloc_kmap(obj, cache, page); 1275 } 1276 1277 return vaddr; 1278 } 1279 1280 static void clflush_write32(u32 *addr, u32 value, unsigned int flushes) 1281 { 1282 if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) { 1283 if (flushes & CLFLUSH_BEFORE) { 1284 clflushopt(addr); 1285 mb(); 1286 } 1287 1288 *addr = value; 1289 1290 /* 1291 * Writes to the same cacheline are serialised by the CPU 1292 * (including clflush). On the write path, we only require 1293 * that it hits memory in an orderly fashion and place 1294 * mb barriers at the start and end of the relocation phase 1295 * to ensure ordering of clflush wrt to the system. 1296 */ 1297 if (flushes & CLFLUSH_AFTER) 1298 clflushopt(addr); 1299 } else 1300 *addr = value; 1301 } 1302 1303 static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) 1304 { 1305 struct drm_i915_gem_object *obj = vma->obj; 1306 int err; 1307 1308 assert_vma_held(vma); 1309 1310 if (obj->cache_dirty & ~obj->cache_coherent) 1311 i915_gem_clflush_object(obj, 0); 1312 obj->write_domain = 0; 1313 1314 err = i915_request_await_object(rq, vma->obj, true); 1315 if (err == 0) 1316 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); 1317 1318 return err; 1319 } 1320 1321 static int __reloc_gpu_alloc(struct i915_execbuffer *eb, 1322 struct intel_engine_cs *engine, 1323 struct i915_vma *vma, 1324 unsigned int len) 1325 { 1326 struct reloc_cache *cache = &eb->reloc_cache; 1327 struct intel_gt_buffer_pool_node *pool = eb->reloc_pool; 1328 struct i915_request *rq; 1329 struct i915_vma *batch; 1330 u32 *cmd; 1331 int err; 1332 1333 if (!pool) { 1334 pool = intel_gt_get_buffer_pool(engine->gt, PAGE_SIZE, 1335 cache->has_llc ? 1336 I915_MAP_WB : 1337 I915_MAP_WC); 1338 if (IS_ERR(pool)) 1339 return PTR_ERR(pool); 1340 } 1341 eb->reloc_pool = NULL; 1342 1343 err = i915_gem_object_lock(pool->obj, &eb->ww); 1344 if (err) 1345 goto err_pool; 1346 1347 cmd = i915_gem_object_pin_map(pool->obj, pool->type); 1348 if (IS_ERR(cmd)) { 1349 err = PTR_ERR(cmd); 1350 goto err_pool; 1351 } 1352 intel_gt_buffer_pool_mark_used(pool); 1353 1354 memset32(cmd, 0, pool->obj->base.size / sizeof(u32)); 1355 1356 batch = i915_vma_instance(pool->obj, vma->vm, NULL); 1357 if (IS_ERR(batch)) { 1358 err = PTR_ERR(batch); 1359 goto err_unmap; 1360 } 1361 1362 err = i915_vma_pin_ww(batch, &eb->ww, 0, 0, PIN_USER | PIN_NONBLOCK); 1363 if (err) 1364 goto err_unmap; 1365 1366 if (engine == eb->context->engine) { 1367 rq = i915_request_create(eb->context); 1368 } else { 1369 struct intel_context *ce = eb->reloc_context; 1370 1371 if (!ce) { 1372 ce = intel_context_create(engine); 1373 if (IS_ERR(ce)) { 1374 err = PTR_ERR(ce); 1375 goto err_unpin; 1376 } 1377 1378 i915_vm_put(ce->vm); 1379 ce->vm = i915_vm_get(eb->context->vm); 1380 eb->reloc_context = ce; 1381 } 1382 1383 err = intel_context_pin_ww(ce, &eb->ww); 1384 if (err) 1385 goto err_unpin; 1386 1387 rq = i915_request_create(ce); 1388 intel_context_unpin(ce); 1389 } 1390 if (IS_ERR(rq)) { 1391 err = PTR_ERR(rq); 1392 goto err_unpin; 1393 } 1394 1395 err = intel_gt_buffer_pool_mark_active(pool, rq); 1396 if (err) 1397 goto err_request; 1398 1399 err = reloc_move_to_gpu(rq, vma); 1400 if (err) 1401 goto err_request; 1402 1403 err = eb->engine->emit_bb_start(rq, 1404 batch->node.start, PAGE_SIZE, 1405 cache->gen > 5 ? 0 : I915_DISPATCH_SECURE); 1406 if (err) 1407 goto skip_request; 1408 1409 assert_vma_held(batch); 1410 err = i915_request_await_object(rq, batch->obj, false); 1411 if (err == 0) 1412 err = i915_vma_move_to_active(batch, rq, 0); 1413 if (err) 1414 goto skip_request; 1415 1416 rq->batch = batch; 1417 i915_vma_unpin(batch); 1418 1419 cache->rq = rq; 1420 cache->rq_cmd = cmd; 1421 cache->rq_size = 0; 1422 cache->pool = pool; 1423 1424 /* Return with batch mapping (cmd) still pinned */ 1425 return 0; 1426 1427 skip_request: 1428 i915_request_set_error_once(rq, err); 1429 err_request: 1430 i915_request_add(rq); 1431 err_unpin: 1432 i915_vma_unpin(batch); 1433 err_unmap: 1434 i915_gem_object_unpin_map(pool->obj); 1435 err_pool: 1436 eb->reloc_pool = pool; 1437 return err; 1438 } 1439 1440 static bool reloc_can_use_engine(const struct intel_engine_cs *engine) 1441 { 1442 return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6); 1443 } 1444 1445 static u32 *reloc_gpu(struct i915_execbuffer *eb, 1446 struct i915_vma *vma, 1447 unsigned int len) 1448 { 1449 struct reloc_cache *cache = &eb->reloc_cache; 1450 u32 *cmd; 1451 1452 if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1)) 1453 reloc_gpu_flush(eb, cache); 1454 1455 if (unlikely(!cache->rq)) { 1456 int err; 1457 struct intel_engine_cs *engine = eb->engine; 1458 1459 if (!reloc_can_use_engine(engine)) { 1460 engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0]; 1461 if (!engine) 1462 return ERR_PTR(-ENODEV); 1463 } 1464 1465 err = __reloc_gpu_alloc(eb, engine, vma, len); 1466 if (unlikely(err)) 1467 return ERR_PTR(err); 1468 } 1469 1470 cmd = cache->rq_cmd + cache->rq_size; 1471 cache->rq_size += len; 1472 1473 return cmd; 1474 } 1475 1476 static inline bool use_reloc_gpu(struct i915_vma *vma) 1477 { 1478 if (DBG_FORCE_RELOC == FORCE_GPU_RELOC) 1479 return true; 1480 1481 if (DBG_FORCE_RELOC) 1482 return false; 1483 1484 return !dma_resv_test_signaled_rcu(vma->resv, true); 1485 } 1486 1487 static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset) 1488 { 1489 struct page *page; 1490 unsigned long addr; 1491 1492 GEM_BUG_ON(vma->pages != vma->obj->mm.pages); 1493 1494 page = i915_gem_object_get_page(vma->obj, offset >> PAGE_SHIFT); 1495 addr = PFN_PHYS(page_to_pfn(page)); 1496 GEM_BUG_ON(overflows_type(addr, u32)); /* expected dma32 */ 1497 1498 return addr + offset_in_page(offset); 1499 } 1500 1501 static int __reloc_entry_gpu(struct i915_execbuffer *eb, 1502 struct i915_vma *vma, 1503 u64 offset, 1504 u64 target_addr) 1505 { 1506 const unsigned int gen = eb->reloc_cache.gen; 1507 unsigned int len; 1508 u32 *batch; 1509 u64 addr; 1510 1511 if (gen >= 8) 1512 len = offset & 7 ? 8 : 5; 1513 else if (gen >= 4) 1514 len = 4; 1515 else 1516 len = 3; 1517 1518 batch = reloc_gpu(eb, vma, len); 1519 if (batch == ERR_PTR(-EDEADLK)) 1520 return -EDEADLK; 1521 else if (IS_ERR(batch)) 1522 return false; 1523 1524 addr = gen8_canonical_addr(vma->node.start + offset); 1525 if (gen >= 8) { 1526 if (offset & 7) { 1527 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1528 *batch++ = lower_32_bits(addr); 1529 *batch++ = upper_32_bits(addr); 1530 *batch++ = lower_32_bits(target_addr); 1531 1532 addr = gen8_canonical_addr(addr + 4); 1533 1534 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1535 *batch++ = lower_32_bits(addr); 1536 *batch++ = upper_32_bits(addr); 1537 *batch++ = upper_32_bits(target_addr); 1538 } else { 1539 *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1; 1540 *batch++ = lower_32_bits(addr); 1541 *batch++ = upper_32_bits(addr); 1542 *batch++ = lower_32_bits(target_addr); 1543 *batch++ = upper_32_bits(target_addr); 1544 } 1545 } else if (gen >= 6) { 1546 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1547 *batch++ = 0; 1548 *batch++ = addr; 1549 *batch++ = target_addr; 1550 } else if (IS_I965G(eb->i915)) { 1551 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1552 *batch++ = 0; 1553 *batch++ = vma_phys_addr(vma, offset); 1554 *batch++ = target_addr; 1555 } else if (gen >= 4) { 1556 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1557 *batch++ = 0; 1558 *batch++ = addr; 1559 *batch++ = target_addr; 1560 } else if (gen >= 3 && 1561 !(IS_I915G(eb->i915) || IS_I915GM(eb->i915))) { 1562 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 1563 *batch++ = addr; 1564 *batch++ = target_addr; 1565 } else { 1566 *batch++ = MI_STORE_DWORD_IMM; 1567 *batch++ = vma_phys_addr(vma, offset); 1568 *batch++ = target_addr; 1569 } 1570 1571 return true; 1572 } 1573 1574 static int reloc_entry_gpu(struct i915_execbuffer *eb, 1575 struct i915_vma *vma, 1576 u64 offset, 1577 u64 target_addr) 1578 { 1579 if (eb->reloc_cache.vaddr) 1580 return false; 1581 1582 if (!use_reloc_gpu(vma)) 1583 return false; 1584 1585 return __reloc_entry_gpu(eb, vma, offset, target_addr); 1586 } 1587 1588 static u64 1589 relocate_entry(struct i915_vma *vma, 1590 const struct drm_i915_gem_relocation_entry *reloc, 1591 struct i915_execbuffer *eb, 1592 const struct i915_vma *target) 1593 { 1594 u64 target_addr = relocation_target(reloc, target); 1595 u64 offset = reloc->offset; 1596 int reloc_gpu = reloc_entry_gpu(eb, vma, offset, target_addr); 1597 1598 if (reloc_gpu < 0) 1599 return reloc_gpu; 1600 1601 if (!reloc_gpu) { 1602 bool wide = eb->reloc_cache.use_64bit_reloc; 1603 void *vaddr; 1604 1605 repeat: 1606 vaddr = reloc_vaddr(vma->obj, eb, 1607 offset >> PAGE_SHIFT); 1608 if (IS_ERR(vaddr)) 1609 return PTR_ERR(vaddr); 1610 1611 GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32))); 1612 clflush_write32(vaddr + offset_in_page(offset), 1613 lower_32_bits(target_addr), 1614 eb->reloc_cache.vaddr); 1615 1616 if (wide) { 1617 offset += sizeof(u32); 1618 target_addr >>= 32; 1619 wide = false; 1620 goto repeat; 1621 } 1622 } 1623 1624 return target->node.start | UPDATE; 1625 } 1626 1627 static u64 1628 eb_relocate_entry(struct i915_execbuffer *eb, 1629 struct eb_vma *ev, 1630 const struct drm_i915_gem_relocation_entry *reloc) 1631 { 1632 struct drm_i915_private *i915 = eb->i915; 1633 struct eb_vma *target; 1634 int err; 1635 1636 /* we've already hold a reference to all valid objects */ 1637 target = eb_get_vma(eb, reloc->target_handle); 1638 if (unlikely(!target)) 1639 return -ENOENT; 1640 1641 /* Validate that the target is in a valid r/w GPU domain */ 1642 if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) { 1643 drm_dbg(&i915->drm, "reloc with multiple write domains: " 1644 "target %d offset %d " 1645 "read %08x write %08x", 1646 reloc->target_handle, 1647 (int) reloc->offset, 1648 reloc->read_domains, 1649 reloc->write_domain); 1650 return -EINVAL; 1651 } 1652 if (unlikely((reloc->write_domain | reloc->read_domains) 1653 & ~I915_GEM_GPU_DOMAINS)) { 1654 drm_dbg(&i915->drm, "reloc with read/write non-GPU domains: " 1655 "target %d offset %d " 1656 "read %08x write %08x", 1657 reloc->target_handle, 1658 (int) reloc->offset, 1659 reloc->read_domains, 1660 reloc->write_domain); 1661 return -EINVAL; 1662 } 1663 1664 if (reloc->write_domain) { 1665 target->flags |= EXEC_OBJECT_WRITE; 1666 1667 /* 1668 * Sandybridge PPGTT errata: We need a global gtt mapping 1669 * for MI and pipe_control writes because the gpu doesn't 1670 * properly redirect them through the ppgtt for non_secure 1671 * batchbuffers. 1672 */ 1673 if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION && 1674 IS_GEN(eb->i915, 6)) { 1675 err = i915_vma_bind(target->vma, 1676 target->vma->obj->cache_level, 1677 PIN_GLOBAL, NULL); 1678 if (err) 1679 return err; 1680 } 1681 } 1682 1683 /* 1684 * If the relocation already has the right value in it, no 1685 * more work needs to be done. 1686 */ 1687 if (!DBG_FORCE_RELOC && 1688 gen8_canonical_addr(target->vma->node.start) == reloc->presumed_offset) 1689 return 0; 1690 1691 /* Check that the relocation address is valid... */ 1692 if (unlikely(reloc->offset > 1693 ev->vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) { 1694 drm_dbg(&i915->drm, "Relocation beyond object bounds: " 1695 "target %d offset %d size %d.\n", 1696 reloc->target_handle, 1697 (int)reloc->offset, 1698 (int)ev->vma->size); 1699 return -EINVAL; 1700 } 1701 if (unlikely(reloc->offset & 3)) { 1702 drm_dbg(&i915->drm, "Relocation not 4-byte aligned: " 1703 "target %d offset %d.\n", 1704 reloc->target_handle, 1705 (int)reloc->offset); 1706 return -EINVAL; 1707 } 1708 1709 /* 1710 * If we write into the object, we need to force the synchronisation 1711 * barrier, either with an asynchronous clflush or if we executed the 1712 * patching using the GPU (though that should be serialised by the 1713 * timeline). To be completely sure, and since we are required to 1714 * do relocations we are already stalling, disable the user's opt 1715 * out of our synchronisation. 1716 */ 1717 ev->flags &= ~EXEC_OBJECT_ASYNC; 1718 1719 /* and update the user's relocation entry */ 1720 return relocate_entry(ev->vma, reloc, eb, target->vma); 1721 } 1722 1723 static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev) 1724 { 1725 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry)) 1726 struct drm_i915_gem_relocation_entry stack[N_RELOC(512)]; 1727 const struct drm_i915_gem_exec_object2 *entry = ev->exec; 1728 struct drm_i915_gem_relocation_entry __user *urelocs = 1729 u64_to_user_ptr(entry->relocs_ptr); 1730 unsigned long remain = entry->relocation_count; 1731 1732 if (unlikely(remain > N_RELOC(ULONG_MAX))) 1733 return -EINVAL; 1734 1735 /* 1736 * We must check that the entire relocation array is safe 1737 * to read. However, if the array is not writable the user loses 1738 * the updated relocation values. 1739 */ 1740 if (unlikely(!access_ok(urelocs, remain * sizeof(*urelocs)))) 1741 return -EFAULT; 1742 1743 do { 1744 struct drm_i915_gem_relocation_entry *r = stack; 1745 unsigned int count = 1746 min_t(unsigned long, remain, ARRAY_SIZE(stack)); 1747 unsigned int copied; 1748 1749 /* 1750 * This is the fast path and we cannot handle a pagefault 1751 * whilst holding the struct mutex lest the user pass in the 1752 * relocations contained within a mmaped bo. For in such a case 1753 * we, the page fault handler would call i915_gem_fault() and 1754 * we would try to acquire the struct mutex again. Obviously 1755 * this is bad and so lockdep complains vehemently. 1756 */ 1757 pagefault_disable(); 1758 copied = __copy_from_user_inatomic(r, urelocs, count * sizeof(r[0])); 1759 pagefault_enable(); 1760 if (unlikely(copied)) { 1761 remain = -EFAULT; 1762 goto out; 1763 } 1764 1765 remain -= count; 1766 do { 1767 u64 offset = eb_relocate_entry(eb, ev, r); 1768 1769 if (likely(offset == 0)) { 1770 } else if ((s64)offset < 0) { 1771 remain = (int)offset; 1772 goto out; 1773 } else { 1774 /* 1775 * Note that reporting an error now 1776 * leaves everything in an inconsistent 1777 * state as we have *already* changed 1778 * the relocation value inside the 1779 * object. As we have not changed the 1780 * reloc.presumed_offset or will not 1781 * change the execobject.offset, on the 1782 * call we may not rewrite the value 1783 * inside the object, leaving it 1784 * dangling and causing a GPU hang. Unless 1785 * userspace dynamically rebuilds the 1786 * relocations on each execbuf rather than 1787 * presume a static tree. 1788 * 1789 * We did previously check if the relocations 1790 * were writable (access_ok), an error now 1791 * would be a strange race with mprotect, 1792 * having already demonstrated that we 1793 * can read from this userspace address. 1794 */ 1795 offset = gen8_canonical_addr(offset & ~UPDATE); 1796 __put_user(offset, 1797 &urelocs[r - stack].presumed_offset); 1798 } 1799 } while (r++, --count); 1800 urelocs += ARRAY_SIZE(stack); 1801 } while (remain); 1802 out: 1803 reloc_cache_reset(&eb->reloc_cache, eb); 1804 return remain; 1805 } 1806 1807 static int 1808 eb_relocate_vma_slow(struct i915_execbuffer *eb, struct eb_vma *ev) 1809 { 1810 const struct drm_i915_gem_exec_object2 *entry = ev->exec; 1811 struct drm_i915_gem_relocation_entry *relocs = 1812 u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1813 unsigned int i; 1814 int err; 1815 1816 for (i = 0; i < entry->relocation_count; i++) { 1817 u64 offset = eb_relocate_entry(eb, ev, &relocs[i]); 1818 1819 if ((s64)offset < 0) { 1820 err = (int)offset; 1821 goto err; 1822 } 1823 } 1824 err = 0; 1825 err: 1826 reloc_cache_reset(&eb->reloc_cache, eb); 1827 return err; 1828 } 1829 1830 static int check_relocations(const struct drm_i915_gem_exec_object2 *entry) 1831 { 1832 const char __user *addr, *end; 1833 unsigned long size; 1834 char __maybe_unused c; 1835 1836 size = entry->relocation_count; 1837 if (size == 0) 1838 return 0; 1839 1840 if (size > N_RELOC(ULONG_MAX)) 1841 return -EINVAL; 1842 1843 addr = u64_to_user_ptr(entry->relocs_ptr); 1844 size *= sizeof(struct drm_i915_gem_relocation_entry); 1845 if (!access_ok(addr, size)) 1846 return -EFAULT; 1847 1848 end = addr + size; 1849 for (; addr < end; addr += PAGE_SIZE) { 1850 int err = __get_user(c, addr); 1851 if (err) 1852 return err; 1853 } 1854 return __get_user(c, end - 1); 1855 } 1856 1857 static int eb_copy_relocations(const struct i915_execbuffer *eb) 1858 { 1859 struct drm_i915_gem_relocation_entry *relocs; 1860 const unsigned int count = eb->buffer_count; 1861 unsigned int i; 1862 int err; 1863 1864 for (i = 0; i < count; i++) { 1865 const unsigned int nreloc = eb->exec[i].relocation_count; 1866 struct drm_i915_gem_relocation_entry __user *urelocs; 1867 unsigned long size; 1868 unsigned long copied; 1869 1870 if (nreloc == 0) 1871 continue; 1872 1873 err = check_relocations(&eb->exec[i]); 1874 if (err) 1875 goto err; 1876 1877 urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr); 1878 size = nreloc * sizeof(*relocs); 1879 1880 relocs = kvmalloc_array(size, 1, GFP_KERNEL); 1881 if (!relocs) { 1882 err = -ENOMEM; 1883 goto err; 1884 } 1885 1886 /* copy_from_user is limited to < 4GiB */ 1887 copied = 0; 1888 do { 1889 unsigned int len = 1890 min_t(u64, BIT_ULL(31), size - copied); 1891 1892 if (__copy_from_user((char *)relocs + copied, 1893 (char __user *)urelocs + copied, 1894 len)) 1895 goto end; 1896 1897 copied += len; 1898 } while (copied < size); 1899 1900 /* 1901 * As we do not update the known relocation offsets after 1902 * relocating (due to the complexities in lock handling), 1903 * we need to mark them as invalid now so that we force the 1904 * relocation processing next time. Just in case the target 1905 * object is evicted and then rebound into its old 1906 * presumed_offset before the next execbuffer - if that 1907 * happened we would make the mistake of assuming that the 1908 * relocations were valid. 1909 */ 1910 if (!user_access_begin(urelocs, size)) 1911 goto end; 1912 1913 for (copied = 0; copied < nreloc; copied++) 1914 unsafe_put_user(-1, 1915 &urelocs[copied].presumed_offset, 1916 end_user); 1917 user_access_end(); 1918 1919 eb->exec[i].relocs_ptr = (uintptr_t)relocs; 1920 } 1921 1922 return 0; 1923 1924 end_user: 1925 user_access_end(); 1926 end: 1927 kvfree(relocs); 1928 err = -EFAULT; 1929 err: 1930 while (i--) { 1931 relocs = u64_to_ptr(typeof(*relocs), eb->exec[i].relocs_ptr); 1932 if (eb->exec[i].relocation_count) 1933 kvfree(relocs); 1934 } 1935 return err; 1936 } 1937 1938 static int eb_prefault_relocations(const struct i915_execbuffer *eb) 1939 { 1940 const unsigned int count = eb->buffer_count; 1941 unsigned int i; 1942 1943 for (i = 0; i < count; i++) { 1944 int err; 1945 1946 err = check_relocations(&eb->exec[i]); 1947 if (err) 1948 return err; 1949 } 1950 1951 return 0; 1952 } 1953 1954 static int eb_reinit_userptr(struct i915_execbuffer *eb) 1955 { 1956 const unsigned int count = eb->buffer_count; 1957 unsigned int i; 1958 int ret; 1959 1960 if (likely(!(eb->args->flags & __EXEC_USERPTR_USED))) 1961 return 0; 1962 1963 for (i = 0; i < count; i++) { 1964 struct eb_vma *ev = &eb->vma[i]; 1965 1966 if (!i915_gem_object_is_userptr(ev->vma->obj)) 1967 continue; 1968 1969 ret = i915_gem_object_userptr_submit_init(ev->vma->obj); 1970 if (ret) 1971 return ret; 1972 1973 ev->flags |= __EXEC_OBJECT_USERPTR_INIT; 1974 } 1975 1976 return 0; 1977 } 1978 1979 static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb, 1980 struct i915_request *rq) 1981 { 1982 bool have_copy = false; 1983 struct eb_vma *ev; 1984 int err = 0; 1985 1986 repeat: 1987 if (signal_pending(current)) { 1988 err = -ERESTARTSYS; 1989 goto out; 1990 } 1991 1992 /* We may process another execbuffer during the unlock... */ 1993 eb_release_vmas(eb, false, true); 1994 i915_gem_ww_ctx_fini(&eb->ww); 1995 1996 if (rq) { 1997 /* nonblocking is always false */ 1998 if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 1999 MAX_SCHEDULE_TIMEOUT) < 0) { 2000 i915_request_put(rq); 2001 rq = NULL; 2002 2003 err = -EINTR; 2004 goto err_relock; 2005 } 2006 2007 i915_request_put(rq); 2008 rq = NULL; 2009 } 2010 2011 /* 2012 * We take 3 passes through the slowpatch. 2013 * 2014 * 1 - we try to just prefault all the user relocation entries and 2015 * then attempt to reuse the atomic pagefault disabled fast path again. 2016 * 2017 * 2 - we copy the user entries to a local buffer here outside of the 2018 * local and allow ourselves to wait upon any rendering before 2019 * relocations 2020 * 2021 * 3 - we already have a local copy of the relocation entries, but 2022 * were interrupted (EAGAIN) whilst waiting for the objects, try again. 2023 */ 2024 if (!err) { 2025 err = eb_prefault_relocations(eb); 2026 } else if (!have_copy) { 2027 err = eb_copy_relocations(eb); 2028 have_copy = err == 0; 2029 } else { 2030 cond_resched(); 2031 err = 0; 2032 } 2033 2034 if (!err) 2035 err = eb_reinit_userptr(eb); 2036 2037 err_relock: 2038 i915_gem_ww_ctx_init(&eb->ww, true); 2039 if (err) 2040 goto out; 2041 2042 /* reacquire the objects */ 2043 repeat_validate: 2044 rq = eb_pin_engine(eb, false); 2045 if (IS_ERR(rq)) { 2046 err = PTR_ERR(rq); 2047 rq = NULL; 2048 goto err; 2049 } 2050 2051 /* We didn't throttle, should be NULL */ 2052 GEM_WARN_ON(rq); 2053 2054 err = eb_validate_vmas(eb); 2055 if (err) 2056 goto err; 2057 2058 GEM_BUG_ON(!eb->batch); 2059 2060 list_for_each_entry(ev, &eb->relocs, reloc_link) { 2061 if (!have_copy) { 2062 pagefault_disable(); 2063 err = eb_relocate_vma(eb, ev); 2064 pagefault_enable(); 2065 if (err) 2066 break; 2067 } else { 2068 err = eb_relocate_vma_slow(eb, ev); 2069 if (err) 2070 break; 2071 } 2072 } 2073 2074 if (err == -EDEADLK) 2075 goto err; 2076 2077 if (err && !have_copy) 2078 goto repeat; 2079 2080 if (err) 2081 goto err; 2082 2083 /* as last step, parse the command buffer */ 2084 err = eb_parse(eb); 2085 if (err) 2086 goto err; 2087 2088 /* 2089 * Leave the user relocations as are, this is the painfully slow path, 2090 * and we want to avoid the complication of dropping the lock whilst 2091 * having buffers reserved in the aperture and so causing spurious 2092 * ENOSPC for random operations. 2093 */ 2094 2095 err: 2096 if (err == -EDEADLK) { 2097 eb_release_vmas(eb, false, false); 2098 err = i915_gem_ww_ctx_backoff(&eb->ww); 2099 if (!err) 2100 goto repeat_validate; 2101 } 2102 2103 if (err == -EAGAIN) 2104 goto repeat; 2105 2106 out: 2107 if (have_copy) { 2108 const unsigned int count = eb->buffer_count; 2109 unsigned int i; 2110 2111 for (i = 0; i < count; i++) { 2112 const struct drm_i915_gem_exec_object2 *entry = 2113 &eb->exec[i]; 2114 struct drm_i915_gem_relocation_entry *relocs; 2115 2116 if (!entry->relocation_count) 2117 continue; 2118 2119 relocs = u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 2120 kvfree(relocs); 2121 } 2122 } 2123 2124 if (rq) 2125 i915_request_put(rq); 2126 2127 return err; 2128 } 2129 2130 static int eb_relocate_parse(struct i915_execbuffer *eb) 2131 { 2132 int err; 2133 struct i915_request *rq = NULL; 2134 bool throttle = true; 2135 2136 retry: 2137 rq = eb_pin_engine(eb, throttle); 2138 if (IS_ERR(rq)) { 2139 err = PTR_ERR(rq); 2140 rq = NULL; 2141 if (err != -EDEADLK) 2142 return err; 2143 2144 goto err; 2145 } 2146 2147 if (rq) { 2148 bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; 2149 2150 /* Need to drop all locks now for throttling, take slowpath */ 2151 err = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 0); 2152 if (err == -ETIME) { 2153 if (nonblock) { 2154 err = -EWOULDBLOCK; 2155 i915_request_put(rq); 2156 goto err; 2157 } 2158 goto slow; 2159 } 2160 i915_request_put(rq); 2161 rq = NULL; 2162 } 2163 2164 /* only throttle once, even if we didn't need to throttle */ 2165 throttle = false; 2166 2167 err = eb_validate_vmas(eb); 2168 if (err == -EAGAIN) 2169 goto slow; 2170 else if (err) 2171 goto err; 2172 2173 /* The objects are in their final locations, apply the relocations. */ 2174 if (eb->args->flags & __EXEC_HAS_RELOC) { 2175 struct eb_vma *ev; 2176 2177 list_for_each_entry(ev, &eb->relocs, reloc_link) { 2178 err = eb_relocate_vma(eb, ev); 2179 if (err) 2180 break; 2181 } 2182 2183 if (err == -EDEADLK) 2184 goto err; 2185 else if (err) 2186 goto slow; 2187 } 2188 2189 if (!err) 2190 err = eb_parse(eb); 2191 2192 err: 2193 if (err == -EDEADLK) { 2194 eb_release_vmas(eb, false, false); 2195 err = i915_gem_ww_ctx_backoff(&eb->ww); 2196 if (!err) 2197 goto retry; 2198 } 2199 2200 return err; 2201 2202 slow: 2203 err = eb_relocate_parse_slow(eb, rq); 2204 if (err) 2205 /* 2206 * If the user expects the execobject.offset and 2207 * reloc.presumed_offset to be an exact match, 2208 * as for using NO_RELOC, then we cannot update 2209 * the execobject.offset until we have completed 2210 * relocation. 2211 */ 2212 eb->args->flags &= ~__EXEC_HAS_RELOC; 2213 2214 return err; 2215 } 2216 2217 static int eb_move_to_gpu(struct i915_execbuffer *eb) 2218 { 2219 const unsigned int count = eb->buffer_count; 2220 unsigned int i = count; 2221 int err = 0; 2222 2223 while (i--) { 2224 struct eb_vma *ev = &eb->vma[i]; 2225 struct i915_vma *vma = ev->vma; 2226 unsigned int flags = ev->flags; 2227 struct drm_i915_gem_object *obj = vma->obj; 2228 2229 assert_vma_held(vma); 2230 2231 if (flags & EXEC_OBJECT_CAPTURE) { 2232 struct i915_capture_list *capture; 2233 2234 capture = kmalloc(sizeof(*capture), GFP_KERNEL); 2235 if (capture) { 2236 capture->next = eb->request->capture_list; 2237 capture->vma = vma; 2238 eb->request->capture_list = capture; 2239 } 2240 } 2241 2242 /* 2243 * If the GPU is not _reading_ through the CPU cache, we need 2244 * to make sure that any writes (both previous GPU writes from 2245 * before a change in snooping levels and normal CPU writes) 2246 * caught in that cache are flushed to main memory. 2247 * 2248 * We want to say 2249 * obj->cache_dirty && 2250 * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) 2251 * but gcc's optimiser doesn't handle that as well and emits 2252 * two jumps instead of one. Maybe one day... 2253 */ 2254 if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { 2255 if (i915_gem_clflush_object(obj, 0)) 2256 flags &= ~EXEC_OBJECT_ASYNC; 2257 } 2258 2259 if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) { 2260 err = i915_request_await_object 2261 (eb->request, obj, flags & EXEC_OBJECT_WRITE); 2262 } 2263 2264 if (err == 0) 2265 err = i915_vma_move_to_active(vma, eb->request, 2266 flags | __EXEC_OBJECT_NO_RESERVE); 2267 } 2268 2269 #ifdef CONFIG_MMU_NOTIFIER 2270 if (!err && (eb->args->flags & __EXEC_USERPTR_USED)) { 2271 spin_lock(&eb->i915->mm.notifier_lock); 2272 2273 /* 2274 * count is always at least 1, otherwise __EXEC_USERPTR_USED 2275 * could not have been set 2276 */ 2277 for (i = 0; i < count; i++) { 2278 struct eb_vma *ev = &eb->vma[i]; 2279 struct drm_i915_gem_object *obj = ev->vma->obj; 2280 2281 if (!i915_gem_object_is_userptr(obj)) 2282 continue; 2283 2284 err = i915_gem_object_userptr_submit_done(obj); 2285 if (err) 2286 break; 2287 } 2288 2289 spin_unlock(&eb->i915->mm.notifier_lock); 2290 } 2291 #endif 2292 2293 if (unlikely(err)) 2294 goto err_skip; 2295 2296 /* Unconditionally flush any chipset caches (for streaming writes). */ 2297 intel_gt_chipset_flush(eb->engine->gt); 2298 return 0; 2299 2300 err_skip: 2301 i915_request_set_error_once(eb->request, err); 2302 return err; 2303 } 2304 2305 static int i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec) 2306 { 2307 if (exec->flags & __I915_EXEC_ILLEGAL_FLAGS) 2308 return -EINVAL; 2309 2310 /* Kernel clipping was a DRI1 misfeature */ 2311 if (!(exec->flags & (I915_EXEC_FENCE_ARRAY | 2312 I915_EXEC_USE_EXTENSIONS))) { 2313 if (exec->num_cliprects || exec->cliprects_ptr) 2314 return -EINVAL; 2315 } 2316 2317 if (exec->DR4 == 0xffffffff) { 2318 DRM_DEBUG("UXA submitting garbage DR4, fixing up\n"); 2319 exec->DR4 = 0; 2320 } 2321 if (exec->DR1 || exec->DR4) 2322 return -EINVAL; 2323 2324 if ((exec->batch_start_offset | exec->batch_len) & 0x7) 2325 return -EINVAL; 2326 2327 return 0; 2328 } 2329 2330 static int i915_reset_gen7_sol_offsets(struct i915_request *rq) 2331 { 2332 u32 *cs; 2333 int i; 2334 2335 if (!IS_GEN(rq->engine->i915, 7) || rq->engine->id != RCS0) { 2336 drm_dbg(&rq->engine->i915->drm, "sol reset is gen7/rcs only\n"); 2337 return -EINVAL; 2338 } 2339 2340 cs = intel_ring_begin(rq, 4 * 2 + 2); 2341 if (IS_ERR(cs)) 2342 return PTR_ERR(cs); 2343 2344 *cs++ = MI_LOAD_REGISTER_IMM(4); 2345 for (i = 0; i < 4; i++) { 2346 *cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i)); 2347 *cs++ = 0; 2348 } 2349 *cs++ = MI_NOOP; 2350 intel_ring_advance(rq, cs); 2351 2352 return 0; 2353 } 2354 2355 static struct i915_vma * 2356 shadow_batch_pin(struct i915_execbuffer *eb, 2357 struct drm_i915_gem_object *obj, 2358 struct i915_address_space *vm, 2359 unsigned int flags) 2360 { 2361 struct i915_vma *vma; 2362 int err; 2363 2364 vma = i915_vma_instance(obj, vm, NULL); 2365 if (IS_ERR(vma)) 2366 return vma; 2367 2368 err = i915_vma_pin_ww(vma, &eb->ww, 0, 0, flags); 2369 if (err) 2370 return ERR_PTR(err); 2371 2372 return vma; 2373 } 2374 2375 struct eb_parse_work { 2376 struct dma_fence_work base; 2377 struct intel_engine_cs *engine; 2378 struct i915_vma *batch; 2379 struct i915_vma *shadow; 2380 struct i915_vma *trampoline; 2381 unsigned long batch_offset; 2382 unsigned long batch_length; 2383 unsigned long *jump_whitelist; 2384 const void *batch_map; 2385 void *shadow_map; 2386 }; 2387 2388 static int __eb_parse(struct dma_fence_work *work) 2389 { 2390 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 2391 int ret; 2392 bool cookie; 2393 2394 cookie = dma_fence_begin_signalling(); 2395 ret = intel_engine_cmd_parser(pw->engine, 2396 pw->batch, 2397 pw->batch_offset, 2398 pw->batch_length, 2399 pw->shadow, 2400 pw->jump_whitelist, 2401 pw->shadow_map, 2402 pw->batch_map); 2403 dma_fence_end_signalling(cookie); 2404 2405 return ret; 2406 } 2407 2408 static void __eb_parse_release(struct dma_fence_work *work) 2409 { 2410 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 2411 2412 if (!IS_ERR_OR_NULL(pw->jump_whitelist)) 2413 kfree(pw->jump_whitelist); 2414 2415 if (pw->batch_map) 2416 i915_gem_object_unpin_map(pw->batch->obj); 2417 else 2418 i915_gem_object_unpin_pages(pw->batch->obj); 2419 2420 i915_gem_object_unpin_map(pw->shadow->obj); 2421 2422 if (pw->trampoline) 2423 i915_active_release(&pw->trampoline->active); 2424 i915_active_release(&pw->shadow->active); 2425 i915_active_release(&pw->batch->active); 2426 } 2427 2428 static const struct dma_fence_work_ops eb_parse_ops = { 2429 .name = "eb_parse", 2430 .work = __eb_parse, 2431 .release = __eb_parse_release, 2432 }; 2433 2434 static inline int 2435 __parser_mark_active(struct i915_vma *vma, 2436 struct intel_timeline *tl, 2437 struct dma_fence *fence) 2438 { 2439 struct intel_gt_buffer_pool_node *node = vma->private; 2440 2441 return i915_active_ref(&node->active, tl->fence_context, fence); 2442 } 2443 2444 static int 2445 parser_mark_active(struct eb_parse_work *pw, struct intel_timeline *tl) 2446 { 2447 int err; 2448 2449 mutex_lock(&tl->mutex); 2450 2451 err = __parser_mark_active(pw->shadow, tl, &pw->base.dma); 2452 if (err) 2453 goto unlock; 2454 2455 if (pw->trampoline) { 2456 err = __parser_mark_active(pw->trampoline, tl, &pw->base.dma); 2457 if (err) 2458 goto unlock; 2459 } 2460 2461 unlock: 2462 mutex_unlock(&tl->mutex); 2463 return err; 2464 } 2465 2466 static int eb_parse_pipeline(struct i915_execbuffer *eb, 2467 struct i915_vma *shadow, 2468 struct i915_vma *trampoline) 2469 { 2470 struct eb_parse_work *pw; 2471 struct drm_i915_gem_object *batch = eb->batch->vma->obj; 2472 bool needs_clflush; 2473 int err; 2474 2475 GEM_BUG_ON(overflows_type(eb->batch_start_offset, pw->batch_offset)); 2476 GEM_BUG_ON(overflows_type(eb->batch_len, pw->batch_length)); 2477 2478 pw = kzalloc(sizeof(*pw), GFP_KERNEL); 2479 if (!pw) 2480 return -ENOMEM; 2481 2482 err = i915_active_acquire(&eb->batch->vma->active); 2483 if (err) 2484 goto err_free; 2485 2486 err = i915_active_acquire(&shadow->active); 2487 if (err) 2488 goto err_batch; 2489 2490 if (trampoline) { 2491 err = i915_active_acquire(&trampoline->active); 2492 if (err) 2493 goto err_shadow; 2494 } 2495 2496 pw->shadow_map = i915_gem_object_pin_map(shadow->obj, I915_MAP_WB); 2497 if (IS_ERR(pw->shadow_map)) { 2498 err = PTR_ERR(pw->shadow_map); 2499 goto err_trampoline; 2500 } 2501 2502 needs_clflush = 2503 !(batch->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ); 2504 2505 pw->batch_map = ERR_PTR(-ENODEV); 2506 if (needs_clflush && i915_has_memcpy_from_wc()) 2507 pw->batch_map = i915_gem_object_pin_map(batch, I915_MAP_WC); 2508 2509 if (IS_ERR(pw->batch_map)) { 2510 err = i915_gem_object_pin_pages(batch); 2511 if (err) 2512 goto err_unmap_shadow; 2513 pw->batch_map = NULL; 2514 } 2515 2516 pw->jump_whitelist = 2517 intel_engine_cmd_parser_alloc_jump_whitelist(eb->batch_len, 2518 trampoline); 2519 if (IS_ERR(pw->jump_whitelist)) { 2520 err = PTR_ERR(pw->jump_whitelist); 2521 goto err_unmap_batch; 2522 } 2523 2524 dma_fence_work_init(&pw->base, &eb_parse_ops); 2525 2526 pw->engine = eb->engine; 2527 pw->batch = eb->batch->vma; 2528 pw->batch_offset = eb->batch_start_offset; 2529 pw->batch_length = eb->batch_len; 2530 pw->shadow = shadow; 2531 pw->trampoline = trampoline; 2532 2533 /* Mark active refs early for this worker, in case we get interrupted */ 2534 err = parser_mark_active(pw, eb->context->timeline); 2535 if (err) 2536 goto err_commit; 2537 2538 err = dma_resv_reserve_shared(pw->batch->resv, 1); 2539 if (err) 2540 goto err_commit; 2541 2542 err = dma_resv_reserve_shared(shadow->resv, 1); 2543 if (err) 2544 goto err_commit; 2545 2546 /* Wait for all writes (and relocs) into the batch to complete */ 2547 err = i915_sw_fence_await_reservation(&pw->base.chain, 2548 pw->batch->resv, NULL, false, 2549 0, I915_FENCE_GFP); 2550 if (err < 0) 2551 goto err_commit; 2552 2553 /* Keep the batch alive and unwritten as we parse */ 2554 dma_resv_add_shared_fence(pw->batch->resv, &pw->base.dma); 2555 2556 /* Force execution to wait for completion of the parser */ 2557 dma_resv_add_excl_fence(shadow->resv, &pw->base.dma); 2558 2559 dma_fence_work_commit_imm(&pw->base); 2560 return 0; 2561 2562 err_commit: 2563 i915_sw_fence_set_error_once(&pw->base.chain, err); 2564 dma_fence_work_commit_imm(&pw->base); 2565 return err; 2566 2567 err_unmap_batch: 2568 if (pw->batch_map) 2569 i915_gem_object_unpin_map(batch); 2570 else 2571 i915_gem_object_unpin_pages(batch); 2572 err_unmap_shadow: 2573 i915_gem_object_unpin_map(shadow->obj); 2574 err_trampoline: 2575 if (trampoline) 2576 i915_active_release(&trampoline->active); 2577 err_shadow: 2578 i915_active_release(&shadow->active); 2579 err_batch: 2580 i915_active_release(&eb->batch->vma->active); 2581 err_free: 2582 kfree(pw); 2583 return err; 2584 } 2585 2586 static struct i915_vma *eb_dispatch_secure(struct i915_execbuffer *eb, struct i915_vma *vma) 2587 { 2588 /* 2589 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure 2590 * batch" bit. Hence we need to pin secure batches into the global gtt. 2591 * hsw should have this fixed, but bdw mucks it up again. */ 2592 if (eb->batch_flags & I915_DISPATCH_SECURE) 2593 return i915_gem_object_ggtt_pin_ww(vma->obj, &eb->ww, NULL, 0, 0, 0); 2594 2595 return NULL; 2596 } 2597 2598 static int eb_parse(struct i915_execbuffer *eb) 2599 { 2600 struct drm_i915_private *i915 = eb->i915; 2601 struct intel_gt_buffer_pool_node *pool = eb->batch_pool; 2602 struct i915_vma *shadow, *trampoline, *batch; 2603 unsigned long len; 2604 int err; 2605 2606 if (!eb_use_cmdparser(eb)) { 2607 batch = eb_dispatch_secure(eb, eb->batch->vma); 2608 if (IS_ERR(batch)) 2609 return PTR_ERR(batch); 2610 2611 goto secure_batch; 2612 } 2613 2614 len = eb->batch_len; 2615 if (!CMDPARSER_USES_GGTT(eb->i915)) { 2616 /* 2617 * ppGTT backed shadow buffers must be mapped RO, to prevent 2618 * post-scan tampering 2619 */ 2620 if (!eb->context->vm->has_read_only) { 2621 drm_dbg(&i915->drm, 2622 "Cannot prevent post-scan tampering without RO capable vm\n"); 2623 return -EINVAL; 2624 } 2625 } else { 2626 len += I915_CMD_PARSER_TRAMPOLINE_SIZE; 2627 } 2628 if (unlikely(len < eb->batch_len)) /* last paranoid check of overflow */ 2629 return -EINVAL; 2630 2631 if (!pool) { 2632 pool = intel_gt_get_buffer_pool(eb->engine->gt, len, 2633 I915_MAP_WB); 2634 if (IS_ERR(pool)) 2635 return PTR_ERR(pool); 2636 eb->batch_pool = pool; 2637 } 2638 2639 err = i915_gem_object_lock(pool->obj, &eb->ww); 2640 if (err) 2641 goto err; 2642 2643 shadow = shadow_batch_pin(eb, pool->obj, eb->context->vm, PIN_USER); 2644 if (IS_ERR(shadow)) { 2645 err = PTR_ERR(shadow); 2646 goto err; 2647 } 2648 intel_gt_buffer_pool_mark_used(pool); 2649 i915_gem_object_set_readonly(shadow->obj); 2650 shadow->private = pool; 2651 2652 trampoline = NULL; 2653 if (CMDPARSER_USES_GGTT(eb->i915)) { 2654 trampoline = shadow; 2655 2656 shadow = shadow_batch_pin(eb, pool->obj, 2657 &eb->engine->gt->ggtt->vm, 2658 PIN_GLOBAL); 2659 if (IS_ERR(shadow)) { 2660 err = PTR_ERR(shadow); 2661 shadow = trampoline; 2662 goto err_shadow; 2663 } 2664 shadow->private = pool; 2665 2666 eb->batch_flags |= I915_DISPATCH_SECURE; 2667 } 2668 2669 batch = eb_dispatch_secure(eb, shadow); 2670 if (IS_ERR(batch)) { 2671 err = PTR_ERR(batch); 2672 goto err_trampoline; 2673 } 2674 2675 err = eb_parse_pipeline(eb, shadow, trampoline); 2676 if (err) 2677 goto err_unpin_batch; 2678 2679 eb->batch = &eb->vma[eb->buffer_count++]; 2680 eb->batch->vma = i915_vma_get(shadow); 2681 eb->batch->flags = __EXEC_OBJECT_HAS_PIN; 2682 2683 eb->trampoline = trampoline; 2684 eb->batch_start_offset = 0; 2685 2686 secure_batch: 2687 if (batch) { 2688 eb->batch = &eb->vma[eb->buffer_count++]; 2689 eb->batch->flags = __EXEC_OBJECT_HAS_PIN; 2690 eb->batch->vma = i915_vma_get(batch); 2691 } 2692 return 0; 2693 2694 err_unpin_batch: 2695 if (batch) 2696 i915_vma_unpin(batch); 2697 err_trampoline: 2698 if (trampoline) 2699 i915_vma_unpin(trampoline); 2700 err_shadow: 2701 i915_vma_unpin(shadow); 2702 err: 2703 return err; 2704 } 2705 2706 static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) 2707 { 2708 int err; 2709 2710 if (intel_context_nopreempt(eb->context)) 2711 __set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags); 2712 2713 err = eb_move_to_gpu(eb); 2714 if (err) 2715 return err; 2716 2717 if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) { 2718 err = i915_reset_gen7_sol_offsets(eb->request); 2719 if (err) 2720 return err; 2721 } 2722 2723 /* 2724 * After we completed waiting for other engines (using HW semaphores) 2725 * then we can signal that this request/batch is ready to run. This 2726 * allows us to determine if the batch is still waiting on the GPU 2727 * or actually running by checking the breadcrumb. 2728 */ 2729 if (eb->engine->emit_init_breadcrumb) { 2730 err = eb->engine->emit_init_breadcrumb(eb->request); 2731 if (err) 2732 return err; 2733 } 2734 2735 err = eb->engine->emit_bb_start(eb->request, 2736 batch->node.start + 2737 eb->batch_start_offset, 2738 eb->batch_len, 2739 eb->batch_flags); 2740 if (err) 2741 return err; 2742 2743 if (eb->trampoline) { 2744 GEM_BUG_ON(eb->batch_start_offset); 2745 err = eb->engine->emit_bb_start(eb->request, 2746 eb->trampoline->node.start + 2747 eb->batch_len, 2748 0, 0); 2749 if (err) 2750 return err; 2751 } 2752 2753 return 0; 2754 } 2755 2756 static int num_vcs_engines(const struct drm_i915_private *i915) 2757 { 2758 return hweight_long(VDBOX_MASK(&i915->gt)); 2759 } 2760 2761 /* 2762 * Find one BSD ring to dispatch the corresponding BSD command. 2763 * The engine index is returned. 2764 */ 2765 static unsigned int 2766 gen8_dispatch_bsd_engine(struct drm_i915_private *dev_priv, 2767 struct drm_file *file) 2768 { 2769 struct drm_i915_file_private *file_priv = file->driver_priv; 2770 2771 /* Check whether the file_priv has already selected one ring. */ 2772 if ((int)file_priv->bsd_engine < 0) 2773 file_priv->bsd_engine = 2774 get_random_int() % num_vcs_engines(dev_priv); 2775 2776 return file_priv->bsd_engine; 2777 } 2778 2779 static const enum intel_engine_id user_ring_map[] = { 2780 [I915_EXEC_DEFAULT] = RCS0, 2781 [I915_EXEC_RENDER] = RCS0, 2782 [I915_EXEC_BLT] = BCS0, 2783 [I915_EXEC_BSD] = VCS0, 2784 [I915_EXEC_VEBOX] = VECS0 2785 }; 2786 2787 static struct i915_request *eb_throttle(struct i915_execbuffer *eb, struct intel_context *ce) 2788 { 2789 struct intel_ring *ring = ce->ring; 2790 struct intel_timeline *tl = ce->timeline; 2791 struct i915_request *rq; 2792 2793 /* 2794 * Completely unscientific finger-in-the-air estimates for suitable 2795 * maximum user request size (to avoid blocking) and then backoff. 2796 */ 2797 if (intel_ring_update_space(ring) >= PAGE_SIZE) 2798 return NULL; 2799 2800 /* 2801 * Find a request that after waiting upon, there will be at least half 2802 * the ring available. The hysteresis allows us to compete for the 2803 * shared ring and should mean that we sleep less often prior to 2804 * claiming our resources, but not so long that the ring completely 2805 * drains before we can submit our next request. 2806 */ 2807 list_for_each_entry(rq, &tl->requests, link) { 2808 if (rq->ring != ring) 2809 continue; 2810 2811 if (__intel_ring_space(rq->postfix, 2812 ring->emit, ring->size) > ring->size / 2) 2813 break; 2814 } 2815 if (&rq->link == &tl->requests) 2816 return NULL; /* weird, we will check again later for real */ 2817 2818 return i915_request_get(rq); 2819 } 2820 2821 static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throttle) 2822 { 2823 struct intel_context *ce = eb->context; 2824 struct intel_timeline *tl; 2825 struct i915_request *rq = NULL; 2826 int err; 2827 2828 GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED); 2829 2830 if (unlikely(intel_context_is_banned(ce))) 2831 return ERR_PTR(-EIO); 2832 2833 /* 2834 * Pinning the contexts may generate requests in order to acquire 2835 * GGTT space, so do this first before we reserve a seqno for 2836 * ourselves. 2837 */ 2838 err = intel_context_pin_ww(ce, &eb->ww); 2839 if (err) 2840 return ERR_PTR(err); 2841 2842 /* 2843 * Take a local wakeref for preparing to dispatch the execbuf as 2844 * we expect to access the hardware fairly frequently in the 2845 * process, and require the engine to be kept awake between accesses. 2846 * Upon dispatch, we acquire another prolonged wakeref that we hold 2847 * until the timeline is idle, which in turn releases the wakeref 2848 * taken on the engine, and the parent device. 2849 */ 2850 tl = intel_context_timeline_lock(ce); 2851 if (IS_ERR(tl)) { 2852 intel_context_unpin(ce); 2853 return ERR_CAST(tl); 2854 } 2855 2856 intel_context_enter(ce); 2857 if (throttle) 2858 rq = eb_throttle(eb, ce); 2859 intel_context_timeline_unlock(tl); 2860 2861 eb->args->flags |= __EXEC_ENGINE_PINNED; 2862 return rq; 2863 } 2864 2865 static void eb_unpin_engine(struct i915_execbuffer *eb) 2866 { 2867 struct intel_context *ce = eb->context; 2868 struct intel_timeline *tl = ce->timeline; 2869 2870 if (!(eb->args->flags & __EXEC_ENGINE_PINNED)) 2871 return; 2872 2873 eb->args->flags &= ~__EXEC_ENGINE_PINNED; 2874 2875 mutex_lock(&tl->mutex); 2876 intel_context_exit(ce); 2877 mutex_unlock(&tl->mutex); 2878 2879 intel_context_unpin(ce); 2880 } 2881 2882 static unsigned int 2883 eb_select_legacy_ring(struct i915_execbuffer *eb) 2884 { 2885 struct drm_i915_private *i915 = eb->i915; 2886 struct drm_i915_gem_execbuffer2 *args = eb->args; 2887 unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK; 2888 2889 if (user_ring_id != I915_EXEC_BSD && 2890 (args->flags & I915_EXEC_BSD_MASK)) { 2891 drm_dbg(&i915->drm, 2892 "execbuf with non bsd ring but with invalid " 2893 "bsd dispatch flags: %d\n", (int)(args->flags)); 2894 return -1; 2895 } 2896 2897 if (user_ring_id == I915_EXEC_BSD && num_vcs_engines(i915) > 1) { 2898 unsigned int bsd_idx = args->flags & I915_EXEC_BSD_MASK; 2899 2900 if (bsd_idx == I915_EXEC_BSD_DEFAULT) { 2901 bsd_idx = gen8_dispatch_bsd_engine(i915, eb->file); 2902 } else if (bsd_idx >= I915_EXEC_BSD_RING1 && 2903 bsd_idx <= I915_EXEC_BSD_RING2) { 2904 bsd_idx >>= I915_EXEC_BSD_SHIFT; 2905 bsd_idx--; 2906 } else { 2907 drm_dbg(&i915->drm, 2908 "execbuf with unknown bsd ring: %u\n", 2909 bsd_idx); 2910 return -1; 2911 } 2912 2913 return _VCS(bsd_idx); 2914 } 2915 2916 if (user_ring_id >= ARRAY_SIZE(user_ring_map)) { 2917 drm_dbg(&i915->drm, "execbuf with unknown ring: %u\n", 2918 user_ring_id); 2919 return -1; 2920 } 2921 2922 return user_ring_map[user_ring_id]; 2923 } 2924 2925 static int 2926 eb_select_engine(struct i915_execbuffer *eb) 2927 { 2928 struct intel_context *ce; 2929 unsigned int idx; 2930 int err; 2931 2932 if (i915_gem_context_user_engines(eb->gem_context)) 2933 idx = eb->args->flags & I915_EXEC_RING_MASK; 2934 else 2935 idx = eb_select_legacy_ring(eb); 2936 2937 ce = i915_gem_context_get_engine(eb->gem_context, idx); 2938 if (IS_ERR(ce)) 2939 return PTR_ERR(ce); 2940 2941 intel_gt_pm_get(ce->engine->gt); 2942 2943 if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { 2944 err = intel_context_alloc_state(ce); 2945 if (err) 2946 goto err; 2947 } 2948 2949 /* 2950 * ABI: Before userspace accesses the GPU (e.g. execbuffer), report 2951 * EIO if the GPU is already wedged. 2952 */ 2953 err = intel_gt_terminally_wedged(ce->engine->gt); 2954 if (err) 2955 goto err; 2956 2957 eb->context = ce; 2958 eb->engine = ce->engine; 2959 2960 /* 2961 * Make sure engine pool stays alive even if we call intel_context_put 2962 * during ww handling. The pool is destroyed when last pm reference 2963 * is dropped, which breaks our -EDEADLK handling. 2964 */ 2965 return err; 2966 2967 err: 2968 intel_gt_pm_put(ce->engine->gt); 2969 intel_context_put(ce); 2970 return err; 2971 } 2972 2973 static void 2974 eb_put_engine(struct i915_execbuffer *eb) 2975 { 2976 intel_gt_pm_put(eb->engine->gt); 2977 intel_context_put(eb->context); 2978 } 2979 2980 static void 2981 __free_fence_array(struct eb_fence *fences, unsigned int n) 2982 { 2983 while (n--) { 2984 drm_syncobj_put(ptr_mask_bits(fences[n].syncobj, 2)); 2985 dma_fence_put(fences[n].dma_fence); 2986 kfree(fences[n].chain_fence); 2987 } 2988 kvfree(fences); 2989 } 2990 2991 static int 2992 add_timeline_fence_array(struct i915_execbuffer *eb, 2993 const struct drm_i915_gem_execbuffer_ext_timeline_fences *timeline_fences) 2994 { 2995 struct drm_i915_gem_exec_fence __user *user_fences; 2996 u64 __user *user_values; 2997 struct eb_fence *f; 2998 u64 nfences; 2999 int err = 0; 3000 3001 nfences = timeline_fences->fence_count; 3002 if (!nfences) 3003 return 0; 3004 3005 /* Check multiplication overflow for access_ok() and kvmalloc_array() */ 3006 BUILD_BUG_ON(sizeof(size_t) > sizeof(unsigned long)); 3007 if (nfences > min_t(unsigned long, 3008 ULONG_MAX / sizeof(*user_fences), 3009 SIZE_MAX / sizeof(*f)) - eb->num_fences) 3010 return -EINVAL; 3011 3012 user_fences = u64_to_user_ptr(timeline_fences->handles_ptr); 3013 if (!access_ok(user_fences, nfences * sizeof(*user_fences))) 3014 return -EFAULT; 3015 3016 user_values = u64_to_user_ptr(timeline_fences->values_ptr); 3017 if (!access_ok(user_values, nfences * sizeof(*user_values))) 3018 return -EFAULT; 3019 3020 f = krealloc(eb->fences, 3021 (eb->num_fences + nfences) * sizeof(*f), 3022 __GFP_NOWARN | GFP_KERNEL); 3023 if (!f) 3024 return -ENOMEM; 3025 3026 eb->fences = f; 3027 f += eb->num_fences; 3028 3029 BUILD_BUG_ON(~(ARCH_KMALLOC_MINALIGN - 1) & 3030 ~__I915_EXEC_FENCE_UNKNOWN_FLAGS); 3031 3032 while (nfences--) { 3033 struct drm_i915_gem_exec_fence user_fence; 3034 struct drm_syncobj *syncobj; 3035 struct dma_fence *fence = NULL; 3036 u64 point; 3037 3038 if (__copy_from_user(&user_fence, 3039 user_fences++, 3040 sizeof(user_fence))) 3041 return -EFAULT; 3042 3043 if (user_fence.flags & __I915_EXEC_FENCE_UNKNOWN_FLAGS) 3044 return -EINVAL; 3045 3046 if (__get_user(point, user_values++)) 3047 return -EFAULT; 3048 3049 syncobj = drm_syncobj_find(eb->file, user_fence.handle); 3050 if (!syncobj) { 3051 DRM_DEBUG("Invalid syncobj handle provided\n"); 3052 return -ENOENT; 3053 } 3054 3055 fence = drm_syncobj_fence_get(syncobj); 3056 3057 if (!fence && user_fence.flags && 3058 !(user_fence.flags & I915_EXEC_FENCE_SIGNAL)) { 3059 DRM_DEBUG("Syncobj handle has no fence\n"); 3060 drm_syncobj_put(syncobj); 3061 return -EINVAL; 3062 } 3063 3064 if (fence) 3065 err = dma_fence_chain_find_seqno(&fence, point); 3066 3067 if (err && !(user_fence.flags & I915_EXEC_FENCE_SIGNAL)) { 3068 DRM_DEBUG("Syncobj handle missing requested point %llu\n", point); 3069 dma_fence_put(fence); 3070 drm_syncobj_put(syncobj); 3071 return err; 3072 } 3073 3074 /* 3075 * A point might have been signaled already and 3076 * garbage collected from the timeline. In this case 3077 * just ignore the point and carry on. 3078 */ 3079 if (!fence && !(user_fence.flags & I915_EXEC_FENCE_SIGNAL)) { 3080 drm_syncobj_put(syncobj); 3081 continue; 3082 } 3083 3084 /* 3085 * For timeline syncobjs we need to preallocate chains for 3086 * later signaling. 3087 */ 3088 if (point != 0 && user_fence.flags & I915_EXEC_FENCE_SIGNAL) { 3089 /* 3090 * Waiting and signaling the same point (when point != 3091 * 0) would break the timeline. 3092 */ 3093 if (user_fence.flags & I915_EXEC_FENCE_WAIT) { 3094 DRM_DEBUG("Trying to wait & signal the same timeline point.\n"); 3095 dma_fence_put(fence); 3096 drm_syncobj_put(syncobj); 3097 return -EINVAL; 3098 } 3099 3100 f->chain_fence = 3101 kmalloc(sizeof(*f->chain_fence), 3102 GFP_KERNEL); 3103 if (!f->chain_fence) { 3104 drm_syncobj_put(syncobj); 3105 dma_fence_put(fence); 3106 return -ENOMEM; 3107 } 3108 } else { 3109 f->chain_fence = NULL; 3110 } 3111 3112 f->syncobj = ptr_pack_bits(syncobj, user_fence.flags, 2); 3113 f->dma_fence = fence; 3114 f->value = point; 3115 f++; 3116 eb->num_fences++; 3117 } 3118 3119 return 0; 3120 } 3121 3122 static int add_fence_array(struct i915_execbuffer *eb) 3123 { 3124 struct drm_i915_gem_execbuffer2 *args = eb->args; 3125 struct drm_i915_gem_exec_fence __user *user; 3126 unsigned long num_fences = args->num_cliprects; 3127 struct eb_fence *f; 3128 3129 if (!(args->flags & I915_EXEC_FENCE_ARRAY)) 3130 return 0; 3131 3132 if (!num_fences) 3133 return 0; 3134 3135 /* Check multiplication overflow for access_ok() and kvmalloc_array() */ 3136 BUILD_BUG_ON(sizeof(size_t) > sizeof(unsigned long)); 3137 if (num_fences > min_t(unsigned long, 3138 ULONG_MAX / sizeof(*user), 3139 SIZE_MAX / sizeof(*f) - eb->num_fences)) 3140 return -EINVAL; 3141 3142 user = u64_to_user_ptr(args->cliprects_ptr); 3143 if (!access_ok(user, num_fences * sizeof(*user))) 3144 return -EFAULT; 3145 3146 f = krealloc(eb->fences, 3147 (eb->num_fences + num_fences) * sizeof(*f), 3148 __GFP_NOWARN | GFP_KERNEL); 3149 if (!f) 3150 return -ENOMEM; 3151 3152 eb->fences = f; 3153 f += eb->num_fences; 3154 while (num_fences--) { 3155 struct drm_i915_gem_exec_fence user_fence; 3156 struct drm_syncobj *syncobj; 3157 struct dma_fence *fence = NULL; 3158 3159 if (__copy_from_user(&user_fence, user++, sizeof(user_fence))) 3160 return -EFAULT; 3161 3162 if (user_fence.flags & __I915_EXEC_FENCE_UNKNOWN_FLAGS) 3163 return -EINVAL; 3164 3165 syncobj = drm_syncobj_find(eb->file, user_fence.handle); 3166 if (!syncobj) { 3167 DRM_DEBUG("Invalid syncobj handle provided\n"); 3168 return -ENOENT; 3169 } 3170 3171 if (user_fence.flags & I915_EXEC_FENCE_WAIT) { 3172 fence = drm_syncobj_fence_get(syncobj); 3173 if (!fence) { 3174 DRM_DEBUG("Syncobj handle has no fence\n"); 3175 drm_syncobj_put(syncobj); 3176 return -EINVAL; 3177 } 3178 } 3179 3180 BUILD_BUG_ON(~(ARCH_KMALLOC_MINALIGN - 1) & 3181 ~__I915_EXEC_FENCE_UNKNOWN_FLAGS); 3182 3183 f->syncobj = ptr_pack_bits(syncobj, user_fence.flags, 2); 3184 f->dma_fence = fence; 3185 f->value = 0; 3186 f->chain_fence = NULL; 3187 f++; 3188 eb->num_fences++; 3189 } 3190 3191 return 0; 3192 } 3193 3194 static void put_fence_array(struct eb_fence *fences, int num_fences) 3195 { 3196 if (fences) 3197 __free_fence_array(fences, num_fences); 3198 } 3199 3200 static int 3201 await_fence_array(struct i915_execbuffer *eb) 3202 { 3203 unsigned int n; 3204 int err; 3205 3206 for (n = 0; n < eb->num_fences; n++) { 3207 struct drm_syncobj *syncobj; 3208 unsigned int flags; 3209 3210 syncobj = ptr_unpack_bits(eb->fences[n].syncobj, &flags, 2); 3211 3212 if (!eb->fences[n].dma_fence) 3213 continue; 3214 3215 err = i915_request_await_dma_fence(eb->request, 3216 eb->fences[n].dma_fence); 3217 if (err < 0) 3218 return err; 3219 } 3220 3221 return 0; 3222 } 3223 3224 static void signal_fence_array(const struct i915_execbuffer *eb) 3225 { 3226 struct dma_fence * const fence = &eb->request->fence; 3227 unsigned int n; 3228 3229 for (n = 0; n < eb->num_fences; n++) { 3230 struct drm_syncobj *syncobj; 3231 unsigned int flags; 3232 3233 syncobj = ptr_unpack_bits(eb->fences[n].syncobj, &flags, 2); 3234 if (!(flags & I915_EXEC_FENCE_SIGNAL)) 3235 continue; 3236 3237 if (eb->fences[n].chain_fence) { 3238 drm_syncobj_add_point(syncobj, 3239 eb->fences[n].chain_fence, 3240 fence, 3241 eb->fences[n].value); 3242 /* 3243 * The chain's ownership is transferred to the 3244 * timeline. 3245 */ 3246 eb->fences[n].chain_fence = NULL; 3247 } else { 3248 drm_syncobj_replace_fence(syncobj, fence); 3249 } 3250 } 3251 } 3252 3253 static int 3254 parse_timeline_fences(struct i915_user_extension __user *ext, void *data) 3255 { 3256 struct i915_execbuffer *eb = data; 3257 struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences; 3258 3259 if (copy_from_user(&timeline_fences, ext, sizeof(timeline_fences))) 3260 return -EFAULT; 3261 3262 return add_timeline_fence_array(eb, &timeline_fences); 3263 } 3264 3265 static void retire_requests(struct intel_timeline *tl, struct i915_request *end) 3266 { 3267 struct i915_request *rq, *rn; 3268 3269 list_for_each_entry_safe(rq, rn, &tl->requests, link) 3270 if (rq == end || !i915_request_retire(rq)) 3271 break; 3272 } 3273 3274 static int eb_request_add(struct i915_execbuffer *eb, int err) 3275 { 3276 struct i915_request *rq = eb->request; 3277 struct intel_timeline * const tl = i915_request_timeline(rq); 3278 struct i915_sched_attr attr = {}; 3279 struct i915_request *prev; 3280 3281 lockdep_assert_held(&tl->mutex); 3282 lockdep_unpin_lock(&tl->mutex, rq->cookie); 3283 3284 trace_i915_request_add(rq); 3285 3286 prev = __i915_request_commit(rq); 3287 3288 /* Check that the context wasn't destroyed before submission */ 3289 if (likely(!intel_context_is_closed(eb->context))) { 3290 attr = eb->gem_context->sched; 3291 } else { 3292 /* Serialise with context_close via the add_to_timeline */ 3293 i915_request_set_error_once(rq, -ENOENT); 3294 __i915_request_skip(rq); 3295 err = -ENOENT; /* override any transient errors */ 3296 } 3297 3298 __i915_request_queue(rq, &attr); 3299 3300 /* Try to clean up the client's timeline after submitting the request */ 3301 if (prev) 3302 retire_requests(tl, prev); 3303 3304 mutex_unlock(&tl->mutex); 3305 3306 return err; 3307 } 3308 3309 static const i915_user_extension_fn execbuf_extensions[] = { 3310 [DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES] = parse_timeline_fences, 3311 }; 3312 3313 static int 3314 parse_execbuf2_extensions(struct drm_i915_gem_execbuffer2 *args, 3315 struct i915_execbuffer *eb) 3316 { 3317 if (!(args->flags & I915_EXEC_USE_EXTENSIONS)) 3318 return 0; 3319 3320 /* The execbuf2 extension mechanism reuses cliprects_ptr. So we cannot 3321 * have another flag also using it at the same time. 3322 */ 3323 if (eb->args->flags & I915_EXEC_FENCE_ARRAY) 3324 return -EINVAL; 3325 3326 if (args->num_cliprects != 0) 3327 return -EINVAL; 3328 3329 return i915_user_extensions(u64_to_user_ptr(args->cliprects_ptr), 3330 execbuf_extensions, 3331 ARRAY_SIZE(execbuf_extensions), 3332 eb); 3333 } 3334 3335 static int 3336 i915_gem_do_execbuffer(struct drm_device *dev, 3337 struct drm_file *file, 3338 struct drm_i915_gem_execbuffer2 *args, 3339 struct drm_i915_gem_exec_object2 *exec) 3340 { 3341 struct drm_i915_private *i915 = to_i915(dev); 3342 struct i915_execbuffer eb; 3343 struct dma_fence *in_fence = NULL; 3344 struct sync_file *out_fence = NULL; 3345 struct i915_vma *batch; 3346 int out_fence_fd = -1; 3347 int err; 3348 3349 BUILD_BUG_ON(__EXEC_INTERNAL_FLAGS & ~__I915_EXEC_ILLEGAL_FLAGS); 3350 BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & 3351 ~__EXEC_OBJECT_UNKNOWN_FLAGS); 3352 3353 eb.i915 = i915; 3354 eb.file = file; 3355 eb.args = args; 3356 if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) 3357 args->flags |= __EXEC_HAS_RELOC; 3358 3359 eb.exec = exec; 3360 eb.vma = (struct eb_vma *)(exec + args->buffer_count + 1); 3361 eb.vma[0].vma = NULL; 3362 eb.reloc_pool = eb.batch_pool = NULL; 3363 eb.reloc_context = NULL; 3364 3365 eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS; 3366 reloc_cache_init(&eb.reloc_cache, eb.i915); 3367 3368 eb.buffer_count = args->buffer_count; 3369 eb.batch_start_offset = args->batch_start_offset; 3370 eb.batch_len = args->batch_len; 3371 eb.trampoline = NULL; 3372 3373 eb.fences = NULL; 3374 eb.num_fences = 0; 3375 3376 eb.batch_flags = 0; 3377 if (args->flags & I915_EXEC_SECURE) { 3378 if (INTEL_GEN(i915) >= 11) 3379 return -ENODEV; 3380 3381 /* Return -EPERM to trigger fallback code on old binaries. */ 3382 if (!HAS_SECURE_BATCHES(i915)) 3383 return -EPERM; 3384 3385 if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN)) 3386 return -EPERM; 3387 3388 eb.batch_flags |= I915_DISPATCH_SECURE; 3389 } 3390 if (args->flags & I915_EXEC_IS_PINNED) 3391 eb.batch_flags |= I915_DISPATCH_PINNED; 3392 3393 err = parse_execbuf2_extensions(args, &eb); 3394 if (err) 3395 goto err_ext; 3396 3397 err = add_fence_array(&eb); 3398 if (err) 3399 goto err_ext; 3400 3401 #define IN_FENCES (I915_EXEC_FENCE_IN | I915_EXEC_FENCE_SUBMIT) 3402 if (args->flags & IN_FENCES) { 3403 if ((args->flags & IN_FENCES) == IN_FENCES) 3404 return -EINVAL; 3405 3406 in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 3407 if (!in_fence) { 3408 err = -EINVAL; 3409 goto err_ext; 3410 } 3411 } 3412 #undef IN_FENCES 3413 3414 if (args->flags & I915_EXEC_FENCE_OUT) { 3415 out_fence_fd = get_unused_fd_flags(O_CLOEXEC); 3416 if (out_fence_fd < 0) { 3417 err = out_fence_fd; 3418 goto err_in_fence; 3419 } 3420 } 3421 3422 err = eb_create(&eb); 3423 if (err) 3424 goto err_out_fence; 3425 3426 GEM_BUG_ON(!eb.lut_size); 3427 3428 err = eb_select_context(&eb); 3429 if (unlikely(err)) 3430 goto err_destroy; 3431 3432 err = eb_select_engine(&eb); 3433 if (unlikely(err)) 3434 goto err_context; 3435 3436 err = eb_lookup_vmas(&eb); 3437 if (err) { 3438 eb_release_vmas(&eb, true, true); 3439 goto err_engine; 3440 } 3441 3442 i915_gem_ww_ctx_init(&eb.ww, true); 3443 3444 err = eb_relocate_parse(&eb); 3445 if (err) { 3446 /* 3447 * If the user expects the execobject.offset and 3448 * reloc.presumed_offset to be an exact match, 3449 * as for using NO_RELOC, then we cannot update 3450 * the execobject.offset until we have completed 3451 * relocation. 3452 */ 3453 args->flags &= ~__EXEC_HAS_RELOC; 3454 goto err_vma; 3455 } 3456 3457 ww_acquire_done(&eb.ww.ctx); 3458 3459 batch = eb.batch->vma; 3460 3461 /* All GPU relocation batches must be submitted prior to the user rq */ 3462 GEM_BUG_ON(eb.reloc_cache.rq); 3463 3464 /* Allocate a request for this batch buffer nice and early. */ 3465 eb.request = i915_request_create(eb.context); 3466 if (IS_ERR(eb.request)) { 3467 err = PTR_ERR(eb.request); 3468 goto err_vma; 3469 } 3470 3471 if (in_fence) { 3472 if (args->flags & I915_EXEC_FENCE_SUBMIT) 3473 err = i915_request_await_execution(eb.request, 3474 in_fence, 3475 eb.engine->bond_execute); 3476 else 3477 err = i915_request_await_dma_fence(eb.request, 3478 in_fence); 3479 if (err < 0) 3480 goto err_request; 3481 } 3482 3483 if (eb.fences) { 3484 err = await_fence_array(&eb); 3485 if (err) 3486 goto err_request; 3487 } 3488 3489 if (out_fence_fd != -1) { 3490 out_fence = sync_file_create(&eb.request->fence); 3491 if (!out_fence) { 3492 err = -ENOMEM; 3493 goto err_request; 3494 } 3495 } 3496 3497 /* 3498 * Whilst this request exists, batch_obj will be on the 3499 * active_list, and so will hold the active reference. Only when this 3500 * request is retired will the the batch_obj be moved onto the 3501 * inactive_list and lose its active reference. Hence we do not need 3502 * to explicitly hold another reference here. 3503 */ 3504 eb.request->batch = batch; 3505 if (eb.batch_pool) 3506 intel_gt_buffer_pool_mark_active(eb.batch_pool, eb.request); 3507 3508 trace_i915_request_queue(eb.request, eb.batch_flags); 3509 err = eb_submit(&eb, batch); 3510 3511 err_request: 3512 i915_request_get(eb.request); 3513 err = eb_request_add(&eb, err); 3514 3515 if (eb.fences) 3516 signal_fence_array(&eb); 3517 3518 if (out_fence) { 3519 if (err == 0) { 3520 fd_install(out_fence_fd, out_fence->file); 3521 args->rsvd2 &= GENMASK_ULL(31, 0); /* keep in-fence */ 3522 args->rsvd2 |= (u64)out_fence_fd << 32; 3523 out_fence_fd = -1; 3524 } else { 3525 fput(out_fence->file); 3526 } 3527 } 3528 i915_request_put(eb.request); 3529 3530 err_vma: 3531 eb_release_vmas(&eb, true, true); 3532 if (eb.trampoline) 3533 i915_vma_unpin(eb.trampoline); 3534 WARN_ON(err == -EDEADLK); 3535 i915_gem_ww_ctx_fini(&eb.ww); 3536 3537 if (eb.batch_pool) 3538 intel_gt_buffer_pool_put(eb.batch_pool); 3539 if (eb.reloc_pool) 3540 intel_gt_buffer_pool_put(eb.reloc_pool); 3541 if (eb.reloc_context) 3542 intel_context_put(eb.reloc_context); 3543 err_engine: 3544 eb_put_engine(&eb); 3545 err_context: 3546 i915_gem_context_put(eb.gem_context); 3547 err_destroy: 3548 eb_destroy(&eb); 3549 err_out_fence: 3550 if (out_fence_fd != -1) 3551 put_unused_fd(out_fence_fd); 3552 err_in_fence: 3553 dma_fence_put(in_fence); 3554 err_ext: 3555 put_fence_array(eb.fences, eb.num_fences); 3556 return err; 3557 } 3558 3559 static size_t eb_element_size(void) 3560 { 3561 return sizeof(struct drm_i915_gem_exec_object2) + sizeof(struct eb_vma); 3562 } 3563 3564 static bool check_buffer_count(size_t count) 3565 { 3566 const size_t sz = eb_element_size(); 3567 3568 /* 3569 * When using LUT_HANDLE, we impose a limit of INT_MAX for the lookup 3570 * array size (see eb_create()). Otherwise, we can accept an array as 3571 * large as can be addressed (though use large arrays at your peril)! 3572 */ 3573 3574 return !(count < 1 || count > INT_MAX || count > SIZE_MAX / sz - 1); 3575 } 3576 3577 int 3578 i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data, 3579 struct drm_file *file) 3580 { 3581 struct drm_i915_private *i915 = to_i915(dev); 3582 struct drm_i915_gem_execbuffer2 *args = data; 3583 struct drm_i915_gem_exec_object2 *exec2_list; 3584 const size_t count = args->buffer_count; 3585 int err; 3586 3587 if (!check_buffer_count(count)) { 3588 drm_dbg(&i915->drm, "execbuf2 with %zd buffers\n", count); 3589 return -EINVAL; 3590 } 3591 3592 err = i915_gem_check_execbuffer(args); 3593 if (err) 3594 return err; 3595 3596 /* Allocate extra slots for use by the command parser */ 3597 exec2_list = kvmalloc_array(count + 2, eb_element_size(), 3598 __GFP_NOWARN | GFP_KERNEL); 3599 if (exec2_list == NULL) { 3600 drm_dbg(&i915->drm, "Failed to allocate exec list for %zd buffers\n", 3601 count); 3602 return -ENOMEM; 3603 } 3604 if (copy_from_user(exec2_list, 3605 u64_to_user_ptr(args->buffers_ptr), 3606 sizeof(*exec2_list) * count)) { 3607 drm_dbg(&i915->drm, "copy %zd exec entries failed\n", count); 3608 kvfree(exec2_list); 3609 return -EFAULT; 3610 } 3611 3612 err = i915_gem_do_execbuffer(dev, file, args, exec2_list); 3613 3614 /* 3615 * Now that we have begun execution of the batchbuffer, we ignore 3616 * any new error after this point. Also given that we have already 3617 * updated the associated relocations, we try to write out the current 3618 * object locations irrespective of any error. 3619 */ 3620 if (args->flags & __EXEC_HAS_RELOC) { 3621 struct drm_i915_gem_exec_object2 __user *user_exec_list = 3622 u64_to_user_ptr(args->buffers_ptr); 3623 unsigned int i; 3624 3625 /* Copy the new buffer offsets back to the user's exec list. */ 3626 /* 3627 * Note: count * sizeof(*user_exec_list) does not overflow, 3628 * because we checked 'count' in check_buffer_count(). 3629 * 3630 * And this range already got effectively checked earlier 3631 * when we did the "copy_from_user()" above. 3632 */ 3633 if (!user_write_access_begin(user_exec_list, 3634 count * sizeof(*user_exec_list))) 3635 goto end; 3636 3637 for (i = 0; i < args->buffer_count; i++) { 3638 if (!(exec2_list[i].offset & UPDATE)) 3639 continue; 3640 3641 exec2_list[i].offset = 3642 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 3643 unsafe_put_user(exec2_list[i].offset, 3644 &user_exec_list[i].offset, 3645 end_user); 3646 } 3647 end_user: 3648 user_write_access_end(); 3649 end:; 3650 } 3651 3652 args->flags &= ~__I915_EXEC_UNKNOWN_FLAGS; 3653 kvfree(exec2_list); 3654 return err; 3655 } 3656 3657 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 3658 #include "selftests/i915_gem_execbuffer.c" 3659 #endif 3660