1 /* 2 * SPDX-License-Identifier: MIT 3 * 4 * Copyright © 2008,2010 Intel Corporation 5 */ 6 7 #include <linux/intel-iommu.h> 8 #include <linux/dma-resv.h> 9 #include <linux/sync_file.h> 10 #include <linux/uaccess.h> 11 12 #include <drm/drm_syncobj.h> 13 #include <drm/i915_drm.h> 14 15 #include "display/intel_frontbuffer.h" 16 17 #include "gem/i915_gem_ioctls.h" 18 #include "gt/intel_context.h" 19 #include "gt/intel_engine_pool.h" 20 #include "gt/intel_gt.h" 21 #include "gt/intel_gt_pm.h" 22 #include "gt/intel_ring.h" 23 24 #include "i915_drv.h" 25 #include "i915_gem_clflush.h" 26 #include "i915_gem_context.h" 27 #include "i915_gem_ioctls.h" 28 #include "i915_sw_fence_work.h" 29 #include "i915_trace.h" 30 31 enum { 32 FORCE_CPU_RELOC = 1, 33 FORCE_GTT_RELOC, 34 FORCE_GPU_RELOC, 35 #define DBG_FORCE_RELOC 0 /* choose one of the above! */ 36 }; 37 38 #define __EXEC_OBJECT_HAS_REF BIT(31) 39 #define __EXEC_OBJECT_HAS_PIN BIT(30) 40 #define __EXEC_OBJECT_HAS_FENCE BIT(29) 41 #define __EXEC_OBJECT_NEEDS_MAP BIT(28) 42 #define __EXEC_OBJECT_NEEDS_BIAS BIT(27) 43 #define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */ 44 #define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE) 45 46 #define __EXEC_HAS_RELOC BIT(31) 47 #define __EXEC_VALIDATED BIT(30) 48 #define __EXEC_INTERNAL_FLAGS (~0u << 30) 49 #define UPDATE PIN_OFFSET_FIXED 50 51 #define BATCH_OFFSET_BIAS (256*1024) 52 53 #define __I915_EXEC_ILLEGAL_FLAGS \ 54 (__I915_EXEC_UNKNOWN_FLAGS | \ 55 I915_EXEC_CONSTANTS_MASK | \ 56 I915_EXEC_RESOURCE_STREAMER) 57 58 /* Catch emission of unexpected errors for CI! */ 59 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 60 #undef EINVAL 61 #define EINVAL ({ \ 62 DRM_DEBUG_DRIVER("EINVAL at %s:%d\n", __func__, __LINE__); \ 63 22; \ 64 }) 65 #endif 66 67 /** 68 * DOC: User command execution 69 * 70 * Userspace submits commands to be executed on the GPU as an instruction 71 * stream within a GEM object we call a batchbuffer. This instructions may 72 * refer to other GEM objects containing auxiliary state such as kernels, 73 * samplers, render targets and even secondary batchbuffers. Userspace does 74 * not know where in the GPU memory these objects reside and so before the 75 * batchbuffer is passed to the GPU for execution, those addresses in the 76 * batchbuffer and auxiliary objects are updated. This is known as relocation, 77 * or patching. To try and avoid having to relocate each object on the next 78 * execution, userspace is told the location of those objects in this pass, 79 * but this remains just a hint as the kernel may choose a new location for 80 * any object in the future. 81 * 82 * At the level of talking to the hardware, submitting a batchbuffer for the 83 * GPU to execute is to add content to a buffer from which the HW 84 * command streamer is reading. 85 * 86 * 1. Add a command to load the HW context. For Logical Ring Contexts, i.e. 87 * Execlists, this command is not placed on the same buffer as the 88 * remaining items. 89 * 90 * 2. Add a command to invalidate caches to the buffer. 91 * 92 * 3. Add a batchbuffer start command to the buffer; the start command is 93 * essentially a token together with the GPU address of the batchbuffer 94 * to be executed. 95 * 96 * 4. Add a pipeline flush to the buffer. 97 * 98 * 5. Add a memory write command to the buffer to record when the GPU 99 * is done executing the batchbuffer. The memory write writes the 100 * global sequence number of the request, ``i915_request::global_seqno``; 101 * the i915 driver uses the current value in the register to determine 102 * if the GPU has completed the batchbuffer. 103 * 104 * 6. Add a user interrupt command to the buffer. This command instructs 105 * the GPU to issue an interrupt when the command, pipeline flush and 106 * memory write are completed. 107 * 108 * 7. Inform the hardware of the additional commands added to the buffer 109 * (by updating the tail pointer). 110 * 111 * Processing an execbuf ioctl is conceptually split up into a few phases. 112 * 113 * 1. Validation - Ensure all the pointers, handles and flags are valid. 114 * 2. Reservation - Assign GPU address space for every object 115 * 3. Relocation - Update any addresses to point to the final locations 116 * 4. Serialisation - Order the request with respect to its dependencies 117 * 5. Construction - Construct a request to execute the batchbuffer 118 * 6. Submission (at some point in the future execution) 119 * 120 * Reserving resources for the execbuf is the most complicated phase. We 121 * neither want to have to migrate the object in the address space, nor do 122 * we want to have to update any relocations pointing to this object. Ideally, 123 * we want to leave the object where it is and for all the existing relocations 124 * to match. If the object is given a new address, or if userspace thinks the 125 * object is elsewhere, we have to parse all the relocation entries and update 126 * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that 127 * all the target addresses in all of its objects match the value in the 128 * relocation entries and that they all match the presumed offsets given by the 129 * list of execbuffer objects. Using this knowledge, we know that if we haven't 130 * moved any buffers, all the relocation entries are valid and we can skip 131 * the update. (If userspace is wrong, the likely outcome is an impromptu GPU 132 * hang.) The requirement for using I915_EXEC_NO_RELOC are: 133 * 134 * The addresses written in the objects must match the corresponding 135 * reloc.presumed_offset which in turn must match the corresponding 136 * execobject.offset. 137 * 138 * Any render targets written to in the batch must be flagged with 139 * EXEC_OBJECT_WRITE. 140 * 141 * To avoid stalling, execobject.offset should match the current 142 * address of that object within the active context. 143 * 144 * The reservation is done is multiple phases. First we try and keep any 145 * object already bound in its current location - so as long as meets the 146 * constraints imposed by the new execbuffer. Any object left unbound after the 147 * first pass is then fitted into any available idle space. If an object does 148 * not fit, all objects are removed from the reservation and the process rerun 149 * after sorting the objects into a priority order (more difficult to fit 150 * objects are tried first). Failing that, the entire VM is cleared and we try 151 * to fit the execbuf once last time before concluding that it simply will not 152 * fit. 153 * 154 * A small complication to all of this is that we allow userspace not only to 155 * specify an alignment and a size for the object in the address space, but 156 * we also allow userspace to specify the exact offset. This objects are 157 * simpler to place (the location is known a priori) all we have to do is make 158 * sure the space is available. 159 * 160 * Once all the objects are in place, patching up the buried pointers to point 161 * to the final locations is a fairly simple job of walking over the relocation 162 * entry arrays, looking up the right address and rewriting the value into 163 * the object. Simple! ... The relocation entries are stored in user memory 164 * and so to access them we have to copy them into a local buffer. That copy 165 * has to avoid taking any pagefaults as they may lead back to a GEM object 166 * requiring the struct_mutex (i.e. recursive deadlock). So once again we split 167 * the relocation into multiple passes. First we try to do everything within an 168 * atomic context (avoid the pagefaults) which requires that we never wait. If 169 * we detect that we may wait, or if we need to fault, then we have to fallback 170 * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm 171 * bells yet?) Dropping the mutex means that we lose all the state we have 172 * built up so far for the execbuf and we must reset any global data. However, 173 * we do leave the objects pinned in their final locations - which is a 174 * potential issue for concurrent execbufs. Once we have left the mutex, we can 175 * allocate and copy all the relocation entries into a large array at our 176 * leisure, reacquire the mutex, reclaim all the objects and other state and 177 * then proceed to update any incorrect addresses with the objects. 178 * 179 * As we process the relocation entries, we maintain a record of whether the 180 * object is being written to. Using NORELOC, we expect userspace to provide 181 * this information instead. We also check whether we can skip the relocation 182 * by comparing the expected value inside the relocation entry with the target's 183 * final address. If they differ, we have to map the current object and rewrite 184 * the 4 or 8 byte pointer within. 185 * 186 * Serialising an execbuf is quite simple according to the rules of the GEM 187 * ABI. Execution within each context is ordered by the order of submission. 188 * Writes to any GEM object are in order of submission and are exclusive. Reads 189 * from a GEM object are unordered with respect to other reads, but ordered by 190 * writes. A write submitted after a read cannot occur before the read, and 191 * similarly any read submitted after a write cannot occur before the write. 192 * Writes are ordered between engines such that only one write occurs at any 193 * time (completing any reads beforehand) - using semaphores where available 194 * and CPU serialisation otherwise. Other GEM access obey the same rules, any 195 * write (either via mmaps using set-domain, or via pwrite) must flush all GPU 196 * reads before starting, and any read (either using set-domain or pread) must 197 * flush all GPU writes before starting. (Note we only employ a barrier before, 198 * we currently rely on userspace not concurrently starting a new execution 199 * whilst reading or writing to an object. This may be an advantage or not 200 * depending on how much you trust userspace not to shoot themselves in the 201 * foot.) Serialisation may just result in the request being inserted into 202 * a DAG awaiting its turn, but most simple is to wait on the CPU until 203 * all dependencies are resolved. 204 * 205 * After all of that, is just a matter of closing the request and handing it to 206 * the hardware (well, leaving it in a queue to be executed). However, we also 207 * offer the ability for batchbuffers to be run with elevated privileges so 208 * that they access otherwise hidden registers. (Used to adjust L3 cache etc.) 209 * Before any batch is given extra privileges we first must check that it 210 * contains no nefarious instructions, we check that each instruction is from 211 * our whitelist and all registers are also from an allowed list. We first 212 * copy the user's batchbuffer to a shadow (so that the user doesn't have 213 * access to it, either by the CPU or GPU as we scan it) and then parse each 214 * instruction. If everything is ok, we set a flag telling the hardware to run 215 * the batchbuffer in trusted mode, otherwise the ioctl is rejected. 216 */ 217 218 struct i915_execbuffer { 219 struct drm_i915_private *i915; /** i915 backpointer */ 220 struct drm_file *file; /** per-file lookup tables and limits */ 221 struct drm_i915_gem_execbuffer2 *args; /** ioctl parameters */ 222 struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ 223 struct i915_vma **vma; 224 unsigned int *flags; 225 226 struct intel_engine_cs *engine; /** engine to queue the request to */ 227 struct intel_context *context; /* logical state for the request */ 228 struct i915_gem_context *gem_context; /** caller's context */ 229 230 struct i915_request *request; /** our request to build */ 231 struct i915_vma *batch; /** identity of the batch obj/vma */ 232 struct i915_vma *trampoline; /** trampoline used for chaining */ 233 234 /** actual size of execobj[] as we may extend it for the cmdparser */ 235 unsigned int buffer_count; 236 237 /** list of vma not yet bound during reservation phase */ 238 struct list_head unbound; 239 240 /** list of vma that have execobj.relocation_count */ 241 struct list_head relocs; 242 243 /** 244 * Track the most recently used object for relocations, as we 245 * frequently have to perform multiple relocations within the same 246 * obj/page 247 */ 248 struct reloc_cache { 249 struct drm_mm_node node; /** temporary GTT binding */ 250 unsigned long vaddr; /** Current kmap address */ 251 unsigned long page; /** Currently mapped page index */ 252 unsigned int gen; /** Cached value of INTEL_GEN */ 253 bool use_64bit_reloc : 1; 254 bool has_llc : 1; 255 bool has_fence : 1; 256 bool needs_unfenced : 1; 257 258 struct i915_request *rq; 259 u32 *rq_cmd; 260 unsigned int rq_size; 261 } reloc_cache; 262 263 u64 invalid_flags; /** Set of execobj.flags that are invalid */ 264 u32 context_flags; /** Set of execobj.flags to insert from the ctx */ 265 266 u32 batch_start_offset; /** Location within object of batch */ 267 u32 batch_len; /** Length of batch within object */ 268 u32 batch_flags; /** Flags composed for emit_bb_start() */ 269 270 /** 271 * Indicate either the size of the hastable used to resolve 272 * relocation handles, or if negative that we are using a direct 273 * index into the execobj[]. 274 */ 275 int lut_size; 276 struct hlist_head *buckets; /** ht for relocation handles */ 277 }; 278 279 #define exec_entry(EB, VMA) (&(EB)->exec[(VMA)->exec_flags - (EB)->flags]) 280 281 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) 282 { 283 return intel_engine_requires_cmd_parser(eb->engine) || 284 (intel_engine_using_cmd_parser(eb->engine) && 285 eb->args->batch_len); 286 } 287 288 static int eb_create(struct i915_execbuffer *eb) 289 { 290 if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) { 291 unsigned int size = 1 + ilog2(eb->buffer_count); 292 293 /* 294 * Without a 1:1 association between relocation handles and 295 * the execobject[] index, we instead create a hashtable. 296 * We size it dynamically based on available memory, starting 297 * first with 1:1 assocative hash and scaling back until 298 * the allocation succeeds. 299 * 300 * Later on we use a positive lut_size to indicate we are 301 * using this hashtable, and a negative value to indicate a 302 * direct lookup. 303 */ 304 do { 305 gfp_t flags; 306 307 /* While we can still reduce the allocation size, don't 308 * raise a warning and allow the allocation to fail. 309 * On the last pass though, we want to try as hard 310 * as possible to perform the allocation and warn 311 * if it fails. 312 */ 313 flags = GFP_KERNEL; 314 if (size > 1) 315 flags |= __GFP_NORETRY | __GFP_NOWARN; 316 317 eb->buckets = kzalloc(sizeof(struct hlist_head) << size, 318 flags); 319 if (eb->buckets) 320 break; 321 } while (--size); 322 323 if (unlikely(!size)) 324 return -ENOMEM; 325 326 eb->lut_size = size; 327 } else { 328 eb->lut_size = -eb->buffer_count; 329 } 330 331 return 0; 332 } 333 334 static bool 335 eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry, 336 const struct i915_vma *vma, 337 unsigned int flags) 338 { 339 if (vma->node.size < entry->pad_to_size) 340 return true; 341 342 if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment)) 343 return true; 344 345 if (flags & EXEC_OBJECT_PINNED && 346 vma->node.start != entry->offset) 347 return true; 348 349 if (flags & __EXEC_OBJECT_NEEDS_BIAS && 350 vma->node.start < BATCH_OFFSET_BIAS) 351 return true; 352 353 if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) && 354 (vma->node.start + vma->node.size - 1) >> 32) 355 return true; 356 357 if (flags & __EXEC_OBJECT_NEEDS_MAP && 358 !i915_vma_is_map_and_fenceable(vma)) 359 return true; 360 361 return false; 362 } 363 364 static inline bool 365 eb_pin_vma(struct i915_execbuffer *eb, 366 const struct drm_i915_gem_exec_object2 *entry, 367 struct i915_vma *vma) 368 { 369 unsigned int exec_flags = *vma->exec_flags; 370 u64 pin_flags; 371 372 if (vma->node.size) 373 pin_flags = vma->node.start; 374 else 375 pin_flags = entry->offset & PIN_OFFSET_MASK; 376 377 pin_flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED; 378 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_GTT)) 379 pin_flags |= PIN_GLOBAL; 380 381 if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags))) 382 return false; 383 384 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) { 385 if (unlikely(i915_vma_pin_fence(vma))) { 386 i915_vma_unpin(vma); 387 return false; 388 } 389 390 if (vma->fence) 391 exec_flags |= __EXEC_OBJECT_HAS_FENCE; 392 } 393 394 *vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN; 395 return !eb_vma_misplaced(entry, vma, exec_flags); 396 } 397 398 static inline void __eb_unreserve_vma(struct i915_vma *vma, unsigned int flags) 399 { 400 GEM_BUG_ON(!(flags & __EXEC_OBJECT_HAS_PIN)); 401 402 if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE)) 403 __i915_vma_unpin_fence(vma); 404 405 __i915_vma_unpin(vma); 406 } 407 408 static inline void 409 eb_unreserve_vma(struct i915_vma *vma, unsigned int *flags) 410 { 411 if (!(*flags & __EXEC_OBJECT_HAS_PIN)) 412 return; 413 414 __eb_unreserve_vma(vma, *flags); 415 *flags &= ~__EXEC_OBJECT_RESERVED; 416 } 417 418 static int 419 eb_validate_vma(struct i915_execbuffer *eb, 420 struct drm_i915_gem_exec_object2 *entry, 421 struct i915_vma *vma) 422 { 423 if (unlikely(entry->flags & eb->invalid_flags)) 424 return -EINVAL; 425 426 if (unlikely(entry->alignment && !is_power_of_2(entry->alignment))) 427 return -EINVAL; 428 429 /* 430 * Offset can be used as input (EXEC_OBJECT_PINNED), reject 431 * any non-page-aligned or non-canonical addresses. 432 */ 433 if (unlikely(entry->flags & EXEC_OBJECT_PINNED && 434 entry->offset != gen8_canonical_addr(entry->offset & I915_GTT_PAGE_MASK))) 435 return -EINVAL; 436 437 /* pad_to_size was once a reserved field, so sanitize it */ 438 if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) { 439 if (unlikely(offset_in_page(entry->pad_to_size))) 440 return -EINVAL; 441 } else { 442 entry->pad_to_size = 0; 443 } 444 445 if (unlikely(vma->exec_flags)) { 446 DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n", 447 entry->handle, (int)(entry - eb->exec)); 448 return -EINVAL; 449 } 450 451 /* 452 * From drm_mm perspective address space is continuous, 453 * so from this point we're always using non-canonical 454 * form internally. 455 */ 456 entry->offset = gen8_noncanonical_addr(entry->offset); 457 458 if (!eb->reloc_cache.has_fence) { 459 entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE; 460 } else { 461 if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE || 462 eb->reloc_cache.needs_unfenced) && 463 i915_gem_object_is_tiled(vma->obj)) 464 entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP; 465 } 466 467 if (!(entry->flags & EXEC_OBJECT_PINNED)) 468 entry->flags |= eb->context_flags; 469 470 return 0; 471 } 472 473 static int 474 eb_add_vma(struct i915_execbuffer *eb, 475 unsigned int i, unsigned batch_idx, 476 struct i915_vma *vma) 477 { 478 struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; 479 int err; 480 481 GEM_BUG_ON(i915_vma_is_closed(vma)); 482 483 if (!(eb->args->flags & __EXEC_VALIDATED)) { 484 err = eb_validate_vma(eb, entry, vma); 485 if (unlikely(err)) 486 return err; 487 } 488 489 if (eb->lut_size > 0) { 490 vma->exec_handle = entry->handle; 491 hlist_add_head(&vma->exec_node, 492 &eb->buckets[hash_32(entry->handle, 493 eb->lut_size)]); 494 } 495 496 if (entry->relocation_count) 497 list_add_tail(&vma->reloc_link, &eb->relocs); 498 499 /* 500 * Stash a pointer from the vma to execobj, so we can query its flags, 501 * size, alignment etc as provided by the user. Also we stash a pointer 502 * to the vma inside the execobj so that we can use a direct lookup 503 * to find the right target VMA when doing relocations. 504 */ 505 eb->vma[i] = vma; 506 eb->flags[i] = entry->flags; 507 vma->exec_flags = &eb->flags[i]; 508 509 /* 510 * SNA is doing fancy tricks with compressing batch buffers, which leads 511 * to negative relocation deltas. Usually that works out ok since the 512 * relocate address is still positive, except when the batch is placed 513 * very low in the GTT. Ensure this doesn't happen. 514 * 515 * Note that actual hangs have only been observed on gen7, but for 516 * paranoia do it everywhere. 517 */ 518 if (i == batch_idx) { 519 if (entry->relocation_count && 520 !(eb->flags[i] & EXEC_OBJECT_PINNED)) 521 eb->flags[i] |= __EXEC_OBJECT_NEEDS_BIAS; 522 if (eb->reloc_cache.has_fence) 523 eb->flags[i] |= EXEC_OBJECT_NEEDS_FENCE; 524 525 eb->batch = vma; 526 } 527 528 err = 0; 529 if (eb_pin_vma(eb, entry, vma)) { 530 if (entry->offset != vma->node.start) { 531 entry->offset = vma->node.start | UPDATE; 532 eb->args->flags |= __EXEC_HAS_RELOC; 533 } 534 } else { 535 eb_unreserve_vma(vma, vma->exec_flags); 536 537 list_add_tail(&vma->exec_link, &eb->unbound); 538 if (drm_mm_node_allocated(&vma->node)) 539 err = i915_vma_unbind(vma); 540 if (unlikely(err)) 541 vma->exec_flags = NULL; 542 } 543 return err; 544 } 545 546 static inline int use_cpu_reloc(const struct reloc_cache *cache, 547 const struct drm_i915_gem_object *obj) 548 { 549 if (!i915_gem_object_has_struct_page(obj)) 550 return false; 551 552 if (DBG_FORCE_RELOC == FORCE_CPU_RELOC) 553 return true; 554 555 if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) 556 return false; 557 558 return (cache->has_llc || 559 obj->cache_dirty || 560 obj->cache_level != I915_CACHE_NONE); 561 } 562 563 static int eb_reserve_vma(const struct i915_execbuffer *eb, 564 struct i915_vma *vma) 565 { 566 struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 567 unsigned int exec_flags = *vma->exec_flags; 568 u64 pin_flags; 569 int err; 570 571 pin_flags = PIN_USER | PIN_NONBLOCK; 572 if (exec_flags & EXEC_OBJECT_NEEDS_GTT) 573 pin_flags |= PIN_GLOBAL; 574 575 /* 576 * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset, 577 * limit address to the first 4GBs for unflagged objects. 578 */ 579 if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 580 pin_flags |= PIN_ZONE_4G; 581 582 if (exec_flags & __EXEC_OBJECT_NEEDS_MAP) 583 pin_flags |= PIN_MAPPABLE; 584 585 if (exec_flags & EXEC_OBJECT_PINNED) { 586 pin_flags |= entry->offset | PIN_OFFSET_FIXED; 587 pin_flags &= ~PIN_NONBLOCK; /* force overlapping checks */ 588 } else if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS) { 589 pin_flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS; 590 } 591 592 err = i915_vma_pin(vma, 593 entry->pad_to_size, entry->alignment, 594 pin_flags); 595 if (err) 596 return err; 597 598 if (entry->offset != vma->node.start) { 599 entry->offset = vma->node.start | UPDATE; 600 eb->args->flags |= __EXEC_HAS_RELOC; 601 } 602 603 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) { 604 err = i915_vma_pin_fence(vma); 605 if (unlikely(err)) { 606 i915_vma_unpin(vma); 607 return err; 608 } 609 610 if (vma->fence) 611 exec_flags |= __EXEC_OBJECT_HAS_FENCE; 612 } 613 614 *vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN; 615 GEM_BUG_ON(eb_vma_misplaced(entry, vma, exec_flags)); 616 617 return 0; 618 } 619 620 static int eb_reserve(struct i915_execbuffer *eb) 621 { 622 const unsigned int count = eb->buffer_count; 623 struct list_head last; 624 struct i915_vma *vma; 625 unsigned int i, pass; 626 int err; 627 628 /* 629 * Attempt to pin all of the buffers into the GTT. 630 * This is done in 3 phases: 631 * 632 * 1a. Unbind all objects that do not match the GTT constraints for 633 * the execbuffer (fenceable, mappable, alignment etc). 634 * 1b. Increment pin count for already bound objects. 635 * 2. Bind new objects. 636 * 3. Decrement pin count. 637 * 638 * This avoid unnecessary unbinding of later objects in order to make 639 * room for the earlier objects *unless* we need to defragment. 640 */ 641 642 pass = 0; 643 err = 0; 644 do { 645 list_for_each_entry(vma, &eb->unbound, exec_link) { 646 err = eb_reserve_vma(eb, vma); 647 if (err) 648 break; 649 } 650 if (err != -ENOSPC) 651 return err; 652 653 /* Resort *all* the objects into priority order */ 654 INIT_LIST_HEAD(&eb->unbound); 655 INIT_LIST_HEAD(&last); 656 for (i = 0; i < count; i++) { 657 unsigned int flags = eb->flags[i]; 658 struct i915_vma *vma = eb->vma[i]; 659 660 if (flags & EXEC_OBJECT_PINNED && 661 flags & __EXEC_OBJECT_HAS_PIN) 662 continue; 663 664 eb_unreserve_vma(vma, &eb->flags[i]); 665 666 if (flags & EXEC_OBJECT_PINNED) 667 /* Pinned must have their slot */ 668 list_add(&vma->exec_link, &eb->unbound); 669 else if (flags & __EXEC_OBJECT_NEEDS_MAP) 670 /* Map require the lowest 256MiB (aperture) */ 671 list_add_tail(&vma->exec_link, &eb->unbound); 672 else if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 673 /* Prioritise 4GiB region for restricted bo */ 674 list_add(&vma->exec_link, &last); 675 else 676 list_add_tail(&vma->exec_link, &last); 677 } 678 list_splice_tail(&last, &eb->unbound); 679 680 switch (pass++) { 681 case 0: 682 break; 683 684 case 1: 685 /* Too fragmented, unbind everything and retry */ 686 mutex_lock(&eb->context->vm->mutex); 687 err = i915_gem_evict_vm(eb->context->vm); 688 mutex_unlock(&eb->context->vm->mutex); 689 if (err) 690 return err; 691 break; 692 693 default: 694 return -ENOSPC; 695 } 696 } while (1); 697 } 698 699 static unsigned int eb_batch_index(const struct i915_execbuffer *eb) 700 { 701 if (eb->args->flags & I915_EXEC_BATCH_FIRST) 702 return 0; 703 else 704 return eb->buffer_count - 1; 705 } 706 707 static int eb_select_context(struct i915_execbuffer *eb) 708 { 709 struct i915_gem_context *ctx; 710 711 ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1); 712 if (unlikely(!ctx)) 713 return -ENOENT; 714 715 eb->gem_context = ctx; 716 if (rcu_access_pointer(ctx->vm)) 717 eb->invalid_flags |= EXEC_OBJECT_NEEDS_GTT; 718 719 eb->context_flags = 0; 720 if (test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags)) 721 eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS; 722 723 return 0; 724 } 725 726 static int eb_lookup_vmas(struct i915_execbuffer *eb) 727 { 728 struct radix_tree_root *handles_vma = &eb->gem_context->handles_vma; 729 struct drm_i915_gem_object *obj; 730 unsigned int i, batch; 731 int err; 732 733 INIT_LIST_HEAD(&eb->relocs); 734 INIT_LIST_HEAD(&eb->unbound); 735 736 batch = eb_batch_index(eb); 737 738 mutex_lock(&eb->gem_context->mutex); 739 if (unlikely(i915_gem_context_is_closed(eb->gem_context))) { 740 err = -ENOENT; 741 goto err_ctx; 742 } 743 744 for (i = 0; i < eb->buffer_count; i++) { 745 u32 handle = eb->exec[i].handle; 746 struct i915_lut_handle *lut; 747 struct i915_vma *vma; 748 749 vma = radix_tree_lookup(handles_vma, handle); 750 if (likely(vma)) 751 goto add_vma; 752 753 obj = i915_gem_object_lookup(eb->file, handle); 754 if (unlikely(!obj)) { 755 err = -ENOENT; 756 goto err_vma; 757 } 758 759 vma = i915_vma_instance(obj, eb->context->vm, NULL); 760 if (IS_ERR(vma)) { 761 err = PTR_ERR(vma); 762 goto err_obj; 763 } 764 765 lut = i915_lut_handle_alloc(); 766 if (unlikely(!lut)) { 767 err = -ENOMEM; 768 goto err_obj; 769 } 770 771 err = radix_tree_insert(handles_vma, handle, vma); 772 if (unlikely(err)) { 773 i915_lut_handle_free(lut); 774 goto err_obj; 775 } 776 777 /* transfer ref to lut */ 778 if (!atomic_fetch_inc(&vma->open_count)) 779 i915_vma_reopen(vma); 780 lut->handle = handle; 781 lut->ctx = eb->gem_context; 782 783 i915_gem_object_lock(obj); 784 list_add(&lut->obj_link, &obj->lut_list); 785 i915_gem_object_unlock(obj); 786 787 add_vma: 788 err = eb_add_vma(eb, i, batch, vma); 789 if (unlikely(err)) 790 goto err_vma; 791 792 GEM_BUG_ON(vma != eb->vma[i]); 793 GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); 794 GEM_BUG_ON(drm_mm_node_allocated(&vma->node) && 795 eb_vma_misplaced(&eb->exec[i], vma, eb->flags[i])); 796 } 797 798 mutex_unlock(&eb->gem_context->mutex); 799 800 eb->args->flags |= __EXEC_VALIDATED; 801 return eb_reserve(eb); 802 803 err_obj: 804 i915_gem_object_put(obj); 805 err_vma: 806 eb->vma[i] = NULL; 807 err_ctx: 808 mutex_unlock(&eb->gem_context->mutex); 809 return err; 810 } 811 812 static struct i915_vma * 813 eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle) 814 { 815 if (eb->lut_size < 0) { 816 if (handle >= -eb->lut_size) 817 return NULL; 818 return eb->vma[handle]; 819 } else { 820 struct hlist_head *head; 821 struct i915_vma *vma; 822 823 head = &eb->buckets[hash_32(handle, eb->lut_size)]; 824 hlist_for_each_entry(vma, head, exec_node) { 825 if (vma->exec_handle == handle) 826 return vma; 827 } 828 return NULL; 829 } 830 } 831 832 static void eb_release_vmas(const struct i915_execbuffer *eb) 833 { 834 const unsigned int count = eb->buffer_count; 835 unsigned int i; 836 837 for (i = 0; i < count; i++) { 838 struct i915_vma *vma = eb->vma[i]; 839 unsigned int flags = eb->flags[i]; 840 841 if (!vma) 842 break; 843 844 GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); 845 vma->exec_flags = NULL; 846 eb->vma[i] = NULL; 847 848 if (flags & __EXEC_OBJECT_HAS_PIN) 849 __eb_unreserve_vma(vma, flags); 850 851 if (flags & __EXEC_OBJECT_HAS_REF) 852 i915_vma_put(vma); 853 } 854 } 855 856 static void eb_reset_vmas(const struct i915_execbuffer *eb) 857 { 858 eb_release_vmas(eb); 859 if (eb->lut_size > 0) 860 memset(eb->buckets, 0, 861 sizeof(struct hlist_head) << eb->lut_size); 862 } 863 864 static void eb_destroy(const struct i915_execbuffer *eb) 865 { 866 GEM_BUG_ON(eb->reloc_cache.rq); 867 868 if (eb->lut_size > 0) 869 kfree(eb->buckets); 870 } 871 872 static inline u64 873 relocation_target(const struct drm_i915_gem_relocation_entry *reloc, 874 const struct i915_vma *target) 875 { 876 return gen8_canonical_addr((int)reloc->delta + target->node.start); 877 } 878 879 static void reloc_cache_init(struct reloc_cache *cache, 880 struct drm_i915_private *i915) 881 { 882 cache->page = -1; 883 cache->vaddr = 0; 884 /* Must be a variable in the struct to allow GCC to unroll. */ 885 cache->gen = INTEL_GEN(i915); 886 cache->has_llc = HAS_LLC(i915); 887 cache->use_64bit_reloc = HAS_64BIT_RELOC(i915); 888 cache->has_fence = cache->gen < 4; 889 cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment; 890 cache->node.flags = 0; 891 cache->rq = NULL; 892 cache->rq_size = 0; 893 } 894 895 static inline void *unmask_page(unsigned long p) 896 { 897 return (void *)(uintptr_t)(p & PAGE_MASK); 898 } 899 900 static inline unsigned int unmask_flags(unsigned long p) 901 { 902 return p & ~PAGE_MASK; 903 } 904 905 #define KMAP 0x4 /* after CLFLUSH_FLAGS */ 906 907 static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache) 908 { 909 struct drm_i915_private *i915 = 910 container_of(cache, struct i915_execbuffer, reloc_cache)->i915; 911 return &i915->ggtt; 912 } 913 914 static void reloc_gpu_flush(struct reloc_cache *cache) 915 { 916 GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32)); 917 cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END; 918 919 __i915_gem_object_flush_map(cache->rq->batch->obj, 0, cache->rq_size); 920 i915_gem_object_unpin_map(cache->rq->batch->obj); 921 922 intel_gt_chipset_flush(cache->rq->engine->gt); 923 924 i915_request_add(cache->rq); 925 cache->rq = NULL; 926 } 927 928 static void reloc_cache_reset(struct reloc_cache *cache) 929 { 930 void *vaddr; 931 932 if (cache->rq) 933 reloc_gpu_flush(cache); 934 935 if (!cache->vaddr) 936 return; 937 938 vaddr = unmask_page(cache->vaddr); 939 if (cache->vaddr & KMAP) { 940 if (cache->vaddr & CLFLUSH_AFTER) 941 mb(); 942 943 kunmap_atomic(vaddr); 944 i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm); 945 } else { 946 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 947 948 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 949 io_mapping_unmap_atomic((void __iomem *)vaddr); 950 951 if (drm_mm_node_allocated(&cache->node)) { 952 ggtt->vm.clear_range(&ggtt->vm, 953 cache->node.start, 954 cache->node.size); 955 mutex_lock(&ggtt->vm.mutex); 956 drm_mm_remove_node(&cache->node); 957 mutex_unlock(&ggtt->vm.mutex); 958 } else { 959 i915_vma_unpin((struct i915_vma *)cache->node.mm); 960 } 961 } 962 963 cache->vaddr = 0; 964 cache->page = -1; 965 } 966 967 static void *reloc_kmap(struct drm_i915_gem_object *obj, 968 struct reloc_cache *cache, 969 unsigned long page) 970 { 971 void *vaddr; 972 973 if (cache->vaddr) { 974 kunmap_atomic(unmask_page(cache->vaddr)); 975 } else { 976 unsigned int flushes; 977 int err; 978 979 err = i915_gem_object_prepare_write(obj, &flushes); 980 if (err) 981 return ERR_PTR(err); 982 983 BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS); 984 BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK); 985 986 cache->vaddr = flushes | KMAP; 987 cache->node.mm = (void *)obj; 988 if (flushes) 989 mb(); 990 } 991 992 vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page)); 993 cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr; 994 cache->page = page; 995 996 return vaddr; 997 } 998 999 static void *reloc_iomap(struct drm_i915_gem_object *obj, 1000 struct reloc_cache *cache, 1001 unsigned long page) 1002 { 1003 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 1004 unsigned long offset; 1005 void *vaddr; 1006 1007 if (cache->vaddr) { 1008 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 1009 io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); 1010 } else { 1011 struct i915_vma *vma; 1012 int err; 1013 1014 if (i915_gem_object_is_tiled(obj)) 1015 return ERR_PTR(-EINVAL); 1016 1017 if (use_cpu_reloc(cache, obj)) 1018 return NULL; 1019 1020 i915_gem_object_lock(obj); 1021 err = i915_gem_object_set_to_gtt_domain(obj, true); 1022 i915_gem_object_unlock(obj); 1023 if (err) 1024 return ERR_PTR(err); 1025 1026 vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 1027 PIN_MAPPABLE | 1028 PIN_NONBLOCK /* NOWARN */ | 1029 PIN_NOEVICT); 1030 if (IS_ERR(vma)) { 1031 memset(&cache->node, 0, sizeof(cache->node)); 1032 mutex_lock(&ggtt->vm.mutex); 1033 err = drm_mm_insert_node_in_range 1034 (&ggtt->vm.mm, &cache->node, 1035 PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE, 1036 0, ggtt->mappable_end, 1037 DRM_MM_INSERT_LOW); 1038 mutex_unlock(&ggtt->vm.mutex); 1039 if (err) /* no inactive aperture space, use cpu reloc */ 1040 return NULL; 1041 } else { 1042 cache->node.start = vma->node.start; 1043 cache->node.mm = (void *)vma; 1044 } 1045 } 1046 1047 offset = cache->node.start; 1048 if (drm_mm_node_allocated(&cache->node)) { 1049 ggtt->vm.insert_page(&ggtt->vm, 1050 i915_gem_object_get_dma_address(obj, page), 1051 offset, I915_CACHE_NONE, 0); 1052 } else { 1053 offset += page << PAGE_SHIFT; 1054 } 1055 1056 vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, 1057 offset); 1058 cache->page = page; 1059 cache->vaddr = (unsigned long)vaddr; 1060 1061 return vaddr; 1062 } 1063 1064 static void *reloc_vaddr(struct drm_i915_gem_object *obj, 1065 struct reloc_cache *cache, 1066 unsigned long page) 1067 { 1068 void *vaddr; 1069 1070 if (cache->page == page) { 1071 vaddr = unmask_page(cache->vaddr); 1072 } else { 1073 vaddr = NULL; 1074 if ((cache->vaddr & KMAP) == 0) 1075 vaddr = reloc_iomap(obj, cache, page); 1076 if (!vaddr) 1077 vaddr = reloc_kmap(obj, cache, page); 1078 } 1079 1080 return vaddr; 1081 } 1082 1083 static void clflush_write32(u32 *addr, u32 value, unsigned int flushes) 1084 { 1085 if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) { 1086 if (flushes & CLFLUSH_BEFORE) { 1087 clflushopt(addr); 1088 mb(); 1089 } 1090 1091 *addr = value; 1092 1093 /* 1094 * Writes to the same cacheline are serialised by the CPU 1095 * (including clflush). On the write path, we only require 1096 * that it hits memory in an orderly fashion and place 1097 * mb barriers at the start and end of the relocation phase 1098 * to ensure ordering of clflush wrt to the system. 1099 */ 1100 if (flushes & CLFLUSH_AFTER) 1101 clflushopt(addr); 1102 } else 1103 *addr = value; 1104 } 1105 1106 static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) 1107 { 1108 struct drm_i915_gem_object *obj = vma->obj; 1109 int err; 1110 1111 i915_vma_lock(vma); 1112 1113 if (obj->cache_dirty & ~obj->cache_coherent) 1114 i915_gem_clflush_object(obj, 0); 1115 obj->write_domain = 0; 1116 1117 err = i915_request_await_object(rq, vma->obj, true); 1118 if (err == 0) 1119 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); 1120 1121 i915_vma_unlock(vma); 1122 1123 return err; 1124 } 1125 1126 static int __reloc_gpu_alloc(struct i915_execbuffer *eb, 1127 struct i915_vma *vma, 1128 unsigned int len) 1129 { 1130 struct reloc_cache *cache = &eb->reloc_cache; 1131 struct intel_engine_pool_node *pool; 1132 struct i915_request *rq; 1133 struct i915_vma *batch; 1134 u32 *cmd; 1135 int err; 1136 1137 pool = intel_engine_get_pool(eb->engine, PAGE_SIZE); 1138 if (IS_ERR(pool)) 1139 return PTR_ERR(pool); 1140 1141 cmd = i915_gem_object_pin_map(pool->obj, 1142 cache->has_llc ? 1143 I915_MAP_FORCE_WB : 1144 I915_MAP_FORCE_WC); 1145 if (IS_ERR(cmd)) { 1146 err = PTR_ERR(cmd); 1147 goto out_pool; 1148 } 1149 1150 batch = i915_vma_instance(pool->obj, vma->vm, NULL); 1151 if (IS_ERR(batch)) { 1152 err = PTR_ERR(batch); 1153 goto err_unmap; 1154 } 1155 1156 err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK); 1157 if (err) 1158 goto err_unmap; 1159 1160 rq = i915_request_create(eb->context); 1161 if (IS_ERR(rq)) { 1162 err = PTR_ERR(rq); 1163 goto err_unpin; 1164 } 1165 1166 err = intel_engine_pool_mark_active(pool, rq); 1167 if (err) 1168 goto err_request; 1169 1170 err = reloc_move_to_gpu(rq, vma); 1171 if (err) 1172 goto err_request; 1173 1174 err = eb->engine->emit_bb_start(rq, 1175 batch->node.start, PAGE_SIZE, 1176 cache->gen > 5 ? 0 : I915_DISPATCH_SECURE); 1177 if (err) 1178 goto skip_request; 1179 1180 i915_vma_lock(batch); 1181 err = i915_request_await_object(rq, batch->obj, false); 1182 if (err == 0) 1183 err = i915_vma_move_to_active(batch, rq, 0); 1184 i915_vma_unlock(batch); 1185 if (err) 1186 goto skip_request; 1187 1188 rq->batch = batch; 1189 i915_vma_unpin(batch); 1190 1191 cache->rq = rq; 1192 cache->rq_cmd = cmd; 1193 cache->rq_size = 0; 1194 1195 /* Return with batch mapping (cmd) still pinned */ 1196 goto out_pool; 1197 1198 skip_request: 1199 i915_request_skip(rq, err); 1200 err_request: 1201 i915_request_add(rq); 1202 err_unpin: 1203 i915_vma_unpin(batch); 1204 err_unmap: 1205 i915_gem_object_unpin_map(pool->obj); 1206 out_pool: 1207 intel_engine_pool_put(pool); 1208 return err; 1209 } 1210 1211 static u32 *reloc_gpu(struct i915_execbuffer *eb, 1212 struct i915_vma *vma, 1213 unsigned int len) 1214 { 1215 struct reloc_cache *cache = &eb->reloc_cache; 1216 u32 *cmd; 1217 1218 if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1)) 1219 reloc_gpu_flush(cache); 1220 1221 if (unlikely(!cache->rq)) { 1222 int err; 1223 1224 if (!intel_engine_can_store_dword(eb->engine)) 1225 return ERR_PTR(-ENODEV); 1226 1227 err = __reloc_gpu_alloc(eb, vma, len); 1228 if (unlikely(err)) 1229 return ERR_PTR(err); 1230 } 1231 1232 cmd = cache->rq_cmd + cache->rq_size; 1233 cache->rq_size += len; 1234 1235 return cmd; 1236 } 1237 1238 static u64 1239 relocate_entry(struct i915_vma *vma, 1240 const struct drm_i915_gem_relocation_entry *reloc, 1241 struct i915_execbuffer *eb, 1242 const struct i915_vma *target) 1243 { 1244 u64 offset = reloc->offset; 1245 u64 target_offset = relocation_target(reloc, target); 1246 bool wide = eb->reloc_cache.use_64bit_reloc; 1247 void *vaddr; 1248 1249 if (!eb->reloc_cache.vaddr && 1250 (DBG_FORCE_RELOC == FORCE_GPU_RELOC || 1251 !dma_resv_test_signaled_rcu(vma->resv, true))) { 1252 const unsigned int gen = eb->reloc_cache.gen; 1253 unsigned int len; 1254 u32 *batch; 1255 u64 addr; 1256 1257 if (wide) 1258 len = offset & 7 ? 8 : 5; 1259 else if (gen >= 4) 1260 len = 4; 1261 else 1262 len = 3; 1263 1264 batch = reloc_gpu(eb, vma, len); 1265 if (IS_ERR(batch)) 1266 goto repeat; 1267 1268 addr = gen8_canonical_addr(vma->node.start + offset); 1269 if (wide) { 1270 if (offset & 7) { 1271 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1272 *batch++ = lower_32_bits(addr); 1273 *batch++ = upper_32_bits(addr); 1274 *batch++ = lower_32_bits(target_offset); 1275 1276 addr = gen8_canonical_addr(addr + 4); 1277 1278 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1279 *batch++ = lower_32_bits(addr); 1280 *batch++ = upper_32_bits(addr); 1281 *batch++ = upper_32_bits(target_offset); 1282 } else { 1283 *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1; 1284 *batch++ = lower_32_bits(addr); 1285 *batch++ = upper_32_bits(addr); 1286 *batch++ = lower_32_bits(target_offset); 1287 *batch++ = upper_32_bits(target_offset); 1288 } 1289 } else if (gen >= 6) { 1290 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1291 *batch++ = 0; 1292 *batch++ = addr; 1293 *batch++ = target_offset; 1294 } else if (gen >= 4) { 1295 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1296 *batch++ = 0; 1297 *batch++ = addr; 1298 *batch++ = target_offset; 1299 } else { 1300 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 1301 *batch++ = addr; 1302 *batch++ = target_offset; 1303 } 1304 1305 goto out; 1306 } 1307 1308 repeat: 1309 vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT); 1310 if (IS_ERR(vaddr)) 1311 return PTR_ERR(vaddr); 1312 1313 clflush_write32(vaddr + offset_in_page(offset), 1314 lower_32_bits(target_offset), 1315 eb->reloc_cache.vaddr); 1316 1317 if (wide) { 1318 offset += sizeof(u32); 1319 target_offset >>= 32; 1320 wide = false; 1321 goto repeat; 1322 } 1323 1324 out: 1325 return target->node.start | UPDATE; 1326 } 1327 1328 static u64 1329 eb_relocate_entry(struct i915_execbuffer *eb, 1330 struct i915_vma *vma, 1331 const struct drm_i915_gem_relocation_entry *reloc) 1332 { 1333 struct i915_vma *target; 1334 int err; 1335 1336 /* we've already hold a reference to all valid objects */ 1337 target = eb_get_vma(eb, reloc->target_handle); 1338 if (unlikely(!target)) 1339 return -ENOENT; 1340 1341 /* Validate that the target is in a valid r/w GPU domain */ 1342 if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) { 1343 DRM_DEBUG("reloc with multiple write domains: " 1344 "target %d offset %d " 1345 "read %08x write %08x", 1346 reloc->target_handle, 1347 (int) reloc->offset, 1348 reloc->read_domains, 1349 reloc->write_domain); 1350 return -EINVAL; 1351 } 1352 if (unlikely((reloc->write_domain | reloc->read_domains) 1353 & ~I915_GEM_GPU_DOMAINS)) { 1354 DRM_DEBUG("reloc with read/write non-GPU domains: " 1355 "target %d offset %d " 1356 "read %08x write %08x", 1357 reloc->target_handle, 1358 (int) reloc->offset, 1359 reloc->read_domains, 1360 reloc->write_domain); 1361 return -EINVAL; 1362 } 1363 1364 if (reloc->write_domain) { 1365 *target->exec_flags |= EXEC_OBJECT_WRITE; 1366 1367 /* 1368 * Sandybridge PPGTT errata: We need a global gtt mapping 1369 * for MI and pipe_control writes because the gpu doesn't 1370 * properly redirect them through the ppgtt for non_secure 1371 * batchbuffers. 1372 */ 1373 if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION && 1374 IS_GEN(eb->i915, 6)) { 1375 err = i915_vma_bind(target, target->obj->cache_level, 1376 PIN_GLOBAL, NULL); 1377 if (WARN_ONCE(err, 1378 "Unexpected failure to bind target VMA!")) 1379 return err; 1380 } 1381 } 1382 1383 /* 1384 * If the relocation already has the right value in it, no 1385 * more work needs to be done. 1386 */ 1387 if (!DBG_FORCE_RELOC && 1388 gen8_canonical_addr(target->node.start) == reloc->presumed_offset) 1389 return 0; 1390 1391 /* Check that the relocation address is valid... */ 1392 if (unlikely(reloc->offset > 1393 vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) { 1394 DRM_DEBUG("Relocation beyond object bounds: " 1395 "target %d offset %d size %d.\n", 1396 reloc->target_handle, 1397 (int)reloc->offset, 1398 (int)vma->size); 1399 return -EINVAL; 1400 } 1401 if (unlikely(reloc->offset & 3)) { 1402 DRM_DEBUG("Relocation not 4-byte aligned: " 1403 "target %d offset %d.\n", 1404 reloc->target_handle, 1405 (int)reloc->offset); 1406 return -EINVAL; 1407 } 1408 1409 /* 1410 * If we write into the object, we need to force the synchronisation 1411 * barrier, either with an asynchronous clflush or if we executed the 1412 * patching using the GPU (though that should be serialised by the 1413 * timeline). To be completely sure, and since we are required to 1414 * do relocations we are already stalling, disable the user's opt 1415 * out of our synchronisation. 1416 */ 1417 *vma->exec_flags &= ~EXEC_OBJECT_ASYNC; 1418 1419 /* and update the user's relocation entry */ 1420 return relocate_entry(vma, reloc, eb, target); 1421 } 1422 1423 static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma) 1424 { 1425 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry)) 1426 struct drm_i915_gem_relocation_entry stack[N_RELOC(512)]; 1427 struct drm_i915_gem_relocation_entry __user *urelocs; 1428 const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 1429 unsigned int remain; 1430 1431 urelocs = u64_to_user_ptr(entry->relocs_ptr); 1432 remain = entry->relocation_count; 1433 if (unlikely(remain > N_RELOC(ULONG_MAX))) 1434 return -EINVAL; 1435 1436 /* 1437 * We must check that the entire relocation array is safe 1438 * to read. However, if the array is not writable the user loses 1439 * the updated relocation values. 1440 */ 1441 if (unlikely(!access_ok(urelocs, remain*sizeof(*urelocs)))) 1442 return -EFAULT; 1443 1444 do { 1445 struct drm_i915_gem_relocation_entry *r = stack; 1446 unsigned int count = 1447 min_t(unsigned int, remain, ARRAY_SIZE(stack)); 1448 unsigned int copied; 1449 1450 /* 1451 * This is the fast path and we cannot handle a pagefault 1452 * whilst holding the struct mutex lest the user pass in the 1453 * relocations contained within a mmaped bo. For in such a case 1454 * we, the page fault handler would call i915_gem_fault() and 1455 * we would try to acquire the struct mutex again. Obviously 1456 * this is bad and so lockdep complains vehemently. 1457 */ 1458 pagefault_disable(); 1459 copied = __copy_from_user_inatomic(r, urelocs, count * sizeof(r[0])); 1460 pagefault_enable(); 1461 if (unlikely(copied)) { 1462 remain = -EFAULT; 1463 goto out; 1464 } 1465 1466 remain -= count; 1467 do { 1468 u64 offset = eb_relocate_entry(eb, vma, r); 1469 1470 if (likely(offset == 0)) { 1471 } else if ((s64)offset < 0) { 1472 remain = (int)offset; 1473 goto out; 1474 } else { 1475 /* 1476 * Note that reporting an error now 1477 * leaves everything in an inconsistent 1478 * state as we have *already* changed 1479 * the relocation value inside the 1480 * object. As we have not changed the 1481 * reloc.presumed_offset or will not 1482 * change the execobject.offset, on the 1483 * call we may not rewrite the value 1484 * inside the object, leaving it 1485 * dangling and causing a GPU hang. Unless 1486 * userspace dynamically rebuilds the 1487 * relocations on each execbuf rather than 1488 * presume a static tree. 1489 * 1490 * We did previously check if the relocations 1491 * were writable (access_ok), an error now 1492 * would be a strange race with mprotect, 1493 * having already demonstrated that we 1494 * can read from this userspace address. 1495 */ 1496 offset = gen8_canonical_addr(offset & ~UPDATE); 1497 if (unlikely(__put_user(offset, &urelocs[r-stack].presumed_offset))) { 1498 remain = -EFAULT; 1499 goto out; 1500 } 1501 } 1502 } while (r++, --count); 1503 urelocs += ARRAY_SIZE(stack); 1504 } while (remain); 1505 out: 1506 reloc_cache_reset(&eb->reloc_cache); 1507 return remain; 1508 } 1509 1510 static int 1511 eb_relocate_vma_slow(struct i915_execbuffer *eb, struct i915_vma *vma) 1512 { 1513 const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 1514 struct drm_i915_gem_relocation_entry *relocs = 1515 u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1516 unsigned int i; 1517 int err; 1518 1519 for (i = 0; i < entry->relocation_count; i++) { 1520 u64 offset = eb_relocate_entry(eb, vma, &relocs[i]); 1521 1522 if ((s64)offset < 0) { 1523 err = (int)offset; 1524 goto err; 1525 } 1526 } 1527 err = 0; 1528 err: 1529 reloc_cache_reset(&eb->reloc_cache); 1530 return err; 1531 } 1532 1533 static int check_relocations(const struct drm_i915_gem_exec_object2 *entry) 1534 { 1535 const char __user *addr, *end; 1536 unsigned long size; 1537 char __maybe_unused c; 1538 1539 size = entry->relocation_count; 1540 if (size == 0) 1541 return 0; 1542 1543 if (size > N_RELOC(ULONG_MAX)) 1544 return -EINVAL; 1545 1546 addr = u64_to_user_ptr(entry->relocs_ptr); 1547 size *= sizeof(struct drm_i915_gem_relocation_entry); 1548 if (!access_ok(addr, size)) 1549 return -EFAULT; 1550 1551 end = addr + size; 1552 for (; addr < end; addr += PAGE_SIZE) { 1553 int err = __get_user(c, addr); 1554 if (err) 1555 return err; 1556 } 1557 return __get_user(c, end - 1); 1558 } 1559 1560 static int eb_copy_relocations(const struct i915_execbuffer *eb) 1561 { 1562 struct drm_i915_gem_relocation_entry *relocs; 1563 const unsigned int count = eb->buffer_count; 1564 unsigned int i; 1565 int err; 1566 1567 for (i = 0; i < count; i++) { 1568 const unsigned int nreloc = eb->exec[i].relocation_count; 1569 struct drm_i915_gem_relocation_entry __user *urelocs; 1570 unsigned long size; 1571 unsigned long copied; 1572 1573 if (nreloc == 0) 1574 continue; 1575 1576 err = check_relocations(&eb->exec[i]); 1577 if (err) 1578 goto err; 1579 1580 urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr); 1581 size = nreloc * sizeof(*relocs); 1582 1583 relocs = kvmalloc_array(size, 1, GFP_KERNEL); 1584 if (!relocs) { 1585 err = -ENOMEM; 1586 goto err; 1587 } 1588 1589 /* copy_from_user is limited to < 4GiB */ 1590 copied = 0; 1591 do { 1592 unsigned int len = 1593 min_t(u64, BIT_ULL(31), size - copied); 1594 1595 if (__copy_from_user((char *)relocs + copied, 1596 (char __user *)urelocs + copied, 1597 len)) 1598 goto end; 1599 1600 copied += len; 1601 } while (copied < size); 1602 1603 /* 1604 * As we do not update the known relocation offsets after 1605 * relocating (due to the complexities in lock handling), 1606 * we need to mark them as invalid now so that we force the 1607 * relocation processing next time. Just in case the target 1608 * object is evicted and then rebound into its old 1609 * presumed_offset before the next execbuffer - if that 1610 * happened we would make the mistake of assuming that the 1611 * relocations were valid. 1612 */ 1613 if (!user_access_begin(urelocs, size)) 1614 goto end; 1615 1616 for (copied = 0; copied < nreloc; copied++) 1617 unsafe_put_user(-1, 1618 &urelocs[copied].presumed_offset, 1619 end_user); 1620 user_access_end(); 1621 1622 eb->exec[i].relocs_ptr = (uintptr_t)relocs; 1623 } 1624 1625 return 0; 1626 1627 end_user: 1628 user_access_end(); 1629 end: 1630 kvfree(relocs); 1631 err = -EFAULT; 1632 err: 1633 while (i--) { 1634 relocs = u64_to_ptr(typeof(*relocs), eb->exec[i].relocs_ptr); 1635 if (eb->exec[i].relocation_count) 1636 kvfree(relocs); 1637 } 1638 return err; 1639 } 1640 1641 static int eb_prefault_relocations(const struct i915_execbuffer *eb) 1642 { 1643 const unsigned int count = eb->buffer_count; 1644 unsigned int i; 1645 1646 if (unlikely(i915_modparams.prefault_disable)) 1647 return 0; 1648 1649 for (i = 0; i < count; i++) { 1650 int err; 1651 1652 err = check_relocations(&eb->exec[i]); 1653 if (err) 1654 return err; 1655 } 1656 1657 return 0; 1658 } 1659 1660 static noinline int eb_relocate_slow(struct i915_execbuffer *eb) 1661 { 1662 struct drm_device *dev = &eb->i915->drm; 1663 bool have_copy = false; 1664 struct i915_vma *vma; 1665 int err = 0; 1666 1667 repeat: 1668 if (signal_pending(current)) { 1669 err = -ERESTARTSYS; 1670 goto out; 1671 } 1672 1673 /* We may process another execbuffer during the unlock... */ 1674 eb_reset_vmas(eb); 1675 mutex_unlock(&dev->struct_mutex); 1676 1677 /* 1678 * We take 3 passes through the slowpatch. 1679 * 1680 * 1 - we try to just prefault all the user relocation entries and 1681 * then attempt to reuse the atomic pagefault disabled fast path again. 1682 * 1683 * 2 - we copy the user entries to a local buffer here outside of the 1684 * local and allow ourselves to wait upon any rendering before 1685 * relocations 1686 * 1687 * 3 - we already have a local copy of the relocation entries, but 1688 * were interrupted (EAGAIN) whilst waiting for the objects, try again. 1689 */ 1690 if (!err) { 1691 err = eb_prefault_relocations(eb); 1692 } else if (!have_copy) { 1693 err = eb_copy_relocations(eb); 1694 have_copy = err == 0; 1695 } else { 1696 cond_resched(); 1697 err = 0; 1698 } 1699 if (err) { 1700 mutex_lock(&dev->struct_mutex); 1701 goto out; 1702 } 1703 1704 /* A frequent cause for EAGAIN are currently unavailable client pages */ 1705 flush_workqueue(eb->i915->mm.userptr_wq); 1706 1707 err = i915_mutex_lock_interruptible(dev); 1708 if (err) { 1709 mutex_lock(&dev->struct_mutex); 1710 goto out; 1711 } 1712 1713 /* reacquire the objects */ 1714 err = eb_lookup_vmas(eb); 1715 if (err) 1716 goto err; 1717 1718 GEM_BUG_ON(!eb->batch); 1719 1720 list_for_each_entry(vma, &eb->relocs, reloc_link) { 1721 if (!have_copy) { 1722 pagefault_disable(); 1723 err = eb_relocate_vma(eb, vma); 1724 pagefault_enable(); 1725 if (err) 1726 goto repeat; 1727 } else { 1728 err = eb_relocate_vma_slow(eb, vma); 1729 if (err) 1730 goto err; 1731 } 1732 } 1733 1734 /* 1735 * Leave the user relocations as are, this is the painfully slow path, 1736 * and we want to avoid the complication of dropping the lock whilst 1737 * having buffers reserved in the aperture and so causing spurious 1738 * ENOSPC for random operations. 1739 */ 1740 1741 err: 1742 if (err == -EAGAIN) 1743 goto repeat; 1744 1745 out: 1746 if (have_copy) { 1747 const unsigned int count = eb->buffer_count; 1748 unsigned int i; 1749 1750 for (i = 0; i < count; i++) { 1751 const struct drm_i915_gem_exec_object2 *entry = 1752 &eb->exec[i]; 1753 struct drm_i915_gem_relocation_entry *relocs; 1754 1755 if (!entry->relocation_count) 1756 continue; 1757 1758 relocs = u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1759 kvfree(relocs); 1760 } 1761 } 1762 1763 return err; 1764 } 1765 1766 static int eb_relocate(struct i915_execbuffer *eb) 1767 { 1768 if (eb_lookup_vmas(eb)) 1769 goto slow; 1770 1771 /* The objects are in their final locations, apply the relocations. */ 1772 if (eb->args->flags & __EXEC_HAS_RELOC) { 1773 struct i915_vma *vma; 1774 1775 list_for_each_entry(vma, &eb->relocs, reloc_link) { 1776 if (eb_relocate_vma(eb, vma)) 1777 goto slow; 1778 } 1779 } 1780 1781 return 0; 1782 1783 slow: 1784 return eb_relocate_slow(eb); 1785 } 1786 1787 static int eb_move_to_gpu(struct i915_execbuffer *eb) 1788 { 1789 const unsigned int count = eb->buffer_count; 1790 struct ww_acquire_ctx acquire; 1791 unsigned int i; 1792 int err = 0; 1793 1794 ww_acquire_init(&acquire, &reservation_ww_class); 1795 1796 for (i = 0; i < count; i++) { 1797 struct i915_vma *vma = eb->vma[i]; 1798 1799 err = ww_mutex_lock_interruptible(&vma->resv->lock, &acquire); 1800 if (!err) 1801 continue; 1802 1803 GEM_BUG_ON(err == -EALREADY); /* No duplicate vma */ 1804 1805 if (err == -EDEADLK) { 1806 GEM_BUG_ON(i == 0); 1807 do { 1808 int j = i - 1; 1809 1810 ww_mutex_unlock(&eb->vma[j]->resv->lock); 1811 1812 swap(eb->flags[i], eb->flags[j]); 1813 swap(eb->vma[i], eb->vma[j]); 1814 eb->vma[i]->exec_flags = &eb->flags[i]; 1815 } while (--i); 1816 GEM_BUG_ON(vma != eb->vma[0]); 1817 vma->exec_flags = &eb->flags[0]; 1818 1819 err = ww_mutex_lock_slow_interruptible(&vma->resv->lock, 1820 &acquire); 1821 } 1822 if (err) 1823 break; 1824 } 1825 ww_acquire_done(&acquire); 1826 1827 while (i--) { 1828 unsigned int flags = eb->flags[i]; 1829 struct i915_vma *vma = eb->vma[i]; 1830 struct drm_i915_gem_object *obj = vma->obj; 1831 1832 assert_vma_held(vma); 1833 1834 if (flags & EXEC_OBJECT_CAPTURE) { 1835 struct i915_capture_list *capture; 1836 1837 capture = kmalloc(sizeof(*capture), GFP_KERNEL); 1838 if (capture) { 1839 capture->next = eb->request->capture_list; 1840 capture->vma = vma; 1841 eb->request->capture_list = capture; 1842 } 1843 } 1844 1845 /* 1846 * If the GPU is not _reading_ through the CPU cache, we need 1847 * to make sure that any writes (both previous GPU writes from 1848 * before a change in snooping levels and normal CPU writes) 1849 * caught in that cache are flushed to main memory. 1850 * 1851 * We want to say 1852 * obj->cache_dirty && 1853 * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) 1854 * but gcc's optimiser doesn't handle that as well and emits 1855 * two jumps instead of one. Maybe one day... 1856 */ 1857 if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { 1858 if (i915_gem_clflush_object(obj, 0)) 1859 flags &= ~EXEC_OBJECT_ASYNC; 1860 } 1861 1862 if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) { 1863 err = i915_request_await_object 1864 (eb->request, obj, flags & EXEC_OBJECT_WRITE); 1865 } 1866 1867 if (err == 0) 1868 err = i915_vma_move_to_active(vma, eb->request, flags); 1869 1870 i915_vma_unlock(vma); 1871 1872 __eb_unreserve_vma(vma, flags); 1873 vma->exec_flags = NULL; 1874 1875 if (unlikely(flags & __EXEC_OBJECT_HAS_REF)) 1876 i915_vma_put(vma); 1877 } 1878 ww_acquire_fini(&acquire); 1879 1880 if (unlikely(err)) 1881 goto err_skip; 1882 1883 eb->exec = NULL; 1884 1885 /* Unconditionally flush any chipset caches (for streaming writes). */ 1886 intel_gt_chipset_flush(eb->engine->gt); 1887 return 0; 1888 1889 err_skip: 1890 i915_request_skip(eb->request, err); 1891 return err; 1892 } 1893 1894 static int i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec) 1895 { 1896 if (exec->flags & __I915_EXEC_ILLEGAL_FLAGS) 1897 return -EINVAL; 1898 1899 /* Kernel clipping was a DRI1 misfeature */ 1900 if (!(exec->flags & I915_EXEC_FENCE_ARRAY)) { 1901 if (exec->num_cliprects || exec->cliprects_ptr) 1902 return -EINVAL; 1903 } 1904 1905 if (exec->DR4 == 0xffffffff) { 1906 DRM_DEBUG("UXA submitting garbage DR4, fixing up\n"); 1907 exec->DR4 = 0; 1908 } 1909 if (exec->DR1 || exec->DR4) 1910 return -EINVAL; 1911 1912 if ((exec->batch_start_offset | exec->batch_len) & 0x7) 1913 return -EINVAL; 1914 1915 return 0; 1916 } 1917 1918 static int i915_reset_gen7_sol_offsets(struct i915_request *rq) 1919 { 1920 u32 *cs; 1921 int i; 1922 1923 if (!IS_GEN(rq->i915, 7) || rq->engine->id != RCS0) { 1924 DRM_DEBUG("sol reset is gen7/rcs only\n"); 1925 return -EINVAL; 1926 } 1927 1928 cs = intel_ring_begin(rq, 4 * 2 + 2); 1929 if (IS_ERR(cs)) 1930 return PTR_ERR(cs); 1931 1932 *cs++ = MI_LOAD_REGISTER_IMM(4); 1933 for (i = 0; i < 4; i++) { 1934 *cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i)); 1935 *cs++ = 0; 1936 } 1937 *cs++ = MI_NOOP; 1938 intel_ring_advance(rq, cs); 1939 1940 return 0; 1941 } 1942 1943 static struct i915_vma * 1944 shadow_batch_pin(struct drm_i915_gem_object *obj, 1945 struct i915_address_space *vm, 1946 unsigned int flags) 1947 { 1948 struct i915_vma *vma; 1949 int err; 1950 1951 vma = i915_vma_instance(obj, vm, NULL); 1952 if (IS_ERR(vma)) 1953 return vma; 1954 1955 err = i915_vma_pin(vma, 0, 0, flags); 1956 if (err) 1957 return ERR_PTR(err); 1958 1959 return vma; 1960 } 1961 1962 struct eb_parse_work { 1963 struct dma_fence_work base; 1964 struct intel_engine_cs *engine; 1965 struct i915_vma *batch; 1966 struct i915_vma *shadow; 1967 struct i915_vma *trampoline; 1968 unsigned int batch_offset; 1969 unsigned int batch_length; 1970 }; 1971 1972 static int __eb_parse(struct dma_fence_work *work) 1973 { 1974 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 1975 1976 return intel_engine_cmd_parser(pw->engine, 1977 pw->batch, 1978 pw->batch_offset, 1979 pw->batch_length, 1980 pw->shadow, 1981 pw->trampoline); 1982 } 1983 1984 static void __eb_parse_release(struct dma_fence_work *work) 1985 { 1986 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 1987 1988 if (pw->trampoline) 1989 i915_active_release(&pw->trampoline->active); 1990 i915_active_release(&pw->shadow->active); 1991 i915_active_release(&pw->batch->active); 1992 } 1993 1994 static const struct dma_fence_work_ops eb_parse_ops = { 1995 .name = "eb_parse", 1996 .work = __eb_parse, 1997 .release = __eb_parse_release, 1998 }; 1999 2000 static int eb_parse_pipeline(struct i915_execbuffer *eb, 2001 struct i915_vma *shadow, 2002 struct i915_vma *trampoline) 2003 { 2004 struct eb_parse_work *pw; 2005 int err; 2006 2007 pw = kzalloc(sizeof(*pw), GFP_KERNEL); 2008 if (!pw) 2009 return -ENOMEM; 2010 2011 err = i915_active_acquire(&eb->batch->active); 2012 if (err) 2013 goto err_free; 2014 2015 err = i915_active_acquire(&shadow->active); 2016 if (err) 2017 goto err_batch; 2018 2019 if (trampoline) { 2020 err = i915_active_acquire(&trampoline->active); 2021 if (err) 2022 goto err_shadow; 2023 } 2024 2025 dma_fence_work_init(&pw->base, &eb_parse_ops); 2026 2027 pw->engine = eb->engine; 2028 pw->batch = eb->batch; 2029 pw->batch_offset = eb->batch_start_offset; 2030 pw->batch_length = eb->batch_len; 2031 pw->shadow = shadow; 2032 pw->trampoline = trampoline; 2033 2034 err = dma_resv_lock_interruptible(pw->batch->resv, NULL); 2035 if (err) 2036 goto err_trampoline; 2037 2038 err = dma_resv_reserve_shared(pw->batch->resv, 1); 2039 if (err) 2040 goto err_batch_unlock; 2041 2042 /* Wait for all writes (and relocs) into the batch to complete */ 2043 err = i915_sw_fence_await_reservation(&pw->base.chain, 2044 pw->batch->resv, NULL, false, 2045 0, I915_FENCE_GFP); 2046 if (err < 0) 2047 goto err_batch_unlock; 2048 2049 /* Keep the batch alive and unwritten as we parse */ 2050 dma_resv_add_shared_fence(pw->batch->resv, &pw->base.dma); 2051 2052 dma_resv_unlock(pw->batch->resv); 2053 2054 /* Force execution to wait for completion of the parser */ 2055 dma_resv_lock(shadow->resv, NULL); 2056 dma_resv_add_excl_fence(shadow->resv, &pw->base.dma); 2057 dma_resv_unlock(shadow->resv); 2058 2059 dma_fence_work_commit(&pw->base); 2060 return 0; 2061 2062 err_batch_unlock: 2063 dma_resv_unlock(pw->batch->resv); 2064 err_trampoline: 2065 if (trampoline) 2066 i915_active_release(&trampoline->active); 2067 err_shadow: 2068 i915_active_release(&shadow->active); 2069 err_batch: 2070 i915_active_release(&eb->batch->active); 2071 err_free: 2072 kfree(pw); 2073 return err; 2074 } 2075 2076 static int eb_parse(struct i915_execbuffer *eb) 2077 { 2078 struct intel_engine_pool_node *pool; 2079 struct i915_vma *shadow, *trampoline; 2080 unsigned int len; 2081 int err; 2082 2083 if (!eb_use_cmdparser(eb)) 2084 return 0; 2085 2086 len = eb->batch_len; 2087 if (!CMDPARSER_USES_GGTT(eb->i915)) { 2088 /* 2089 * ppGTT backed shadow buffers must be mapped RO, to prevent 2090 * post-scan tampering 2091 */ 2092 if (!eb->context->vm->has_read_only) { 2093 DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n"); 2094 return -EINVAL; 2095 } 2096 } else { 2097 len += I915_CMD_PARSER_TRAMPOLINE_SIZE; 2098 } 2099 2100 pool = intel_engine_get_pool(eb->engine, len); 2101 if (IS_ERR(pool)) 2102 return PTR_ERR(pool); 2103 2104 shadow = shadow_batch_pin(pool->obj, eb->context->vm, PIN_USER); 2105 if (IS_ERR(shadow)) { 2106 err = PTR_ERR(shadow); 2107 goto err; 2108 } 2109 i915_gem_object_set_readonly(shadow->obj); 2110 2111 trampoline = NULL; 2112 if (CMDPARSER_USES_GGTT(eb->i915)) { 2113 trampoline = shadow; 2114 2115 shadow = shadow_batch_pin(pool->obj, 2116 &eb->engine->gt->ggtt->vm, 2117 PIN_GLOBAL); 2118 if (IS_ERR(shadow)) { 2119 err = PTR_ERR(shadow); 2120 shadow = trampoline; 2121 goto err_shadow; 2122 } 2123 2124 eb->batch_flags |= I915_DISPATCH_SECURE; 2125 } 2126 2127 err = eb_parse_pipeline(eb, shadow, trampoline); 2128 if (err) 2129 goto err_trampoline; 2130 2131 eb->vma[eb->buffer_count] = i915_vma_get(shadow); 2132 eb->flags[eb->buffer_count] = 2133 __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF; 2134 shadow->exec_flags = &eb->flags[eb->buffer_count]; 2135 eb->buffer_count++; 2136 2137 eb->trampoline = trampoline; 2138 eb->batch_start_offset = 0; 2139 eb->batch = shadow; 2140 2141 shadow->private = pool; 2142 return 0; 2143 2144 err_trampoline: 2145 if (trampoline) 2146 i915_vma_unpin(trampoline); 2147 err_shadow: 2148 i915_vma_unpin(shadow); 2149 err: 2150 intel_engine_pool_put(pool); 2151 return err; 2152 } 2153 2154 static void 2155 add_to_client(struct i915_request *rq, struct drm_file *file) 2156 { 2157 struct drm_i915_file_private *file_priv = file->driver_priv; 2158 2159 rq->file_priv = file_priv; 2160 2161 spin_lock(&file_priv->mm.lock); 2162 list_add_tail(&rq->client_link, &file_priv->mm.request_list); 2163 spin_unlock(&file_priv->mm.lock); 2164 } 2165 2166 static int eb_submit(struct i915_execbuffer *eb) 2167 { 2168 int err; 2169 2170 err = eb_move_to_gpu(eb); 2171 if (err) 2172 return err; 2173 2174 if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) { 2175 err = i915_reset_gen7_sol_offsets(eb->request); 2176 if (err) 2177 return err; 2178 } 2179 2180 /* 2181 * After we completed waiting for other engines (using HW semaphores) 2182 * then we can signal that this request/batch is ready to run. This 2183 * allows us to determine if the batch is still waiting on the GPU 2184 * or actually running by checking the breadcrumb. 2185 */ 2186 if (eb->engine->emit_init_breadcrumb) { 2187 err = eb->engine->emit_init_breadcrumb(eb->request); 2188 if (err) 2189 return err; 2190 } 2191 2192 err = eb->engine->emit_bb_start(eb->request, 2193 eb->batch->node.start + 2194 eb->batch_start_offset, 2195 eb->batch_len, 2196 eb->batch_flags); 2197 if (err) 2198 return err; 2199 2200 if (eb->trampoline) { 2201 GEM_BUG_ON(eb->batch_start_offset); 2202 err = eb->engine->emit_bb_start(eb->request, 2203 eb->trampoline->node.start + 2204 eb->batch_len, 2205 0, 0); 2206 if (err) 2207 return err; 2208 } 2209 2210 if (intel_context_nopreempt(eb->context)) 2211 __set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags); 2212 2213 return 0; 2214 } 2215 2216 static int num_vcs_engines(const struct drm_i915_private *i915) 2217 { 2218 return hweight64(INTEL_INFO(i915)->engine_mask & 2219 GENMASK_ULL(VCS0 + I915_MAX_VCS - 1, VCS0)); 2220 } 2221 2222 /* 2223 * Find one BSD ring to dispatch the corresponding BSD command. 2224 * The engine index is returned. 2225 */ 2226 static unsigned int 2227 gen8_dispatch_bsd_engine(struct drm_i915_private *dev_priv, 2228 struct drm_file *file) 2229 { 2230 struct drm_i915_file_private *file_priv = file->driver_priv; 2231 2232 /* Check whether the file_priv has already selected one ring. */ 2233 if ((int)file_priv->bsd_engine < 0) 2234 file_priv->bsd_engine = 2235 get_random_int() % num_vcs_engines(dev_priv); 2236 2237 return file_priv->bsd_engine; 2238 } 2239 2240 static const enum intel_engine_id user_ring_map[] = { 2241 [I915_EXEC_DEFAULT] = RCS0, 2242 [I915_EXEC_RENDER] = RCS0, 2243 [I915_EXEC_BLT] = BCS0, 2244 [I915_EXEC_BSD] = VCS0, 2245 [I915_EXEC_VEBOX] = VECS0 2246 }; 2247 2248 static struct i915_request *eb_throttle(struct intel_context *ce) 2249 { 2250 struct intel_ring *ring = ce->ring; 2251 struct intel_timeline *tl = ce->timeline; 2252 struct i915_request *rq; 2253 2254 /* 2255 * Completely unscientific finger-in-the-air estimates for suitable 2256 * maximum user request size (to avoid blocking) and then backoff. 2257 */ 2258 if (intel_ring_update_space(ring) >= PAGE_SIZE) 2259 return NULL; 2260 2261 /* 2262 * Find a request that after waiting upon, there will be at least half 2263 * the ring available. The hysteresis allows us to compete for the 2264 * shared ring and should mean that we sleep less often prior to 2265 * claiming our resources, but not so long that the ring completely 2266 * drains before we can submit our next request. 2267 */ 2268 list_for_each_entry(rq, &tl->requests, link) { 2269 if (rq->ring != ring) 2270 continue; 2271 2272 if (__intel_ring_space(rq->postfix, 2273 ring->emit, ring->size) > ring->size / 2) 2274 break; 2275 } 2276 if (&rq->link == &tl->requests) 2277 return NULL; /* weird, we will check again later for real */ 2278 2279 return i915_request_get(rq); 2280 } 2281 2282 static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce) 2283 { 2284 struct intel_timeline *tl; 2285 struct i915_request *rq; 2286 int err; 2287 2288 /* 2289 * ABI: Before userspace accesses the GPU (e.g. execbuffer), report 2290 * EIO if the GPU is already wedged. 2291 */ 2292 err = intel_gt_terminally_wedged(ce->engine->gt); 2293 if (err) 2294 return err; 2295 2296 if (unlikely(intel_context_is_banned(ce))) 2297 return -EIO; 2298 2299 /* 2300 * Pinning the contexts may generate requests in order to acquire 2301 * GGTT space, so do this first before we reserve a seqno for 2302 * ourselves. 2303 */ 2304 err = intel_context_pin(ce); 2305 if (err) 2306 return err; 2307 2308 /* 2309 * Take a local wakeref for preparing to dispatch the execbuf as 2310 * we expect to access the hardware fairly frequently in the 2311 * process, and require the engine to be kept awake between accesses. 2312 * Upon dispatch, we acquire another prolonged wakeref that we hold 2313 * until the timeline is idle, which in turn releases the wakeref 2314 * taken on the engine, and the parent device. 2315 */ 2316 tl = intel_context_timeline_lock(ce); 2317 if (IS_ERR(tl)) { 2318 err = PTR_ERR(tl); 2319 goto err_unpin; 2320 } 2321 2322 intel_context_enter(ce); 2323 rq = eb_throttle(ce); 2324 2325 intel_context_timeline_unlock(tl); 2326 2327 if (rq) { 2328 if (i915_request_wait(rq, 2329 I915_WAIT_INTERRUPTIBLE, 2330 MAX_SCHEDULE_TIMEOUT) < 0) { 2331 i915_request_put(rq); 2332 err = -EINTR; 2333 goto err_exit; 2334 } 2335 2336 i915_request_put(rq); 2337 } 2338 2339 eb->engine = ce->engine; 2340 eb->context = ce; 2341 return 0; 2342 2343 err_exit: 2344 mutex_lock(&tl->mutex); 2345 intel_context_exit(ce); 2346 intel_context_timeline_unlock(tl); 2347 err_unpin: 2348 intel_context_unpin(ce); 2349 return err; 2350 } 2351 2352 static void eb_unpin_engine(struct i915_execbuffer *eb) 2353 { 2354 struct intel_context *ce = eb->context; 2355 struct intel_timeline *tl = ce->timeline; 2356 2357 mutex_lock(&tl->mutex); 2358 intel_context_exit(ce); 2359 mutex_unlock(&tl->mutex); 2360 2361 intel_context_unpin(ce); 2362 } 2363 2364 static unsigned int 2365 eb_select_legacy_ring(struct i915_execbuffer *eb, 2366 struct drm_file *file, 2367 struct drm_i915_gem_execbuffer2 *args) 2368 { 2369 struct drm_i915_private *i915 = eb->i915; 2370 unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK; 2371 2372 if (user_ring_id != I915_EXEC_BSD && 2373 (args->flags & I915_EXEC_BSD_MASK)) { 2374 DRM_DEBUG("execbuf with non bsd ring but with invalid " 2375 "bsd dispatch flags: %d\n", (int)(args->flags)); 2376 return -1; 2377 } 2378 2379 if (user_ring_id == I915_EXEC_BSD && num_vcs_engines(i915) > 1) { 2380 unsigned int bsd_idx = args->flags & I915_EXEC_BSD_MASK; 2381 2382 if (bsd_idx == I915_EXEC_BSD_DEFAULT) { 2383 bsd_idx = gen8_dispatch_bsd_engine(i915, file); 2384 } else if (bsd_idx >= I915_EXEC_BSD_RING1 && 2385 bsd_idx <= I915_EXEC_BSD_RING2) { 2386 bsd_idx >>= I915_EXEC_BSD_SHIFT; 2387 bsd_idx--; 2388 } else { 2389 DRM_DEBUG("execbuf with unknown bsd ring: %u\n", 2390 bsd_idx); 2391 return -1; 2392 } 2393 2394 return _VCS(bsd_idx); 2395 } 2396 2397 if (user_ring_id >= ARRAY_SIZE(user_ring_map)) { 2398 DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id); 2399 return -1; 2400 } 2401 2402 return user_ring_map[user_ring_id]; 2403 } 2404 2405 static int 2406 eb_pin_engine(struct i915_execbuffer *eb, 2407 struct drm_file *file, 2408 struct drm_i915_gem_execbuffer2 *args) 2409 { 2410 struct intel_context *ce; 2411 unsigned int idx; 2412 int err; 2413 2414 if (i915_gem_context_user_engines(eb->gem_context)) 2415 idx = args->flags & I915_EXEC_RING_MASK; 2416 else 2417 idx = eb_select_legacy_ring(eb, file, args); 2418 2419 ce = i915_gem_context_get_engine(eb->gem_context, idx); 2420 if (IS_ERR(ce)) 2421 return PTR_ERR(ce); 2422 2423 err = __eb_pin_engine(eb, ce); 2424 intel_context_put(ce); 2425 2426 return err; 2427 } 2428 2429 static void 2430 __free_fence_array(struct drm_syncobj **fences, unsigned int n) 2431 { 2432 while (n--) 2433 drm_syncobj_put(ptr_mask_bits(fences[n], 2)); 2434 kvfree(fences); 2435 } 2436 2437 static struct drm_syncobj ** 2438 get_fence_array(struct drm_i915_gem_execbuffer2 *args, 2439 struct drm_file *file) 2440 { 2441 const unsigned long nfences = args->num_cliprects; 2442 struct drm_i915_gem_exec_fence __user *user; 2443 struct drm_syncobj **fences; 2444 unsigned long n; 2445 int err; 2446 2447 if (!(args->flags & I915_EXEC_FENCE_ARRAY)) 2448 return NULL; 2449 2450 /* Check multiplication overflow for access_ok() and kvmalloc_array() */ 2451 BUILD_BUG_ON(sizeof(size_t) > sizeof(unsigned long)); 2452 if (nfences > min_t(unsigned long, 2453 ULONG_MAX / sizeof(*user), 2454 SIZE_MAX / sizeof(*fences))) 2455 return ERR_PTR(-EINVAL); 2456 2457 user = u64_to_user_ptr(args->cliprects_ptr); 2458 if (!access_ok(user, nfences * sizeof(*user))) 2459 return ERR_PTR(-EFAULT); 2460 2461 fences = kvmalloc_array(nfences, sizeof(*fences), 2462 __GFP_NOWARN | GFP_KERNEL); 2463 if (!fences) 2464 return ERR_PTR(-ENOMEM); 2465 2466 for (n = 0; n < nfences; n++) { 2467 struct drm_i915_gem_exec_fence fence; 2468 struct drm_syncobj *syncobj; 2469 2470 if (__copy_from_user(&fence, user++, sizeof(fence))) { 2471 err = -EFAULT; 2472 goto err; 2473 } 2474 2475 if (fence.flags & __I915_EXEC_FENCE_UNKNOWN_FLAGS) { 2476 err = -EINVAL; 2477 goto err; 2478 } 2479 2480 syncobj = drm_syncobj_find(file, fence.handle); 2481 if (!syncobj) { 2482 DRM_DEBUG("Invalid syncobj handle provided\n"); 2483 err = -ENOENT; 2484 goto err; 2485 } 2486 2487 BUILD_BUG_ON(~(ARCH_KMALLOC_MINALIGN - 1) & 2488 ~__I915_EXEC_FENCE_UNKNOWN_FLAGS); 2489 2490 fences[n] = ptr_pack_bits(syncobj, fence.flags, 2); 2491 } 2492 2493 return fences; 2494 2495 err: 2496 __free_fence_array(fences, n); 2497 return ERR_PTR(err); 2498 } 2499 2500 static void 2501 put_fence_array(struct drm_i915_gem_execbuffer2 *args, 2502 struct drm_syncobj **fences) 2503 { 2504 if (fences) 2505 __free_fence_array(fences, args->num_cliprects); 2506 } 2507 2508 static int 2509 await_fence_array(struct i915_execbuffer *eb, 2510 struct drm_syncobj **fences) 2511 { 2512 const unsigned int nfences = eb->args->num_cliprects; 2513 unsigned int n; 2514 int err; 2515 2516 for (n = 0; n < nfences; n++) { 2517 struct drm_syncobj *syncobj; 2518 struct dma_fence *fence; 2519 unsigned int flags; 2520 2521 syncobj = ptr_unpack_bits(fences[n], &flags, 2); 2522 if (!(flags & I915_EXEC_FENCE_WAIT)) 2523 continue; 2524 2525 fence = drm_syncobj_fence_get(syncobj); 2526 if (!fence) 2527 return -EINVAL; 2528 2529 err = i915_request_await_dma_fence(eb->request, fence); 2530 dma_fence_put(fence); 2531 if (err < 0) 2532 return err; 2533 } 2534 2535 return 0; 2536 } 2537 2538 static void 2539 signal_fence_array(struct i915_execbuffer *eb, 2540 struct drm_syncobj **fences) 2541 { 2542 const unsigned int nfences = eb->args->num_cliprects; 2543 struct dma_fence * const fence = &eb->request->fence; 2544 unsigned int n; 2545 2546 for (n = 0; n < nfences; n++) { 2547 struct drm_syncobj *syncobj; 2548 unsigned int flags; 2549 2550 syncobj = ptr_unpack_bits(fences[n], &flags, 2); 2551 if (!(flags & I915_EXEC_FENCE_SIGNAL)) 2552 continue; 2553 2554 drm_syncobj_replace_fence(syncobj, fence); 2555 } 2556 } 2557 2558 static int 2559 i915_gem_do_execbuffer(struct drm_device *dev, 2560 struct drm_file *file, 2561 struct drm_i915_gem_execbuffer2 *args, 2562 struct drm_i915_gem_exec_object2 *exec, 2563 struct drm_syncobj **fences) 2564 { 2565 struct drm_i915_private *i915 = to_i915(dev); 2566 struct i915_execbuffer eb; 2567 struct dma_fence *in_fence = NULL; 2568 struct dma_fence *exec_fence = NULL; 2569 struct sync_file *out_fence = NULL; 2570 int out_fence_fd = -1; 2571 int err; 2572 2573 BUILD_BUG_ON(__EXEC_INTERNAL_FLAGS & ~__I915_EXEC_ILLEGAL_FLAGS); 2574 BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & 2575 ~__EXEC_OBJECT_UNKNOWN_FLAGS); 2576 2577 eb.i915 = i915; 2578 eb.file = file; 2579 eb.args = args; 2580 if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) 2581 args->flags |= __EXEC_HAS_RELOC; 2582 2583 eb.exec = exec; 2584 eb.vma = (struct i915_vma **)(exec + args->buffer_count + 1); 2585 eb.vma[0] = NULL; 2586 eb.flags = (unsigned int *)(eb.vma + args->buffer_count + 1); 2587 2588 eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS; 2589 reloc_cache_init(&eb.reloc_cache, eb.i915); 2590 2591 eb.buffer_count = args->buffer_count; 2592 eb.batch_start_offset = args->batch_start_offset; 2593 eb.batch_len = args->batch_len; 2594 eb.trampoline = NULL; 2595 2596 eb.batch_flags = 0; 2597 if (args->flags & I915_EXEC_SECURE) { 2598 if (INTEL_GEN(i915) >= 11) 2599 return -ENODEV; 2600 2601 /* Return -EPERM to trigger fallback code on old binaries. */ 2602 if (!HAS_SECURE_BATCHES(i915)) 2603 return -EPERM; 2604 2605 if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN)) 2606 return -EPERM; 2607 2608 eb.batch_flags |= I915_DISPATCH_SECURE; 2609 } 2610 if (args->flags & I915_EXEC_IS_PINNED) 2611 eb.batch_flags |= I915_DISPATCH_PINNED; 2612 2613 if (args->flags & I915_EXEC_FENCE_IN) { 2614 in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 2615 if (!in_fence) 2616 return -EINVAL; 2617 } 2618 2619 if (args->flags & I915_EXEC_FENCE_SUBMIT) { 2620 if (in_fence) { 2621 err = -EINVAL; 2622 goto err_in_fence; 2623 } 2624 2625 exec_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 2626 if (!exec_fence) { 2627 err = -EINVAL; 2628 goto err_in_fence; 2629 } 2630 } 2631 2632 if (args->flags & I915_EXEC_FENCE_OUT) { 2633 out_fence_fd = get_unused_fd_flags(O_CLOEXEC); 2634 if (out_fence_fd < 0) { 2635 err = out_fence_fd; 2636 goto err_exec_fence; 2637 } 2638 } 2639 2640 err = eb_create(&eb); 2641 if (err) 2642 goto err_out_fence; 2643 2644 GEM_BUG_ON(!eb.lut_size); 2645 2646 err = eb_select_context(&eb); 2647 if (unlikely(err)) 2648 goto err_destroy; 2649 2650 err = eb_pin_engine(&eb, file, args); 2651 if (unlikely(err)) 2652 goto err_context; 2653 2654 err = i915_mutex_lock_interruptible(dev); 2655 if (err) 2656 goto err_engine; 2657 2658 err = eb_relocate(&eb); 2659 if (err) { 2660 /* 2661 * If the user expects the execobject.offset and 2662 * reloc.presumed_offset to be an exact match, 2663 * as for using NO_RELOC, then we cannot update 2664 * the execobject.offset until we have completed 2665 * relocation. 2666 */ 2667 args->flags &= ~__EXEC_HAS_RELOC; 2668 goto err_vma; 2669 } 2670 2671 if (unlikely(*eb.batch->exec_flags & EXEC_OBJECT_WRITE)) { 2672 DRM_DEBUG("Attempting to use self-modifying batch buffer\n"); 2673 err = -EINVAL; 2674 goto err_vma; 2675 } 2676 if (eb.batch_start_offset > eb.batch->size || 2677 eb.batch_len > eb.batch->size - eb.batch_start_offset) { 2678 DRM_DEBUG("Attempting to use out-of-bounds batch\n"); 2679 err = -EINVAL; 2680 goto err_vma; 2681 } 2682 2683 if (eb.batch_len == 0) 2684 eb.batch_len = eb.batch->size - eb.batch_start_offset; 2685 2686 err = eb_parse(&eb); 2687 if (err) 2688 goto err_vma; 2689 2690 /* 2691 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure 2692 * batch" bit. Hence we need to pin secure batches into the global gtt. 2693 * hsw should have this fixed, but bdw mucks it up again. */ 2694 if (eb.batch_flags & I915_DISPATCH_SECURE) { 2695 struct i915_vma *vma; 2696 2697 /* 2698 * So on first glance it looks freaky that we pin the batch here 2699 * outside of the reservation loop. But: 2700 * - The batch is already pinned into the relevant ppgtt, so we 2701 * already have the backing storage fully allocated. 2702 * - No other BO uses the global gtt (well contexts, but meh), 2703 * so we don't really have issues with multiple objects not 2704 * fitting due to fragmentation. 2705 * So this is actually safe. 2706 */ 2707 vma = i915_gem_object_ggtt_pin(eb.batch->obj, NULL, 0, 0, 0); 2708 if (IS_ERR(vma)) { 2709 err = PTR_ERR(vma); 2710 goto err_vma; 2711 } 2712 2713 eb.batch = vma; 2714 } 2715 2716 /* All GPU relocation batches must be submitted prior to the user rq */ 2717 GEM_BUG_ON(eb.reloc_cache.rq); 2718 2719 /* Allocate a request for this batch buffer nice and early. */ 2720 eb.request = i915_request_create(eb.context); 2721 if (IS_ERR(eb.request)) { 2722 err = PTR_ERR(eb.request); 2723 goto err_batch_unpin; 2724 } 2725 2726 if (in_fence) { 2727 err = i915_request_await_dma_fence(eb.request, in_fence); 2728 if (err < 0) 2729 goto err_request; 2730 } 2731 2732 if (exec_fence) { 2733 err = i915_request_await_execution(eb.request, exec_fence, 2734 eb.engine->bond_execute); 2735 if (err < 0) 2736 goto err_request; 2737 } 2738 2739 if (fences) { 2740 err = await_fence_array(&eb, fences); 2741 if (err) 2742 goto err_request; 2743 } 2744 2745 if (out_fence_fd != -1) { 2746 out_fence = sync_file_create(&eb.request->fence); 2747 if (!out_fence) { 2748 err = -ENOMEM; 2749 goto err_request; 2750 } 2751 } 2752 2753 /* 2754 * Whilst this request exists, batch_obj will be on the 2755 * active_list, and so will hold the active reference. Only when this 2756 * request is retired will the the batch_obj be moved onto the 2757 * inactive_list and lose its active reference. Hence we do not need 2758 * to explicitly hold another reference here. 2759 */ 2760 eb.request->batch = eb.batch; 2761 if (eb.batch->private) 2762 intel_engine_pool_mark_active(eb.batch->private, eb.request); 2763 2764 trace_i915_request_queue(eb.request, eb.batch_flags); 2765 err = eb_submit(&eb); 2766 err_request: 2767 add_to_client(eb.request, file); 2768 i915_request_get(eb.request); 2769 i915_request_add(eb.request); 2770 2771 if (fences) 2772 signal_fence_array(&eb, fences); 2773 2774 if (out_fence) { 2775 if (err == 0) { 2776 fd_install(out_fence_fd, out_fence->file); 2777 args->rsvd2 &= GENMASK_ULL(31, 0); /* keep in-fence */ 2778 args->rsvd2 |= (u64)out_fence_fd << 32; 2779 out_fence_fd = -1; 2780 } else { 2781 fput(out_fence->file); 2782 } 2783 } 2784 i915_request_put(eb.request); 2785 2786 err_batch_unpin: 2787 if (eb.batch_flags & I915_DISPATCH_SECURE) 2788 i915_vma_unpin(eb.batch); 2789 if (eb.batch->private) 2790 intel_engine_pool_put(eb.batch->private); 2791 err_vma: 2792 if (eb.exec) 2793 eb_release_vmas(&eb); 2794 if (eb.trampoline) 2795 i915_vma_unpin(eb.trampoline); 2796 mutex_unlock(&dev->struct_mutex); 2797 err_engine: 2798 eb_unpin_engine(&eb); 2799 err_context: 2800 i915_gem_context_put(eb.gem_context); 2801 err_destroy: 2802 eb_destroy(&eb); 2803 err_out_fence: 2804 if (out_fence_fd != -1) 2805 put_unused_fd(out_fence_fd); 2806 err_exec_fence: 2807 dma_fence_put(exec_fence); 2808 err_in_fence: 2809 dma_fence_put(in_fence); 2810 return err; 2811 } 2812 2813 static size_t eb_element_size(void) 2814 { 2815 return (sizeof(struct drm_i915_gem_exec_object2) + 2816 sizeof(struct i915_vma *) + 2817 sizeof(unsigned int)); 2818 } 2819 2820 static bool check_buffer_count(size_t count) 2821 { 2822 const size_t sz = eb_element_size(); 2823 2824 /* 2825 * When using LUT_HANDLE, we impose a limit of INT_MAX for the lookup 2826 * array size (see eb_create()). Otherwise, we can accept an array as 2827 * large as can be addressed (though use large arrays at your peril)! 2828 */ 2829 2830 return !(count < 1 || count > INT_MAX || count > SIZE_MAX / sz - 1); 2831 } 2832 2833 /* 2834 * Legacy execbuffer just creates an exec2 list from the original exec object 2835 * list array and passes it to the real function. 2836 */ 2837 int 2838 i915_gem_execbuffer_ioctl(struct drm_device *dev, void *data, 2839 struct drm_file *file) 2840 { 2841 struct drm_i915_gem_execbuffer *args = data; 2842 struct drm_i915_gem_execbuffer2 exec2; 2843 struct drm_i915_gem_exec_object *exec_list = NULL; 2844 struct drm_i915_gem_exec_object2 *exec2_list = NULL; 2845 const size_t count = args->buffer_count; 2846 unsigned int i; 2847 int err; 2848 2849 if (!check_buffer_count(count)) { 2850 DRM_DEBUG("execbuf2 with %zd buffers\n", count); 2851 return -EINVAL; 2852 } 2853 2854 exec2.buffers_ptr = args->buffers_ptr; 2855 exec2.buffer_count = args->buffer_count; 2856 exec2.batch_start_offset = args->batch_start_offset; 2857 exec2.batch_len = args->batch_len; 2858 exec2.DR1 = args->DR1; 2859 exec2.DR4 = args->DR4; 2860 exec2.num_cliprects = args->num_cliprects; 2861 exec2.cliprects_ptr = args->cliprects_ptr; 2862 exec2.flags = I915_EXEC_RENDER; 2863 i915_execbuffer2_set_context_id(exec2, 0); 2864 2865 err = i915_gem_check_execbuffer(&exec2); 2866 if (err) 2867 return err; 2868 2869 /* Copy in the exec list from userland */ 2870 exec_list = kvmalloc_array(count, sizeof(*exec_list), 2871 __GFP_NOWARN | GFP_KERNEL); 2872 exec2_list = kvmalloc_array(count + 1, eb_element_size(), 2873 __GFP_NOWARN | GFP_KERNEL); 2874 if (exec_list == NULL || exec2_list == NULL) { 2875 DRM_DEBUG("Failed to allocate exec list for %d buffers\n", 2876 args->buffer_count); 2877 kvfree(exec_list); 2878 kvfree(exec2_list); 2879 return -ENOMEM; 2880 } 2881 err = copy_from_user(exec_list, 2882 u64_to_user_ptr(args->buffers_ptr), 2883 sizeof(*exec_list) * count); 2884 if (err) { 2885 DRM_DEBUG("copy %d exec entries failed %d\n", 2886 args->buffer_count, err); 2887 kvfree(exec_list); 2888 kvfree(exec2_list); 2889 return -EFAULT; 2890 } 2891 2892 for (i = 0; i < args->buffer_count; i++) { 2893 exec2_list[i].handle = exec_list[i].handle; 2894 exec2_list[i].relocation_count = exec_list[i].relocation_count; 2895 exec2_list[i].relocs_ptr = exec_list[i].relocs_ptr; 2896 exec2_list[i].alignment = exec_list[i].alignment; 2897 exec2_list[i].offset = exec_list[i].offset; 2898 if (INTEL_GEN(to_i915(dev)) < 4) 2899 exec2_list[i].flags = EXEC_OBJECT_NEEDS_FENCE; 2900 else 2901 exec2_list[i].flags = 0; 2902 } 2903 2904 err = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list, NULL); 2905 if (exec2.flags & __EXEC_HAS_RELOC) { 2906 struct drm_i915_gem_exec_object __user *user_exec_list = 2907 u64_to_user_ptr(args->buffers_ptr); 2908 2909 /* Copy the new buffer offsets back to the user's exec list. */ 2910 for (i = 0; i < args->buffer_count; i++) { 2911 if (!(exec2_list[i].offset & UPDATE)) 2912 continue; 2913 2914 exec2_list[i].offset = 2915 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 2916 exec2_list[i].offset &= PIN_OFFSET_MASK; 2917 if (__copy_to_user(&user_exec_list[i].offset, 2918 &exec2_list[i].offset, 2919 sizeof(user_exec_list[i].offset))) 2920 break; 2921 } 2922 } 2923 2924 kvfree(exec_list); 2925 kvfree(exec2_list); 2926 return err; 2927 } 2928 2929 int 2930 i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data, 2931 struct drm_file *file) 2932 { 2933 struct drm_i915_gem_execbuffer2 *args = data; 2934 struct drm_i915_gem_exec_object2 *exec2_list; 2935 struct drm_syncobj **fences = NULL; 2936 const size_t count = args->buffer_count; 2937 int err; 2938 2939 if (!check_buffer_count(count)) { 2940 DRM_DEBUG("execbuf2 with %zd buffers\n", count); 2941 return -EINVAL; 2942 } 2943 2944 err = i915_gem_check_execbuffer(args); 2945 if (err) 2946 return err; 2947 2948 /* Allocate an extra slot for use by the command parser */ 2949 exec2_list = kvmalloc_array(count + 1, eb_element_size(), 2950 __GFP_NOWARN | GFP_KERNEL); 2951 if (exec2_list == NULL) { 2952 DRM_DEBUG("Failed to allocate exec list for %zd buffers\n", 2953 count); 2954 return -ENOMEM; 2955 } 2956 if (copy_from_user(exec2_list, 2957 u64_to_user_ptr(args->buffers_ptr), 2958 sizeof(*exec2_list) * count)) { 2959 DRM_DEBUG("copy %zd exec entries failed\n", count); 2960 kvfree(exec2_list); 2961 return -EFAULT; 2962 } 2963 2964 if (args->flags & I915_EXEC_FENCE_ARRAY) { 2965 fences = get_fence_array(args, file); 2966 if (IS_ERR(fences)) { 2967 kvfree(exec2_list); 2968 return PTR_ERR(fences); 2969 } 2970 } 2971 2972 err = i915_gem_do_execbuffer(dev, file, args, exec2_list, fences); 2973 2974 /* 2975 * Now that we have begun execution of the batchbuffer, we ignore 2976 * any new error after this point. Also given that we have already 2977 * updated the associated relocations, we try to write out the current 2978 * object locations irrespective of any error. 2979 */ 2980 if (args->flags & __EXEC_HAS_RELOC) { 2981 struct drm_i915_gem_exec_object2 __user *user_exec_list = 2982 u64_to_user_ptr(args->buffers_ptr); 2983 unsigned int i; 2984 2985 /* Copy the new buffer offsets back to the user's exec list. */ 2986 /* 2987 * Note: count * sizeof(*user_exec_list) does not overflow, 2988 * because we checked 'count' in check_buffer_count(). 2989 * 2990 * And this range already got effectively checked earlier 2991 * when we did the "copy_from_user()" above. 2992 */ 2993 if (!user_access_begin(user_exec_list, count * sizeof(*user_exec_list))) 2994 goto end; 2995 2996 for (i = 0; i < args->buffer_count; i++) { 2997 if (!(exec2_list[i].offset & UPDATE)) 2998 continue; 2999 3000 exec2_list[i].offset = 3001 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 3002 unsafe_put_user(exec2_list[i].offset, 3003 &user_exec_list[i].offset, 3004 end_user); 3005 } 3006 end_user: 3007 user_access_end(); 3008 end:; 3009 } 3010 3011 args->flags &= ~__I915_EXEC_UNKNOWN_FLAGS; 3012 put_fence_array(args, fences); 3013 kvfree(exec2_list); 3014 return err; 3015 } 3016