1 /* 2 * SPDX-License-Identifier: MIT 3 * 4 * Copyright © 2008,2010 Intel Corporation 5 */ 6 7 #include <linux/intel-iommu.h> 8 #include <linux/dma-resv.h> 9 #include <linux/sync_file.h> 10 #include <linux/uaccess.h> 11 12 #include <drm/drm_syncobj.h> 13 #include <drm/i915_drm.h> 14 15 #include "display/intel_frontbuffer.h" 16 17 #include "gem/i915_gem_ioctls.h" 18 #include "gt/intel_context.h" 19 #include "gt/intel_engine_pool.h" 20 #include "gt/intel_gt.h" 21 #include "gt/intel_gt_pm.h" 22 #include "gt/intel_ring.h" 23 24 #include "i915_drv.h" 25 #include "i915_gem_clflush.h" 26 #include "i915_gem_context.h" 27 #include "i915_gem_ioctls.h" 28 #include "i915_sw_fence_work.h" 29 #include "i915_trace.h" 30 31 enum { 32 FORCE_CPU_RELOC = 1, 33 FORCE_GTT_RELOC, 34 FORCE_GPU_RELOC, 35 #define DBG_FORCE_RELOC 0 /* choose one of the above! */ 36 }; 37 38 #define __EXEC_OBJECT_HAS_REF BIT(31) 39 #define __EXEC_OBJECT_HAS_PIN BIT(30) 40 #define __EXEC_OBJECT_HAS_FENCE BIT(29) 41 #define __EXEC_OBJECT_NEEDS_MAP BIT(28) 42 #define __EXEC_OBJECT_NEEDS_BIAS BIT(27) 43 #define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 27) /* all of the above */ 44 #define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE) 45 46 #define __EXEC_HAS_RELOC BIT(31) 47 #define __EXEC_VALIDATED BIT(30) 48 #define __EXEC_INTERNAL_FLAGS (~0u << 30) 49 #define UPDATE PIN_OFFSET_FIXED 50 51 #define BATCH_OFFSET_BIAS (256*1024) 52 53 #define __I915_EXEC_ILLEGAL_FLAGS \ 54 (__I915_EXEC_UNKNOWN_FLAGS | \ 55 I915_EXEC_CONSTANTS_MASK | \ 56 I915_EXEC_RESOURCE_STREAMER) 57 58 /* Catch emission of unexpected errors for CI! */ 59 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 60 #undef EINVAL 61 #define EINVAL ({ \ 62 DRM_DEBUG_DRIVER("EINVAL at %s:%d\n", __func__, __LINE__); \ 63 22; \ 64 }) 65 #endif 66 67 /** 68 * DOC: User command execution 69 * 70 * Userspace submits commands to be executed on the GPU as an instruction 71 * stream within a GEM object we call a batchbuffer. This instructions may 72 * refer to other GEM objects containing auxiliary state such as kernels, 73 * samplers, render targets and even secondary batchbuffers. Userspace does 74 * not know where in the GPU memory these objects reside and so before the 75 * batchbuffer is passed to the GPU for execution, those addresses in the 76 * batchbuffer and auxiliary objects are updated. This is known as relocation, 77 * or patching. To try and avoid having to relocate each object on the next 78 * execution, userspace is told the location of those objects in this pass, 79 * but this remains just a hint as the kernel may choose a new location for 80 * any object in the future. 81 * 82 * At the level of talking to the hardware, submitting a batchbuffer for the 83 * GPU to execute is to add content to a buffer from which the HW 84 * command streamer is reading. 85 * 86 * 1. Add a command to load the HW context. For Logical Ring Contexts, i.e. 87 * Execlists, this command is not placed on the same buffer as the 88 * remaining items. 89 * 90 * 2. Add a command to invalidate caches to the buffer. 91 * 92 * 3. Add a batchbuffer start command to the buffer; the start command is 93 * essentially a token together with the GPU address of the batchbuffer 94 * to be executed. 95 * 96 * 4. Add a pipeline flush to the buffer. 97 * 98 * 5. Add a memory write command to the buffer to record when the GPU 99 * is done executing the batchbuffer. The memory write writes the 100 * global sequence number of the request, ``i915_request::global_seqno``; 101 * the i915 driver uses the current value in the register to determine 102 * if the GPU has completed the batchbuffer. 103 * 104 * 6. Add a user interrupt command to the buffer. This command instructs 105 * the GPU to issue an interrupt when the command, pipeline flush and 106 * memory write are completed. 107 * 108 * 7. Inform the hardware of the additional commands added to the buffer 109 * (by updating the tail pointer). 110 * 111 * Processing an execbuf ioctl is conceptually split up into a few phases. 112 * 113 * 1. Validation - Ensure all the pointers, handles and flags are valid. 114 * 2. Reservation - Assign GPU address space for every object 115 * 3. Relocation - Update any addresses to point to the final locations 116 * 4. Serialisation - Order the request with respect to its dependencies 117 * 5. Construction - Construct a request to execute the batchbuffer 118 * 6. Submission (at some point in the future execution) 119 * 120 * Reserving resources for the execbuf is the most complicated phase. We 121 * neither want to have to migrate the object in the address space, nor do 122 * we want to have to update any relocations pointing to this object. Ideally, 123 * we want to leave the object where it is and for all the existing relocations 124 * to match. If the object is given a new address, or if userspace thinks the 125 * object is elsewhere, we have to parse all the relocation entries and update 126 * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that 127 * all the target addresses in all of its objects match the value in the 128 * relocation entries and that they all match the presumed offsets given by the 129 * list of execbuffer objects. Using this knowledge, we know that if we haven't 130 * moved any buffers, all the relocation entries are valid and we can skip 131 * the update. (If userspace is wrong, the likely outcome is an impromptu GPU 132 * hang.) The requirement for using I915_EXEC_NO_RELOC are: 133 * 134 * The addresses written in the objects must match the corresponding 135 * reloc.presumed_offset which in turn must match the corresponding 136 * execobject.offset. 137 * 138 * Any render targets written to in the batch must be flagged with 139 * EXEC_OBJECT_WRITE. 140 * 141 * To avoid stalling, execobject.offset should match the current 142 * address of that object within the active context. 143 * 144 * The reservation is done is multiple phases. First we try and keep any 145 * object already bound in its current location - so as long as meets the 146 * constraints imposed by the new execbuffer. Any object left unbound after the 147 * first pass is then fitted into any available idle space. If an object does 148 * not fit, all objects are removed from the reservation and the process rerun 149 * after sorting the objects into a priority order (more difficult to fit 150 * objects are tried first). Failing that, the entire VM is cleared and we try 151 * to fit the execbuf once last time before concluding that it simply will not 152 * fit. 153 * 154 * A small complication to all of this is that we allow userspace not only to 155 * specify an alignment and a size for the object in the address space, but 156 * we also allow userspace to specify the exact offset. This objects are 157 * simpler to place (the location is known a priori) all we have to do is make 158 * sure the space is available. 159 * 160 * Once all the objects are in place, patching up the buried pointers to point 161 * to the final locations is a fairly simple job of walking over the relocation 162 * entry arrays, looking up the right address and rewriting the value into 163 * the object. Simple! ... The relocation entries are stored in user memory 164 * and so to access them we have to copy them into a local buffer. That copy 165 * has to avoid taking any pagefaults as they may lead back to a GEM object 166 * requiring the struct_mutex (i.e. recursive deadlock). So once again we split 167 * the relocation into multiple passes. First we try to do everything within an 168 * atomic context (avoid the pagefaults) which requires that we never wait. If 169 * we detect that we may wait, or if we need to fault, then we have to fallback 170 * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm 171 * bells yet?) Dropping the mutex means that we lose all the state we have 172 * built up so far for the execbuf and we must reset any global data. However, 173 * we do leave the objects pinned in their final locations - which is a 174 * potential issue for concurrent execbufs. Once we have left the mutex, we can 175 * allocate and copy all the relocation entries into a large array at our 176 * leisure, reacquire the mutex, reclaim all the objects and other state and 177 * then proceed to update any incorrect addresses with the objects. 178 * 179 * As we process the relocation entries, we maintain a record of whether the 180 * object is being written to. Using NORELOC, we expect userspace to provide 181 * this information instead. We also check whether we can skip the relocation 182 * by comparing the expected value inside the relocation entry with the target's 183 * final address. If they differ, we have to map the current object and rewrite 184 * the 4 or 8 byte pointer within. 185 * 186 * Serialising an execbuf is quite simple according to the rules of the GEM 187 * ABI. Execution within each context is ordered by the order of submission. 188 * Writes to any GEM object are in order of submission and are exclusive. Reads 189 * from a GEM object are unordered with respect to other reads, but ordered by 190 * writes. A write submitted after a read cannot occur before the read, and 191 * similarly any read submitted after a write cannot occur before the write. 192 * Writes are ordered between engines such that only one write occurs at any 193 * time (completing any reads beforehand) - using semaphores where available 194 * and CPU serialisation otherwise. Other GEM access obey the same rules, any 195 * write (either via mmaps using set-domain, or via pwrite) must flush all GPU 196 * reads before starting, and any read (either using set-domain or pread) must 197 * flush all GPU writes before starting. (Note we only employ a barrier before, 198 * we currently rely on userspace not concurrently starting a new execution 199 * whilst reading or writing to an object. This may be an advantage or not 200 * depending on how much you trust userspace not to shoot themselves in the 201 * foot.) Serialisation may just result in the request being inserted into 202 * a DAG awaiting its turn, but most simple is to wait on the CPU until 203 * all dependencies are resolved. 204 * 205 * After all of that, is just a matter of closing the request and handing it to 206 * the hardware (well, leaving it in a queue to be executed). However, we also 207 * offer the ability for batchbuffers to be run with elevated privileges so 208 * that they access otherwise hidden registers. (Used to adjust L3 cache etc.) 209 * Before any batch is given extra privileges we first must check that it 210 * contains no nefarious instructions, we check that each instruction is from 211 * our whitelist and all registers are also from an allowed list. We first 212 * copy the user's batchbuffer to a shadow (so that the user doesn't have 213 * access to it, either by the CPU or GPU as we scan it) and then parse each 214 * instruction. If everything is ok, we set a flag telling the hardware to run 215 * the batchbuffer in trusted mode, otherwise the ioctl is rejected. 216 */ 217 218 struct i915_execbuffer { 219 struct drm_i915_private *i915; /** i915 backpointer */ 220 struct drm_file *file; /** per-file lookup tables and limits */ 221 struct drm_i915_gem_execbuffer2 *args; /** ioctl parameters */ 222 struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ 223 struct i915_vma **vma; 224 unsigned int *flags; 225 226 struct intel_engine_cs *engine; /** engine to queue the request to */ 227 struct intel_context *context; /* logical state for the request */ 228 struct i915_gem_context *gem_context; /** caller's context */ 229 230 struct i915_request *request; /** our request to build */ 231 struct i915_vma *batch; /** identity of the batch obj/vma */ 232 struct i915_vma *trampoline; /** trampoline used for chaining */ 233 234 /** actual size of execobj[] as we may extend it for the cmdparser */ 235 unsigned int buffer_count; 236 237 /** list of vma not yet bound during reservation phase */ 238 struct list_head unbound; 239 240 /** list of vma that have execobj.relocation_count */ 241 struct list_head relocs; 242 243 /** 244 * Track the most recently used object for relocations, as we 245 * frequently have to perform multiple relocations within the same 246 * obj/page 247 */ 248 struct reloc_cache { 249 struct drm_mm_node node; /** temporary GTT binding */ 250 unsigned long vaddr; /** Current kmap address */ 251 unsigned long page; /** Currently mapped page index */ 252 unsigned int gen; /** Cached value of INTEL_GEN */ 253 bool use_64bit_reloc : 1; 254 bool has_llc : 1; 255 bool has_fence : 1; 256 bool needs_unfenced : 1; 257 258 struct i915_request *rq; 259 u32 *rq_cmd; 260 unsigned int rq_size; 261 } reloc_cache; 262 263 u64 invalid_flags; /** Set of execobj.flags that are invalid */ 264 u32 context_flags; /** Set of execobj.flags to insert from the ctx */ 265 266 u32 batch_start_offset; /** Location within object of batch */ 267 u32 batch_len; /** Length of batch within object */ 268 u32 batch_flags; /** Flags composed for emit_bb_start() */ 269 270 /** 271 * Indicate either the size of the hastable used to resolve 272 * relocation handles, or if negative that we are using a direct 273 * index into the execobj[]. 274 */ 275 int lut_size; 276 struct hlist_head *buckets; /** ht for relocation handles */ 277 }; 278 279 #define exec_entry(EB, VMA) (&(EB)->exec[(VMA)->exec_flags - (EB)->flags]) 280 281 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) 282 { 283 return intel_engine_requires_cmd_parser(eb->engine) || 284 (intel_engine_using_cmd_parser(eb->engine) && 285 eb->args->batch_len); 286 } 287 288 static int eb_create(struct i915_execbuffer *eb) 289 { 290 if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) { 291 unsigned int size = 1 + ilog2(eb->buffer_count); 292 293 /* 294 * Without a 1:1 association between relocation handles and 295 * the execobject[] index, we instead create a hashtable. 296 * We size it dynamically based on available memory, starting 297 * first with 1:1 assocative hash and scaling back until 298 * the allocation succeeds. 299 * 300 * Later on we use a positive lut_size to indicate we are 301 * using this hashtable, and a negative value to indicate a 302 * direct lookup. 303 */ 304 do { 305 gfp_t flags; 306 307 /* While we can still reduce the allocation size, don't 308 * raise a warning and allow the allocation to fail. 309 * On the last pass though, we want to try as hard 310 * as possible to perform the allocation and warn 311 * if it fails. 312 */ 313 flags = GFP_KERNEL; 314 if (size > 1) 315 flags |= __GFP_NORETRY | __GFP_NOWARN; 316 317 eb->buckets = kzalloc(sizeof(struct hlist_head) << size, 318 flags); 319 if (eb->buckets) 320 break; 321 } while (--size); 322 323 if (unlikely(!size)) 324 return -ENOMEM; 325 326 eb->lut_size = size; 327 } else { 328 eb->lut_size = -eb->buffer_count; 329 } 330 331 return 0; 332 } 333 334 static bool 335 eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry, 336 const struct i915_vma *vma, 337 unsigned int flags) 338 { 339 if (vma->node.size < entry->pad_to_size) 340 return true; 341 342 if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment)) 343 return true; 344 345 if (flags & EXEC_OBJECT_PINNED && 346 vma->node.start != entry->offset) 347 return true; 348 349 if (flags & __EXEC_OBJECT_NEEDS_BIAS && 350 vma->node.start < BATCH_OFFSET_BIAS) 351 return true; 352 353 if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) && 354 (vma->node.start + vma->node.size - 1) >> 32) 355 return true; 356 357 if (flags & __EXEC_OBJECT_NEEDS_MAP && 358 !i915_vma_is_map_and_fenceable(vma)) 359 return true; 360 361 return false; 362 } 363 364 static inline bool 365 eb_pin_vma(struct i915_execbuffer *eb, 366 const struct drm_i915_gem_exec_object2 *entry, 367 struct i915_vma *vma) 368 { 369 unsigned int exec_flags = *vma->exec_flags; 370 u64 pin_flags; 371 372 if (vma->node.size) 373 pin_flags = vma->node.start; 374 else 375 pin_flags = entry->offset & PIN_OFFSET_MASK; 376 377 pin_flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED; 378 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_GTT)) 379 pin_flags |= PIN_GLOBAL; 380 381 if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags))) 382 return false; 383 384 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) { 385 if (unlikely(i915_vma_pin_fence(vma))) { 386 i915_vma_unpin(vma); 387 return false; 388 } 389 390 if (vma->fence) 391 exec_flags |= __EXEC_OBJECT_HAS_FENCE; 392 } 393 394 *vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN; 395 return !eb_vma_misplaced(entry, vma, exec_flags); 396 } 397 398 static inline void __eb_unreserve_vma(struct i915_vma *vma, unsigned int flags) 399 { 400 GEM_BUG_ON(!(flags & __EXEC_OBJECT_HAS_PIN)); 401 402 if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE)) 403 __i915_vma_unpin_fence(vma); 404 405 __i915_vma_unpin(vma); 406 } 407 408 static inline void 409 eb_unreserve_vma(struct i915_vma *vma, unsigned int *flags) 410 { 411 if (!(*flags & __EXEC_OBJECT_HAS_PIN)) 412 return; 413 414 __eb_unreserve_vma(vma, *flags); 415 *flags &= ~__EXEC_OBJECT_RESERVED; 416 } 417 418 static int 419 eb_validate_vma(struct i915_execbuffer *eb, 420 struct drm_i915_gem_exec_object2 *entry, 421 struct i915_vma *vma) 422 { 423 if (unlikely(entry->flags & eb->invalid_flags)) 424 return -EINVAL; 425 426 if (unlikely(entry->alignment && !is_power_of_2(entry->alignment))) 427 return -EINVAL; 428 429 /* 430 * Offset can be used as input (EXEC_OBJECT_PINNED), reject 431 * any non-page-aligned or non-canonical addresses. 432 */ 433 if (unlikely(entry->flags & EXEC_OBJECT_PINNED && 434 entry->offset != gen8_canonical_addr(entry->offset & I915_GTT_PAGE_MASK))) 435 return -EINVAL; 436 437 /* pad_to_size was once a reserved field, so sanitize it */ 438 if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) { 439 if (unlikely(offset_in_page(entry->pad_to_size))) 440 return -EINVAL; 441 } else { 442 entry->pad_to_size = 0; 443 } 444 445 if (unlikely(vma->exec_flags)) { 446 DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n", 447 entry->handle, (int)(entry - eb->exec)); 448 return -EINVAL; 449 } 450 451 /* 452 * From drm_mm perspective address space is continuous, 453 * so from this point we're always using non-canonical 454 * form internally. 455 */ 456 entry->offset = gen8_noncanonical_addr(entry->offset); 457 458 if (!eb->reloc_cache.has_fence) { 459 entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE; 460 } else { 461 if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE || 462 eb->reloc_cache.needs_unfenced) && 463 i915_gem_object_is_tiled(vma->obj)) 464 entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP; 465 } 466 467 if (!(entry->flags & EXEC_OBJECT_PINNED)) 468 entry->flags |= eb->context_flags; 469 470 return 0; 471 } 472 473 static int 474 eb_add_vma(struct i915_execbuffer *eb, 475 unsigned int i, unsigned batch_idx, 476 struct i915_vma *vma) 477 { 478 struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; 479 int err; 480 481 GEM_BUG_ON(i915_vma_is_closed(vma)); 482 483 if (!(eb->args->flags & __EXEC_VALIDATED)) { 484 err = eb_validate_vma(eb, entry, vma); 485 if (unlikely(err)) 486 return err; 487 } 488 489 if (eb->lut_size > 0) { 490 vma->exec_handle = entry->handle; 491 hlist_add_head(&vma->exec_node, 492 &eb->buckets[hash_32(entry->handle, 493 eb->lut_size)]); 494 } 495 496 if (entry->relocation_count) 497 list_add_tail(&vma->reloc_link, &eb->relocs); 498 499 /* 500 * Stash a pointer from the vma to execobj, so we can query its flags, 501 * size, alignment etc as provided by the user. Also we stash a pointer 502 * to the vma inside the execobj so that we can use a direct lookup 503 * to find the right target VMA when doing relocations. 504 */ 505 eb->vma[i] = vma; 506 eb->flags[i] = entry->flags; 507 vma->exec_flags = &eb->flags[i]; 508 509 /* 510 * SNA is doing fancy tricks with compressing batch buffers, which leads 511 * to negative relocation deltas. Usually that works out ok since the 512 * relocate address is still positive, except when the batch is placed 513 * very low in the GTT. Ensure this doesn't happen. 514 * 515 * Note that actual hangs have only been observed on gen7, but for 516 * paranoia do it everywhere. 517 */ 518 if (i == batch_idx) { 519 if (entry->relocation_count && 520 !(eb->flags[i] & EXEC_OBJECT_PINNED)) 521 eb->flags[i] |= __EXEC_OBJECT_NEEDS_BIAS; 522 if (eb->reloc_cache.has_fence) 523 eb->flags[i] |= EXEC_OBJECT_NEEDS_FENCE; 524 525 eb->batch = vma; 526 } 527 528 err = 0; 529 if (eb_pin_vma(eb, entry, vma)) { 530 if (entry->offset != vma->node.start) { 531 entry->offset = vma->node.start | UPDATE; 532 eb->args->flags |= __EXEC_HAS_RELOC; 533 } 534 } else { 535 eb_unreserve_vma(vma, vma->exec_flags); 536 537 list_add_tail(&vma->exec_link, &eb->unbound); 538 if (drm_mm_node_allocated(&vma->node)) 539 err = i915_vma_unbind(vma); 540 if (unlikely(err)) 541 vma->exec_flags = NULL; 542 } 543 return err; 544 } 545 546 static inline int use_cpu_reloc(const struct reloc_cache *cache, 547 const struct drm_i915_gem_object *obj) 548 { 549 if (!i915_gem_object_has_struct_page(obj)) 550 return false; 551 552 if (DBG_FORCE_RELOC == FORCE_CPU_RELOC) 553 return true; 554 555 if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) 556 return false; 557 558 return (cache->has_llc || 559 obj->cache_dirty || 560 obj->cache_level != I915_CACHE_NONE); 561 } 562 563 static int eb_reserve_vma(const struct i915_execbuffer *eb, 564 struct i915_vma *vma) 565 { 566 struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 567 unsigned int exec_flags = *vma->exec_flags; 568 u64 pin_flags; 569 int err; 570 571 pin_flags = PIN_USER | PIN_NONBLOCK; 572 if (exec_flags & EXEC_OBJECT_NEEDS_GTT) 573 pin_flags |= PIN_GLOBAL; 574 575 /* 576 * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset, 577 * limit address to the first 4GBs for unflagged objects. 578 */ 579 if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 580 pin_flags |= PIN_ZONE_4G; 581 582 if (exec_flags & __EXEC_OBJECT_NEEDS_MAP) 583 pin_flags |= PIN_MAPPABLE; 584 585 if (exec_flags & EXEC_OBJECT_PINNED) { 586 pin_flags |= entry->offset | PIN_OFFSET_FIXED; 587 pin_flags &= ~PIN_NONBLOCK; /* force overlapping checks */ 588 } else if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS) { 589 pin_flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS; 590 } 591 592 err = i915_vma_pin(vma, 593 entry->pad_to_size, entry->alignment, 594 pin_flags); 595 if (err) 596 return err; 597 598 if (entry->offset != vma->node.start) { 599 entry->offset = vma->node.start | UPDATE; 600 eb->args->flags |= __EXEC_HAS_RELOC; 601 } 602 603 if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) { 604 err = i915_vma_pin_fence(vma); 605 if (unlikely(err)) { 606 i915_vma_unpin(vma); 607 return err; 608 } 609 610 if (vma->fence) 611 exec_flags |= __EXEC_OBJECT_HAS_FENCE; 612 } 613 614 *vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN; 615 GEM_BUG_ON(eb_vma_misplaced(entry, vma, exec_flags)); 616 617 return 0; 618 } 619 620 static int eb_reserve(struct i915_execbuffer *eb) 621 { 622 const unsigned int count = eb->buffer_count; 623 struct list_head last; 624 struct i915_vma *vma; 625 unsigned int i, pass; 626 int err; 627 628 /* 629 * Attempt to pin all of the buffers into the GTT. 630 * This is done in 3 phases: 631 * 632 * 1a. Unbind all objects that do not match the GTT constraints for 633 * the execbuffer (fenceable, mappable, alignment etc). 634 * 1b. Increment pin count for already bound objects. 635 * 2. Bind new objects. 636 * 3. Decrement pin count. 637 * 638 * This avoid unnecessary unbinding of later objects in order to make 639 * room for the earlier objects *unless* we need to defragment. 640 */ 641 642 pass = 0; 643 err = 0; 644 do { 645 list_for_each_entry(vma, &eb->unbound, exec_link) { 646 err = eb_reserve_vma(eb, vma); 647 if (err) 648 break; 649 } 650 if (err != -ENOSPC) 651 return err; 652 653 /* Resort *all* the objects into priority order */ 654 INIT_LIST_HEAD(&eb->unbound); 655 INIT_LIST_HEAD(&last); 656 for (i = 0; i < count; i++) { 657 unsigned int flags = eb->flags[i]; 658 struct i915_vma *vma = eb->vma[i]; 659 660 if (flags & EXEC_OBJECT_PINNED && 661 flags & __EXEC_OBJECT_HAS_PIN) 662 continue; 663 664 eb_unreserve_vma(vma, &eb->flags[i]); 665 666 if (flags & EXEC_OBJECT_PINNED) 667 /* Pinned must have their slot */ 668 list_add(&vma->exec_link, &eb->unbound); 669 else if (flags & __EXEC_OBJECT_NEEDS_MAP) 670 /* Map require the lowest 256MiB (aperture) */ 671 list_add_tail(&vma->exec_link, &eb->unbound); 672 else if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) 673 /* Prioritise 4GiB region for restricted bo */ 674 list_add(&vma->exec_link, &last); 675 else 676 list_add_tail(&vma->exec_link, &last); 677 } 678 list_splice_tail(&last, &eb->unbound); 679 680 switch (pass++) { 681 case 0: 682 break; 683 684 case 1: 685 /* Too fragmented, unbind everything and retry */ 686 mutex_lock(&eb->context->vm->mutex); 687 err = i915_gem_evict_vm(eb->context->vm); 688 mutex_unlock(&eb->context->vm->mutex); 689 if (err) 690 return err; 691 break; 692 693 default: 694 return -ENOSPC; 695 } 696 } while (1); 697 } 698 699 static unsigned int eb_batch_index(const struct i915_execbuffer *eb) 700 { 701 if (eb->args->flags & I915_EXEC_BATCH_FIRST) 702 return 0; 703 else 704 return eb->buffer_count - 1; 705 } 706 707 static int eb_select_context(struct i915_execbuffer *eb) 708 { 709 struct i915_gem_context *ctx; 710 711 ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1); 712 if (unlikely(!ctx)) 713 return -ENOENT; 714 715 eb->gem_context = ctx; 716 if (rcu_access_pointer(ctx->vm)) 717 eb->invalid_flags |= EXEC_OBJECT_NEEDS_GTT; 718 719 eb->context_flags = 0; 720 if (test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags)) 721 eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS; 722 723 return 0; 724 } 725 726 static int eb_lookup_vmas(struct i915_execbuffer *eb) 727 { 728 struct radix_tree_root *handles_vma = &eb->gem_context->handles_vma; 729 struct drm_i915_gem_object *obj; 730 unsigned int i, batch; 731 int err; 732 733 INIT_LIST_HEAD(&eb->relocs); 734 INIT_LIST_HEAD(&eb->unbound); 735 736 batch = eb_batch_index(eb); 737 738 mutex_lock(&eb->gem_context->mutex); 739 if (unlikely(i915_gem_context_is_closed(eb->gem_context))) { 740 err = -ENOENT; 741 goto err_ctx; 742 } 743 744 for (i = 0; i < eb->buffer_count; i++) { 745 u32 handle = eb->exec[i].handle; 746 struct i915_lut_handle *lut; 747 struct i915_vma *vma; 748 749 vma = radix_tree_lookup(handles_vma, handle); 750 if (likely(vma)) 751 goto add_vma; 752 753 obj = i915_gem_object_lookup(eb->file, handle); 754 if (unlikely(!obj)) { 755 err = -ENOENT; 756 goto err_vma; 757 } 758 759 vma = i915_vma_instance(obj, eb->context->vm, NULL); 760 if (IS_ERR(vma)) { 761 err = PTR_ERR(vma); 762 goto err_obj; 763 } 764 765 lut = i915_lut_handle_alloc(); 766 if (unlikely(!lut)) { 767 err = -ENOMEM; 768 goto err_obj; 769 } 770 771 err = radix_tree_insert(handles_vma, handle, vma); 772 if (unlikely(err)) { 773 i915_lut_handle_free(lut); 774 goto err_obj; 775 } 776 777 /* transfer ref to lut */ 778 if (!atomic_fetch_inc(&vma->open_count)) 779 i915_vma_reopen(vma); 780 lut->handle = handle; 781 lut->ctx = eb->gem_context; 782 783 i915_gem_object_lock(obj); 784 list_add(&lut->obj_link, &obj->lut_list); 785 i915_gem_object_unlock(obj); 786 787 add_vma: 788 err = eb_add_vma(eb, i, batch, vma); 789 if (unlikely(err)) 790 goto err_vma; 791 792 GEM_BUG_ON(vma != eb->vma[i]); 793 GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); 794 GEM_BUG_ON(drm_mm_node_allocated(&vma->node) && 795 eb_vma_misplaced(&eb->exec[i], vma, eb->flags[i])); 796 } 797 798 mutex_unlock(&eb->gem_context->mutex); 799 800 eb->args->flags |= __EXEC_VALIDATED; 801 return eb_reserve(eb); 802 803 err_obj: 804 i915_gem_object_put(obj); 805 err_vma: 806 eb->vma[i] = NULL; 807 err_ctx: 808 mutex_unlock(&eb->gem_context->mutex); 809 return err; 810 } 811 812 static struct i915_vma * 813 eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle) 814 { 815 if (eb->lut_size < 0) { 816 if (handle >= -eb->lut_size) 817 return NULL; 818 return eb->vma[handle]; 819 } else { 820 struct hlist_head *head; 821 struct i915_vma *vma; 822 823 head = &eb->buckets[hash_32(handle, eb->lut_size)]; 824 hlist_for_each_entry(vma, head, exec_node) { 825 if (vma->exec_handle == handle) 826 return vma; 827 } 828 return NULL; 829 } 830 } 831 832 static void eb_release_vmas(const struct i915_execbuffer *eb) 833 { 834 const unsigned int count = eb->buffer_count; 835 unsigned int i; 836 837 for (i = 0; i < count; i++) { 838 struct i915_vma *vma = eb->vma[i]; 839 unsigned int flags = eb->flags[i]; 840 841 if (!vma) 842 break; 843 844 GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); 845 vma->exec_flags = NULL; 846 eb->vma[i] = NULL; 847 848 if (flags & __EXEC_OBJECT_HAS_PIN) 849 __eb_unreserve_vma(vma, flags); 850 851 if (flags & __EXEC_OBJECT_HAS_REF) 852 i915_vma_put(vma); 853 } 854 } 855 856 static void eb_reset_vmas(const struct i915_execbuffer *eb) 857 { 858 eb_release_vmas(eb); 859 if (eb->lut_size > 0) 860 memset(eb->buckets, 0, 861 sizeof(struct hlist_head) << eb->lut_size); 862 } 863 864 static void eb_destroy(const struct i915_execbuffer *eb) 865 { 866 GEM_BUG_ON(eb->reloc_cache.rq); 867 868 if (eb->lut_size > 0) 869 kfree(eb->buckets); 870 } 871 872 static inline u64 873 relocation_target(const struct drm_i915_gem_relocation_entry *reloc, 874 const struct i915_vma *target) 875 { 876 return gen8_canonical_addr((int)reloc->delta + target->node.start); 877 } 878 879 static void reloc_cache_init(struct reloc_cache *cache, 880 struct drm_i915_private *i915) 881 { 882 cache->page = -1; 883 cache->vaddr = 0; 884 /* Must be a variable in the struct to allow GCC to unroll. */ 885 cache->gen = INTEL_GEN(i915); 886 cache->has_llc = HAS_LLC(i915); 887 cache->use_64bit_reloc = HAS_64BIT_RELOC(i915); 888 cache->has_fence = cache->gen < 4; 889 cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment; 890 cache->node.flags = 0; 891 cache->rq = NULL; 892 cache->rq_size = 0; 893 } 894 895 static inline void *unmask_page(unsigned long p) 896 { 897 return (void *)(uintptr_t)(p & PAGE_MASK); 898 } 899 900 static inline unsigned int unmask_flags(unsigned long p) 901 { 902 return p & ~PAGE_MASK; 903 } 904 905 #define KMAP 0x4 /* after CLFLUSH_FLAGS */ 906 907 static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache) 908 { 909 struct drm_i915_private *i915 = 910 container_of(cache, struct i915_execbuffer, reloc_cache)->i915; 911 return &i915->ggtt; 912 } 913 914 static void reloc_gpu_flush(struct reloc_cache *cache) 915 { 916 GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32)); 917 cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END; 918 919 __i915_gem_object_flush_map(cache->rq->batch->obj, 0, cache->rq_size); 920 i915_gem_object_unpin_map(cache->rq->batch->obj); 921 922 intel_gt_chipset_flush(cache->rq->engine->gt); 923 924 i915_request_add(cache->rq); 925 cache->rq = NULL; 926 } 927 928 static void reloc_cache_reset(struct reloc_cache *cache) 929 { 930 void *vaddr; 931 932 if (cache->rq) 933 reloc_gpu_flush(cache); 934 935 if (!cache->vaddr) 936 return; 937 938 vaddr = unmask_page(cache->vaddr); 939 if (cache->vaddr & KMAP) { 940 if (cache->vaddr & CLFLUSH_AFTER) 941 mb(); 942 943 kunmap_atomic(vaddr); 944 i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm); 945 } else { 946 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 947 948 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 949 io_mapping_unmap_atomic((void __iomem *)vaddr); 950 951 if (drm_mm_node_allocated(&cache->node)) { 952 ggtt->vm.clear_range(&ggtt->vm, 953 cache->node.start, 954 cache->node.size); 955 mutex_lock(&ggtt->vm.mutex); 956 drm_mm_remove_node(&cache->node); 957 mutex_unlock(&ggtt->vm.mutex); 958 } else { 959 i915_vma_unpin((struct i915_vma *)cache->node.mm); 960 } 961 } 962 963 cache->vaddr = 0; 964 cache->page = -1; 965 } 966 967 static void *reloc_kmap(struct drm_i915_gem_object *obj, 968 struct reloc_cache *cache, 969 unsigned long page) 970 { 971 void *vaddr; 972 973 if (cache->vaddr) { 974 kunmap_atomic(unmask_page(cache->vaddr)); 975 } else { 976 unsigned int flushes; 977 int err; 978 979 err = i915_gem_object_prepare_write(obj, &flushes); 980 if (err) 981 return ERR_PTR(err); 982 983 BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS); 984 BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK); 985 986 cache->vaddr = flushes | KMAP; 987 cache->node.mm = (void *)obj; 988 if (flushes) 989 mb(); 990 } 991 992 vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page)); 993 cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr; 994 cache->page = page; 995 996 return vaddr; 997 } 998 999 static void *reloc_iomap(struct drm_i915_gem_object *obj, 1000 struct reloc_cache *cache, 1001 unsigned long page) 1002 { 1003 struct i915_ggtt *ggtt = cache_to_ggtt(cache); 1004 unsigned long offset; 1005 void *vaddr; 1006 1007 if (cache->vaddr) { 1008 intel_gt_flush_ggtt_writes(ggtt->vm.gt); 1009 io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); 1010 } else { 1011 struct i915_vma *vma; 1012 int err; 1013 1014 if (i915_gem_object_is_tiled(obj)) 1015 return ERR_PTR(-EINVAL); 1016 1017 if (use_cpu_reloc(cache, obj)) 1018 return NULL; 1019 1020 i915_gem_object_lock(obj); 1021 err = i915_gem_object_set_to_gtt_domain(obj, true); 1022 i915_gem_object_unlock(obj); 1023 if (err) 1024 return ERR_PTR(err); 1025 1026 vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 1027 PIN_MAPPABLE | 1028 PIN_NONBLOCK /* NOWARN */ | 1029 PIN_NOEVICT); 1030 if (IS_ERR(vma)) { 1031 memset(&cache->node, 0, sizeof(cache->node)); 1032 mutex_lock(&ggtt->vm.mutex); 1033 err = drm_mm_insert_node_in_range 1034 (&ggtt->vm.mm, &cache->node, 1035 PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE, 1036 0, ggtt->mappable_end, 1037 DRM_MM_INSERT_LOW); 1038 mutex_unlock(&ggtt->vm.mutex); 1039 if (err) /* no inactive aperture space, use cpu reloc */ 1040 return NULL; 1041 } else { 1042 cache->node.start = vma->node.start; 1043 cache->node.mm = (void *)vma; 1044 } 1045 } 1046 1047 offset = cache->node.start; 1048 if (drm_mm_node_allocated(&cache->node)) { 1049 ggtt->vm.insert_page(&ggtt->vm, 1050 i915_gem_object_get_dma_address(obj, page), 1051 offset, I915_CACHE_NONE, 0); 1052 } else { 1053 offset += page << PAGE_SHIFT; 1054 } 1055 1056 vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, 1057 offset); 1058 cache->page = page; 1059 cache->vaddr = (unsigned long)vaddr; 1060 1061 return vaddr; 1062 } 1063 1064 static void *reloc_vaddr(struct drm_i915_gem_object *obj, 1065 struct reloc_cache *cache, 1066 unsigned long page) 1067 { 1068 void *vaddr; 1069 1070 if (cache->page == page) { 1071 vaddr = unmask_page(cache->vaddr); 1072 } else { 1073 vaddr = NULL; 1074 if ((cache->vaddr & KMAP) == 0) 1075 vaddr = reloc_iomap(obj, cache, page); 1076 if (!vaddr) 1077 vaddr = reloc_kmap(obj, cache, page); 1078 } 1079 1080 return vaddr; 1081 } 1082 1083 static void clflush_write32(u32 *addr, u32 value, unsigned int flushes) 1084 { 1085 if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) { 1086 if (flushes & CLFLUSH_BEFORE) { 1087 clflushopt(addr); 1088 mb(); 1089 } 1090 1091 *addr = value; 1092 1093 /* 1094 * Writes to the same cacheline are serialised by the CPU 1095 * (including clflush). On the write path, we only require 1096 * that it hits memory in an orderly fashion and place 1097 * mb barriers at the start and end of the relocation phase 1098 * to ensure ordering of clflush wrt to the system. 1099 */ 1100 if (flushes & CLFLUSH_AFTER) 1101 clflushopt(addr); 1102 } else 1103 *addr = value; 1104 } 1105 1106 static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) 1107 { 1108 struct drm_i915_gem_object *obj = vma->obj; 1109 int err; 1110 1111 i915_vma_lock(vma); 1112 1113 if (obj->cache_dirty & ~obj->cache_coherent) 1114 i915_gem_clflush_object(obj, 0); 1115 obj->write_domain = 0; 1116 1117 err = i915_request_await_object(rq, vma->obj, true); 1118 if (err == 0) 1119 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); 1120 1121 i915_vma_unlock(vma); 1122 1123 return err; 1124 } 1125 1126 static int __reloc_gpu_alloc(struct i915_execbuffer *eb, 1127 struct i915_vma *vma, 1128 unsigned int len) 1129 { 1130 struct reloc_cache *cache = &eb->reloc_cache; 1131 struct intel_engine_pool_node *pool; 1132 struct i915_request *rq; 1133 struct i915_vma *batch; 1134 u32 *cmd; 1135 int err; 1136 1137 pool = intel_engine_get_pool(eb->engine, PAGE_SIZE); 1138 if (IS_ERR(pool)) 1139 return PTR_ERR(pool); 1140 1141 cmd = i915_gem_object_pin_map(pool->obj, 1142 cache->has_llc ? 1143 I915_MAP_FORCE_WB : 1144 I915_MAP_FORCE_WC); 1145 if (IS_ERR(cmd)) { 1146 err = PTR_ERR(cmd); 1147 goto out_pool; 1148 } 1149 1150 batch = i915_vma_instance(pool->obj, vma->vm, NULL); 1151 if (IS_ERR(batch)) { 1152 err = PTR_ERR(batch); 1153 goto err_unmap; 1154 } 1155 1156 err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK); 1157 if (err) 1158 goto err_unmap; 1159 1160 rq = i915_request_create(eb->context); 1161 if (IS_ERR(rq)) { 1162 err = PTR_ERR(rq); 1163 goto err_unpin; 1164 } 1165 1166 err = intel_engine_pool_mark_active(pool, rq); 1167 if (err) 1168 goto err_request; 1169 1170 err = reloc_move_to_gpu(rq, vma); 1171 if (err) 1172 goto err_request; 1173 1174 err = eb->engine->emit_bb_start(rq, 1175 batch->node.start, PAGE_SIZE, 1176 cache->gen > 5 ? 0 : I915_DISPATCH_SECURE); 1177 if (err) 1178 goto skip_request; 1179 1180 i915_vma_lock(batch); 1181 err = i915_request_await_object(rq, batch->obj, false); 1182 if (err == 0) 1183 err = i915_vma_move_to_active(batch, rq, 0); 1184 i915_vma_unlock(batch); 1185 if (err) 1186 goto skip_request; 1187 1188 rq->batch = batch; 1189 i915_vma_unpin(batch); 1190 1191 cache->rq = rq; 1192 cache->rq_cmd = cmd; 1193 cache->rq_size = 0; 1194 1195 /* Return with batch mapping (cmd) still pinned */ 1196 goto out_pool; 1197 1198 skip_request: 1199 i915_request_skip(rq, err); 1200 err_request: 1201 i915_request_add(rq); 1202 err_unpin: 1203 i915_vma_unpin(batch); 1204 err_unmap: 1205 i915_gem_object_unpin_map(pool->obj); 1206 out_pool: 1207 intel_engine_pool_put(pool); 1208 return err; 1209 } 1210 1211 static u32 *reloc_gpu(struct i915_execbuffer *eb, 1212 struct i915_vma *vma, 1213 unsigned int len) 1214 { 1215 struct reloc_cache *cache = &eb->reloc_cache; 1216 u32 *cmd; 1217 1218 if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1)) 1219 reloc_gpu_flush(cache); 1220 1221 if (unlikely(!cache->rq)) { 1222 int err; 1223 1224 if (!intel_engine_can_store_dword(eb->engine)) 1225 return ERR_PTR(-ENODEV); 1226 1227 err = __reloc_gpu_alloc(eb, vma, len); 1228 if (unlikely(err)) 1229 return ERR_PTR(err); 1230 } 1231 1232 cmd = cache->rq_cmd + cache->rq_size; 1233 cache->rq_size += len; 1234 1235 return cmd; 1236 } 1237 1238 static u64 1239 relocate_entry(struct i915_vma *vma, 1240 const struct drm_i915_gem_relocation_entry *reloc, 1241 struct i915_execbuffer *eb, 1242 const struct i915_vma *target) 1243 { 1244 u64 offset = reloc->offset; 1245 u64 target_offset = relocation_target(reloc, target); 1246 bool wide = eb->reloc_cache.use_64bit_reloc; 1247 void *vaddr; 1248 1249 if (!eb->reloc_cache.vaddr && 1250 (DBG_FORCE_RELOC == FORCE_GPU_RELOC || 1251 !dma_resv_test_signaled_rcu(vma->resv, true))) { 1252 const unsigned int gen = eb->reloc_cache.gen; 1253 unsigned int len; 1254 u32 *batch; 1255 u64 addr; 1256 1257 if (wide) 1258 len = offset & 7 ? 8 : 5; 1259 else if (gen >= 4) 1260 len = 4; 1261 else 1262 len = 3; 1263 1264 batch = reloc_gpu(eb, vma, len); 1265 if (IS_ERR(batch)) 1266 goto repeat; 1267 1268 addr = gen8_canonical_addr(vma->node.start + offset); 1269 if (wide) { 1270 if (offset & 7) { 1271 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1272 *batch++ = lower_32_bits(addr); 1273 *batch++ = upper_32_bits(addr); 1274 *batch++ = lower_32_bits(target_offset); 1275 1276 addr = gen8_canonical_addr(addr + 4); 1277 1278 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1279 *batch++ = lower_32_bits(addr); 1280 *batch++ = upper_32_bits(addr); 1281 *batch++ = upper_32_bits(target_offset); 1282 } else { 1283 *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1; 1284 *batch++ = lower_32_bits(addr); 1285 *batch++ = upper_32_bits(addr); 1286 *batch++ = lower_32_bits(target_offset); 1287 *batch++ = upper_32_bits(target_offset); 1288 } 1289 } else if (gen >= 6) { 1290 *batch++ = MI_STORE_DWORD_IMM_GEN4; 1291 *batch++ = 0; 1292 *batch++ = addr; 1293 *batch++ = target_offset; 1294 } else if (gen >= 4) { 1295 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1296 *batch++ = 0; 1297 *batch++ = addr; 1298 *batch++ = target_offset; 1299 } else { 1300 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 1301 *batch++ = addr; 1302 *batch++ = target_offset; 1303 } 1304 1305 goto out; 1306 } 1307 1308 repeat: 1309 vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT); 1310 if (IS_ERR(vaddr)) 1311 return PTR_ERR(vaddr); 1312 1313 clflush_write32(vaddr + offset_in_page(offset), 1314 lower_32_bits(target_offset), 1315 eb->reloc_cache.vaddr); 1316 1317 if (wide) { 1318 offset += sizeof(u32); 1319 target_offset >>= 32; 1320 wide = false; 1321 goto repeat; 1322 } 1323 1324 out: 1325 return target->node.start | UPDATE; 1326 } 1327 1328 static u64 1329 eb_relocate_entry(struct i915_execbuffer *eb, 1330 struct i915_vma *vma, 1331 const struct drm_i915_gem_relocation_entry *reloc) 1332 { 1333 struct i915_vma *target; 1334 int err; 1335 1336 /* we've already hold a reference to all valid objects */ 1337 target = eb_get_vma(eb, reloc->target_handle); 1338 if (unlikely(!target)) 1339 return -ENOENT; 1340 1341 /* Validate that the target is in a valid r/w GPU domain */ 1342 if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) { 1343 DRM_DEBUG("reloc with multiple write domains: " 1344 "target %d offset %d " 1345 "read %08x write %08x", 1346 reloc->target_handle, 1347 (int) reloc->offset, 1348 reloc->read_domains, 1349 reloc->write_domain); 1350 return -EINVAL; 1351 } 1352 if (unlikely((reloc->write_domain | reloc->read_domains) 1353 & ~I915_GEM_GPU_DOMAINS)) { 1354 DRM_DEBUG("reloc with read/write non-GPU domains: " 1355 "target %d offset %d " 1356 "read %08x write %08x", 1357 reloc->target_handle, 1358 (int) reloc->offset, 1359 reloc->read_domains, 1360 reloc->write_domain); 1361 return -EINVAL; 1362 } 1363 1364 if (reloc->write_domain) { 1365 *target->exec_flags |= EXEC_OBJECT_WRITE; 1366 1367 /* 1368 * Sandybridge PPGTT errata: We need a global gtt mapping 1369 * for MI and pipe_control writes because the gpu doesn't 1370 * properly redirect them through the ppgtt for non_secure 1371 * batchbuffers. 1372 */ 1373 if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION && 1374 IS_GEN(eb->i915, 6)) { 1375 err = i915_vma_bind(target, target->obj->cache_level, 1376 PIN_GLOBAL, NULL); 1377 if (WARN_ONCE(err, 1378 "Unexpected failure to bind target VMA!")) 1379 return err; 1380 } 1381 } 1382 1383 /* 1384 * If the relocation already has the right value in it, no 1385 * more work needs to be done. 1386 */ 1387 if (!DBG_FORCE_RELOC && 1388 gen8_canonical_addr(target->node.start) == reloc->presumed_offset) 1389 return 0; 1390 1391 /* Check that the relocation address is valid... */ 1392 if (unlikely(reloc->offset > 1393 vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) { 1394 DRM_DEBUG("Relocation beyond object bounds: " 1395 "target %d offset %d size %d.\n", 1396 reloc->target_handle, 1397 (int)reloc->offset, 1398 (int)vma->size); 1399 return -EINVAL; 1400 } 1401 if (unlikely(reloc->offset & 3)) { 1402 DRM_DEBUG("Relocation not 4-byte aligned: " 1403 "target %d offset %d.\n", 1404 reloc->target_handle, 1405 (int)reloc->offset); 1406 return -EINVAL; 1407 } 1408 1409 /* 1410 * If we write into the object, we need to force the synchronisation 1411 * barrier, either with an asynchronous clflush or if we executed the 1412 * patching using the GPU (though that should be serialised by the 1413 * timeline). To be completely sure, and since we are required to 1414 * do relocations we are already stalling, disable the user's opt 1415 * out of our synchronisation. 1416 */ 1417 *vma->exec_flags &= ~EXEC_OBJECT_ASYNC; 1418 1419 /* and update the user's relocation entry */ 1420 return relocate_entry(vma, reloc, eb, target); 1421 } 1422 1423 static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma) 1424 { 1425 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry)) 1426 struct drm_i915_gem_relocation_entry stack[N_RELOC(512)]; 1427 struct drm_i915_gem_relocation_entry __user *urelocs; 1428 const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 1429 unsigned int remain; 1430 1431 urelocs = u64_to_user_ptr(entry->relocs_ptr); 1432 remain = entry->relocation_count; 1433 if (unlikely(remain > N_RELOC(ULONG_MAX))) 1434 return -EINVAL; 1435 1436 /* 1437 * We must check that the entire relocation array is safe 1438 * to read. However, if the array is not writable the user loses 1439 * the updated relocation values. 1440 */ 1441 if (unlikely(!access_ok(urelocs, remain*sizeof(*urelocs)))) 1442 return -EFAULT; 1443 1444 do { 1445 struct drm_i915_gem_relocation_entry *r = stack; 1446 unsigned int count = 1447 min_t(unsigned int, remain, ARRAY_SIZE(stack)); 1448 unsigned int copied; 1449 1450 /* 1451 * This is the fast path and we cannot handle a pagefault 1452 * whilst holding the struct mutex lest the user pass in the 1453 * relocations contained within a mmaped bo. For in such a case 1454 * we, the page fault handler would call i915_gem_fault() and 1455 * we would try to acquire the struct mutex again. Obviously 1456 * this is bad and so lockdep complains vehemently. 1457 */ 1458 pagefault_disable(); 1459 copied = __copy_from_user_inatomic(r, urelocs, count * sizeof(r[0])); 1460 pagefault_enable(); 1461 if (unlikely(copied)) { 1462 remain = -EFAULT; 1463 goto out; 1464 } 1465 1466 remain -= count; 1467 do { 1468 u64 offset = eb_relocate_entry(eb, vma, r); 1469 1470 if (likely(offset == 0)) { 1471 } else if ((s64)offset < 0) { 1472 remain = (int)offset; 1473 goto out; 1474 } else { 1475 /* 1476 * Note that reporting an error now 1477 * leaves everything in an inconsistent 1478 * state as we have *already* changed 1479 * the relocation value inside the 1480 * object. As we have not changed the 1481 * reloc.presumed_offset or will not 1482 * change the execobject.offset, on the 1483 * call we may not rewrite the value 1484 * inside the object, leaving it 1485 * dangling and causing a GPU hang. Unless 1486 * userspace dynamically rebuilds the 1487 * relocations on each execbuf rather than 1488 * presume a static tree. 1489 * 1490 * We did previously check if the relocations 1491 * were writable (access_ok), an error now 1492 * would be a strange race with mprotect, 1493 * having already demonstrated that we 1494 * can read from this userspace address. 1495 */ 1496 offset = gen8_canonical_addr(offset & ~UPDATE); 1497 if (unlikely(__put_user(offset, &urelocs[r-stack].presumed_offset))) { 1498 remain = -EFAULT; 1499 goto out; 1500 } 1501 } 1502 } while (r++, --count); 1503 urelocs += ARRAY_SIZE(stack); 1504 } while (remain); 1505 out: 1506 reloc_cache_reset(&eb->reloc_cache); 1507 return remain; 1508 } 1509 1510 static int 1511 eb_relocate_vma_slow(struct i915_execbuffer *eb, struct i915_vma *vma) 1512 { 1513 const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma); 1514 struct drm_i915_gem_relocation_entry *relocs = 1515 u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1516 unsigned int i; 1517 int err; 1518 1519 for (i = 0; i < entry->relocation_count; i++) { 1520 u64 offset = eb_relocate_entry(eb, vma, &relocs[i]); 1521 1522 if ((s64)offset < 0) { 1523 err = (int)offset; 1524 goto err; 1525 } 1526 } 1527 err = 0; 1528 err: 1529 reloc_cache_reset(&eb->reloc_cache); 1530 return err; 1531 } 1532 1533 static int check_relocations(const struct drm_i915_gem_exec_object2 *entry) 1534 { 1535 const char __user *addr, *end; 1536 unsigned long size; 1537 char __maybe_unused c; 1538 1539 size = entry->relocation_count; 1540 if (size == 0) 1541 return 0; 1542 1543 if (size > N_RELOC(ULONG_MAX)) 1544 return -EINVAL; 1545 1546 addr = u64_to_user_ptr(entry->relocs_ptr); 1547 size *= sizeof(struct drm_i915_gem_relocation_entry); 1548 if (!access_ok(addr, size)) 1549 return -EFAULT; 1550 1551 end = addr + size; 1552 for (; addr < end; addr += PAGE_SIZE) { 1553 int err = __get_user(c, addr); 1554 if (err) 1555 return err; 1556 } 1557 return __get_user(c, end - 1); 1558 } 1559 1560 static int eb_copy_relocations(const struct i915_execbuffer *eb) 1561 { 1562 struct drm_i915_gem_relocation_entry *relocs; 1563 const unsigned int count = eb->buffer_count; 1564 unsigned int i; 1565 int err; 1566 1567 for (i = 0; i < count; i++) { 1568 const unsigned int nreloc = eb->exec[i].relocation_count; 1569 struct drm_i915_gem_relocation_entry __user *urelocs; 1570 unsigned long size; 1571 unsigned long copied; 1572 1573 if (nreloc == 0) 1574 continue; 1575 1576 err = check_relocations(&eb->exec[i]); 1577 if (err) 1578 goto err; 1579 1580 urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr); 1581 size = nreloc * sizeof(*relocs); 1582 1583 relocs = kvmalloc_array(size, 1, GFP_KERNEL); 1584 if (!relocs) { 1585 err = -ENOMEM; 1586 goto err; 1587 } 1588 1589 /* copy_from_user is limited to < 4GiB */ 1590 copied = 0; 1591 do { 1592 unsigned int len = 1593 min_t(u64, BIT_ULL(31), size - copied); 1594 1595 if (__copy_from_user((char *)relocs + copied, 1596 (char __user *)urelocs + copied, 1597 len)) 1598 goto end; 1599 1600 copied += len; 1601 } while (copied < size); 1602 1603 /* 1604 * As we do not update the known relocation offsets after 1605 * relocating (due to the complexities in lock handling), 1606 * we need to mark them as invalid now so that we force the 1607 * relocation processing next time. Just in case the target 1608 * object is evicted and then rebound into its old 1609 * presumed_offset before the next execbuffer - if that 1610 * happened we would make the mistake of assuming that the 1611 * relocations were valid. 1612 */ 1613 if (!user_access_begin(urelocs, size)) 1614 goto end; 1615 1616 for (copied = 0; copied < nreloc; copied++) 1617 unsafe_put_user(-1, 1618 &urelocs[copied].presumed_offset, 1619 end_user); 1620 user_access_end(); 1621 1622 eb->exec[i].relocs_ptr = (uintptr_t)relocs; 1623 } 1624 1625 return 0; 1626 1627 end_user: 1628 user_access_end(); 1629 end: 1630 kvfree(relocs); 1631 err = -EFAULT; 1632 err: 1633 while (i--) { 1634 relocs = u64_to_ptr(typeof(*relocs), eb->exec[i].relocs_ptr); 1635 if (eb->exec[i].relocation_count) 1636 kvfree(relocs); 1637 } 1638 return err; 1639 } 1640 1641 static int eb_prefault_relocations(const struct i915_execbuffer *eb) 1642 { 1643 const unsigned int count = eb->buffer_count; 1644 unsigned int i; 1645 1646 if (unlikely(i915_modparams.prefault_disable)) 1647 return 0; 1648 1649 for (i = 0; i < count; i++) { 1650 int err; 1651 1652 err = check_relocations(&eb->exec[i]); 1653 if (err) 1654 return err; 1655 } 1656 1657 return 0; 1658 } 1659 1660 static noinline int eb_relocate_slow(struct i915_execbuffer *eb) 1661 { 1662 struct drm_device *dev = &eb->i915->drm; 1663 bool have_copy = false; 1664 struct i915_vma *vma; 1665 int err = 0; 1666 1667 repeat: 1668 if (signal_pending(current)) { 1669 err = -ERESTARTSYS; 1670 goto out; 1671 } 1672 1673 /* We may process another execbuffer during the unlock... */ 1674 eb_reset_vmas(eb); 1675 mutex_unlock(&dev->struct_mutex); 1676 1677 /* 1678 * We take 3 passes through the slowpatch. 1679 * 1680 * 1 - we try to just prefault all the user relocation entries and 1681 * then attempt to reuse the atomic pagefault disabled fast path again. 1682 * 1683 * 2 - we copy the user entries to a local buffer here outside of the 1684 * local and allow ourselves to wait upon any rendering before 1685 * relocations 1686 * 1687 * 3 - we already have a local copy of the relocation entries, but 1688 * were interrupted (EAGAIN) whilst waiting for the objects, try again. 1689 */ 1690 if (!err) { 1691 err = eb_prefault_relocations(eb); 1692 } else if (!have_copy) { 1693 err = eb_copy_relocations(eb); 1694 have_copy = err == 0; 1695 } else { 1696 cond_resched(); 1697 err = 0; 1698 } 1699 if (err) { 1700 mutex_lock(&dev->struct_mutex); 1701 goto out; 1702 } 1703 1704 /* A frequent cause for EAGAIN are currently unavailable client pages */ 1705 flush_workqueue(eb->i915->mm.userptr_wq); 1706 1707 err = i915_mutex_lock_interruptible(dev); 1708 if (err) { 1709 mutex_lock(&dev->struct_mutex); 1710 goto out; 1711 } 1712 1713 /* reacquire the objects */ 1714 err = eb_lookup_vmas(eb); 1715 if (err) 1716 goto err; 1717 1718 GEM_BUG_ON(!eb->batch); 1719 1720 list_for_each_entry(vma, &eb->relocs, reloc_link) { 1721 if (!have_copy) { 1722 pagefault_disable(); 1723 err = eb_relocate_vma(eb, vma); 1724 pagefault_enable(); 1725 if (err) 1726 goto repeat; 1727 } else { 1728 err = eb_relocate_vma_slow(eb, vma); 1729 if (err) 1730 goto err; 1731 } 1732 } 1733 1734 /* 1735 * Leave the user relocations as are, this is the painfully slow path, 1736 * and we want to avoid the complication of dropping the lock whilst 1737 * having buffers reserved in the aperture and so causing spurious 1738 * ENOSPC for random operations. 1739 */ 1740 1741 err: 1742 if (err == -EAGAIN) 1743 goto repeat; 1744 1745 out: 1746 if (have_copy) { 1747 const unsigned int count = eb->buffer_count; 1748 unsigned int i; 1749 1750 for (i = 0; i < count; i++) { 1751 const struct drm_i915_gem_exec_object2 *entry = 1752 &eb->exec[i]; 1753 struct drm_i915_gem_relocation_entry *relocs; 1754 1755 if (!entry->relocation_count) 1756 continue; 1757 1758 relocs = u64_to_ptr(typeof(*relocs), entry->relocs_ptr); 1759 kvfree(relocs); 1760 } 1761 } 1762 1763 return err; 1764 } 1765 1766 static int eb_relocate(struct i915_execbuffer *eb) 1767 { 1768 if (eb_lookup_vmas(eb)) 1769 goto slow; 1770 1771 /* The objects are in their final locations, apply the relocations. */ 1772 if (eb->args->flags & __EXEC_HAS_RELOC) { 1773 struct i915_vma *vma; 1774 1775 list_for_each_entry(vma, &eb->relocs, reloc_link) { 1776 if (eb_relocate_vma(eb, vma)) 1777 goto slow; 1778 } 1779 } 1780 1781 return 0; 1782 1783 slow: 1784 return eb_relocate_slow(eb); 1785 } 1786 1787 static int eb_move_to_gpu(struct i915_execbuffer *eb) 1788 { 1789 const unsigned int count = eb->buffer_count; 1790 struct ww_acquire_ctx acquire; 1791 unsigned int i; 1792 int err = 0; 1793 1794 ww_acquire_init(&acquire, &reservation_ww_class); 1795 1796 for (i = 0; i < count; i++) { 1797 struct i915_vma *vma = eb->vma[i]; 1798 1799 err = ww_mutex_lock_interruptible(&vma->resv->lock, &acquire); 1800 if (!err) 1801 continue; 1802 1803 GEM_BUG_ON(err == -EALREADY); /* No duplicate vma */ 1804 1805 if (err == -EDEADLK) { 1806 GEM_BUG_ON(i == 0); 1807 do { 1808 int j = i - 1; 1809 1810 ww_mutex_unlock(&eb->vma[j]->resv->lock); 1811 1812 swap(eb->flags[i], eb->flags[j]); 1813 swap(eb->vma[i], eb->vma[j]); 1814 eb->vma[i]->exec_flags = &eb->flags[i]; 1815 } while (--i); 1816 GEM_BUG_ON(vma != eb->vma[0]); 1817 vma->exec_flags = &eb->flags[0]; 1818 1819 err = ww_mutex_lock_slow_interruptible(&vma->resv->lock, 1820 &acquire); 1821 } 1822 if (err) 1823 break; 1824 } 1825 ww_acquire_done(&acquire); 1826 1827 while (i--) { 1828 unsigned int flags = eb->flags[i]; 1829 struct i915_vma *vma = eb->vma[i]; 1830 struct drm_i915_gem_object *obj = vma->obj; 1831 1832 assert_vma_held(vma); 1833 1834 if (flags & EXEC_OBJECT_CAPTURE) { 1835 struct i915_capture_list *capture; 1836 1837 capture = kmalloc(sizeof(*capture), GFP_KERNEL); 1838 if (capture) { 1839 capture->next = eb->request->capture_list; 1840 capture->vma = vma; 1841 eb->request->capture_list = capture; 1842 } 1843 } 1844 1845 /* 1846 * If the GPU is not _reading_ through the CPU cache, we need 1847 * to make sure that any writes (both previous GPU writes from 1848 * before a change in snooping levels and normal CPU writes) 1849 * caught in that cache are flushed to main memory. 1850 * 1851 * We want to say 1852 * obj->cache_dirty && 1853 * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) 1854 * but gcc's optimiser doesn't handle that as well and emits 1855 * two jumps instead of one. Maybe one day... 1856 */ 1857 if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { 1858 if (i915_gem_clflush_object(obj, 0)) 1859 flags &= ~EXEC_OBJECT_ASYNC; 1860 } 1861 1862 if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) { 1863 err = i915_request_await_object 1864 (eb->request, obj, flags & EXEC_OBJECT_WRITE); 1865 } 1866 1867 if (err == 0) 1868 err = i915_vma_move_to_active(vma, eb->request, flags); 1869 1870 i915_vma_unlock(vma); 1871 1872 __eb_unreserve_vma(vma, flags); 1873 vma->exec_flags = NULL; 1874 1875 if (unlikely(flags & __EXEC_OBJECT_HAS_REF)) 1876 i915_vma_put(vma); 1877 } 1878 ww_acquire_fini(&acquire); 1879 1880 if (unlikely(err)) 1881 goto err_skip; 1882 1883 eb->exec = NULL; 1884 1885 /* Unconditionally flush any chipset caches (for streaming writes). */ 1886 intel_gt_chipset_flush(eb->engine->gt); 1887 return 0; 1888 1889 err_skip: 1890 i915_request_skip(eb->request, err); 1891 return err; 1892 } 1893 1894 static int i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec) 1895 { 1896 if (exec->flags & __I915_EXEC_ILLEGAL_FLAGS) 1897 return -EINVAL; 1898 1899 /* Kernel clipping was a DRI1 misfeature */ 1900 if (!(exec->flags & I915_EXEC_FENCE_ARRAY)) { 1901 if (exec->num_cliprects || exec->cliprects_ptr) 1902 return -EINVAL; 1903 } 1904 1905 if (exec->DR4 == 0xffffffff) { 1906 DRM_DEBUG("UXA submitting garbage DR4, fixing up\n"); 1907 exec->DR4 = 0; 1908 } 1909 if (exec->DR1 || exec->DR4) 1910 return -EINVAL; 1911 1912 if ((exec->batch_start_offset | exec->batch_len) & 0x7) 1913 return -EINVAL; 1914 1915 return 0; 1916 } 1917 1918 static int i915_reset_gen7_sol_offsets(struct i915_request *rq) 1919 { 1920 u32 *cs; 1921 int i; 1922 1923 if (!IS_GEN(rq->i915, 7) || rq->engine->id != RCS0) { 1924 DRM_DEBUG("sol reset is gen7/rcs only\n"); 1925 return -EINVAL; 1926 } 1927 1928 cs = intel_ring_begin(rq, 4 * 2 + 2); 1929 if (IS_ERR(cs)) 1930 return PTR_ERR(cs); 1931 1932 *cs++ = MI_LOAD_REGISTER_IMM(4); 1933 for (i = 0; i < 4; i++) { 1934 *cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i)); 1935 *cs++ = 0; 1936 } 1937 *cs++ = MI_NOOP; 1938 intel_ring_advance(rq, cs); 1939 1940 return 0; 1941 } 1942 1943 static struct i915_vma * 1944 shadow_batch_pin(struct drm_i915_gem_object *obj, 1945 struct i915_address_space *vm, 1946 unsigned int flags) 1947 { 1948 struct i915_vma *vma; 1949 int err; 1950 1951 vma = i915_vma_instance(obj, vm, NULL); 1952 if (IS_ERR(vma)) 1953 return vma; 1954 1955 err = i915_vma_pin(vma, 0, 0, flags); 1956 if (err) 1957 return ERR_PTR(err); 1958 1959 return vma; 1960 } 1961 1962 struct eb_parse_work { 1963 struct dma_fence_work base; 1964 struct intel_engine_cs *engine; 1965 struct i915_vma *batch; 1966 struct i915_vma *shadow; 1967 struct i915_vma *trampoline; 1968 unsigned int batch_offset; 1969 unsigned int batch_length; 1970 }; 1971 1972 static int __eb_parse(struct dma_fence_work *work) 1973 { 1974 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 1975 1976 return intel_engine_cmd_parser(pw->engine, 1977 pw->batch, 1978 pw->batch_offset, 1979 pw->batch_length, 1980 pw->shadow, 1981 pw->trampoline); 1982 } 1983 1984 static const struct dma_fence_work_ops eb_parse_ops = { 1985 .name = "eb_parse", 1986 .work = __eb_parse, 1987 }; 1988 1989 static int eb_parse_pipeline(struct i915_execbuffer *eb, 1990 struct i915_vma *shadow, 1991 struct i915_vma *trampoline) 1992 { 1993 struct eb_parse_work *pw; 1994 int err; 1995 1996 pw = kzalloc(sizeof(*pw), GFP_KERNEL); 1997 if (!pw) 1998 return -ENOMEM; 1999 2000 dma_fence_work_init(&pw->base, &eb_parse_ops); 2001 2002 pw->engine = eb->engine; 2003 pw->batch = eb->batch; 2004 pw->batch_offset = eb->batch_start_offset; 2005 pw->batch_length = eb->batch_len; 2006 pw->shadow = shadow; 2007 pw->trampoline = trampoline; 2008 2009 dma_resv_lock(pw->batch->resv, NULL); 2010 2011 err = dma_resv_reserve_shared(pw->batch->resv, 1); 2012 if (err) 2013 goto err_batch_unlock; 2014 2015 /* Wait for all writes (and relocs) into the batch to complete */ 2016 err = i915_sw_fence_await_reservation(&pw->base.chain, 2017 pw->batch->resv, NULL, false, 2018 0, I915_FENCE_GFP); 2019 if (err < 0) 2020 goto err_batch_unlock; 2021 2022 /* Keep the batch alive and unwritten as we parse */ 2023 dma_resv_add_shared_fence(pw->batch->resv, &pw->base.dma); 2024 2025 dma_resv_unlock(pw->batch->resv); 2026 2027 /* Force execution to wait for completion of the parser */ 2028 dma_resv_lock(shadow->resv, NULL); 2029 dma_resv_add_excl_fence(shadow->resv, &pw->base.dma); 2030 dma_resv_unlock(shadow->resv); 2031 2032 dma_fence_work_commit(&pw->base); 2033 return 0; 2034 2035 err_batch_unlock: 2036 dma_resv_unlock(pw->batch->resv); 2037 kfree(pw); 2038 return err; 2039 } 2040 2041 static int eb_parse(struct i915_execbuffer *eb) 2042 { 2043 struct intel_engine_pool_node *pool; 2044 struct i915_vma *shadow, *trampoline; 2045 unsigned int len; 2046 int err; 2047 2048 if (!eb_use_cmdparser(eb)) 2049 return 0; 2050 2051 len = eb->batch_len; 2052 if (!CMDPARSER_USES_GGTT(eb->i915)) { 2053 /* 2054 * ppGTT backed shadow buffers must be mapped RO, to prevent 2055 * post-scan tampering 2056 */ 2057 if (!eb->context->vm->has_read_only) { 2058 DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n"); 2059 return -EINVAL; 2060 } 2061 } else { 2062 len += I915_CMD_PARSER_TRAMPOLINE_SIZE; 2063 } 2064 2065 pool = intel_engine_get_pool(eb->engine, len); 2066 if (IS_ERR(pool)) 2067 return PTR_ERR(pool); 2068 2069 shadow = shadow_batch_pin(pool->obj, eb->context->vm, PIN_USER); 2070 if (IS_ERR(shadow)) { 2071 err = PTR_ERR(shadow); 2072 goto err; 2073 } 2074 i915_gem_object_set_readonly(shadow->obj); 2075 2076 trampoline = NULL; 2077 if (CMDPARSER_USES_GGTT(eb->i915)) { 2078 trampoline = shadow; 2079 2080 shadow = shadow_batch_pin(pool->obj, 2081 &eb->engine->gt->ggtt->vm, 2082 PIN_GLOBAL); 2083 if (IS_ERR(shadow)) { 2084 err = PTR_ERR(shadow); 2085 shadow = trampoline; 2086 goto err_shadow; 2087 } 2088 2089 eb->batch_flags |= I915_DISPATCH_SECURE; 2090 } 2091 2092 err = eb_parse_pipeline(eb, shadow, trampoline); 2093 if (err) 2094 goto err_trampoline; 2095 2096 eb->vma[eb->buffer_count] = i915_vma_get(shadow); 2097 eb->flags[eb->buffer_count] = 2098 __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF; 2099 shadow->exec_flags = &eb->flags[eb->buffer_count]; 2100 eb->buffer_count++; 2101 2102 eb->trampoline = trampoline; 2103 eb->batch_start_offset = 0; 2104 eb->batch = shadow; 2105 2106 shadow->private = pool; 2107 return 0; 2108 2109 err_trampoline: 2110 if (trampoline) 2111 i915_vma_unpin(trampoline); 2112 err_shadow: 2113 i915_vma_unpin(shadow); 2114 err: 2115 intel_engine_pool_put(pool); 2116 return err; 2117 } 2118 2119 static void 2120 add_to_client(struct i915_request *rq, struct drm_file *file) 2121 { 2122 struct drm_i915_file_private *file_priv = file->driver_priv; 2123 2124 rq->file_priv = file_priv; 2125 2126 spin_lock(&file_priv->mm.lock); 2127 list_add_tail(&rq->client_link, &file_priv->mm.request_list); 2128 spin_unlock(&file_priv->mm.lock); 2129 } 2130 2131 static int eb_submit(struct i915_execbuffer *eb) 2132 { 2133 int err; 2134 2135 err = eb_move_to_gpu(eb); 2136 if (err) 2137 return err; 2138 2139 if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) { 2140 err = i915_reset_gen7_sol_offsets(eb->request); 2141 if (err) 2142 return err; 2143 } 2144 2145 /* 2146 * After we completed waiting for other engines (using HW semaphores) 2147 * then we can signal that this request/batch is ready to run. This 2148 * allows us to determine if the batch is still waiting on the GPU 2149 * or actually running by checking the breadcrumb. 2150 */ 2151 if (eb->engine->emit_init_breadcrumb) { 2152 err = eb->engine->emit_init_breadcrumb(eb->request); 2153 if (err) 2154 return err; 2155 } 2156 2157 err = eb->engine->emit_bb_start(eb->request, 2158 eb->batch->node.start + 2159 eb->batch_start_offset, 2160 eb->batch_len, 2161 eb->batch_flags); 2162 if (err) 2163 return err; 2164 2165 if (eb->trampoline) { 2166 GEM_BUG_ON(eb->batch_start_offset); 2167 err = eb->engine->emit_bb_start(eb->request, 2168 eb->trampoline->node.start + 2169 eb->batch_len, 2170 0, 0); 2171 if (err) 2172 return err; 2173 } 2174 2175 if (intel_context_nopreempt(eb->context)) 2176 __set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags); 2177 2178 return 0; 2179 } 2180 2181 static int num_vcs_engines(const struct drm_i915_private *i915) 2182 { 2183 return hweight64(INTEL_INFO(i915)->engine_mask & 2184 GENMASK_ULL(VCS0 + I915_MAX_VCS - 1, VCS0)); 2185 } 2186 2187 /* 2188 * Find one BSD ring to dispatch the corresponding BSD command. 2189 * The engine index is returned. 2190 */ 2191 static unsigned int 2192 gen8_dispatch_bsd_engine(struct drm_i915_private *dev_priv, 2193 struct drm_file *file) 2194 { 2195 struct drm_i915_file_private *file_priv = file->driver_priv; 2196 2197 /* Check whether the file_priv has already selected one ring. */ 2198 if ((int)file_priv->bsd_engine < 0) 2199 file_priv->bsd_engine = 2200 get_random_int() % num_vcs_engines(dev_priv); 2201 2202 return file_priv->bsd_engine; 2203 } 2204 2205 static const enum intel_engine_id user_ring_map[] = { 2206 [I915_EXEC_DEFAULT] = RCS0, 2207 [I915_EXEC_RENDER] = RCS0, 2208 [I915_EXEC_BLT] = BCS0, 2209 [I915_EXEC_BSD] = VCS0, 2210 [I915_EXEC_VEBOX] = VECS0 2211 }; 2212 2213 static struct i915_request *eb_throttle(struct intel_context *ce) 2214 { 2215 struct intel_ring *ring = ce->ring; 2216 struct intel_timeline *tl = ce->timeline; 2217 struct i915_request *rq; 2218 2219 /* 2220 * Completely unscientific finger-in-the-air estimates for suitable 2221 * maximum user request size (to avoid blocking) and then backoff. 2222 */ 2223 if (intel_ring_update_space(ring) >= PAGE_SIZE) 2224 return NULL; 2225 2226 /* 2227 * Find a request that after waiting upon, there will be at least half 2228 * the ring available. The hysteresis allows us to compete for the 2229 * shared ring and should mean that we sleep less often prior to 2230 * claiming our resources, but not so long that the ring completely 2231 * drains before we can submit our next request. 2232 */ 2233 list_for_each_entry(rq, &tl->requests, link) { 2234 if (rq->ring != ring) 2235 continue; 2236 2237 if (__intel_ring_space(rq->postfix, 2238 ring->emit, ring->size) > ring->size / 2) 2239 break; 2240 } 2241 if (&rq->link == &tl->requests) 2242 return NULL; /* weird, we will check again later for real */ 2243 2244 return i915_request_get(rq); 2245 } 2246 2247 static int __eb_pin_engine(struct i915_execbuffer *eb, struct intel_context *ce) 2248 { 2249 struct intel_timeline *tl; 2250 struct i915_request *rq; 2251 int err; 2252 2253 /* 2254 * ABI: Before userspace accesses the GPU (e.g. execbuffer), report 2255 * EIO if the GPU is already wedged. 2256 */ 2257 err = intel_gt_terminally_wedged(ce->engine->gt); 2258 if (err) 2259 return err; 2260 2261 if (unlikely(intel_context_is_banned(ce))) 2262 return -EIO; 2263 2264 /* 2265 * Pinning the contexts may generate requests in order to acquire 2266 * GGTT space, so do this first before we reserve a seqno for 2267 * ourselves. 2268 */ 2269 err = intel_context_pin(ce); 2270 if (err) 2271 return err; 2272 2273 /* 2274 * Take a local wakeref for preparing to dispatch the execbuf as 2275 * we expect to access the hardware fairly frequently in the 2276 * process, and require the engine to be kept awake between accesses. 2277 * Upon dispatch, we acquire another prolonged wakeref that we hold 2278 * until the timeline is idle, which in turn releases the wakeref 2279 * taken on the engine, and the parent device. 2280 */ 2281 tl = intel_context_timeline_lock(ce); 2282 if (IS_ERR(tl)) { 2283 err = PTR_ERR(tl); 2284 goto err_unpin; 2285 } 2286 2287 intel_context_enter(ce); 2288 rq = eb_throttle(ce); 2289 2290 intel_context_timeline_unlock(tl); 2291 2292 if (rq) { 2293 if (i915_request_wait(rq, 2294 I915_WAIT_INTERRUPTIBLE, 2295 MAX_SCHEDULE_TIMEOUT) < 0) { 2296 i915_request_put(rq); 2297 err = -EINTR; 2298 goto err_exit; 2299 } 2300 2301 i915_request_put(rq); 2302 } 2303 2304 eb->engine = ce->engine; 2305 eb->context = ce; 2306 return 0; 2307 2308 err_exit: 2309 mutex_lock(&tl->mutex); 2310 intel_context_exit(ce); 2311 intel_context_timeline_unlock(tl); 2312 err_unpin: 2313 intel_context_unpin(ce); 2314 return err; 2315 } 2316 2317 static void eb_unpin_engine(struct i915_execbuffer *eb) 2318 { 2319 struct intel_context *ce = eb->context; 2320 struct intel_timeline *tl = ce->timeline; 2321 2322 mutex_lock(&tl->mutex); 2323 intel_context_exit(ce); 2324 mutex_unlock(&tl->mutex); 2325 2326 intel_context_unpin(ce); 2327 } 2328 2329 static unsigned int 2330 eb_select_legacy_ring(struct i915_execbuffer *eb, 2331 struct drm_file *file, 2332 struct drm_i915_gem_execbuffer2 *args) 2333 { 2334 struct drm_i915_private *i915 = eb->i915; 2335 unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK; 2336 2337 if (user_ring_id != I915_EXEC_BSD && 2338 (args->flags & I915_EXEC_BSD_MASK)) { 2339 DRM_DEBUG("execbuf with non bsd ring but with invalid " 2340 "bsd dispatch flags: %d\n", (int)(args->flags)); 2341 return -1; 2342 } 2343 2344 if (user_ring_id == I915_EXEC_BSD && num_vcs_engines(i915) > 1) { 2345 unsigned int bsd_idx = args->flags & I915_EXEC_BSD_MASK; 2346 2347 if (bsd_idx == I915_EXEC_BSD_DEFAULT) { 2348 bsd_idx = gen8_dispatch_bsd_engine(i915, file); 2349 } else if (bsd_idx >= I915_EXEC_BSD_RING1 && 2350 bsd_idx <= I915_EXEC_BSD_RING2) { 2351 bsd_idx >>= I915_EXEC_BSD_SHIFT; 2352 bsd_idx--; 2353 } else { 2354 DRM_DEBUG("execbuf with unknown bsd ring: %u\n", 2355 bsd_idx); 2356 return -1; 2357 } 2358 2359 return _VCS(bsd_idx); 2360 } 2361 2362 if (user_ring_id >= ARRAY_SIZE(user_ring_map)) { 2363 DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id); 2364 return -1; 2365 } 2366 2367 return user_ring_map[user_ring_id]; 2368 } 2369 2370 static int 2371 eb_pin_engine(struct i915_execbuffer *eb, 2372 struct drm_file *file, 2373 struct drm_i915_gem_execbuffer2 *args) 2374 { 2375 struct intel_context *ce; 2376 unsigned int idx; 2377 int err; 2378 2379 if (i915_gem_context_user_engines(eb->gem_context)) 2380 idx = args->flags & I915_EXEC_RING_MASK; 2381 else 2382 idx = eb_select_legacy_ring(eb, file, args); 2383 2384 ce = i915_gem_context_get_engine(eb->gem_context, idx); 2385 if (IS_ERR(ce)) 2386 return PTR_ERR(ce); 2387 2388 err = __eb_pin_engine(eb, ce); 2389 intel_context_put(ce); 2390 2391 return err; 2392 } 2393 2394 static void 2395 __free_fence_array(struct drm_syncobj **fences, unsigned int n) 2396 { 2397 while (n--) 2398 drm_syncobj_put(ptr_mask_bits(fences[n], 2)); 2399 kvfree(fences); 2400 } 2401 2402 static struct drm_syncobj ** 2403 get_fence_array(struct drm_i915_gem_execbuffer2 *args, 2404 struct drm_file *file) 2405 { 2406 const unsigned long nfences = args->num_cliprects; 2407 struct drm_i915_gem_exec_fence __user *user; 2408 struct drm_syncobj **fences; 2409 unsigned long n; 2410 int err; 2411 2412 if (!(args->flags & I915_EXEC_FENCE_ARRAY)) 2413 return NULL; 2414 2415 /* Check multiplication overflow for access_ok() and kvmalloc_array() */ 2416 BUILD_BUG_ON(sizeof(size_t) > sizeof(unsigned long)); 2417 if (nfences > min_t(unsigned long, 2418 ULONG_MAX / sizeof(*user), 2419 SIZE_MAX / sizeof(*fences))) 2420 return ERR_PTR(-EINVAL); 2421 2422 user = u64_to_user_ptr(args->cliprects_ptr); 2423 if (!access_ok(user, nfences * sizeof(*user))) 2424 return ERR_PTR(-EFAULT); 2425 2426 fences = kvmalloc_array(nfences, sizeof(*fences), 2427 __GFP_NOWARN | GFP_KERNEL); 2428 if (!fences) 2429 return ERR_PTR(-ENOMEM); 2430 2431 for (n = 0; n < nfences; n++) { 2432 struct drm_i915_gem_exec_fence fence; 2433 struct drm_syncobj *syncobj; 2434 2435 if (__copy_from_user(&fence, user++, sizeof(fence))) { 2436 err = -EFAULT; 2437 goto err; 2438 } 2439 2440 if (fence.flags & __I915_EXEC_FENCE_UNKNOWN_FLAGS) { 2441 err = -EINVAL; 2442 goto err; 2443 } 2444 2445 syncobj = drm_syncobj_find(file, fence.handle); 2446 if (!syncobj) { 2447 DRM_DEBUG("Invalid syncobj handle provided\n"); 2448 err = -ENOENT; 2449 goto err; 2450 } 2451 2452 BUILD_BUG_ON(~(ARCH_KMALLOC_MINALIGN - 1) & 2453 ~__I915_EXEC_FENCE_UNKNOWN_FLAGS); 2454 2455 fences[n] = ptr_pack_bits(syncobj, fence.flags, 2); 2456 } 2457 2458 return fences; 2459 2460 err: 2461 __free_fence_array(fences, n); 2462 return ERR_PTR(err); 2463 } 2464 2465 static void 2466 put_fence_array(struct drm_i915_gem_execbuffer2 *args, 2467 struct drm_syncobj **fences) 2468 { 2469 if (fences) 2470 __free_fence_array(fences, args->num_cliprects); 2471 } 2472 2473 static int 2474 await_fence_array(struct i915_execbuffer *eb, 2475 struct drm_syncobj **fences) 2476 { 2477 const unsigned int nfences = eb->args->num_cliprects; 2478 unsigned int n; 2479 int err; 2480 2481 for (n = 0; n < nfences; n++) { 2482 struct drm_syncobj *syncobj; 2483 struct dma_fence *fence; 2484 unsigned int flags; 2485 2486 syncobj = ptr_unpack_bits(fences[n], &flags, 2); 2487 if (!(flags & I915_EXEC_FENCE_WAIT)) 2488 continue; 2489 2490 fence = drm_syncobj_fence_get(syncobj); 2491 if (!fence) 2492 return -EINVAL; 2493 2494 err = i915_request_await_dma_fence(eb->request, fence); 2495 dma_fence_put(fence); 2496 if (err < 0) 2497 return err; 2498 } 2499 2500 return 0; 2501 } 2502 2503 static void 2504 signal_fence_array(struct i915_execbuffer *eb, 2505 struct drm_syncobj **fences) 2506 { 2507 const unsigned int nfences = eb->args->num_cliprects; 2508 struct dma_fence * const fence = &eb->request->fence; 2509 unsigned int n; 2510 2511 for (n = 0; n < nfences; n++) { 2512 struct drm_syncobj *syncobj; 2513 unsigned int flags; 2514 2515 syncobj = ptr_unpack_bits(fences[n], &flags, 2); 2516 if (!(flags & I915_EXEC_FENCE_SIGNAL)) 2517 continue; 2518 2519 drm_syncobj_replace_fence(syncobj, fence); 2520 } 2521 } 2522 2523 static int 2524 i915_gem_do_execbuffer(struct drm_device *dev, 2525 struct drm_file *file, 2526 struct drm_i915_gem_execbuffer2 *args, 2527 struct drm_i915_gem_exec_object2 *exec, 2528 struct drm_syncobj **fences) 2529 { 2530 struct drm_i915_private *i915 = to_i915(dev); 2531 struct i915_execbuffer eb; 2532 struct dma_fence *in_fence = NULL; 2533 struct dma_fence *exec_fence = NULL; 2534 struct sync_file *out_fence = NULL; 2535 int out_fence_fd = -1; 2536 int err; 2537 2538 BUILD_BUG_ON(__EXEC_INTERNAL_FLAGS & ~__I915_EXEC_ILLEGAL_FLAGS); 2539 BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & 2540 ~__EXEC_OBJECT_UNKNOWN_FLAGS); 2541 2542 eb.i915 = i915; 2543 eb.file = file; 2544 eb.args = args; 2545 if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) 2546 args->flags |= __EXEC_HAS_RELOC; 2547 2548 eb.exec = exec; 2549 eb.vma = (struct i915_vma **)(exec + args->buffer_count + 1); 2550 eb.vma[0] = NULL; 2551 eb.flags = (unsigned int *)(eb.vma + args->buffer_count + 1); 2552 2553 eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS; 2554 reloc_cache_init(&eb.reloc_cache, eb.i915); 2555 2556 eb.buffer_count = args->buffer_count; 2557 eb.batch_start_offset = args->batch_start_offset; 2558 eb.batch_len = args->batch_len; 2559 eb.trampoline = NULL; 2560 2561 eb.batch_flags = 0; 2562 if (args->flags & I915_EXEC_SECURE) { 2563 if (INTEL_GEN(i915) >= 11) 2564 return -ENODEV; 2565 2566 /* Return -EPERM to trigger fallback code on old binaries. */ 2567 if (!HAS_SECURE_BATCHES(i915)) 2568 return -EPERM; 2569 2570 if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN)) 2571 return -EPERM; 2572 2573 eb.batch_flags |= I915_DISPATCH_SECURE; 2574 } 2575 if (args->flags & I915_EXEC_IS_PINNED) 2576 eb.batch_flags |= I915_DISPATCH_PINNED; 2577 2578 if (args->flags & I915_EXEC_FENCE_IN) { 2579 in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 2580 if (!in_fence) 2581 return -EINVAL; 2582 } 2583 2584 if (args->flags & I915_EXEC_FENCE_SUBMIT) { 2585 if (in_fence) { 2586 err = -EINVAL; 2587 goto err_in_fence; 2588 } 2589 2590 exec_fence = sync_file_get_fence(lower_32_bits(args->rsvd2)); 2591 if (!exec_fence) { 2592 err = -EINVAL; 2593 goto err_in_fence; 2594 } 2595 } 2596 2597 if (args->flags & I915_EXEC_FENCE_OUT) { 2598 out_fence_fd = get_unused_fd_flags(O_CLOEXEC); 2599 if (out_fence_fd < 0) { 2600 err = out_fence_fd; 2601 goto err_exec_fence; 2602 } 2603 } 2604 2605 err = eb_create(&eb); 2606 if (err) 2607 goto err_out_fence; 2608 2609 GEM_BUG_ON(!eb.lut_size); 2610 2611 err = eb_select_context(&eb); 2612 if (unlikely(err)) 2613 goto err_destroy; 2614 2615 err = eb_pin_engine(&eb, file, args); 2616 if (unlikely(err)) 2617 goto err_context; 2618 2619 err = i915_mutex_lock_interruptible(dev); 2620 if (err) 2621 goto err_engine; 2622 2623 err = eb_relocate(&eb); 2624 if (err) { 2625 /* 2626 * If the user expects the execobject.offset and 2627 * reloc.presumed_offset to be an exact match, 2628 * as for using NO_RELOC, then we cannot update 2629 * the execobject.offset until we have completed 2630 * relocation. 2631 */ 2632 args->flags &= ~__EXEC_HAS_RELOC; 2633 goto err_vma; 2634 } 2635 2636 if (unlikely(*eb.batch->exec_flags & EXEC_OBJECT_WRITE)) { 2637 DRM_DEBUG("Attempting to use self-modifying batch buffer\n"); 2638 err = -EINVAL; 2639 goto err_vma; 2640 } 2641 if (eb.batch_start_offset > eb.batch->size || 2642 eb.batch_len > eb.batch->size - eb.batch_start_offset) { 2643 DRM_DEBUG("Attempting to use out-of-bounds batch\n"); 2644 err = -EINVAL; 2645 goto err_vma; 2646 } 2647 2648 if (eb.batch_len == 0) 2649 eb.batch_len = eb.batch->size - eb.batch_start_offset; 2650 2651 err = eb_parse(&eb); 2652 if (err) 2653 goto err_vma; 2654 2655 /* 2656 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure 2657 * batch" bit. Hence we need to pin secure batches into the global gtt. 2658 * hsw should have this fixed, but bdw mucks it up again. */ 2659 if (eb.batch_flags & I915_DISPATCH_SECURE) { 2660 struct i915_vma *vma; 2661 2662 /* 2663 * So on first glance it looks freaky that we pin the batch here 2664 * outside of the reservation loop. But: 2665 * - The batch is already pinned into the relevant ppgtt, so we 2666 * already have the backing storage fully allocated. 2667 * - No other BO uses the global gtt (well contexts, but meh), 2668 * so we don't really have issues with multiple objects not 2669 * fitting due to fragmentation. 2670 * So this is actually safe. 2671 */ 2672 vma = i915_gem_object_ggtt_pin(eb.batch->obj, NULL, 0, 0, 0); 2673 if (IS_ERR(vma)) { 2674 err = PTR_ERR(vma); 2675 goto err_vma; 2676 } 2677 2678 eb.batch = vma; 2679 } 2680 2681 /* All GPU relocation batches must be submitted prior to the user rq */ 2682 GEM_BUG_ON(eb.reloc_cache.rq); 2683 2684 /* Allocate a request for this batch buffer nice and early. */ 2685 eb.request = i915_request_create(eb.context); 2686 if (IS_ERR(eb.request)) { 2687 err = PTR_ERR(eb.request); 2688 goto err_batch_unpin; 2689 } 2690 2691 if (in_fence) { 2692 err = i915_request_await_dma_fence(eb.request, in_fence); 2693 if (err < 0) 2694 goto err_request; 2695 } 2696 2697 if (exec_fence) { 2698 err = i915_request_await_execution(eb.request, exec_fence, 2699 eb.engine->bond_execute); 2700 if (err < 0) 2701 goto err_request; 2702 } 2703 2704 if (fences) { 2705 err = await_fence_array(&eb, fences); 2706 if (err) 2707 goto err_request; 2708 } 2709 2710 if (out_fence_fd != -1) { 2711 out_fence = sync_file_create(&eb.request->fence); 2712 if (!out_fence) { 2713 err = -ENOMEM; 2714 goto err_request; 2715 } 2716 } 2717 2718 /* 2719 * Whilst this request exists, batch_obj will be on the 2720 * active_list, and so will hold the active reference. Only when this 2721 * request is retired will the the batch_obj be moved onto the 2722 * inactive_list and lose its active reference. Hence we do not need 2723 * to explicitly hold another reference here. 2724 */ 2725 eb.request->batch = eb.batch; 2726 if (eb.batch->private) 2727 intel_engine_pool_mark_active(eb.batch->private, eb.request); 2728 2729 trace_i915_request_queue(eb.request, eb.batch_flags); 2730 err = eb_submit(&eb); 2731 err_request: 2732 add_to_client(eb.request, file); 2733 i915_request_get(eb.request); 2734 i915_request_add(eb.request); 2735 2736 if (fences) 2737 signal_fence_array(&eb, fences); 2738 2739 if (out_fence) { 2740 if (err == 0) { 2741 fd_install(out_fence_fd, out_fence->file); 2742 args->rsvd2 &= GENMASK_ULL(31, 0); /* keep in-fence */ 2743 args->rsvd2 |= (u64)out_fence_fd << 32; 2744 out_fence_fd = -1; 2745 } else { 2746 fput(out_fence->file); 2747 } 2748 } 2749 i915_request_put(eb.request); 2750 2751 err_batch_unpin: 2752 if (eb.batch_flags & I915_DISPATCH_SECURE) 2753 i915_vma_unpin(eb.batch); 2754 if (eb.batch->private) 2755 intel_engine_pool_put(eb.batch->private); 2756 err_vma: 2757 if (eb.exec) 2758 eb_release_vmas(&eb); 2759 if (eb.trampoline) 2760 i915_vma_unpin(eb.trampoline); 2761 mutex_unlock(&dev->struct_mutex); 2762 err_engine: 2763 eb_unpin_engine(&eb); 2764 err_context: 2765 i915_gem_context_put(eb.gem_context); 2766 err_destroy: 2767 eb_destroy(&eb); 2768 err_out_fence: 2769 if (out_fence_fd != -1) 2770 put_unused_fd(out_fence_fd); 2771 err_exec_fence: 2772 dma_fence_put(exec_fence); 2773 err_in_fence: 2774 dma_fence_put(in_fence); 2775 return err; 2776 } 2777 2778 static size_t eb_element_size(void) 2779 { 2780 return (sizeof(struct drm_i915_gem_exec_object2) + 2781 sizeof(struct i915_vma *) + 2782 sizeof(unsigned int)); 2783 } 2784 2785 static bool check_buffer_count(size_t count) 2786 { 2787 const size_t sz = eb_element_size(); 2788 2789 /* 2790 * When using LUT_HANDLE, we impose a limit of INT_MAX for the lookup 2791 * array size (see eb_create()). Otherwise, we can accept an array as 2792 * large as can be addressed (though use large arrays at your peril)! 2793 */ 2794 2795 return !(count < 1 || count > INT_MAX || count > SIZE_MAX / sz - 1); 2796 } 2797 2798 /* 2799 * Legacy execbuffer just creates an exec2 list from the original exec object 2800 * list array and passes it to the real function. 2801 */ 2802 int 2803 i915_gem_execbuffer_ioctl(struct drm_device *dev, void *data, 2804 struct drm_file *file) 2805 { 2806 struct drm_i915_gem_execbuffer *args = data; 2807 struct drm_i915_gem_execbuffer2 exec2; 2808 struct drm_i915_gem_exec_object *exec_list = NULL; 2809 struct drm_i915_gem_exec_object2 *exec2_list = NULL; 2810 const size_t count = args->buffer_count; 2811 unsigned int i; 2812 int err; 2813 2814 if (!check_buffer_count(count)) { 2815 DRM_DEBUG("execbuf2 with %zd buffers\n", count); 2816 return -EINVAL; 2817 } 2818 2819 exec2.buffers_ptr = args->buffers_ptr; 2820 exec2.buffer_count = args->buffer_count; 2821 exec2.batch_start_offset = args->batch_start_offset; 2822 exec2.batch_len = args->batch_len; 2823 exec2.DR1 = args->DR1; 2824 exec2.DR4 = args->DR4; 2825 exec2.num_cliprects = args->num_cliprects; 2826 exec2.cliprects_ptr = args->cliprects_ptr; 2827 exec2.flags = I915_EXEC_RENDER; 2828 i915_execbuffer2_set_context_id(exec2, 0); 2829 2830 err = i915_gem_check_execbuffer(&exec2); 2831 if (err) 2832 return err; 2833 2834 /* Copy in the exec list from userland */ 2835 exec_list = kvmalloc_array(count, sizeof(*exec_list), 2836 __GFP_NOWARN | GFP_KERNEL); 2837 exec2_list = kvmalloc_array(count + 1, eb_element_size(), 2838 __GFP_NOWARN | GFP_KERNEL); 2839 if (exec_list == NULL || exec2_list == NULL) { 2840 DRM_DEBUG("Failed to allocate exec list for %d buffers\n", 2841 args->buffer_count); 2842 kvfree(exec_list); 2843 kvfree(exec2_list); 2844 return -ENOMEM; 2845 } 2846 err = copy_from_user(exec_list, 2847 u64_to_user_ptr(args->buffers_ptr), 2848 sizeof(*exec_list) * count); 2849 if (err) { 2850 DRM_DEBUG("copy %d exec entries failed %d\n", 2851 args->buffer_count, err); 2852 kvfree(exec_list); 2853 kvfree(exec2_list); 2854 return -EFAULT; 2855 } 2856 2857 for (i = 0; i < args->buffer_count; i++) { 2858 exec2_list[i].handle = exec_list[i].handle; 2859 exec2_list[i].relocation_count = exec_list[i].relocation_count; 2860 exec2_list[i].relocs_ptr = exec_list[i].relocs_ptr; 2861 exec2_list[i].alignment = exec_list[i].alignment; 2862 exec2_list[i].offset = exec_list[i].offset; 2863 if (INTEL_GEN(to_i915(dev)) < 4) 2864 exec2_list[i].flags = EXEC_OBJECT_NEEDS_FENCE; 2865 else 2866 exec2_list[i].flags = 0; 2867 } 2868 2869 err = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list, NULL); 2870 if (exec2.flags & __EXEC_HAS_RELOC) { 2871 struct drm_i915_gem_exec_object __user *user_exec_list = 2872 u64_to_user_ptr(args->buffers_ptr); 2873 2874 /* Copy the new buffer offsets back to the user's exec list. */ 2875 for (i = 0; i < args->buffer_count; i++) { 2876 if (!(exec2_list[i].offset & UPDATE)) 2877 continue; 2878 2879 exec2_list[i].offset = 2880 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 2881 exec2_list[i].offset &= PIN_OFFSET_MASK; 2882 if (__copy_to_user(&user_exec_list[i].offset, 2883 &exec2_list[i].offset, 2884 sizeof(user_exec_list[i].offset))) 2885 break; 2886 } 2887 } 2888 2889 kvfree(exec_list); 2890 kvfree(exec2_list); 2891 return err; 2892 } 2893 2894 int 2895 i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data, 2896 struct drm_file *file) 2897 { 2898 struct drm_i915_gem_execbuffer2 *args = data; 2899 struct drm_i915_gem_exec_object2 *exec2_list; 2900 struct drm_syncobj **fences = NULL; 2901 const size_t count = args->buffer_count; 2902 int err; 2903 2904 if (!check_buffer_count(count)) { 2905 DRM_DEBUG("execbuf2 with %zd buffers\n", count); 2906 return -EINVAL; 2907 } 2908 2909 err = i915_gem_check_execbuffer(args); 2910 if (err) 2911 return err; 2912 2913 /* Allocate an extra slot for use by the command parser */ 2914 exec2_list = kvmalloc_array(count + 1, eb_element_size(), 2915 __GFP_NOWARN | GFP_KERNEL); 2916 if (exec2_list == NULL) { 2917 DRM_DEBUG("Failed to allocate exec list for %zd buffers\n", 2918 count); 2919 return -ENOMEM; 2920 } 2921 if (copy_from_user(exec2_list, 2922 u64_to_user_ptr(args->buffers_ptr), 2923 sizeof(*exec2_list) * count)) { 2924 DRM_DEBUG("copy %zd exec entries failed\n", count); 2925 kvfree(exec2_list); 2926 return -EFAULT; 2927 } 2928 2929 if (args->flags & I915_EXEC_FENCE_ARRAY) { 2930 fences = get_fence_array(args, file); 2931 if (IS_ERR(fences)) { 2932 kvfree(exec2_list); 2933 return PTR_ERR(fences); 2934 } 2935 } 2936 2937 err = i915_gem_do_execbuffer(dev, file, args, exec2_list, fences); 2938 2939 /* 2940 * Now that we have begun execution of the batchbuffer, we ignore 2941 * any new error after this point. Also given that we have already 2942 * updated the associated relocations, we try to write out the current 2943 * object locations irrespective of any error. 2944 */ 2945 if (args->flags & __EXEC_HAS_RELOC) { 2946 struct drm_i915_gem_exec_object2 __user *user_exec_list = 2947 u64_to_user_ptr(args->buffers_ptr); 2948 unsigned int i; 2949 2950 /* Copy the new buffer offsets back to the user's exec list. */ 2951 /* 2952 * Note: count * sizeof(*user_exec_list) does not overflow, 2953 * because we checked 'count' in check_buffer_count(). 2954 * 2955 * And this range already got effectively checked earlier 2956 * when we did the "copy_from_user()" above. 2957 */ 2958 if (!user_access_begin(user_exec_list, count * sizeof(*user_exec_list))) 2959 goto end; 2960 2961 for (i = 0; i < args->buffer_count; i++) { 2962 if (!(exec2_list[i].offset & UPDATE)) 2963 continue; 2964 2965 exec2_list[i].offset = 2966 gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK); 2967 unsafe_put_user(exec2_list[i].offset, 2968 &user_exec_list[i].offset, 2969 end_user); 2970 } 2971 end_user: 2972 user_access_end(); 2973 end:; 2974 } 2975 2976 args->flags &= ~__I915_EXEC_UNKNOWN_FLAGS; 2977 put_fence_array(args, fences); 2978 kvfree(exec2_list); 2979 return err; 2980 } 2981