xref: /openbmc/linux/drivers/gpu/drm/i915/i915_gem.c (revision 1fa0a7dc)
1 /*
2  * Copyright © 2008-2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *
26  */
27 
28 #include <drm/drm_vma_manager.h>
29 #include <drm/drm_pci.h>
30 #include <drm/i915_drm.h>
31 #include <linux/dma-fence-array.h>
32 #include <linux/kthread.h>
33 #include <linux/reservation.h>
34 #include <linux/shmem_fs.h>
35 #include <linux/slab.h>
36 #include <linux/stop_machine.h>
37 #include <linux/swap.h>
38 #include <linux/pci.h>
39 #include <linux/dma-buf.h>
40 #include <linux/mman.h>
41 
42 #include "i915_drv.h"
43 #include "i915_gem_clflush.h"
44 #include "i915_gemfs.h"
45 #include "i915_globals.h"
46 #include "i915_reset.h"
47 #include "i915_trace.h"
48 #include "i915_vgpu.h"
49 
50 #include "intel_drv.h"
51 #include "intel_frontbuffer.h"
52 #include "intel_mocs.h"
53 #include "intel_pm.h"
54 #include "intel_workarounds.h"
55 
56 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
57 
58 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
59 {
60 	if (obj->cache_dirty)
61 		return false;
62 
63 	if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
64 		return true;
65 
66 	return obj->pin_global; /* currently in use by HW, keep flushed */
67 }
68 
69 static int
70 insert_mappable_node(struct i915_ggtt *ggtt,
71                      struct drm_mm_node *node, u32 size)
72 {
73 	memset(node, 0, sizeof(*node));
74 	return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
75 					   size, 0, I915_COLOR_UNEVICTABLE,
76 					   0, ggtt->mappable_end,
77 					   DRM_MM_INSERT_LOW);
78 }
79 
80 static void
81 remove_mappable_node(struct drm_mm_node *node)
82 {
83 	drm_mm_remove_node(node);
84 }
85 
86 /* some bookkeeping */
87 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
88 				  u64 size)
89 {
90 	spin_lock(&dev_priv->mm.object_stat_lock);
91 	dev_priv->mm.object_count++;
92 	dev_priv->mm.object_memory += size;
93 	spin_unlock(&dev_priv->mm.object_stat_lock);
94 }
95 
96 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
97 				     u64 size)
98 {
99 	spin_lock(&dev_priv->mm.object_stat_lock);
100 	dev_priv->mm.object_count--;
101 	dev_priv->mm.object_memory -= size;
102 	spin_unlock(&dev_priv->mm.object_stat_lock);
103 }
104 
105 static void __i915_gem_park(struct drm_i915_private *i915)
106 {
107 	intel_wakeref_t wakeref;
108 
109 	GEM_TRACE("\n");
110 
111 	lockdep_assert_held(&i915->drm.struct_mutex);
112 	GEM_BUG_ON(i915->gt.active_requests);
113 	GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
114 
115 	if (!i915->gt.awake)
116 		return;
117 
118 	/*
119 	 * Be paranoid and flush a concurrent interrupt to make sure
120 	 * we don't reactivate any irq tasklets after parking.
121 	 *
122 	 * FIXME: Note that even though we have waited for execlists to be idle,
123 	 * there may still be an in-flight interrupt even though the CSB
124 	 * is now empty. synchronize_irq() makes sure that a residual interrupt
125 	 * is completed before we continue, but it doesn't prevent the HW from
126 	 * raising a spurious interrupt later. To complete the shield we should
127 	 * coordinate disabling the CS irq with flushing the interrupts.
128 	 */
129 	synchronize_irq(i915->drm.irq);
130 
131 	intel_engines_park(i915);
132 	i915_timelines_park(i915);
133 
134 	i915_pmu_gt_parked(i915);
135 	i915_vma_parked(i915);
136 
137 	wakeref = fetch_and_zero(&i915->gt.awake);
138 	GEM_BUG_ON(!wakeref);
139 
140 	if (INTEL_GEN(i915) >= 6)
141 		gen6_rps_idle(i915);
142 
143 	intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ, wakeref);
144 
145 	i915_globals_park();
146 }
147 
148 void i915_gem_park(struct drm_i915_private *i915)
149 {
150 	GEM_TRACE("\n");
151 
152 	lockdep_assert_held(&i915->drm.struct_mutex);
153 	GEM_BUG_ON(i915->gt.active_requests);
154 
155 	if (!i915->gt.awake)
156 		return;
157 
158 	/* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
159 	mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
160 }
161 
162 void i915_gem_unpark(struct drm_i915_private *i915)
163 {
164 	GEM_TRACE("\n");
165 
166 	lockdep_assert_held(&i915->drm.struct_mutex);
167 	GEM_BUG_ON(!i915->gt.active_requests);
168 	assert_rpm_wakelock_held(i915);
169 
170 	if (i915->gt.awake)
171 		return;
172 
173 	/*
174 	 * It seems that the DMC likes to transition between the DC states a lot
175 	 * when there are no connected displays (no active power domains) during
176 	 * command submission.
177 	 *
178 	 * This activity has negative impact on the performance of the chip with
179 	 * huge latencies observed in the interrupt handler and elsewhere.
180 	 *
181 	 * Work around it by grabbing a GT IRQ power domain whilst there is any
182 	 * GT activity, preventing any DC state transitions.
183 	 */
184 	i915->gt.awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
185 	GEM_BUG_ON(!i915->gt.awake);
186 
187 	i915_globals_unpark();
188 
189 	intel_enable_gt_powersave(i915);
190 	i915_update_gfx_val(i915);
191 	if (INTEL_GEN(i915) >= 6)
192 		gen6_rps_busy(i915);
193 	i915_pmu_gt_unparked(i915);
194 
195 	intel_engines_unpark(i915);
196 
197 	i915_queue_hangcheck(i915);
198 
199 	queue_delayed_work(i915->wq,
200 			   &i915->gt.retire_work,
201 			   round_jiffies_up_relative(HZ));
202 }
203 
204 int
205 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
206 			    struct drm_file *file)
207 {
208 	struct i915_ggtt *ggtt = &to_i915(dev)->ggtt;
209 	struct drm_i915_gem_get_aperture *args = data;
210 	struct i915_vma *vma;
211 	u64 pinned;
212 
213 	mutex_lock(&ggtt->vm.mutex);
214 
215 	pinned = ggtt->vm.reserved;
216 	list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link)
217 		if (i915_vma_is_pinned(vma))
218 			pinned += vma->node.size;
219 
220 	mutex_unlock(&ggtt->vm.mutex);
221 
222 	args->aper_size = ggtt->vm.total;
223 	args->aper_available_size = args->aper_size - pinned;
224 
225 	return 0;
226 }
227 
228 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
229 {
230 	struct address_space *mapping = obj->base.filp->f_mapping;
231 	drm_dma_handle_t *phys;
232 	struct sg_table *st;
233 	struct scatterlist *sg;
234 	char *vaddr;
235 	int i;
236 	int err;
237 
238 	if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
239 		return -EINVAL;
240 
241 	/* Always aligning to the object size, allows a single allocation
242 	 * to handle all possible callers, and given typical object sizes,
243 	 * the alignment of the buddy allocation will naturally match.
244 	 */
245 	phys = drm_pci_alloc(obj->base.dev,
246 			     roundup_pow_of_two(obj->base.size),
247 			     roundup_pow_of_two(obj->base.size));
248 	if (!phys)
249 		return -ENOMEM;
250 
251 	vaddr = phys->vaddr;
252 	for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
253 		struct page *page;
254 		char *src;
255 
256 		page = shmem_read_mapping_page(mapping, i);
257 		if (IS_ERR(page)) {
258 			err = PTR_ERR(page);
259 			goto err_phys;
260 		}
261 
262 		src = kmap_atomic(page);
263 		memcpy(vaddr, src, PAGE_SIZE);
264 		drm_clflush_virt_range(vaddr, PAGE_SIZE);
265 		kunmap_atomic(src);
266 
267 		put_page(page);
268 		vaddr += PAGE_SIZE;
269 	}
270 
271 	i915_gem_chipset_flush(to_i915(obj->base.dev));
272 
273 	st = kmalloc(sizeof(*st), GFP_KERNEL);
274 	if (!st) {
275 		err = -ENOMEM;
276 		goto err_phys;
277 	}
278 
279 	if (sg_alloc_table(st, 1, GFP_KERNEL)) {
280 		kfree(st);
281 		err = -ENOMEM;
282 		goto err_phys;
283 	}
284 
285 	sg = st->sgl;
286 	sg->offset = 0;
287 	sg->length = obj->base.size;
288 
289 	sg_dma_address(sg) = phys->busaddr;
290 	sg_dma_len(sg) = obj->base.size;
291 
292 	obj->phys_handle = phys;
293 
294 	__i915_gem_object_set_pages(obj, st, sg->length);
295 
296 	return 0;
297 
298 err_phys:
299 	drm_pci_free(obj->base.dev, phys);
300 
301 	return err;
302 }
303 
304 static void __start_cpu_write(struct drm_i915_gem_object *obj)
305 {
306 	obj->read_domains = I915_GEM_DOMAIN_CPU;
307 	obj->write_domain = I915_GEM_DOMAIN_CPU;
308 	if (cpu_write_needs_clflush(obj))
309 		obj->cache_dirty = true;
310 }
311 
312 void
313 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
314 				struct sg_table *pages,
315 				bool needs_clflush)
316 {
317 	GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
318 
319 	if (obj->mm.madv == I915_MADV_DONTNEED)
320 		obj->mm.dirty = false;
321 
322 	if (needs_clflush &&
323 	    (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
324 	    !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
325 		drm_clflush_sg(pages);
326 
327 	__start_cpu_write(obj);
328 }
329 
330 static void
331 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
332 			       struct sg_table *pages)
333 {
334 	__i915_gem_object_release_shmem(obj, pages, false);
335 
336 	if (obj->mm.dirty) {
337 		struct address_space *mapping = obj->base.filp->f_mapping;
338 		char *vaddr = obj->phys_handle->vaddr;
339 		int i;
340 
341 		for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
342 			struct page *page;
343 			char *dst;
344 
345 			page = shmem_read_mapping_page(mapping, i);
346 			if (IS_ERR(page))
347 				continue;
348 
349 			dst = kmap_atomic(page);
350 			drm_clflush_virt_range(vaddr, PAGE_SIZE);
351 			memcpy(dst, vaddr, PAGE_SIZE);
352 			kunmap_atomic(dst);
353 
354 			set_page_dirty(page);
355 			if (obj->mm.madv == I915_MADV_WILLNEED)
356 				mark_page_accessed(page);
357 			put_page(page);
358 			vaddr += PAGE_SIZE;
359 		}
360 		obj->mm.dirty = false;
361 	}
362 
363 	sg_free_table(pages);
364 	kfree(pages);
365 
366 	drm_pci_free(obj->base.dev, obj->phys_handle);
367 }
368 
369 static void
370 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
371 {
372 	i915_gem_object_unpin_pages(obj);
373 }
374 
375 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
376 	.get_pages = i915_gem_object_get_pages_phys,
377 	.put_pages = i915_gem_object_put_pages_phys,
378 	.release = i915_gem_object_release_phys,
379 };
380 
381 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
382 
383 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
384 {
385 	struct i915_vma *vma;
386 	LIST_HEAD(still_in_list);
387 	int ret;
388 
389 	lockdep_assert_held(&obj->base.dev->struct_mutex);
390 
391 	/* Closed vma are removed from the obj->vma_list - but they may
392 	 * still have an active binding on the object. To remove those we
393 	 * must wait for all rendering to complete to the object (as unbinding
394 	 * must anyway), and retire the requests.
395 	 */
396 	ret = i915_gem_object_set_to_cpu_domain(obj, false);
397 	if (ret)
398 		return ret;
399 
400 	spin_lock(&obj->vma.lock);
401 	while (!ret && (vma = list_first_entry_or_null(&obj->vma.list,
402 						       struct i915_vma,
403 						       obj_link))) {
404 		list_move_tail(&vma->obj_link, &still_in_list);
405 		spin_unlock(&obj->vma.lock);
406 
407 		ret = i915_vma_unbind(vma);
408 
409 		spin_lock(&obj->vma.lock);
410 	}
411 	list_splice(&still_in_list, &obj->vma.list);
412 	spin_unlock(&obj->vma.lock);
413 
414 	return ret;
415 }
416 
417 static long
418 i915_gem_object_wait_fence(struct dma_fence *fence,
419 			   unsigned int flags,
420 			   long timeout)
421 {
422 	struct i915_request *rq;
423 
424 	BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
425 
426 	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
427 		return timeout;
428 
429 	if (!dma_fence_is_i915(fence))
430 		return dma_fence_wait_timeout(fence,
431 					      flags & I915_WAIT_INTERRUPTIBLE,
432 					      timeout);
433 
434 	rq = to_request(fence);
435 	if (i915_request_completed(rq))
436 		goto out;
437 
438 	timeout = i915_request_wait(rq, flags, timeout);
439 
440 out:
441 	if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
442 		i915_request_retire_upto(rq);
443 
444 	return timeout;
445 }
446 
447 static long
448 i915_gem_object_wait_reservation(struct reservation_object *resv,
449 				 unsigned int flags,
450 				 long timeout)
451 {
452 	unsigned int seq = __read_seqcount_begin(&resv->seq);
453 	struct dma_fence *excl;
454 	bool prune_fences = false;
455 
456 	if (flags & I915_WAIT_ALL) {
457 		struct dma_fence **shared;
458 		unsigned int count, i;
459 		int ret;
460 
461 		ret = reservation_object_get_fences_rcu(resv,
462 							&excl, &count, &shared);
463 		if (ret)
464 			return ret;
465 
466 		for (i = 0; i < count; i++) {
467 			timeout = i915_gem_object_wait_fence(shared[i],
468 							     flags, timeout);
469 			if (timeout < 0)
470 				break;
471 
472 			dma_fence_put(shared[i]);
473 		}
474 
475 		for (; i < count; i++)
476 			dma_fence_put(shared[i]);
477 		kfree(shared);
478 
479 		/*
480 		 * If both shared fences and an exclusive fence exist,
481 		 * then by construction the shared fences must be later
482 		 * than the exclusive fence. If we successfully wait for
483 		 * all the shared fences, we know that the exclusive fence
484 		 * must all be signaled. If all the shared fences are
485 		 * signaled, we can prune the array and recover the
486 		 * floating references on the fences/requests.
487 		 */
488 		prune_fences = count && timeout >= 0;
489 	} else {
490 		excl = reservation_object_get_excl_rcu(resv);
491 	}
492 
493 	if (excl && timeout >= 0)
494 		timeout = i915_gem_object_wait_fence(excl, flags, timeout);
495 
496 	dma_fence_put(excl);
497 
498 	/*
499 	 * Opportunistically prune the fences iff we know they have *all* been
500 	 * signaled and that the reservation object has not been changed (i.e.
501 	 * no new fences have been added).
502 	 */
503 	if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
504 		if (reservation_object_trylock(resv)) {
505 			if (!__read_seqcount_retry(&resv->seq, seq))
506 				reservation_object_add_excl_fence(resv, NULL);
507 			reservation_object_unlock(resv);
508 		}
509 	}
510 
511 	return timeout;
512 }
513 
514 static void __fence_set_priority(struct dma_fence *fence,
515 				 const struct i915_sched_attr *attr)
516 {
517 	struct i915_request *rq;
518 	struct intel_engine_cs *engine;
519 
520 	if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
521 		return;
522 
523 	rq = to_request(fence);
524 	engine = rq->engine;
525 
526 	local_bh_disable();
527 	rcu_read_lock(); /* RCU serialisation for set-wedged protection */
528 	if (engine->schedule)
529 		engine->schedule(rq, attr);
530 	rcu_read_unlock();
531 	local_bh_enable(); /* kick the tasklets if queues were reprioritised */
532 }
533 
534 static void fence_set_priority(struct dma_fence *fence,
535 			       const struct i915_sched_attr *attr)
536 {
537 	/* Recurse once into a fence-array */
538 	if (dma_fence_is_array(fence)) {
539 		struct dma_fence_array *array = to_dma_fence_array(fence);
540 		int i;
541 
542 		for (i = 0; i < array->num_fences; i++)
543 			__fence_set_priority(array->fences[i], attr);
544 	} else {
545 		__fence_set_priority(fence, attr);
546 	}
547 }
548 
549 int
550 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
551 			      unsigned int flags,
552 			      const struct i915_sched_attr *attr)
553 {
554 	struct dma_fence *excl;
555 
556 	if (flags & I915_WAIT_ALL) {
557 		struct dma_fence **shared;
558 		unsigned int count, i;
559 		int ret;
560 
561 		ret = reservation_object_get_fences_rcu(obj->resv,
562 							&excl, &count, &shared);
563 		if (ret)
564 			return ret;
565 
566 		for (i = 0; i < count; i++) {
567 			fence_set_priority(shared[i], attr);
568 			dma_fence_put(shared[i]);
569 		}
570 
571 		kfree(shared);
572 	} else {
573 		excl = reservation_object_get_excl_rcu(obj->resv);
574 	}
575 
576 	if (excl) {
577 		fence_set_priority(excl, attr);
578 		dma_fence_put(excl);
579 	}
580 	return 0;
581 }
582 
583 /**
584  * Waits for rendering to the object to be completed
585  * @obj: i915 gem object
586  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
587  * @timeout: how long to wait
588  */
589 int
590 i915_gem_object_wait(struct drm_i915_gem_object *obj,
591 		     unsigned int flags,
592 		     long timeout)
593 {
594 	might_sleep();
595 	GEM_BUG_ON(timeout < 0);
596 
597 	timeout = i915_gem_object_wait_reservation(obj->resv, flags, timeout);
598 	return timeout < 0 ? timeout : 0;
599 }
600 
601 static int
602 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
603 		     struct drm_i915_gem_pwrite *args,
604 		     struct drm_file *file)
605 {
606 	void *vaddr = obj->phys_handle->vaddr + args->offset;
607 	char __user *user_data = u64_to_user_ptr(args->data_ptr);
608 
609 	/* We manually control the domain here and pretend that it
610 	 * remains coherent i.e. in the GTT domain, like shmem_pwrite.
611 	 */
612 	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
613 	if (copy_from_user(vaddr, user_data, args->size))
614 		return -EFAULT;
615 
616 	drm_clflush_virt_range(vaddr, args->size);
617 	i915_gem_chipset_flush(to_i915(obj->base.dev));
618 
619 	intel_fb_obj_flush(obj, ORIGIN_CPU);
620 	return 0;
621 }
622 
623 static int
624 i915_gem_create(struct drm_file *file,
625 		struct drm_i915_private *dev_priv,
626 		u64 *size_p,
627 		u32 *handle_p)
628 {
629 	struct drm_i915_gem_object *obj;
630 	u32 handle;
631 	u64 size;
632 	int ret;
633 
634 	size = round_up(*size_p, PAGE_SIZE);
635 	if (size == 0)
636 		return -EINVAL;
637 
638 	/* Allocate the new object */
639 	obj = i915_gem_object_create(dev_priv, size);
640 	if (IS_ERR(obj))
641 		return PTR_ERR(obj);
642 
643 	ret = drm_gem_handle_create(file, &obj->base, &handle);
644 	/* drop reference from allocate - handle holds it now */
645 	i915_gem_object_put(obj);
646 	if (ret)
647 		return ret;
648 
649 	*handle_p = handle;
650 	*size_p = size;
651 	return 0;
652 }
653 
654 int
655 i915_gem_dumb_create(struct drm_file *file,
656 		     struct drm_device *dev,
657 		     struct drm_mode_create_dumb *args)
658 {
659 	/* have to work out size/pitch and return them */
660 	args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
661 	args->size = args->pitch * args->height;
662 	return i915_gem_create(file, to_i915(dev),
663 			       &args->size, &args->handle);
664 }
665 
666 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
667 {
668 	return !(obj->cache_level == I915_CACHE_NONE ||
669 		 obj->cache_level == I915_CACHE_WT);
670 }
671 
672 /**
673  * Creates a new mm object and returns a handle to it.
674  * @dev: drm device pointer
675  * @data: ioctl data blob
676  * @file: drm file pointer
677  */
678 int
679 i915_gem_create_ioctl(struct drm_device *dev, void *data,
680 		      struct drm_file *file)
681 {
682 	struct drm_i915_private *dev_priv = to_i915(dev);
683 	struct drm_i915_gem_create *args = data;
684 
685 	i915_gem_flush_free_objects(dev_priv);
686 
687 	return i915_gem_create(file, dev_priv,
688 			       &args->size, &args->handle);
689 }
690 
691 static inline enum fb_op_origin
692 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
693 {
694 	return (domain == I915_GEM_DOMAIN_GTT ?
695 		obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
696 }
697 
698 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
699 {
700 	intel_wakeref_t wakeref;
701 
702 	/*
703 	 * No actual flushing is required for the GTT write domain for reads
704 	 * from the GTT domain. Writes to it "immediately" go to main memory
705 	 * as far as we know, so there's no chipset flush. It also doesn't
706 	 * land in the GPU render cache.
707 	 *
708 	 * However, we do have to enforce the order so that all writes through
709 	 * the GTT land before any writes to the device, such as updates to
710 	 * the GATT itself.
711 	 *
712 	 * We also have to wait a bit for the writes to land from the GTT.
713 	 * An uncached read (i.e. mmio) seems to be ideal for the round-trip
714 	 * timing. This issue has only been observed when switching quickly
715 	 * between GTT writes and CPU reads from inside the kernel on recent hw,
716 	 * and it appears to only affect discrete GTT blocks (i.e. on LLC
717 	 * system agents we cannot reproduce this behaviour, until Cannonlake
718 	 * that was!).
719 	 */
720 
721 	wmb();
722 
723 	if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
724 		return;
725 
726 	i915_gem_chipset_flush(dev_priv);
727 
728 	with_intel_runtime_pm(dev_priv, wakeref) {
729 		spin_lock_irq(&dev_priv->uncore.lock);
730 
731 		POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
732 
733 		spin_unlock_irq(&dev_priv->uncore.lock);
734 	}
735 }
736 
737 static void
738 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
739 {
740 	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
741 	struct i915_vma *vma;
742 
743 	if (!(obj->write_domain & flush_domains))
744 		return;
745 
746 	switch (obj->write_domain) {
747 	case I915_GEM_DOMAIN_GTT:
748 		i915_gem_flush_ggtt_writes(dev_priv);
749 
750 		intel_fb_obj_flush(obj,
751 				   fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
752 
753 		for_each_ggtt_vma(vma, obj) {
754 			if (vma->iomap)
755 				continue;
756 
757 			i915_vma_unset_ggtt_write(vma);
758 		}
759 		break;
760 
761 	case I915_GEM_DOMAIN_WC:
762 		wmb();
763 		break;
764 
765 	case I915_GEM_DOMAIN_CPU:
766 		i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
767 		break;
768 
769 	case I915_GEM_DOMAIN_RENDER:
770 		if (gpu_write_needs_clflush(obj))
771 			obj->cache_dirty = true;
772 		break;
773 	}
774 
775 	obj->write_domain = 0;
776 }
777 
778 /*
779  * Pins the specified object's pages and synchronizes the object with
780  * GPU accesses. Sets needs_clflush to non-zero if the caller should
781  * flush the object from the CPU cache.
782  */
783 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
784 				    unsigned int *needs_clflush)
785 {
786 	int ret;
787 
788 	lockdep_assert_held(&obj->base.dev->struct_mutex);
789 
790 	*needs_clflush = 0;
791 	if (!i915_gem_object_has_struct_page(obj))
792 		return -ENODEV;
793 
794 	ret = i915_gem_object_wait(obj,
795 				   I915_WAIT_INTERRUPTIBLE |
796 				   I915_WAIT_LOCKED,
797 				   MAX_SCHEDULE_TIMEOUT);
798 	if (ret)
799 		return ret;
800 
801 	ret = i915_gem_object_pin_pages(obj);
802 	if (ret)
803 		return ret;
804 
805 	if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
806 	    !static_cpu_has(X86_FEATURE_CLFLUSH)) {
807 		ret = i915_gem_object_set_to_cpu_domain(obj, false);
808 		if (ret)
809 			goto err_unpin;
810 		else
811 			goto out;
812 	}
813 
814 	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
815 
816 	/* If we're not in the cpu read domain, set ourself into the gtt
817 	 * read domain and manually flush cachelines (if required). This
818 	 * optimizes for the case when the gpu will dirty the data
819 	 * anyway again before the next pread happens.
820 	 */
821 	if (!obj->cache_dirty &&
822 	    !(obj->read_domains & I915_GEM_DOMAIN_CPU))
823 		*needs_clflush = CLFLUSH_BEFORE;
824 
825 out:
826 	/* return with the pages pinned */
827 	return 0;
828 
829 err_unpin:
830 	i915_gem_object_unpin_pages(obj);
831 	return ret;
832 }
833 
834 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
835 				     unsigned int *needs_clflush)
836 {
837 	int ret;
838 
839 	lockdep_assert_held(&obj->base.dev->struct_mutex);
840 
841 	*needs_clflush = 0;
842 	if (!i915_gem_object_has_struct_page(obj))
843 		return -ENODEV;
844 
845 	ret = i915_gem_object_wait(obj,
846 				   I915_WAIT_INTERRUPTIBLE |
847 				   I915_WAIT_LOCKED |
848 				   I915_WAIT_ALL,
849 				   MAX_SCHEDULE_TIMEOUT);
850 	if (ret)
851 		return ret;
852 
853 	ret = i915_gem_object_pin_pages(obj);
854 	if (ret)
855 		return ret;
856 
857 	if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
858 	    !static_cpu_has(X86_FEATURE_CLFLUSH)) {
859 		ret = i915_gem_object_set_to_cpu_domain(obj, true);
860 		if (ret)
861 			goto err_unpin;
862 		else
863 			goto out;
864 	}
865 
866 	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
867 
868 	/* If we're not in the cpu write domain, set ourself into the
869 	 * gtt write domain and manually flush cachelines (as required).
870 	 * This optimizes for the case when the gpu will use the data
871 	 * right away and we therefore have to clflush anyway.
872 	 */
873 	if (!obj->cache_dirty) {
874 		*needs_clflush |= CLFLUSH_AFTER;
875 
876 		/*
877 		 * Same trick applies to invalidate partially written
878 		 * cachelines read before writing.
879 		 */
880 		if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
881 			*needs_clflush |= CLFLUSH_BEFORE;
882 	}
883 
884 out:
885 	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
886 	obj->mm.dirty = true;
887 	/* return with the pages pinned */
888 	return 0;
889 
890 err_unpin:
891 	i915_gem_object_unpin_pages(obj);
892 	return ret;
893 }
894 
895 static int
896 shmem_pread(struct page *page, int offset, int len, char __user *user_data,
897 	    bool needs_clflush)
898 {
899 	char *vaddr;
900 	int ret;
901 
902 	vaddr = kmap(page);
903 
904 	if (needs_clflush)
905 		drm_clflush_virt_range(vaddr + offset, len);
906 
907 	ret = __copy_to_user(user_data, vaddr + offset, len);
908 
909 	kunmap(page);
910 
911 	return ret ? -EFAULT : 0;
912 }
913 
914 static int
915 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
916 		     struct drm_i915_gem_pread *args)
917 {
918 	char __user *user_data;
919 	u64 remain;
920 	unsigned int needs_clflush;
921 	unsigned int idx, offset;
922 	int ret;
923 
924 	ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
925 	if (ret)
926 		return ret;
927 
928 	ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
929 	mutex_unlock(&obj->base.dev->struct_mutex);
930 	if (ret)
931 		return ret;
932 
933 	remain = args->size;
934 	user_data = u64_to_user_ptr(args->data_ptr);
935 	offset = offset_in_page(args->offset);
936 	for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
937 		struct page *page = i915_gem_object_get_page(obj, idx);
938 		unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
939 
940 		ret = shmem_pread(page, offset, length, user_data,
941 				  needs_clflush);
942 		if (ret)
943 			break;
944 
945 		remain -= length;
946 		user_data += length;
947 		offset = 0;
948 	}
949 
950 	i915_gem_obj_finish_shmem_access(obj);
951 	return ret;
952 }
953 
954 static inline bool
955 gtt_user_read(struct io_mapping *mapping,
956 	      loff_t base, int offset,
957 	      char __user *user_data, int length)
958 {
959 	void __iomem *vaddr;
960 	unsigned long unwritten;
961 
962 	/* We can use the cpu mem copy function because this is X86. */
963 	vaddr = io_mapping_map_atomic_wc(mapping, base);
964 	unwritten = __copy_to_user_inatomic(user_data,
965 					    (void __force *)vaddr + offset,
966 					    length);
967 	io_mapping_unmap_atomic(vaddr);
968 	if (unwritten) {
969 		vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
970 		unwritten = copy_to_user(user_data,
971 					 (void __force *)vaddr + offset,
972 					 length);
973 		io_mapping_unmap(vaddr);
974 	}
975 	return unwritten;
976 }
977 
978 static int
979 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
980 		   const struct drm_i915_gem_pread *args)
981 {
982 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
983 	struct i915_ggtt *ggtt = &i915->ggtt;
984 	intel_wakeref_t wakeref;
985 	struct drm_mm_node node;
986 	struct i915_vma *vma;
987 	void __user *user_data;
988 	u64 remain, offset;
989 	int ret;
990 
991 	ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
992 	if (ret)
993 		return ret;
994 
995 	wakeref = intel_runtime_pm_get(i915);
996 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
997 				       PIN_MAPPABLE |
998 				       PIN_NONFAULT |
999 				       PIN_NONBLOCK);
1000 	if (!IS_ERR(vma)) {
1001 		node.start = i915_ggtt_offset(vma);
1002 		node.allocated = false;
1003 		ret = i915_vma_put_fence(vma);
1004 		if (ret) {
1005 			i915_vma_unpin(vma);
1006 			vma = ERR_PTR(ret);
1007 		}
1008 	}
1009 	if (IS_ERR(vma)) {
1010 		ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1011 		if (ret)
1012 			goto out_unlock;
1013 		GEM_BUG_ON(!node.allocated);
1014 	}
1015 
1016 	ret = i915_gem_object_set_to_gtt_domain(obj, false);
1017 	if (ret)
1018 		goto out_unpin;
1019 
1020 	mutex_unlock(&i915->drm.struct_mutex);
1021 
1022 	user_data = u64_to_user_ptr(args->data_ptr);
1023 	remain = args->size;
1024 	offset = args->offset;
1025 
1026 	while (remain > 0) {
1027 		/* Operation in this page
1028 		 *
1029 		 * page_base = page offset within aperture
1030 		 * page_offset = offset within page
1031 		 * page_length = bytes to copy for this page
1032 		 */
1033 		u32 page_base = node.start;
1034 		unsigned page_offset = offset_in_page(offset);
1035 		unsigned page_length = PAGE_SIZE - page_offset;
1036 		page_length = remain < page_length ? remain : page_length;
1037 		if (node.allocated) {
1038 			wmb();
1039 			ggtt->vm.insert_page(&ggtt->vm,
1040 					     i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1041 					     node.start, I915_CACHE_NONE, 0);
1042 			wmb();
1043 		} else {
1044 			page_base += offset & PAGE_MASK;
1045 		}
1046 
1047 		if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1048 				  user_data, page_length)) {
1049 			ret = -EFAULT;
1050 			break;
1051 		}
1052 
1053 		remain -= page_length;
1054 		user_data += page_length;
1055 		offset += page_length;
1056 	}
1057 
1058 	mutex_lock(&i915->drm.struct_mutex);
1059 out_unpin:
1060 	if (node.allocated) {
1061 		wmb();
1062 		ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1063 		remove_mappable_node(&node);
1064 	} else {
1065 		i915_vma_unpin(vma);
1066 	}
1067 out_unlock:
1068 	intel_runtime_pm_put(i915, wakeref);
1069 	mutex_unlock(&i915->drm.struct_mutex);
1070 
1071 	return ret;
1072 }
1073 
1074 /**
1075  * Reads data from the object referenced by handle.
1076  * @dev: drm device pointer
1077  * @data: ioctl data blob
1078  * @file: drm file pointer
1079  *
1080  * On error, the contents of *data are undefined.
1081  */
1082 int
1083 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1084 		     struct drm_file *file)
1085 {
1086 	struct drm_i915_gem_pread *args = data;
1087 	struct drm_i915_gem_object *obj;
1088 	int ret;
1089 
1090 	if (args->size == 0)
1091 		return 0;
1092 
1093 	if (!access_ok(u64_to_user_ptr(args->data_ptr),
1094 		       args->size))
1095 		return -EFAULT;
1096 
1097 	obj = i915_gem_object_lookup(file, args->handle);
1098 	if (!obj)
1099 		return -ENOENT;
1100 
1101 	/* Bounds check source.  */
1102 	if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1103 		ret = -EINVAL;
1104 		goto out;
1105 	}
1106 
1107 	trace_i915_gem_object_pread(obj, args->offset, args->size);
1108 
1109 	ret = i915_gem_object_wait(obj,
1110 				   I915_WAIT_INTERRUPTIBLE,
1111 				   MAX_SCHEDULE_TIMEOUT);
1112 	if (ret)
1113 		goto out;
1114 
1115 	ret = i915_gem_object_pin_pages(obj);
1116 	if (ret)
1117 		goto out;
1118 
1119 	ret = i915_gem_shmem_pread(obj, args);
1120 	if (ret == -EFAULT || ret == -ENODEV)
1121 		ret = i915_gem_gtt_pread(obj, args);
1122 
1123 	i915_gem_object_unpin_pages(obj);
1124 out:
1125 	i915_gem_object_put(obj);
1126 	return ret;
1127 }
1128 
1129 /* This is the fast write path which cannot handle
1130  * page faults in the source data
1131  */
1132 
1133 static inline bool
1134 ggtt_write(struct io_mapping *mapping,
1135 	   loff_t base, int offset,
1136 	   char __user *user_data, int length)
1137 {
1138 	void __iomem *vaddr;
1139 	unsigned long unwritten;
1140 
1141 	/* We can use the cpu mem copy function because this is X86. */
1142 	vaddr = io_mapping_map_atomic_wc(mapping, base);
1143 	unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1144 						      user_data, length);
1145 	io_mapping_unmap_atomic(vaddr);
1146 	if (unwritten) {
1147 		vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1148 		unwritten = copy_from_user((void __force *)vaddr + offset,
1149 					   user_data, length);
1150 		io_mapping_unmap(vaddr);
1151 	}
1152 
1153 	return unwritten;
1154 }
1155 
1156 /**
1157  * This is the fast pwrite path, where we copy the data directly from the
1158  * user into the GTT, uncached.
1159  * @obj: i915 GEM object
1160  * @args: pwrite arguments structure
1161  */
1162 static int
1163 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1164 			 const struct drm_i915_gem_pwrite *args)
1165 {
1166 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
1167 	struct i915_ggtt *ggtt = &i915->ggtt;
1168 	intel_wakeref_t wakeref;
1169 	struct drm_mm_node node;
1170 	struct i915_vma *vma;
1171 	u64 remain, offset;
1172 	void __user *user_data;
1173 	int ret;
1174 
1175 	ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1176 	if (ret)
1177 		return ret;
1178 
1179 	if (i915_gem_object_has_struct_page(obj)) {
1180 		/*
1181 		 * Avoid waking the device up if we can fallback, as
1182 		 * waking/resuming is very slow (worst-case 10-100 ms
1183 		 * depending on PCI sleeps and our own resume time).
1184 		 * This easily dwarfs any performance advantage from
1185 		 * using the cache bypass of indirect GGTT access.
1186 		 */
1187 		wakeref = intel_runtime_pm_get_if_in_use(i915);
1188 		if (!wakeref) {
1189 			ret = -EFAULT;
1190 			goto out_unlock;
1191 		}
1192 	} else {
1193 		/* No backing pages, no fallback, we must force GGTT access */
1194 		wakeref = intel_runtime_pm_get(i915);
1195 	}
1196 
1197 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1198 				       PIN_MAPPABLE |
1199 				       PIN_NONFAULT |
1200 				       PIN_NONBLOCK);
1201 	if (!IS_ERR(vma)) {
1202 		node.start = i915_ggtt_offset(vma);
1203 		node.allocated = false;
1204 		ret = i915_vma_put_fence(vma);
1205 		if (ret) {
1206 			i915_vma_unpin(vma);
1207 			vma = ERR_PTR(ret);
1208 		}
1209 	}
1210 	if (IS_ERR(vma)) {
1211 		ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1212 		if (ret)
1213 			goto out_rpm;
1214 		GEM_BUG_ON(!node.allocated);
1215 	}
1216 
1217 	ret = i915_gem_object_set_to_gtt_domain(obj, true);
1218 	if (ret)
1219 		goto out_unpin;
1220 
1221 	mutex_unlock(&i915->drm.struct_mutex);
1222 
1223 	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1224 
1225 	user_data = u64_to_user_ptr(args->data_ptr);
1226 	offset = args->offset;
1227 	remain = args->size;
1228 	while (remain) {
1229 		/* Operation in this page
1230 		 *
1231 		 * page_base = page offset within aperture
1232 		 * page_offset = offset within page
1233 		 * page_length = bytes to copy for this page
1234 		 */
1235 		u32 page_base = node.start;
1236 		unsigned int page_offset = offset_in_page(offset);
1237 		unsigned int page_length = PAGE_SIZE - page_offset;
1238 		page_length = remain < page_length ? remain : page_length;
1239 		if (node.allocated) {
1240 			wmb(); /* flush the write before we modify the GGTT */
1241 			ggtt->vm.insert_page(&ggtt->vm,
1242 					     i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1243 					     node.start, I915_CACHE_NONE, 0);
1244 			wmb(); /* flush modifications to the GGTT (insert_page) */
1245 		} else {
1246 			page_base += offset & PAGE_MASK;
1247 		}
1248 		/* If we get a fault while copying data, then (presumably) our
1249 		 * source page isn't available.  Return the error and we'll
1250 		 * retry in the slow path.
1251 		 * If the object is non-shmem backed, we retry again with the
1252 		 * path that handles page fault.
1253 		 */
1254 		if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1255 			       user_data, page_length)) {
1256 			ret = -EFAULT;
1257 			break;
1258 		}
1259 
1260 		remain -= page_length;
1261 		user_data += page_length;
1262 		offset += page_length;
1263 	}
1264 	intel_fb_obj_flush(obj, ORIGIN_CPU);
1265 
1266 	mutex_lock(&i915->drm.struct_mutex);
1267 out_unpin:
1268 	if (node.allocated) {
1269 		wmb();
1270 		ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1271 		remove_mappable_node(&node);
1272 	} else {
1273 		i915_vma_unpin(vma);
1274 	}
1275 out_rpm:
1276 	intel_runtime_pm_put(i915, wakeref);
1277 out_unlock:
1278 	mutex_unlock(&i915->drm.struct_mutex);
1279 	return ret;
1280 }
1281 
1282 /* Per-page copy function for the shmem pwrite fastpath.
1283  * Flushes invalid cachelines before writing to the target if
1284  * needs_clflush_before is set and flushes out any written cachelines after
1285  * writing if needs_clflush is set.
1286  */
1287 static int
1288 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1289 	     bool needs_clflush_before,
1290 	     bool needs_clflush_after)
1291 {
1292 	char *vaddr;
1293 	int ret;
1294 
1295 	vaddr = kmap(page);
1296 
1297 	if (needs_clflush_before)
1298 		drm_clflush_virt_range(vaddr + offset, len);
1299 
1300 	ret = __copy_from_user(vaddr + offset, user_data, len);
1301 	if (!ret && needs_clflush_after)
1302 		drm_clflush_virt_range(vaddr + offset, len);
1303 
1304 	kunmap(page);
1305 
1306 	return ret ? -EFAULT : 0;
1307 }
1308 
1309 static int
1310 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1311 		      const struct drm_i915_gem_pwrite *args)
1312 {
1313 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
1314 	void __user *user_data;
1315 	u64 remain;
1316 	unsigned int partial_cacheline_write;
1317 	unsigned int needs_clflush;
1318 	unsigned int offset, idx;
1319 	int ret;
1320 
1321 	ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1322 	if (ret)
1323 		return ret;
1324 
1325 	ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1326 	mutex_unlock(&i915->drm.struct_mutex);
1327 	if (ret)
1328 		return ret;
1329 
1330 	/* If we don't overwrite a cacheline completely we need to be
1331 	 * careful to have up-to-date data by first clflushing. Don't
1332 	 * overcomplicate things and flush the entire patch.
1333 	 */
1334 	partial_cacheline_write = 0;
1335 	if (needs_clflush & CLFLUSH_BEFORE)
1336 		partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1337 
1338 	user_data = u64_to_user_ptr(args->data_ptr);
1339 	remain = args->size;
1340 	offset = offset_in_page(args->offset);
1341 	for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1342 		struct page *page = i915_gem_object_get_page(obj, idx);
1343 		unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1344 
1345 		ret = shmem_pwrite(page, offset, length, user_data,
1346 				   (offset | length) & partial_cacheline_write,
1347 				   needs_clflush & CLFLUSH_AFTER);
1348 		if (ret)
1349 			break;
1350 
1351 		remain -= length;
1352 		user_data += length;
1353 		offset = 0;
1354 	}
1355 
1356 	intel_fb_obj_flush(obj, ORIGIN_CPU);
1357 	i915_gem_obj_finish_shmem_access(obj);
1358 	return ret;
1359 }
1360 
1361 /**
1362  * Writes data to the object referenced by handle.
1363  * @dev: drm device
1364  * @data: ioctl data blob
1365  * @file: drm file
1366  *
1367  * On error, the contents of the buffer that were to be modified are undefined.
1368  */
1369 int
1370 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1371 		      struct drm_file *file)
1372 {
1373 	struct drm_i915_gem_pwrite *args = data;
1374 	struct drm_i915_gem_object *obj;
1375 	int ret;
1376 
1377 	if (args->size == 0)
1378 		return 0;
1379 
1380 	if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size))
1381 		return -EFAULT;
1382 
1383 	obj = i915_gem_object_lookup(file, args->handle);
1384 	if (!obj)
1385 		return -ENOENT;
1386 
1387 	/* Bounds check destination. */
1388 	if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1389 		ret = -EINVAL;
1390 		goto err;
1391 	}
1392 
1393 	/* Writes not allowed into this read-only object */
1394 	if (i915_gem_object_is_readonly(obj)) {
1395 		ret = -EINVAL;
1396 		goto err;
1397 	}
1398 
1399 	trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1400 
1401 	ret = -ENODEV;
1402 	if (obj->ops->pwrite)
1403 		ret = obj->ops->pwrite(obj, args);
1404 	if (ret != -ENODEV)
1405 		goto err;
1406 
1407 	ret = i915_gem_object_wait(obj,
1408 				   I915_WAIT_INTERRUPTIBLE |
1409 				   I915_WAIT_ALL,
1410 				   MAX_SCHEDULE_TIMEOUT);
1411 	if (ret)
1412 		goto err;
1413 
1414 	ret = i915_gem_object_pin_pages(obj);
1415 	if (ret)
1416 		goto err;
1417 
1418 	ret = -EFAULT;
1419 	/* We can only do the GTT pwrite on untiled buffers, as otherwise
1420 	 * it would end up going through the fenced access, and we'll get
1421 	 * different detiling behavior between reading and writing.
1422 	 * pread/pwrite currently are reading and writing from the CPU
1423 	 * perspective, requiring manual detiling by the client.
1424 	 */
1425 	if (!i915_gem_object_has_struct_page(obj) ||
1426 	    cpu_write_needs_clflush(obj))
1427 		/* Note that the gtt paths might fail with non-page-backed user
1428 		 * pointers (e.g. gtt mappings when moving data between
1429 		 * textures). Fallback to the shmem path in that case.
1430 		 */
1431 		ret = i915_gem_gtt_pwrite_fast(obj, args);
1432 
1433 	if (ret == -EFAULT || ret == -ENOSPC) {
1434 		if (obj->phys_handle)
1435 			ret = i915_gem_phys_pwrite(obj, args, file);
1436 		else
1437 			ret = i915_gem_shmem_pwrite(obj, args);
1438 	}
1439 
1440 	i915_gem_object_unpin_pages(obj);
1441 err:
1442 	i915_gem_object_put(obj);
1443 	return ret;
1444 }
1445 
1446 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1447 {
1448 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
1449 	struct list_head *list;
1450 	struct i915_vma *vma;
1451 
1452 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1453 
1454 	mutex_lock(&i915->ggtt.vm.mutex);
1455 	for_each_ggtt_vma(vma, obj) {
1456 		if (!drm_mm_node_allocated(&vma->node))
1457 			continue;
1458 
1459 		list_move_tail(&vma->vm_link, &vma->vm->bound_list);
1460 	}
1461 	mutex_unlock(&i915->ggtt.vm.mutex);
1462 
1463 	spin_lock(&i915->mm.obj_lock);
1464 	list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1465 	list_move_tail(&obj->mm.link, list);
1466 	spin_unlock(&i915->mm.obj_lock);
1467 }
1468 
1469 /**
1470  * Called when user space prepares to use an object with the CPU, either
1471  * through the mmap ioctl's mapping or a GTT mapping.
1472  * @dev: drm device
1473  * @data: ioctl data blob
1474  * @file: drm file
1475  */
1476 int
1477 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1478 			  struct drm_file *file)
1479 {
1480 	struct drm_i915_gem_set_domain *args = data;
1481 	struct drm_i915_gem_object *obj;
1482 	u32 read_domains = args->read_domains;
1483 	u32 write_domain = args->write_domain;
1484 	int err;
1485 
1486 	/* Only handle setting domains to types used by the CPU. */
1487 	if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1488 		return -EINVAL;
1489 
1490 	/*
1491 	 * Having something in the write domain implies it's in the read
1492 	 * domain, and only that read domain.  Enforce that in the request.
1493 	 */
1494 	if (write_domain && read_domains != write_domain)
1495 		return -EINVAL;
1496 
1497 	if (!read_domains)
1498 		return 0;
1499 
1500 	obj = i915_gem_object_lookup(file, args->handle);
1501 	if (!obj)
1502 		return -ENOENT;
1503 
1504 	/*
1505 	 * Already in the desired write domain? Nothing for us to do!
1506 	 *
1507 	 * We apply a little bit of cunning here to catch a broader set of
1508 	 * no-ops. If obj->write_domain is set, we must be in the same
1509 	 * obj->read_domains, and only that domain. Therefore, if that
1510 	 * obj->write_domain matches the request read_domains, we are
1511 	 * already in the same read/write domain and can skip the operation,
1512 	 * without having to further check the requested write_domain.
1513 	 */
1514 	if (READ_ONCE(obj->write_domain) == read_domains) {
1515 		err = 0;
1516 		goto out;
1517 	}
1518 
1519 	/*
1520 	 * Try to flush the object off the GPU without holding the lock.
1521 	 * We will repeat the flush holding the lock in the normal manner
1522 	 * to catch cases where we are gazumped.
1523 	 */
1524 	err = i915_gem_object_wait(obj,
1525 				   I915_WAIT_INTERRUPTIBLE |
1526 				   I915_WAIT_PRIORITY |
1527 				   (write_domain ? I915_WAIT_ALL : 0),
1528 				   MAX_SCHEDULE_TIMEOUT);
1529 	if (err)
1530 		goto out;
1531 
1532 	/*
1533 	 * Proxy objects do not control access to the backing storage, ergo
1534 	 * they cannot be used as a means to manipulate the cache domain
1535 	 * tracking for that backing storage. The proxy object is always
1536 	 * considered to be outside of any cache domain.
1537 	 */
1538 	if (i915_gem_object_is_proxy(obj)) {
1539 		err = -ENXIO;
1540 		goto out;
1541 	}
1542 
1543 	/*
1544 	 * Flush and acquire obj->pages so that we are coherent through
1545 	 * direct access in memory with previous cached writes through
1546 	 * shmemfs and that our cache domain tracking remains valid.
1547 	 * For example, if the obj->filp was moved to swap without us
1548 	 * being notified and releasing the pages, we would mistakenly
1549 	 * continue to assume that the obj remained out of the CPU cached
1550 	 * domain.
1551 	 */
1552 	err = i915_gem_object_pin_pages(obj);
1553 	if (err)
1554 		goto out;
1555 
1556 	err = i915_mutex_lock_interruptible(dev);
1557 	if (err)
1558 		goto out_unpin;
1559 
1560 	if (read_domains & I915_GEM_DOMAIN_WC)
1561 		err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1562 	else if (read_domains & I915_GEM_DOMAIN_GTT)
1563 		err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1564 	else
1565 		err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1566 
1567 	/* And bump the LRU for this access */
1568 	i915_gem_object_bump_inactive_ggtt(obj);
1569 
1570 	mutex_unlock(&dev->struct_mutex);
1571 
1572 	if (write_domain != 0)
1573 		intel_fb_obj_invalidate(obj,
1574 					fb_write_origin(obj, write_domain));
1575 
1576 out_unpin:
1577 	i915_gem_object_unpin_pages(obj);
1578 out:
1579 	i915_gem_object_put(obj);
1580 	return err;
1581 }
1582 
1583 /**
1584  * Called when user space has done writes to this buffer
1585  * @dev: drm device
1586  * @data: ioctl data blob
1587  * @file: drm file
1588  */
1589 int
1590 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1591 			 struct drm_file *file)
1592 {
1593 	struct drm_i915_gem_sw_finish *args = data;
1594 	struct drm_i915_gem_object *obj;
1595 
1596 	obj = i915_gem_object_lookup(file, args->handle);
1597 	if (!obj)
1598 		return -ENOENT;
1599 
1600 	/*
1601 	 * Proxy objects are barred from CPU access, so there is no
1602 	 * need to ban sw_finish as it is a nop.
1603 	 */
1604 
1605 	/* Pinned buffers may be scanout, so flush the cache */
1606 	i915_gem_object_flush_if_display(obj);
1607 	i915_gem_object_put(obj);
1608 
1609 	return 0;
1610 }
1611 
1612 static inline bool
1613 __vma_matches(struct vm_area_struct *vma, struct file *filp,
1614 	      unsigned long addr, unsigned long size)
1615 {
1616 	if (vma->vm_file != filp)
1617 		return false;
1618 
1619 	return vma->vm_start == addr &&
1620 	       (vma->vm_end - vma->vm_start) == PAGE_ALIGN(size);
1621 }
1622 
1623 /**
1624  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1625  *			 it is mapped to.
1626  * @dev: drm device
1627  * @data: ioctl data blob
1628  * @file: drm file
1629  *
1630  * While the mapping holds a reference on the contents of the object, it doesn't
1631  * imply a ref on the object itself.
1632  *
1633  * IMPORTANT:
1634  *
1635  * DRM driver writers who look a this function as an example for how to do GEM
1636  * mmap support, please don't implement mmap support like here. The modern way
1637  * to implement DRM mmap support is with an mmap offset ioctl (like
1638  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1639  * That way debug tooling like valgrind will understand what's going on, hiding
1640  * the mmap call in a driver private ioctl will break that. The i915 driver only
1641  * does cpu mmaps this way because we didn't know better.
1642  */
1643 int
1644 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1645 		    struct drm_file *file)
1646 {
1647 	struct drm_i915_gem_mmap *args = data;
1648 	struct drm_i915_gem_object *obj;
1649 	unsigned long addr;
1650 
1651 	if (args->flags & ~(I915_MMAP_WC))
1652 		return -EINVAL;
1653 
1654 	if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1655 		return -ENODEV;
1656 
1657 	obj = i915_gem_object_lookup(file, args->handle);
1658 	if (!obj)
1659 		return -ENOENT;
1660 
1661 	/* prime objects have no backing filp to GEM mmap
1662 	 * pages from.
1663 	 */
1664 	if (!obj->base.filp) {
1665 		addr = -ENXIO;
1666 		goto err;
1667 	}
1668 
1669 	if (range_overflows(args->offset, args->size, (u64)obj->base.size)) {
1670 		addr = -EINVAL;
1671 		goto err;
1672 	}
1673 
1674 	addr = vm_mmap(obj->base.filp, 0, args->size,
1675 		       PROT_READ | PROT_WRITE, MAP_SHARED,
1676 		       args->offset);
1677 	if (IS_ERR_VALUE(addr))
1678 		goto err;
1679 
1680 	if (args->flags & I915_MMAP_WC) {
1681 		struct mm_struct *mm = current->mm;
1682 		struct vm_area_struct *vma;
1683 
1684 		if (down_write_killable(&mm->mmap_sem)) {
1685 			addr = -EINTR;
1686 			goto err;
1687 		}
1688 		vma = find_vma(mm, addr);
1689 		if (vma && __vma_matches(vma, obj->base.filp, addr, args->size))
1690 			vma->vm_page_prot =
1691 				pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1692 		else
1693 			addr = -ENOMEM;
1694 		up_write(&mm->mmap_sem);
1695 		if (IS_ERR_VALUE(addr))
1696 			goto err;
1697 
1698 		/* This may race, but that's ok, it only gets set */
1699 		WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1700 	}
1701 	i915_gem_object_put(obj);
1702 
1703 	args->addr_ptr = (u64)addr;
1704 	return 0;
1705 
1706 err:
1707 	i915_gem_object_put(obj);
1708 	return addr;
1709 }
1710 
1711 static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1712 {
1713 	return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1714 }
1715 
1716 /**
1717  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1718  *
1719  * A history of the GTT mmap interface:
1720  *
1721  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1722  *     aligned and suitable for fencing, and still fit into the available
1723  *     mappable space left by the pinned display objects. A classic problem
1724  *     we called the page-fault-of-doom where we would ping-pong between
1725  *     two objects that could not fit inside the GTT and so the memcpy
1726  *     would page one object in at the expense of the other between every
1727  *     single byte.
1728  *
1729  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1730  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1731  *     object is too large for the available space (or simply too large
1732  *     for the mappable aperture!), a view is created instead and faulted
1733  *     into userspace. (This view is aligned and sized appropriately for
1734  *     fenced access.)
1735  *
1736  * 2 - Recognise WC as a separate cache domain so that we can flush the
1737  *     delayed writes via GTT before performing direct access via WC.
1738  *
1739  * 3 - Remove implicit set-domain(GTT) and synchronisation on initial
1740  *     pagefault; swapin remains transparent.
1741  *
1742  * Restrictions:
1743  *
1744  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1745  *    hangs on some architectures, corruption on others. An attempt to service
1746  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1747  *
1748  *  * the object must be able to fit into RAM (physical memory, though no
1749  *    limited to the mappable aperture).
1750  *
1751  *
1752  * Caveats:
1753  *
1754  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1755  *    all data to system memory. Subsequent access will not be synchronized.
1756  *
1757  *  * all mappings are revoked on runtime device suspend.
1758  *
1759  *  * there are only 8, 16 or 32 fence registers to share between all users
1760  *    (older machines require fence register for display and blitter access
1761  *    as well). Contention of the fence registers will cause the previous users
1762  *    to be unmapped and any new access will generate new page faults.
1763  *
1764  *  * running out of memory while servicing a fault may generate a SIGBUS,
1765  *    rather than the expected SIGSEGV.
1766  */
1767 int i915_gem_mmap_gtt_version(void)
1768 {
1769 	return 3;
1770 }
1771 
1772 static inline struct i915_ggtt_view
1773 compute_partial_view(const struct drm_i915_gem_object *obj,
1774 		     pgoff_t page_offset,
1775 		     unsigned int chunk)
1776 {
1777 	struct i915_ggtt_view view;
1778 
1779 	if (i915_gem_object_is_tiled(obj))
1780 		chunk = roundup(chunk, tile_row_pages(obj));
1781 
1782 	view.type = I915_GGTT_VIEW_PARTIAL;
1783 	view.partial.offset = rounddown(page_offset, chunk);
1784 	view.partial.size =
1785 		min_t(unsigned int, chunk,
1786 		      (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1787 
1788 	/* If the partial covers the entire object, just create a normal VMA. */
1789 	if (chunk >= obj->base.size >> PAGE_SHIFT)
1790 		view.type = I915_GGTT_VIEW_NORMAL;
1791 
1792 	return view;
1793 }
1794 
1795 /**
1796  * i915_gem_fault - fault a page into the GTT
1797  * @vmf: fault info
1798  *
1799  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1800  * from userspace.  The fault handler takes care of binding the object to
1801  * the GTT (if needed), allocating and programming a fence register (again,
1802  * only if needed based on whether the old reg is still valid or the object
1803  * is tiled) and inserting a new PTE into the faulting process.
1804  *
1805  * Note that the faulting process may involve evicting existing objects
1806  * from the GTT and/or fence registers to make room.  So performance may
1807  * suffer if the GTT working set is large or there are few fence registers
1808  * left.
1809  *
1810  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1811  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1812  */
1813 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
1814 {
1815 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
1816 	struct vm_area_struct *area = vmf->vma;
1817 	struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1818 	struct drm_device *dev = obj->base.dev;
1819 	struct drm_i915_private *dev_priv = to_i915(dev);
1820 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
1821 	bool write = area->vm_flags & VM_WRITE;
1822 	intel_wakeref_t wakeref;
1823 	struct i915_vma *vma;
1824 	pgoff_t page_offset;
1825 	int srcu;
1826 	int ret;
1827 
1828 	/* Sanity check that we allow writing into this object */
1829 	if (i915_gem_object_is_readonly(obj) && write)
1830 		return VM_FAULT_SIGBUS;
1831 
1832 	/* We don't use vmf->pgoff since that has the fake offset */
1833 	page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1834 
1835 	trace_i915_gem_object_fault(obj, page_offset, true, write);
1836 
1837 	ret = i915_gem_object_pin_pages(obj);
1838 	if (ret)
1839 		goto err;
1840 
1841 	wakeref = intel_runtime_pm_get(dev_priv);
1842 
1843 	srcu = i915_reset_trylock(dev_priv);
1844 	if (srcu < 0) {
1845 		ret = srcu;
1846 		goto err_rpm;
1847 	}
1848 
1849 	ret = i915_mutex_lock_interruptible(dev);
1850 	if (ret)
1851 		goto err_reset;
1852 
1853 	/* Access to snoopable pages through the GTT is incoherent. */
1854 	if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
1855 		ret = -EFAULT;
1856 		goto err_unlock;
1857 	}
1858 
1859 	/* Now pin it into the GTT as needed */
1860 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1861 				       PIN_MAPPABLE |
1862 				       PIN_NONBLOCK |
1863 				       PIN_NONFAULT);
1864 	if (IS_ERR(vma)) {
1865 		/* Use a partial view if it is bigger than available space */
1866 		struct i915_ggtt_view view =
1867 			compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
1868 		unsigned int flags;
1869 
1870 		flags = PIN_MAPPABLE;
1871 		if (view.type == I915_GGTT_VIEW_NORMAL)
1872 			flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
1873 
1874 		/*
1875 		 * Userspace is now writing through an untracked VMA, abandon
1876 		 * all hope that the hardware is able to track future writes.
1877 		 */
1878 		obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
1879 
1880 		vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1881 		if (IS_ERR(vma) && !view.type) {
1882 			flags = PIN_MAPPABLE;
1883 			view.type = I915_GGTT_VIEW_PARTIAL;
1884 			vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1885 		}
1886 	}
1887 	if (IS_ERR(vma)) {
1888 		ret = PTR_ERR(vma);
1889 		goto err_unlock;
1890 	}
1891 
1892 	ret = i915_vma_pin_fence(vma);
1893 	if (ret)
1894 		goto err_unpin;
1895 
1896 	/* Finally, remap it using the new GTT offset */
1897 	ret = remap_io_mapping(area,
1898 			       area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
1899 			       (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
1900 			       min_t(u64, vma->size, area->vm_end - area->vm_start),
1901 			       &ggtt->iomap);
1902 	if (ret)
1903 		goto err_fence;
1904 
1905 	/* Mark as being mmapped into userspace for later revocation */
1906 	assert_rpm_wakelock_held(dev_priv);
1907 	if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
1908 		list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
1909 	GEM_BUG_ON(!obj->userfault_count);
1910 
1911 	i915_vma_set_ggtt_write(vma);
1912 
1913 err_fence:
1914 	i915_vma_unpin_fence(vma);
1915 err_unpin:
1916 	__i915_vma_unpin(vma);
1917 err_unlock:
1918 	mutex_unlock(&dev->struct_mutex);
1919 err_reset:
1920 	i915_reset_unlock(dev_priv, srcu);
1921 err_rpm:
1922 	intel_runtime_pm_put(dev_priv, wakeref);
1923 	i915_gem_object_unpin_pages(obj);
1924 err:
1925 	switch (ret) {
1926 	case -EIO:
1927 		/*
1928 		 * We eat errors when the gpu is terminally wedged to avoid
1929 		 * userspace unduly crashing (gl has no provisions for mmaps to
1930 		 * fail). But any other -EIO isn't ours (e.g. swap in failure)
1931 		 * and so needs to be reported.
1932 		 */
1933 		if (!i915_terminally_wedged(dev_priv))
1934 			return VM_FAULT_SIGBUS;
1935 		/* else: fall through */
1936 	case -EAGAIN:
1937 		/*
1938 		 * EAGAIN means the gpu is hung and we'll wait for the error
1939 		 * handler to reset everything when re-faulting in
1940 		 * i915_mutex_lock_interruptible.
1941 		 */
1942 	case 0:
1943 	case -ERESTARTSYS:
1944 	case -EINTR:
1945 	case -EBUSY:
1946 		/*
1947 		 * EBUSY is ok: this just means that another thread
1948 		 * already did the job.
1949 		 */
1950 		return VM_FAULT_NOPAGE;
1951 	case -ENOMEM:
1952 		return VM_FAULT_OOM;
1953 	case -ENOSPC:
1954 	case -EFAULT:
1955 		return VM_FAULT_SIGBUS;
1956 	default:
1957 		WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
1958 		return VM_FAULT_SIGBUS;
1959 	}
1960 }
1961 
1962 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
1963 {
1964 	struct i915_vma *vma;
1965 
1966 	GEM_BUG_ON(!obj->userfault_count);
1967 
1968 	obj->userfault_count = 0;
1969 	list_del(&obj->userfault_link);
1970 	drm_vma_node_unmap(&obj->base.vma_node,
1971 			   obj->base.dev->anon_inode->i_mapping);
1972 
1973 	for_each_ggtt_vma(vma, obj)
1974 		i915_vma_unset_userfault(vma);
1975 }
1976 
1977 /**
1978  * i915_gem_release_mmap - remove physical page mappings
1979  * @obj: obj in question
1980  *
1981  * Preserve the reservation of the mmapping with the DRM core code, but
1982  * relinquish ownership of the pages back to the system.
1983  *
1984  * It is vital that we remove the page mapping if we have mapped a tiled
1985  * object through the GTT and then lose the fence register due to
1986  * resource pressure. Similarly if the object has been moved out of the
1987  * aperture, than pages mapped into userspace must be revoked. Removing the
1988  * mapping will then trigger a page fault on the next user access, allowing
1989  * fixup by i915_gem_fault().
1990  */
1991 void
1992 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
1993 {
1994 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
1995 	intel_wakeref_t wakeref;
1996 
1997 	/* Serialisation between user GTT access and our code depends upon
1998 	 * revoking the CPU's PTE whilst the mutex is held. The next user
1999 	 * pagefault then has to wait until we release the mutex.
2000 	 *
2001 	 * Note that RPM complicates somewhat by adding an additional
2002 	 * requirement that operations to the GGTT be made holding the RPM
2003 	 * wakeref.
2004 	 */
2005 	lockdep_assert_held(&i915->drm.struct_mutex);
2006 	wakeref = intel_runtime_pm_get(i915);
2007 
2008 	if (!obj->userfault_count)
2009 		goto out;
2010 
2011 	__i915_gem_object_release_mmap(obj);
2012 
2013 	/* Ensure that the CPU's PTE are revoked and there are not outstanding
2014 	 * memory transactions from userspace before we return. The TLB
2015 	 * flushing implied above by changing the PTE above *should* be
2016 	 * sufficient, an extra barrier here just provides us with a bit
2017 	 * of paranoid documentation about our requirement to serialise
2018 	 * memory writes before touching registers / GSM.
2019 	 */
2020 	wmb();
2021 
2022 out:
2023 	intel_runtime_pm_put(i915, wakeref);
2024 }
2025 
2026 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2027 {
2028 	struct drm_i915_gem_object *obj, *on;
2029 	int i;
2030 
2031 	/*
2032 	 * Only called during RPM suspend. All users of the userfault_list
2033 	 * must be holding an RPM wakeref to ensure that this can not
2034 	 * run concurrently with themselves (and use the struct_mutex for
2035 	 * protection between themselves).
2036 	 */
2037 
2038 	list_for_each_entry_safe(obj, on,
2039 				 &dev_priv->mm.userfault_list, userfault_link)
2040 		__i915_gem_object_release_mmap(obj);
2041 
2042 	/* The fence will be lost when the device powers down. If any were
2043 	 * in use by hardware (i.e. they are pinned), we should not be powering
2044 	 * down! All other fences will be reacquired by the user upon waking.
2045 	 */
2046 	for (i = 0; i < dev_priv->num_fence_regs; i++) {
2047 		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2048 
2049 		/* Ideally we want to assert that the fence register is not
2050 		 * live at this point (i.e. that no piece of code will be
2051 		 * trying to write through fence + GTT, as that both violates
2052 		 * our tracking of activity and associated locking/barriers,
2053 		 * but also is illegal given that the hw is powered down).
2054 		 *
2055 		 * Previously we used reg->pin_count as a "liveness" indicator.
2056 		 * That is not sufficient, and we need a more fine-grained
2057 		 * tool if we want to have a sanity check here.
2058 		 */
2059 
2060 		if (!reg->vma)
2061 			continue;
2062 
2063 		GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2064 		reg->dirty = true;
2065 	}
2066 }
2067 
2068 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2069 {
2070 	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2071 	int err;
2072 
2073 	err = drm_gem_create_mmap_offset(&obj->base);
2074 	if (likely(!err))
2075 		return 0;
2076 
2077 	/* Attempt to reap some mmap space from dead objects */
2078 	do {
2079 		err = i915_gem_wait_for_idle(dev_priv,
2080 					     I915_WAIT_INTERRUPTIBLE,
2081 					     MAX_SCHEDULE_TIMEOUT);
2082 		if (err)
2083 			break;
2084 
2085 		i915_gem_drain_freed_objects(dev_priv);
2086 		err = drm_gem_create_mmap_offset(&obj->base);
2087 		if (!err)
2088 			break;
2089 
2090 	} while (flush_delayed_work(&dev_priv->gt.retire_work));
2091 
2092 	return err;
2093 }
2094 
2095 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2096 {
2097 	drm_gem_free_mmap_offset(&obj->base);
2098 }
2099 
2100 int
2101 i915_gem_mmap_gtt(struct drm_file *file,
2102 		  struct drm_device *dev,
2103 		  u32 handle,
2104 		  u64 *offset)
2105 {
2106 	struct drm_i915_gem_object *obj;
2107 	int ret;
2108 
2109 	obj = i915_gem_object_lookup(file, handle);
2110 	if (!obj)
2111 		return -ENOENT;
2112 
2113 	ret = i915_gem_object_create_mmap_offset(obj);
2114 	if (ret == 0)
2115 		*offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2116 
2117 	i915_gem_object_put(obj);
2118 	return ret;
2119 }
2120 
2121 /**
2122  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2123  * @dev: DRM device
2124  * @data: GTT mapping ioctl data
2125  * @file: GEM object info
2126  *
2127  * Simply returns the fake offset to userspace so it can mmap it.
2128  * The mmap call will end up in drm_gem_mmap(), which will set things
2129  * up so we can get faults in the handler above.
2130  *
2131  * The fault handler will take care of binding the object into the GTT
2132  * (since it may have been evicted to make room for something), allocating
2133  * a fence register, and mapping the appropriate aperture address into
2134  * userspace.
2135  */
2136 int
2137 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2138 			struct drm_file *file)
2139 {
2140 	struct drm_i915_gem_mmap_gtt *args = data;
2141 
2142 	return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2143 }
2144 
2145 /* Immediately discard the backing storage */
2146 static void
2147 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2148 {
2149 	i915_gem_object_free_mmap_offset(obj);
2150 
2151 	if (obj->base.filp == NULL)
2152 		return;
2153 
2154 	/* Our goal here is to return as much of the memory as
2155 	 * is possible back to the system as we are called from OOM.
2156 	 * To do this we must instruct the shmfs to drop all of its
2157 	 * backing pages, *now*.
2158 	 */
2159 	shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2160 	obj->mm.madv = __I915_MADV_PURGED;
2161 	obj->mm.pages = ERR_PTR(-EFAULT);
2162 }
2163 
2164 /* Try to discard unwanted pages */
2165 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2166 {
2167 	struct address_space *mapping;
2168 
2169 	lockdep_assert_held(&obj->mm.lock);
2170 	GEM_BUG_ON(i915_gem_object_has_pages(obj));
2171 
2172 	switch (obj->mm.madv) {
2173 	case I915_MADV_DONTNEED:
2174 		i915_gem_object_truncate(obj);
2175 	case __I915_MADV_PURGED:
2176 		return;
2177 	}
2178 
2179 	if (obj->base.filp == NULL)
2180 		return;
2181 
2182 	mapping = obj->base.filp->f_mapping,
2183 	invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2184 }
2185 
2186 /*
2187  * Move pages to appropriate lru and release the pagevec, decrementing the
2188  * ref count of those pages.
2189  */
2190 static void check_release_pagevec(struct pagevec *pvec)
2191 {
2192 	check_move_unevictable_pages(pvec);
2193 	__pagevec_release(pvec);
2194 	cond_resched();
2195 }
2196 
2197 static void
2198 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2199 			      struct sg_table *pages)
2200 {
2201 	struct sgt_iter sgt_iter;
2202 	struct pagevec pvec;
2203 	struct page *page;
2204 
2205 	__i915_gem_object_release_shmem(obj, pages, true);
2206 	i915_gem_gtt_finish_pages(obj, pages);
2207 
2208 	if (i915_gem_object_needs_bit17_swizzle(obj))
2209 		i915_gem_object_save_bit_17_swizzle(obj, pages);
2210 
2211 	mapping_clear_unevictable(file_inode(obj->base.filp)->i_mapping);
2212 
2213 	pagevec_init(&pvec);
2214 	for_each_sgt_page(page, sgt_iter, pages) {
2215 		if (obj->mm.dirty)
2216 			set_page_dirty(page);
2217 
2218 		if (obj->mm.madv == I915_MADV_WILLNEED)
2219 			mark_page_accessed(page);
2220 
2221 		if (!pagevec_add(&pvec, page))
2222 			check_release_pagevec(&pvec);
2223 	}
2224 	if (pagevec_count(&pvec))
2225 		check_release_pagevec(&pvec);
2226 	obj->mm.dirty = false;
2227 
2228 	sg_free_table(pages);
2229 	kfree(pages);
2230 }
2231 
2232 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2233 {
2234 	struct radix_tree_iter iter;
2235 	void __rcu **slot;
2236 
2237 	rcu_read_lock();
2238 	radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2239 		radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2240 	rcu_read_unlock();
2241 }
2242 
2243 static struct sg_table *
2244 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2245 {
2246 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
2247 	struct sg_table *pages;
2248 
2249 	pages = fetch_and_zero(&obj->mm.pages);
2250 	if (IS_ERR_OR_NULL(pages))
2251 		return pages;
2252 
2253 	spin_lock(&i915->mm.obj_lock);
2254 	list_del(&obj->mm.link);
2255 	spin_unlock(&i915->mm.obj_lock);
2256 
2257 	if (obj->mm.mapping) {
2258 		void *ptr;
2259 
2260 		ptr = page_mask_bits(obj->mm.mapping);
2261 		if (is_vmalloc_addr(ptr))
2262 			vunmap(ptr);
2263 		else
2264 			kunmap(kmap_to_page(ptr));
2265 
2266 		obj->mm.mapping = NULL;
2267 	}
2268 
2269 	__i915_gem_object_reset_page_iter(obj);
2270 	obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2271 
2272 	return pages;
2273 }
2274 
2275 int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2276 				enum i915_mm_subclass subclass)
2277 {
2278 	struct sg_table *pages;
2279 	int ret;
2280 
2281 	if (i915_gem_object_has_pinned_pages(obj))
2282 		return -EBUSY;
2283 
2284 	GEM_BUG_ON(obj->bind_count);
2285 
2286 	/* May be called by shrinker from within get_pages() (on another bo) */
2287 	mutex_lock_nested(&obj->mm.lock, subclass);
2288 	if (unlikely(atomic_read(&obj->mm.pages_pin_count))) {
2289 		ret = -EBUSY;
2290 		goto unlock;
2291 	}
2292 
2293 	/*
2294 	 * ->put_pages might need to allocate memory for the bit17 swizzle
2295 	 * array, hence protect them from being reaped by removing them from gtt
2296 	 * lists early.
2297 	 */
2298 	pages = __i915_gem_object_unset_pages(obj);
2299 
2300 	/*
2301 	 * XXX Temporary hijinx to avoid updating all backends to handle
2302 	 * NULL pages. In the future, when we have more asynchronous
2303 	 * get_pages backends we should be better able to handle the
2304 	 * cancellation of the async task in a more uniform manner.
2305 	 */
2306 	if (!pages && !i915_gem_object_needs_async_cancel(obj))
2307 		pages = ERR_PTR(-EINVAL);
2308 
2309 	if (!IS_ERR(pages))
2310 		obj->ops->put_pages(obj, pages);
2311 
2312 	ret = 0;
2313 unlock:
2314 	mutex_unlock(&obj->mm.lock);
2315 
2316 	return ret;
2317 }
2318 
2319 bool i915_sg_trim(struct sg_table *orig_st)
2320 {
2321 	struct sg_table new_st;
2322 	struct scatterlist *sg, *new_sg;
2323 	unsigned int i;
2324 
2325 	if (orig_st->nents == orig_st->orig_nents)
2326 		return false;
2327 
2328 	if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2329 		return false;
2330 
2331 	new_sg = new_st.sgl;
2332 	for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2333 		sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2334 		sg_dma_address(new_sg) = sg_dma_address(sg);
2335 		sg_dma_len(new_sg) = sg_dma_len(sg);
2336 
2337 		new_sg = sg_next(new_sg);
2338 	}
2339 	GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2340 
2341 	sg_free_table(orig_st);
2342 
2343 	*orig_st = new_st;
2344 	return true;
2345 }
2346 
2347 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2348 {
2349 	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2350 	const unsigned long page_count = obj->base.size / PAGE_SIZE;
2351 	unsigned long i;
2352 	struct address_space *mapping;
2353 	struct sg_table *st;
2354 	struct scatterlist *sg;
2355 	struct sgt_iter sgt_iter;
2356 	struct page *page;
2357 	unsigned long last_pfn = 0;	/* suppress gcc warning */
2358 	unsigned int max_segment = i915_sg_segment_size();
2359 	unsigned int sg_page_sizes;
2360 	struct pagevec pvec;
2361 	gfp_t noreclaim;
2362 	int ret;
2363 
2364 	/*
2365 	 * Assert that the object is not currently in any GPU domain. As it
2366 	 * wasn't in the GTT, there shouldn't be any way it could have been in
2367 	 * a GPU cache
2368 	 */
2369 	GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2370 	GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2371 
2372 	/*
2373 	 * If there's no chance of allocating enough pages for the whole
2374 	 * object, bail early.
2375 	 */
2376 	if (page_count > totalram_pages())
2377 		return -ENOMEM;
2378 
2379 	st = kmalloc(sizeof(*st), GFP_KERNEL);
2380 	if (st == NULL)
2381 		return -ENOMEM;
2382 
2383 rebuild_st:
2384 	if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2385 		kfree(st);
2386 		return -ENOMEM;
2387 	}
2388 
2389 	/*
2390 	 * Get the list of pages out of our struct file.  They'll be pinned
2391 	 * at this point until we release them.
2392 	 *
2393 	 * Fail silently without starting the shrinker
2394 	 */
2395 	mapping = obj->base.filp->f_mapping;
2396 	mapping_set_unevictable(mapping);
2397 	noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2398 	noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2399 
2400 	sg = st->sgl;
2401 	st->nents = 0;
2402 	sg_page_sizes = 0;
2403 	for (i = 0; i < page_count; i++) {
2404 		const unsigned int shrink[] = {
2405 			I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2406 			0,
2407 		}, *s = shrink;
2408 		gfp_t gfp = noreclaim;
2409 
2410 		do {
2411 			cond_resched();
2412 			page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2413 			if (!IS_ERR(page))
2414 				break;
2415 
2416 			if (!*s) {
2417 				ret = PTR_ERR(page);
2418 				goto err_sg;
2419 			}
2420 
2421 			i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2422 
2423 			/*
2424 			 * We've tried hard to allocate the memory by reaping
2425 			 * our own buffer, now let the real VM do its job and
2426 			 * go down in flames if truly OOM.
2427 			 *
2428 			 * However, since graphics tend to be disposable,
2429 			 * defer the oom here by reporting the ENOMEM back
2430 			 * to userspace.
2431 			 */
2432 			if (!*s) {
2433 				/* reclaim and warn, but no oom */
2434 				gfp = mapping_gfp_mask(mapping);
2435 
2436 				/*
2437 				 * Our bo are always dirty and so we require
2438 				 * kswapd to reclaim our pages (direct reclaim
2439 				 * does not effectively begin pageout of our
2440 				 * buffers on its own). However, direct reclaim
2441 				 * only waits for kswapd when under allocation
2442 				 * congestion. So as a result __GFP_RECLAIM is
2443 				 * unreliable and fails to actually reclaim our
2444 				 * dirty pages -- unless you try over and over
2445 				 * again with !__GFP_NORETRY. However, we still
2446 				 * want to fail this allocation rather than
2447 				 * trigger the out-of-memory killer and for
2448 				 * this we want __GFP_RETRY_MAYFAIL.
2449 				 */
2450 				gfp |= __GFP_RETRY_MAYFAIL;
2451 			}
2452 		} while (1);
2453 
2454 		if (!i ||
2455 		    sg->length >= max_segment ||
2456 		    page_to_pfn(page) != last_pfn + 1) {
2457 			if (i) {
2458 				sg_page_sizes |= sg->length;
2459 				sg = sg_next(sg);
2460 			}
2461 			st->nents++;
2462 			sg_set_page(sg, page, PAGE_SIZE, 0);
2463 		} else {
2464 			sg->length += PAGE_SIZE;
2465 		}
2466 		last_pfn = page_to_pfn(page);
2467 
2468 		/* Check that the i965g/gm workaround works. */
2469 		WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2470 	}
2471 	if (sg) { /* loop terminated early; short sg table */
2472 		sg_page_sizes |= sg->length;
2473 		sg_mark_end(sg);
2474 	}
2475 
2476 	/* Trim unused sg entries to avoid wasting memory. */
2477 	i915_sg_trim(st);
2478 
2479 	ret = i915_gem_gtt_prepare_pages(obj, st);
2480 	if (ret) {
2481 		/*
2482 		 * DMA remapping failed? One possible cause is that
2483 		 * it could not reserve enough large entries, asking
2484 		 * for PAGE_SIZE chunks instead may be helpful.
2485 		 */
2486 		if (max_segment > PAGE_SIZE) {
2487 			for_each_sgt_page(page, sgt_iter, st)
2488 				put_page(page);
2489 			sg_free_table(st);
2490 
2491 			max_segment = PAGE_SIZE;
2492 			goto rebuild_st;
2493 		} else {
2494 			dev_warn(&dev_priv->drm.pdev->dev,
2495 				 "Failed to DMA remap %lu pages\n",
2496 				 page_count);
2497 			goto err_pages;
2498 		}
2499 	}
2500 
2501 	if (i915_gem_object_needs_bit17_swizzle(obj))
2502 		i915_gem_object_do_bit_17_swizzle(obj, st);
2503 
2504 	__i915_gem_object_set_pages(obj, st, sg_page_sizes);
2505 
2506 	return 0;
2507 
2508 err_sg:
2509 	sg_mark_end(sg);
2510 err_pages:
2511 	mapping_clear_unevictable(mapping);
2512 	pagevec_init(&pvec);
2513 	for_each_sgt_page(page, sgt_iter, st) {
2514 		if (!pagevec_add(&pvec, page))
2515 			check_release_pagevec(&pvec);
2516 	}
2517 	if (pagevec_count(&pvec))
2518 		check_release_pagevec(&pvec);
2519 	sg_free_table(st);
2520 	kfree(st);
2521 
2522 	/*
2523 	 * shmemfs first checks if there is enough memory to allocate the page
2524 	 * and reports ENOSPC should there be insufficient, along with the usual
2525 	 * ENOMEM for a genuine allocation failure.
2526 	 *
2527 	 * We use ENOSPC in our driver to mean that we have run out of aperture
2528 	 * space and so want to translate the error from shmemfs back to our
2529 	 * usual understanding of ENOMEM.
2530 	 */
2531 	if (ret == -ENOSPC)
2532 		ret = -ENOMEM;
2533 
2534 	return ret;
2535 }
2536 
2537 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2538 				 struct sg_table *pages,
2539 				 unsigned int sg_page_sizes)
2540 {
2541 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
2542 	unsigned long supported = INTEL_INFO(i915)->page_sizes;
2543 	int i;
2544 
2545 	lockdep_assert_held(&obj->mm.lock);
2546 
2547 	/* Make the pages coherent with the GPU (flushing any swapin). */
2548 	if (obj->cache_dirty) {
2549 		obj->write_domain = 0;
2550 		if (i915_gem_object_has_struct_page(obj))
2551 			drm_clflush_sg(pages);
2552 		obj->cache_dirty = false;
2553 	}
2554 
2555 	obj->mm.get_page.sg_pos = pages->sgl;
2556 	obj->mm.get_page.sg_idx = 0;
2557 
2558 	obj->mm.pages = pages;
2559 
2560 	if (i915_gem_object_is_tiled(obj) &&
2561 	    i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2562 		GEM_BUG_ON(obj->mm.quirked);
2563 		__i915_gem_object_pin_pages(obj);
2564 		obj->mm.quirked = true;
2565 	}
2566 
2567 	GEM_BUG_ON(!sg_page_sizes);
2568 	obj->mm.page_sizes.phys = sg_page_sizes;
2569 
2570 	/*
2571 	 * Calculate the supported page-sizes which fit into the given
2572 	 * sg_page_sizes. This will give us the page-sizes which we may be able
2573 	 * to use opportunistically when later inserting into the GTT. For
2574 	 * example if phys=2G, then in theory we should be able to use 1G, 2M,
2575 	 * 64K or 4K pages, although in practice this will depend on a number of
2576 	 * other factors.
2577 	 */
2578 	obj->mm.page_sizes.sg = 0;
2579 	for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2580 		if (obj->mm.page_sizes.phys & ~0u << i)
2581 			obj->mm.page_sizes.sg |= BIT(i);
2582 	}
2583 	GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2584 
2585 	spin_lock(&i915->mm.obj_lock);
2586 	list_add(&obj->mm.link, &i915->mm.unbound_list);
2587 	spin_unlock(&i915->mm.obj_lock);
2588 }
2589 
2590 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2591 {
2592 	int err;
2593 
2594 	if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2595 		DRM_DEBUG("Attempting to obtain a purgeable object\n");
2596 		return -EFAULT;
2597 	}
2598 
2599 	err = obj->ops->get_pages(obj);
2600 	GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2601 
2602 	return err;
2603 }
2604 
2605 /* Ensure that the associated pages are gathered from the backing storage
2606  * and pinned into our object. i915_gem_object_pin_pages() may be called
2607  * multiple times before they are released by a single call to
2608  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2609  * either as a result of memory pressure (reaping pages under the shrinker)
2610  * or as the object is itself released.
2611  */
2612 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2613 {
2614 	int err;
2615 
2616 	err = mutex_lock_interruptible(&obj->mm.lock);
2617 	if (err)
2618 		return err;
2619 
2620 	if (unlikely(!i915_gem_object_has_pages(obj))) {
2621 		GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2622 
2623 		err = ____i915_gem_object_get_pages(obj);
2624 		if (err)
2625 			goto unlock;
2626 
2627 		smp_mb__before_atomic();
2628 	}
2629 	atomic_inc(&obj->mm.pages_pin_count);
2630 
2631 unlock:
2632 	mutex_unlock(&obj->mm.lock);
2633 	return err;
2634 }
2635 
2636 /* The 'mapping' part of i915_gem_object_pin_map() below */
2637 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2638 				 enum i915_map_type type)
2639 {
2640 	unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2641 	struct sg_table *sgt = obj->mm.pages;
2642 	struct sgt_iter sgt_iter;
2643 	struct page *page;
2644 	struct page *stack_pages[32];
2645 	struct page **pages = stack_pages;
2646 	unsigned long i = 0;
2647 	pgprot_t pgprot;
2648 	void *addr;
2649 
2650 	/* A single page can always be kmapped */
2651 	if (n_pages == 1 && type == I915_MAP_WB)
2652 		return kmap(sg_page(sgt->sgl));
2653 
2654 	if (n_pages > ARRAY_SIZE(stack_pages)) {
2655 		/* Too big for stack -- allocate temporary array instead */
2656 		pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2657 		if (!pages)
2658 			return NULL;
2659 	}
2660 
2661 	for_each_sgt_page(page, sgt_iter, sgt)
2662 		pages[i++] = page;
2663 
2664 	/* Check that we have the expected number of pages */
2665 	GEM_BUG_ON(i != n_pages);
2666 
2667 	switch (type) {
2668 	default:
2669 		MISSING_CASE(type);
2670 		/* fallthrough to use PAGE_KERNEL anyway */
2671 	case I915_MAP_WB:
2672 		pgprot = PAGE_KERNEL;
2673 		break;
2674 	case I915_MAP_WC:
2675 		pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2676 		break;
2677 	}
2678 	addr = vmap(pages, n_pages, 0, pgprot);
2679 
2680 	if (pages != stack_pages)
2681 		kvfree(pages);
2682 
2683 	return addr;
2684 }
2685 
2686 /* get, pin, and map the pages of the object into kernel space */
2687 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2688 			      enum i915_map_type type)
2689 {
2690 	enum i915_map_type has_type;
2691 	bool pinned;
2692 	void *ptr;
2693 	int ret;
2694 
2695 	if (unlikely(!i915_gem_object_has_struct_page(obj)))
2696 		return ERR_PTR(-ENXIO);
2697 
2698 	ret = mutex_lock_interruptible(&obj->mm.lock);
2699 	if (ret)
2700 		return ERR_PTR(ret);
2701 
2702 	pinned = !(type & I915_MAP_OVERRIDE);
2703 	type &= ~I915_MAP_OVERRIDE;
2704 
2705 	if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2706 		if (unlikely(!i915_gem_object_has_pages(obj))) {
2707 			GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2708 
2709 			ret = ____i915_gem_object_get_pages(obj);
2710 			if (ret)
2711 				goto err_unlock;
2712 
2713 			smp_mb__before_atomic();
2714 		}
2715 		atomic_inc(&obj->mm.pages_pin_count);
2716 		pinned = false;
2717 	}
2718 	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2719 
2720 	ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2721 	if (ptr && has_type != type) {
2722 		if (pinned) {
2723 			ret = -EBUSY;
2724 			goto err_unpin;
2725 		}
2726 
2727 		if (is_vmalloc_addr(ptr))
2728 			vunmap(ptr);
2729 		else
2730 			kunmap(kmap_to_page(ptr));
2731 
2732 		ptr = obj->mm.mapping = NULL;
2733 	}
2734 
2735 	if (!ptr) {
2736 		ptr = i915_gem_object_map(obj, type);
2737 		if (!ptr) {
2738 			ret = -ENOMEM;
2739 			goto err_unpin;
2740 		}
2741 
2742 		obj->mm.mapping = page_pack_bits(ptr, type);
2743 	}
2744 
2745 out_unlock:
2746 	mutex_unlock(&obj->mm.lock);
2747 	return ptr;
2748 
2749 err_unpin:
2750 	atomic_dec(&obj->mm.pages_pin_count);
2751 err_unlock:
2752 	ptr = ERR_PTR(ret);
2753 	goto out_unlock;
2754 }
2755 
2756 void __i915_gem_object_flush_map(struct drm_i915_gem_object *obj,
2757 				 unsigned long offset,
2758 				 unsigned long size)
2759 {
2760 	enum i915_map_type has_type;
2761 	void *ptr;
2762 
2763 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
2764 	GEM_BUG_ON(range_overflows_t(typeof(obj->base.size),
2765 				     offset, size, obj->base.size));
2766 
2767 	obj->mm.dirty = true;
2768 
2769 	if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)
2770 		return;
2771 
2772 	ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2773 	if (has_type == I915_MAP_WC)
2774 		return;
2775 
2776 	drm_clflush_virt_range(ptr + offset, size);
2777 	if (size == obj->base.size) {
2778 		obj->write_domain &= ~I915_GEM_DOMAIN_CPU;
2779 		obj->cache_dirty = false;
2780 	}
2781 }
2782 
2783 static int
2784 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2785 			   const struct drm_i915_gem_pwrite *arg)
2786 {
2787 	struct address_space *mapping = obj->base.filp->f_mapping;
2788 	char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2789 	u64 remain, offset;
2790 	unsigned int pg;
2791 
2792 	/* Caller already validated user args */
2793 	GEM_BUG_ON(!access_ok(user_data, arg->size));
2794 
2795 	/*
2796 	 * Before we instantiate/pin the backing store for our use, we
2797 	 * can prepopulate the shmemfs filp efficiently using a write into
2798 	 * the pagecache. We avoid the penalty of instantiating all the
2799 	 * pages, important if the user is just writing to a few and never
2800 	 * uses the object on the GPU, and using a direct write into shmemfs
2801 	 * allows it to avoid the cost of retrieving a page (either swapin
2802 	 * or clearing-before-use) before it is overwritten.
2803 	 */
2804 	if (i915_gem_object_has_pages(obj))
2805 		return -ENODEV;
2806 
2807 	if (obj->mm.madv != I915_MADV_WILLNEED)
2808 		return -EFAULT;
2809 
2810 	/*
2811 	 * Before the pages are instantiated the object is treated as being
2812 	 * in the CPU domain. The pages will be clflushed as required before
2813 	 * use, and we can freely write into the pages directly. If userspace
2814 	 * races pwrite with any other operation; corruption will ensue -
2815 	 * that is userspace's prerogative!
2816 	 */
2817 
2818 	remain = arg->size;
2819 	offset = arg->offset;
2820 	pg = offset_in_page(offset);
2821 
2822 	do {
2823 		unsigned int len, unwritten;
2824 		struct page *page;
2825 		void *data, *vaddr;
2826 		int err;
2827 		char c;
2828 
2829 		len = PAGE_SIZE - pg;
2830 		if (len > remain)
2831 			len = remain;
2832 
2833 		/* Prefault the user page to reduce potential recursion */
2834 		err = __get_user(c, user_data);
2835 		if (err)
2836 			return err;
2837 
2838 		err = __get_user(c, user_data + len - 1);
2839 		if (err)
2840 			return err;
2841 
2842 		err = pagecache_write_begin(obj->base.filp, mapping,
2843 					    offset, len, 0,
2844 					    &page, &data);
2845 		if (err < 0)
2846 			return err;
2847 
2848 		vaddr = kmap_atomic(page);
2849 		unwritten = __copy_from_user_inatomic(vaddr + pg,
2850 						      user_data,
2851 						      len);
2852 		kunmap_atomic(vaddr);
2853 
2854 		err = pagecache_write_end(obj->base.filp, mapping,
2855 					  offset, len, len - unwritten,
2856 					  page, data);
2857 		if (err < 0)
2858 			return err;
2859 
2860 		/* We don't handle -EFAULT, leave it to the caller to check */
2861 		if (unwritten)
2862 			return -ENODEV;
2863 
2864 		remain -= len;
2865 		user_data += len;
2866 		offset += len;
2867 		pg = 0;
2868 	} while (remain);
2869 
2870 	return 0;
2871 }
2872 
2873 static void
2874 i915_gem_retire_work_handler(struct work_struct *work)
2875 {
2876 	struct drm_i915_private *dev_priv =
2877 		container_of(work, typeof(*dev_priv), gt.retire_work.work);
2878 	struct drm_device *dev = &dev_priv->drm;
2879 
2880 	/* Come back later if the device is busy... */
2881 	if (mutex_trylock(&dev->struct_mutex)) {
2882 		i915_retire_requests(dev_priv);
2883 		mutex_unlock(&dev->struct_mutex);
2884 	}
2885 
2886 	/*
2887 	 * Keep the retire handler running until we are finally idle.
2888 	 * We do not need to do this test under locking as in the worst-case
2889 	 * we queue the retire worker once too often.
2890 	 */
2891 	if (READ_ONCE(dev_priv->gt.awake))
2892 		queue_delayed_work(dev_priv->wq,
2893 				   &dev_priv->gt.retire_work,
2894 				   round_jiffies_up_relative(HZ));
2895 }
2896 
2897 static bool switch_to_kernel_context_sync(struct drm_i915_private *i915,
2898 					  unsigned long mask)
2899 {
2900 	bool result = true;
2901 
2902 	/*
2903 	 * Even if we fail to switch, give whatever is running a small chance
2904 	 * to save itself before we report the failure. Yes, this may be a
2905 	 * false positive due to e.g. ENOMEM, caveat emptor!
2906 	 */
2907 	if (i915_gem_switch_to_kernel_context(i915, mask))
2908 		result = false;
2909 
2910 	if (i915_gem_wait_for_idle(i915,
2911 				   I915_WAIT_LOCKED |
2912 				   I915_WAIT_FOR_IDLE_BOOST,
2913 				   I915_GEM_IDLE_TIMEOUT))
2914 		result = false;
2915 
2916 	if (!result) {
2917 		if (i915_modparams.reset) { /* XXX hide warning from gem_eio */
2918 			dev_err(i915->drm.dev,
2919 				"Failed to idle engines, declaring wedged!\n");
2920 			GEM_TRACE_DUMP();
2921 		}
2922 
2923 		/* Forcibly cancel outstanding work and leave the gpu quiet. */
2924 		i915_gem_set_wedged(i915);
2925 	}
2926 
2927 	i915_retire_requests(i915); /* ensure we flush after wedging */
2928 	return result;
2929 }
2930 
2931 static bool load_power_context(struct drm_i915_private *i915)
2932 {
2933 	/* Force loading the kernel context on all engines */
2934 	if (!switch_to_kernel_context_sync(i915, ALL_ENGINES))
2935 		return false;
2936 
2937 	/*
2938 	 * Immediately park the GPU so that we enable powersaving and
2939 	 * treat it as idle. The next time we issue a request, we will
2940 	 * unpark and start using the engine->pinned_default_state, otherwise
2941 	 * it is in limbo and an early reset may fail.
2942 	 */
2943 	__i915_gem_park(i915);
2944 
2945 	return true;
2946 }
2947 
2948 static void
2949 i915_gem_idle_work_handler(struct work_struct *work)
2950 {
2951 	struct drm_i915_private *i915 =
2952 		container_of(work, typeof(*i915), gt.idle_work.work);
2953 	bool rearm_hangcheck;
2954 
2955 	if (!READ_ONCE(i915->gt.awake))
2956 		return;
2957 
2958 	if (READ_ONCE(i915->gt.active_requests))
2959 		return;
2960 
2961 	rearm_hangcheck =
2962 		cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
2963 
2964 	if (!mutex_trylock(&i915->drm.struct_mutex)) {
2965 		/* Currently busy, come back later */
2966 		mod_delayed_work(i915->wq,
2967 				 &i915->gt.idle_work,
2968 				 msecs_to_jiffies(50));
2969 		goto out_rearm;
2970 	}
2971 
2972 	/*
2973 	 * Flush out the last user context, leaving only the pinned
2974 	 * kernel context resident. Should anything unfortunate happen
2975 	 * while we are idle (such as the GPU being power cycled), no users
2976 	 * will be harmed.
2977 	 */
2978 	if (!work_pending(&i915->gt.idle_work.work) &&
2979 	    !i915->gt.active_requests) {
2980 		++i915->gt.active_requests; /* don't requeue idle */
2981 
2982 		switch_to_kernel_context_sync(i915, i915->gt.active_engines);
2983 
2984 		if (!--i915->gt.active_requests) {
2985 			__i915_gem_park(i915);
2986 			rearm_hangcheck = false;
2987 		}
2988 	}
2989 
2990 	mutex_unlock(&i915->drm.struct_mutex);
2991 
2992 out_rearm:
2993 	if (rearm_hangcheck) {
2994 		GEM_BUG_ON(!i915->gt.awake);
2995 		i915_queue_hangcheck(i915);
2996 	}
2997 }
2998 
2999 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3000 {
3001 	struct drm_i915_private *i915 = to_i915(gem->dev);
3002 	struct drm_i915_gem_object *obj = to_intel_bo(gem);
3003 	struct drm_i915_file_private *fpriv = file->driver_priv;
3004 	struct i915_lut_handle *lut, *ln;
3005 
3006 	mutex_lock(&i915->drm.struct_mutex);
3007 
3008 	list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3009 		struct i915_gem_context *ctx = lut->ctx;
3010 		struct i915_vma *vma;
3011 
3012 		GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3013 		if (ctx->file_priv != fpriv)
3014 			continue;
3015 
3016 		vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3017 		GEM_BUG_ON(vma->obj != obj);
3018 
3019 		/* We allow the process to have multiple handles to the same
3020 		 * vma, in the same fd namespace, by virtue of flink/open.
3021 		 */
3022 		GEM_BUG_ON(!vma->open_count);
3023 		if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3024 			i915_vma_close(vma);
3025 
3026 		list_del(&lut->obj_link);
3027 		list_del(&lut->ctx_link);
3028 
3029 		i915_lut_handle_free(lut);
3030 		__i915_gem_object_release_unless_active(obj);
3031 	}
3032 
3033 	mutex_unlock(&i915->drm.struct_mutex);
3034 }
3035 
3036 static unsigned long to_wait_timeout(s64 timeout_ns)
3037 {
3038 	if (timeout_ns < 0)
3039 		return MAX_SCHEDULE_TIMEOUT;
3040 
3041 	if (timeout_ns == 0)
3042 		return 0;
3043 
3044 	return nsecs_to_jiffies_timeout(timeout_ns);
3045 }
3046 
3047 /**
3048  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3049  * @dev: drm device pointer
3050  * @data: ioctl data blob
3051  * @file: drm file pointer
3052  *
3053  * Returns 0 if successful, else an error is returned with the remaining time in
3054  * the timeout parameter.
3055  *  -ETIME: object is still busy after timeout
3056  *  -ERESTARTSYS: signal interrupted the wait
3057  *  -ENONENT: object doesn't exist
3058  * Also possible, but rare:
3059  *  -EAGAIN: incomplete, restart syscall
3060  *  -ENOMEM: damn
3061  *  -ENODEV: Internal IRQ fail
3062  *  -E?: The add request failed
3063  *
3064  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3065  * non-zero timeout parameter the wait ioctl will wait for the given number of
3066  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3067  * without holding struct_mutex the object may become re-busied before this
3068  * function completes. A similar but shorter * race condition exists in the busy
3069  * ioctl
3070  */
3071 int
3072 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3073 {
3074 	struct drm_i915_gem_wait *args = data;
3075 	struct drm_i915_gem_object *obj;
3076 	ktime_t start;
3077 	long ret;
3078 
3079 	if (args->flags != 0)
3080 		return -EINVAL;
3081 
3082 	obj = i915_gem_object_lookup(file, args->bo_handle);
3083 	if (!obj)
3084 		return -ENOENT;
3085 
3086 	start = ktime_get();
3087 
3088 	ret = i915_gem_object_wait(obj,
3089 				   I915_WAIT_INTERRUPTIBLE |
3090 				   I915_WAIT_PRIORITY |
3091 				   I915_WAIT_ALL,
3092 				   to_wait_timeout(args->timeout_ns));
3093 
3094 	if (args->timeout_ns > 0) {
3095 		args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3096 		if (args->timeout_ns < 0)
3097 			args->timeout_ns = 0;
3098 
3099 		/*
3100 		 * Apparently ktime isn't accurate enough and occasionally has a
3101 		 * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3102 		 * things up to make the test happy. We allow up to 1 jiffy.
3103 		 *
3104 		 * This is a regression from the timespec->ktime conversion.
3105 		 */
3106 		if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3107 			args->timeout_ns = 0;
3108 
3109 		/* Asked to wait beyond the jiffie/scheduler precision? */
3110 		if (ret == -ETIME && args->timeout_ns)
3111 			ret = -EAGAIN;
3112 	}
3113 
3114 	i915_gem_object_put(obj);
3115 	return ret;
3116 }
3117 
3118 static int wait_for_engines(struct drm_i915_private *i915)
3119 {
3120 	if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3121 		dev_err(i915->drm.dev,
3122 			"Failed to idle engines, declaring wedged!\n");
3123 		GEM_TRACE_DUMP();
3124 		i915_gem_set_wedged(i915);
3125 		return -EIO;
3126 	}
3127 
3128 	return 0;
3129 }
3130 
3131 static long
3132 wait_for_timelines(struct drm_i915_private *i915,
3133 		   unsigned int flags, long timeout)
3134 {
3135 	struct i915_gt_timelines *gt = &i915->gt.timelines;
3136 	struct i915_timeline *tl;
3137 
3138 	if (!READ_ONCE(i915->gt.active_requests))
3139 		return timeout;
3140 
3141 	mutex_lock(&gt->mutex);
3142 	list_for_each_entry(tl, &gt->active_list, link) {
3143 		struct i915_request *rq;
3144 
3145 		rq = i915_active_request_get_unlocked(&tl->last_request);
3146 		if (!rq)
3147 			continue;
3148 
3149 		mutex_unlock(&gt->mutex);
3150 
3151 		/*
3152 		 * "Race-to-idle".
3153 		 *
3154 		 * Switching to the kernel context is often used a synchronous
3155 		 * step prior to idling, e.g. in suspend for flushing all
3156 		 * current operations to memory before sleeping. These we
3157 		 * want to complete as quickly as possible to avoid prolonged
3158 		 * stalls, so allow the gpu to boost to maximum clocks.
3159 		 */
3160 		if (flags & I915_WAIT_FOR_IDLE_BOOST)
3161 			gen6_rps_boost(rq);
3162 
3163 		timeout = i915_request_wait(rq, flags, timeout);
3164 		i915_request_put(rq);
3165 		if (timeout < 0)
3166 			return timeout;
3167 
3168 		/* restart after reacquiring the lock */
3169 		mutex_lock(&gt->mutex);
3170 		tl = list_entry(&gt->active_list, typeof(*tl), link);
3171 	}
3172 	mutex_unlock(&gt->mutex);
3173 
3174 	return timeout;
3175 }
3176 
3177 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3178 			   unsigned int flags, long timeout)
3179 {
3180 	GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3181 		  flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3182 		  timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3183 
3184 	/* If the device is asleep, we have no requests outstanding */
3185 	if (!READ_ONCE(i915->gt.awake))
3186 		return 0;
3187 
3188 	timeout = wait_for_timelines(i915, flags, timeout);
3189 	if (timeout < 0)
3190 		return timeout;
3191 
3192 	if (flags & I915_WAIT_LOCKED) {
3193 		int err;
3194 
3195 		lockdep_assert_held(&i915->drm.struct_mutex);
3196 
3197 		err = wait_for_engines(i915);
3198 		if (err)
3199 			return err;
3200 
3201 		i915_retire_requests(i915);
3202 	}
3203 
3204 	return 0;
3205 }
3206 
3207 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3208 {
3209 	/*
3210 	 * We manually flush the CPU domain so that we can override and
3211 	 * force the flush for the display, and perform it asyncrhonously.
3212 	 */
3213 	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3214 	if (obj->cache_dirty)
3215 		i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3216 	obj->write_domain = 0;
3217 }
3218 
3219 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3220 {
3221 	if (!READ_ONCE(obj->pin_global))
3222 		return;
3223 
3224 	mutex_lock(&obj->base.dev->struct_mutex);
3225 	__i915_gem_object_flush_for_display(obj);
3226 	mutex_unlock(&obj->base.dev->struct_mutex);
3227 }
3228 
3229 /**
3230  * Moves a single object to the WC read, and possibly write domain.
3231  * @obj: object to act on
3232  * @write: ask for write access or read only
3233  *
3234  * This function returns when the move is complete, including waiting on
3235  * flushes to occur.
3236  */
3237 int
3238 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3239 {
3240 	int ret;
3241 
3242 	lockdep_assert_held(&obj->base.dev->struct_mutex);
3243 
3244 	ret = i915_gem_object_wait(obj,
3245 				   I915_WAIT_INTERRUPTIBLE |
3246 				   I915_WAIT_LOCKED |
3247 				   (write ? I915_WAIT_ALL : 0),
3248 				   MAX_SCHEDULE_TIMEOUT);
3249 	if (ret)
3250 		return ret;
3251 
3252 	if (obj->write_domain == I915_GEM_DOMAIN_WC)
3253 		return 0;
3254 
3255 	/* Flush and acquire obj->pages so that we are coherent through
3256 	 * direct access in memory with previous cached writes through
3257 	 * shmemfs and that our cache domain tracking remains valid.
3258 	 * For example, if the obj->filp was moved to swap without us
3259 	 * being notified and releasing the pages, we would mistakenly
3260 	 * continue to assume that the obj remained out of the CPU cached
3261 	 * domain.
3262 	 */
3263 	ret = i915_gem_object_pin_pages(obj);
3264 	if (ret)
3265 		return ret;
3266 
3267 	flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3268 
3269 	/* Serialise direct access to this object with the barriers for
3270 	 * coherent writes from the GPU, by effectively invalidating the
3271 	 * WC domain upon first access.
3272 	 */
3273 	if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3274 		mb();
3275 
3276 	/* It should now be out of any other write domains, and we can update
3277 	 * the domain values for our changes.
3278 	 */
3279 	GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3280 	obj->read_domains |= I915_GEM_DOMAIN_WC;
3281 	if (write) {
3282 		obj->read_domains = I915_GEM_DOMAIN_WC;
3283 		obj->write_domain = I915_GEM_DOMAIN_WC;
3284 		obj->mm.dirty = true;
3285 	}
3286 
3287 	i915_gem_object_unpin_pages(obj);
3288 	return 0;
3289 }
3290 
3291 /**
3292  * Moves a single object to the GTT read, and possibly write domain.
3293  * @obj: object to act on
3294  * @write: ask for write access or read only
3295  *
3296  * This function returns when the move is complete, including waiting on
3297  * flushes to occur.
3298  */
3299 int
3300 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3301 {
3302 	int ret;
3303 
3304 	lockdep_assert_held(&obj->base.dev->struct_mutex);
3305 
3306 	ret = i915_gem_object_wait(obj,
3307 				   I915_WAIT_INTERRUPTIBLE |
3308 				   I915_WAIT_LOCKED |
3309 				   (write ? I915_WAIT_ALL : 0),
3310 				   MAX_SCHEDULE_TIMEOUT);
3311 	if (ret)
3312 		return ret;
3313 
3314 	if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3315 		return 0;
3316 
3317 	/* Flush and acquire obj->pages so that we are coherent through
3318 	 * direct access in memory with previous cached writes through
3319 	 * shmemfs and that our cache domain tracking remains valid.
3320 	 * For example, if the obj->filp was moved to swap without us
3321 	 * being notified and releasing the pages, we would mistakenly
3322 	 * continue to assume that the obj remained out of the CPU cached
3323 	 * domain.
3324 	 */
3325 	ret = i915_gem_object_pin_pages(obj);
3326 	if (ret)
3327 		return ret;
3328 
3329 	flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3330 
3331 	/* Serialise direct access to this object with the barriers for
3332 	 * coherent writes from the GPU, by effectively invalidating the
3333 	 * GTT domain upon first access.
3334 	 */
3335 	if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3336 		mb();
3337 
3338 	/* It should now be out of any other write domains, and we can update
3339 	 * the domain values for our changes.
3340 	 */
3341 	GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3342 	obj->read_domains |= I915_GEM_DOMAIN_GTT;
3343 	if (write) {
3344 		obj->read_domains = I915_GEM_DOMAIN_GTT;
3345 		obj->write_domain = I915_GEM_DOMAIN_GTT;
3346 		obj->mm.dirty = true;
3347 	}
3348 
3349 	i915_gem_object_unpin_pages(obj);
3350 	return 0;
3351 }
3352 
3353 /**
3354  * Changes the cache-level of an object across all VMA.
3355  * @obj: object to act on
3356  * @cache_level: new cache level to set for the object
3357  *
3358  * After this function returns, the object will be in the new cache-level
3359  * across all GTT and the contents of the backing storage will be coherent,
3360  * with respect to the new cache-level. In order to keep the backing storage
3361  * coherent for all users, we only allow a single cache level to be set
3362  * globally on the object and prevent it from being changed whilst the
3363  * hardware is reading from the object. That is if the object is currently
3364  * on the scanout it will be set to uncached (or equivalent display
3365  * cache coherency) and all non-MOCS GPU access will also be uncached so
3366  * that all direct access to the scanout remains coherent.
3367  */
3368 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3369 				    enum i915_cache_level cache_level)
3370 {
3371 	struct i915_vma *vma;
3372 	int ret;
3373 
3374 	lockdep_assert_held(&obj->base.dev->struct_mutex);
3375 
3376 	if (obj->cache_level == cache_level)
3377 		return 0;
3378 
3379 	/* Inspect the list of currently bound VMA and unbind any that would
3380 	 * be invalid given the new cache-level. This is principally to
3381 	 * catch the issue of the CS prefetch crossing page boundaries and
3382 	 * reading an invalid PTE on older architectures.
3383 	 */
3384 restart:
3385 	list_for_each_entry(vma, &obj->vma.list, obj_link) {
3386 		if (!drm_mm_node_allocated(&vma->node))
3387 			continue;
3388 
3389 		if (i915_vma_is_pinned(vma)) {
3390 			DRM_DEBUG("can not change the cache level of pinned objects\n");
3391 			return -EBUSY;
3392 		}
3393 
3394 		if (!i915_vma_is_closed(vma) &&
3395 		    i915_gem_valid_gtt_space(vma, cache_level))
3396 			continue;
3397 
3398 		ret = i915_vma_unbind(vma);
3399 		if (ret)
3400 			return ret;
3401 
3402 		/* As unbinding may affect other elements in the
3403 		 * obj->vma_list (due to side-effects from retiring
3404 		 * an active vma), play safe and restart the iterator.
3405 		 */
3406 		goto restart;
3407 	}
3408 
3409 	/* We can reuse the existing drm_mm nodes but need to change the
3410 	 * cache-level on the PTE. We could simply unbind them all and
3411 	 * rebind with the correct cache-level on next use. However since
3412 	 * we already have a valid slot, dma mapping, pages etc, we may as
3413 	 * rewrite the PTE in the belief that doing so tramples upon less
3414 	 * state and so involves less work.
3415 	 */
3416 	if (obj->bind_count) {
3417 		/* Before we change the PTE, the GPU must not be accessing it.
3418 		 * If we wait upon the object, we know that all the bound
3419 		 * VMA are no longer active.
3420 		 */
3421 		ret = i915_gem_object_wait(obj,
3422 					   I915_WAIT_INTERRUPTIBLE |
3423 					   I915_WAIT_LOCKED |
3424 					   I915_WAIT_ALL,
3425 					   MAX_SCHEDULE_TIMEOUT);
3426 		if (ret)
3427 			return ret;
3428 
3429 		if (!HAS_LLC(to_i915(obj->base.dev)) &&
3430 		    cache_level != I915_CACHE_NONE) {
3431 			/* Access to snoopable pages through the GTT is
3432 			 * incoherent and on some machines causes a hard
3433 			 * lockup. Relinquish the CPU mmaping to force
3434 			 * userspace to refault in the pages and we can
3435 			 * then double check if the GTT mapping is still
3436 			 * valid for that pointer access.
3437 			 */
3438 			i915_gem_release_mmap(obj);
3439 
3440 			/* As we no longer need a fence for GTT access,
3441 			 * we can relinquish it now (and so prevent having
3442 			 * to steal a fence from someone else on the next
3443 			 * fence request). Note GPU activity would have
3444 			 * dropped the fence as all snoopable access is
3445 			 * supposed to be linear.
3446 			 */
3447 			for_each_ggtt_vma(vma, obj) {
3448 				ret = i915_vma_put_fence(vma);
3449 				if (ret)
3450 					return ret;
3451 			}
3452 		} else {
3453 			/* We either have incoherent backing store and
3454 			 * so no GTT access or the architecture is fully
3455 			 * coherent. In such cases, existing GTT mmaps
3456 			 * ignore the cache bit in the PTE and we can
3457 			 * rewrite it without confusing the GPU or having
3458 			 * to force userspace to fault back in its mmaps.
3459 			 */
3460 		}
3461 
3462 		list_for_each_entry(vma, &obj->vma.list, obj_link) {
3463 			if (!drm_mm_node_allocated(&vma->node))
3464 				continue;
3465 
3466 			ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
3467 			if (ret)
3468 				return ret;
3469 		}
3470 	}
3471 
3472 	list_for_each_entry(vma, &obj->vma.list, obj_link)
3473 		vma->node.color = cache_level;
3474 	i915_gem_object_set_cache_coherency(obj, cache_level);
3475 	obj->cache_dirty = true; /* Always invalidate stale cachelines */
3476 
3477 	return 0;
3478 }
3479 
3480 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
3481 			       struct drm_file *file)
3482 {
3483 	struct drm_i915_gem_caching *args = data;
3484 	struct drm_i915_gem_object *obj;
3485 	int err = 0;
3486 
3487 	rcu_read_lock();
3488 	obj = i915_gem_object_lookup_rcu(file, args->handle);
3489 	if (!obj) {
3490 		err = -ENOENT;
3491 		goto out;
3492 	}
3493 
3494 	switch (obj->cache_level) {
3495 	case I915_CACHE_LLC:
3496 	case I915_CACHE_L3_LLC:
3497 		args->caching = I915_CACHING_CACHED;
3498 		break;
3499 
3500 	case I915_CACHE_WT:
3501 		args->caching = I915_CACHING_DISPLAY;
3502 		break;
3503 
3504 	default:
3505 		args->caching = I915_CACHING_NONE;
3506 		break;
3507 	}
3508 out:
3509 	rcu_read_unlock();
3510 	return err;
3511 }
3512 
3513 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
3514 			       struct drm_file *file)
3515 {
3516 	struct drm_i915_private *i915 = to_i915(dev);
3517 	struct drm_i915_gem_caching *args = data;
3518 	struct drm_i915_gem_object *obj;
3519 	enum i915_cache_level level;
3520 	int ret = 0;
3521 
3522 	switch (args->caching) {
3523 	case I915_CACHING_NONE:
3524 		level = I915_CACHE_NONE;
3525 		break;
3526 	case I915_CACHING_CACHED:
3527 		/*
3528 		 * Due to a HW issue on BXT A stepping, GPU stores via a
3529 		 * snooped mapping may leave stale data in a corresponding CPU
3530 		 * cacheline, whereas normally such cachelines would get
3531 		 * invalidated.
3532 		 */
3533 		if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
3534 			return -ENODEV;
3535 
3536 		level = I915_CACHE_LLC;
3537 		break;
3538 	case I915_CACHING_DISPLAY:
3539 		level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
3540 		break;
3541 	default:
3542 		return -EINVAL;
3543 	}
3544 
3545 	obj = i915_gem_object_lookup(file, args->handle);
3546 	if (!obj)
3547 		return -ENOENT;
3548 
3549 	/*
3550 	 * The caching mode of proxy object is handled by its generator, and
3551 	 * not allowed to be changed by userspace.
3552 	 */
3553 	if (i915_gem_object_is_proxy(obj)) {
3554 		ret = -ENXIO;
3555 		goto out;
3556 	}
3557 
3558 	if (obj->cache_level == level)
3559 		goto out;
3560 
3561 	ret = i915_gem_object_wait(obj,
3562 				   I915_WAIT_INTERRUPTIBLE,
3563 				   MAX_SCHEDULE_TIMEOUT);
3564 	if (ret)
3565 		goto out;
3566 
3567 	ret = i915_mutex_lock_interruptible(dev);
3568 	if (ret)
3569 		goto out;
3570 
3571 	ret = i915_gem_object_set_cache_level(obj, level);
3572 	mutex_unlock(&dev->struct_mutex);
3573 
3574 out:
3575 	i915_gem_object_put(obj);
3576 	return ret;
3577 }
3578 
3579 /*
3580  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
3581  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
3582  * (for pageflips). We only flush the caches while preparing the buffer for
3583  * display, the callers are responsible for frontbuffer flush.
3584  */
3585 struct i915_vma *
3586 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
3587 				     u32 alignment,
3588 				     const struct i915_ggtt_view *view,
3589 				     unsigned int flags)
3590 {
3591 	struct i915_vma *vma;
3592 	int ret;
3593 
3594 	lockdep_assert_held(&obj->base.dev->struct_mutex);
3595 
3596 	/* Mark the global pin early so that we account for the
3597 	 * display coherency whilst setting up the cache domains.
3598 	 */
3599 	obj->pin_global++;
3600 
3601 	/* The display engine is not coherent with the LLC cache on gen6.  As
3602 	 * a result, we make sure that the pinning that is about to occur is
3603 	 * done with uncached PTEs. This is lowest common denominator for all
3604 	 * chipsets.
3605 	 *
3606 	 * However for gen6+, we could do better by using the GFDT bit instead
3607 	 * of uncaching, which would allow us to flush all the LLC-cached data
3608 	 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
3609 	 */
3610 	ret = i915_gem_object_set_cache_level(obj,
3611 					      HAS_WT(to_i915(obj->base.dev)) ?
3612 					      I915_CACHE_WT : I915_CACHE_NONE);
3613 	if (ret) {
3614 		vma = ERR_PTR(ret);
3615 		goto err_unpin_global;
3616 	}
3617 
3618 	/* As the user may map the buffer once pinned in the display plane
3619 	 * (e.g. libkms for the bootup splash), we have to ensure that we
3620 	 * always use map_and_fenceable for all scanout buffers. However,
3621 	 * it may simply be too big to fit into mappable, in which case
3622 	 * put it anyway and hope that userspace can cope (but always first
3623 	 * try to preserve the existing ABI).
3624 	 */
3625 	vma = ERR_PTR(-ENOSPC);
3626 	if ((flags & PIN_MAPPABLE) == 0 &&
3627 	    (!view || view->type == I915_GGTT_VIEW_NORMAL))
3628 		vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
3629 					       flags |
3630 					       PIN_MAPPABLE |
3631 					       PIN_NONBLOCK);
3632 	if (IS_ERR(vma))
3633 		vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
3634 	if (IS_ERR(vma))
3635 		goto err_unpin_global;
3636 
3637 	vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
3638 
3639 	__i915_gem_object_flush_for_display(obj);
3640 
3641 	/* It should now be out of any other write domains, and we can update
3642 	 * the domain values for our changes.
3643 	 */
3644 	obj->read_domains |= I915_GEM_DOMAIN_GTT;
3645 
3646 	return vma;
3647 
3648 err_unpin_global:
3649 	obj->pin_global--;
3650 	return vma;
3651 }
3652 
3653 void
3654 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
3655 {
3656 	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
3657 
3658 	if (WARN_ON(vma->obj->pin_global == 0))
3659 		return;
3660 
3661 	if (--vma->obj->pin_global == 0)
3662 		vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
3663 
3664 	/* Bump the LRU to try and avoid premature eviction whilst flipping  */
3665 	i915_gem_object_bump_inactive_ggtt(vma->obj);
3666 
3667 	i915_vma_unpin(vma);
3668 }
3669 
3670 /**
3671  * Moves a single object to the CPU read, and possibly write domain.
3672  * @obj: object to act on
3673  * @write: requesting write or read-only access
3674  *
3675  * This function returns when the move is complete, including waiting on
3676  * flushes to occur.
3677  */
3678 int
3679 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
3680 {
3681 	int ret;
3682 
3683 	lockdep_assert_held(&obj->base.dev->struct_mutex);
3684 
3685 	ret = i915_gem_object_wait(obj,
3686 				   I915_WAIT_INTERRUPTIBLE |
3687 				   I915_WAIT_LOCKED |
3688 				   (write ? I915_WAIT_ALL : 0),
3689 				   MAX_SCHEDULE_TIMEOUT);
3690 	if (ret)
3691 		return ret;
3692 
3693 	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3694 
3695 	/* Flush the CPU cache if it's still invalid. */
3696 	if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
3697 		i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
3698 		obj->read_domains |= I915_GEM_DOMAIN_CPU;
3699 	}
3700 
3701 	/* It should now be out of any other write domains, and we can update
3702 	 * the domain values for our changes.
3703 	 */
3704 	GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
3705 
3706 	/* If we're writing through the CPU, then the GPU read domains will
3707 	 * need to be invalidated at next use.
3708 	 */
3709 	if (write)
3710 		__start_cpu_write(obj);
3711 
3712 	return 0;
3713 }
3714 
3715 /* Throttle our rendering by waiting until the ring has completed our requests
3716  * emitted over 20 msec ago.
3717  *
3718  * Note that if we were to use the current jiffies each time around the loop,
3719  * we wouldn't escape the function with any frames outstanding if the time to
3720  * render a frame was over 20ms.
3721  *
3722  * This should get us reasonable parallelism between CPU and GPU but also
3723  * relatively low latency when blocking on a particular request to finish.
3724  */
3725 static int
3726 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
3727 {
3728 	struct drm_i915_private *dev_priv = to_i915(dev);
3729 	struct drm_i915_file_private *file_priv = file->driver_priv;
3730 	unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
3731 	struct i915_request *request, *target = NULL;
3732 	long ret;
3733 
3734 	/* ABI: return -EIO if already wedged */
3735 	ret = i915_terminally_wedged(dev_priv);
3736 	if (ret)
3737 		return ret;
3738 
3739 	spin_lock(&file_priv->mm.lock);
3740 	list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
3741 		if (time_after_eq(request->emitted_jiffies, recent_enough))
3742 			break;
3743 
3744 		if (target) {
3745 			list_del(&target->client_link);
3746 			target->file_priv = NULL;
3747 		}
3748 
3749 		target = request;
3750 	}
3751 	if (target)
3752 		i915_request_get(target);
3753 	spin_unlock(&file_priv->mm.lock);
3754 
3755 	if (target == NULL)
3756 		return 0;
3757 
3758 	ret = i915_request_wait(target,
3759 				I915_WAIT_INTERRUPTIBLE,
3760 				MAX_SCHEDULE_TIMEOUT);
3761 	i915_request_put(target);
3762 
3763 	return ret < 0 ? ret : 0;
3764 }
3765 
3766 struct i915_vma *
3767 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
3768 			 const struct i915_ggtt_view *view,
3769 			 u64 size,
3770 			 u64 alignment,
3771 			 u64 flags)
3772 {
3773 	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
3774 	struct i915_address_space *vm = &dev_priv->ggtt.vm;
3775 	struct i915_vma *vma;
3776 	int ret;
3777 
3778 	lockdep_assert_held(&obj->base.dev->struct_mutex);
3779 
3780 	if (flags & PIN_MAPPABLE &&
3781 	    (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
3782 		/* If the required space is larger than the available
3783 		 * aperture, we will not able to find a slot for the
3784 		 * object and unbinding the object now will be in
3785 		 * vain. Worse, doing so may cause us to ping-pong
3786 		 * the object in and out of the Global GTT and
3787 		 * waste a lot of cycles under the mutex.
3788 		 */
3789 		if (obj->base.size > dev_priv->ggtt.mappable_end)
3790 			return ERR_PTR(-E2BIG);
3791 
3792 		/* If NONBLOCK is set the caller is optimistically
3793 		 * trying to cache the full object within the mappable
3794 		 * aperture, and *must* have a fallback in place for
3795 		 * situations where we cannot bind the object. We
3796 		 * can be a little more lax here and use the fallback
3797 		 * more often to avoid costly migrations of ourselves
3798 		 * and other objects within the aperture.
3799 		 *
3800 		 * Half-the-aperture is used as a simple heuristic.
3801 		 * More interesting would to do search for a free
3802 		 * block prior to making the commitment to unbind.
3803 		 * That caters for the self-harm case, and with a
3804 		 * little more heuristics (e.g. NOFAULT, NOEVICT)
3805 		 * we could try to minimise harm to others.
3806 		 */
3807 		if (flags & PIN_NONBLOCK &&
3808 		    obj->base.size > dev_priv->ggtt.mappable_end / 2)
3809 			return ERR_PTR(-ENOSPC);
3810 	}
3811 
3812 	vma = i915_vma_instance(obj, vm, view);
3813 	if (IS_ERR(vma))
3814 		return vma;
3815 
3816 	if (i915_vma_misplaced(vma, size, alignment, flags)) {
3817 		if (flags & PIN_NONBLOCK) {
3818 			if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
3819 				return ERR_PTR(-ENOSPC);
3820 
3821 			if (flags & PIN_MAPPABLE &&
3822 			    vma->fence_size > dev_priv->ggtt.mappable_end / 2)
3823 				return ERR_PTR(-ENOSPC);
3824 		}
3825 
3826 		WARN(i915_vma_is_pinned(vma),
3827 		     "bo is already pinned in ggtt with incorrect alignment:"
3828 		     " offset=%08x, req.alignment=%llx,"
3829 		     " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
3830 		     i915_ggtt_offset(vma), alignment,
3831 		     !!(flags & PIN_MAPPABLE),
3832 		     i915_vma_is_map_and_fenceable(vma));
3833 		ret = i915_vma_unbind(vma);
3834 		if (ret)
3835 			return ERR_PTR(ret);
3836 	}
3837 
3838 	ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
3839 	if (ret)
3840 		return ERR_PTR(ret);
3841 
3842 	return vma;
3843 }
3844 
3845 static __always_inline u32 __busy_read_flag(u8 id)
3846 {
3847 	if (id == (u8)I915_ENGINE_CLASS_INVALID)
3848 		return 0xffff0000u;
3849 
3850 	GEM_BUG_ON(id >= 16);
3851 	return 0x10000u << id;
3852 }
3853 
3854 static __always_inline u32 __busy_write_id(u8 id)
3855 {
3856 	/*
3857 	 * The uABI guarantees an active writer is also amongst the read
3858 	 * engines. This would be true if we accessed the activity tracking
3859 	 * under the lock, but as we perform the lookup of the object and
3860 	 * its activity locklessly we can not guarantee that the last_write
3861 	 * being active implies that we have set the same engine flag from
3862 	 * last_read - hence we always set both read and write busy for
3863 	 * last_write.
3864 	 */
3865 	if (id == (u8)I915_ENGINE_CLASS_INVALID)
3866 		return 0xffffffffu;
3867 
3868 	return (id + 1) | __busy_read_flag(id);
3869 }
3870 
3871 static __always_inline unsigned int
3872 __busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u8 id))
3873 {
3874 	const struct i915_request *rq;
3875 
3876 	/*
3877 	 * We have to check the current hw status of the fence as the uABI
3878 	 * guarantees forward progress. We could rely on the idle worker
3879 	 * to eventually flush us, but to minimise latency just ask the
3880 	 * hardware.
3881 	 *
3882 	 * Note we only report on the status of native fences.
3883 	 */
3884 	if (!dma_fence_is_i915(fence))
3885 		return 0;
3886 
3887 	/* opencode to_request() in order to avoid const warnings */
3888 	rq = container_of(fence, const struct i915_request, fence);
3889 	if (i915_request_completed(rq))
3890 		return 0;
3891 
3892 	/* Beware type-expansion follies! */
3893 	BUILD_BUG_ON(!typecheck(u8, rq->engine->uabi_class));
3894 	return flag(rq->engine->uabi_class);
3895 }
3896 
3897 static __always_inline unsigned int
3898 busy_check_reader(const struct dma_fence *fence)
3899 {
3900 	return __busy_set_if_active(fence, __busy_read_flag);
3901 }
3902 
3903 static __always_inline unsigned int
3904 busy_check_writer(const struct dma_fence *fence)
3905 {
3906 	if (!fence)
3907 		return 0;
3908 
3909 	return __busy_set_if_active(fence, __busy_write_id);
3910 }
3911 
3912 int
3913 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
3914 		    struct drm_file *file)
3915 {
3916 	struct drm_i915_gem_busy *args = data;
3917 	struct drm_i915_gem_object *obj;
3918 	struct reservation_object_list *list;
3919 	unsigned int seq;
3920 	int err;
3921 
3922 	err = -ENOENT;
3923 	rcu_read_lock();
3924 	obj = i915_gem_object_lookup_rcu(file, args->handle);
3925 	if (!obj)
3926 		goto out;
3927 
3928 	/*
3929 	 * A discrepancy here is that we do not report the status of
3930 	 * non-i915 fences, i.e. even though we may report the object as idle,
3931 	 * a call to set-domain may still stall waiting for foreign rendering.
3932 	 * This also means that wait-ioctl may report an object as busy,
3933 	 * where busy-ioctl considers it idle.
3934 	 *
3935 	 * We trade the ability to warn of foreign fences to report on which
3936 	 * i915 engines are active for the object.
3937 	 *
3938 	 * Alternatively, we can trade that extra information on read/write
3939 	 * activity with
3940 	 *	args->busy =
3941 	 *		!reservation_object_test_signaled_rcu(obj->resv, true);
3942 	 * to report the overall busyness. This is what the wait-ioctl does.
3943 	 *
3944 	 */
3945 retry:
3946 	seq = raw_read_seqcount(&obj->resv->seq);
3947 
3948 	/* Translate the exclusive fence to the READ *and* WRITE engine */
3949 	args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
3950 
3951 	/* Translate shared fences to READ set of engines */
3952 	list = rcu_dereference(obj->resv->fence);
3953 	if (list) {
3954 		unsigned int shared_count = list->shared_count, i;
3955 
3956 		for (i = 0; i < shared_count; ++i) {
3957 			struct dma_fence *fence =
3958 				rcu_dereference(list->shared[i]);
3959 
3960 			args->busy |= busy_check_reader(fence);
3961 		}
3962 	}
3963 
3964 	if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
3965 		goto retry;
3966 
3967 	err = 0;
3968 out:
3969 	rcu_read_unlock();
3970 	return err;
3971 }
3972 
3973 int
3974 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
3975 			struct drm_file *file_priv)
3976 {
3977 	return i915_gem_ring_throttle(dev, file_priv);
3978 }
3979 
3980 int
3981 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
3982 		       struct drm_file *file_priv)
3983 {
3984 	struct drm_i915_private *dev_priv = to_i915(dev);
3985 	struct drm_i915_gem_madvise *args = data;
3986 	struct drm_i915_gem_object *obj;
3987 	int err;
3988 
3989 	switch (args->madv) {
3990 	case I915_MADV_DONTNEED:
3991 	case I915_MADV_WILLNEED:
3992 	    break;
3993 	default:
3994 	    return -EINVAL;
3995 	}
3996 
3997 	obj = i915_gem_object_lookup(file_priv, args->handle);
3998 	if (!obj)
3999 		return -ENOENT;
4000 
4001 	err = mutex_lock_interruptible(&obj->mm.lock);
4002 	if (err)
4003 		goto out;
4004 
4005 	if (i915_gem_object_has_pages(obj) &&
4006 	    i915_gem_object_is_tiled(obj) &&
4007 	    dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4008 		if (obj->mm.madv == I915_MADV_WILLNEED) {
4009 			GEM_BUG_ON(!obj->mm.quirked);
4010 			__i915_gem_object_unpin_pages(obj);
4011 			obj->mm.quirked = false;
4012 		}
4013 		if (args->madv == I915_MADV_WILLNEED) {
4014 			GEM_BUG_ON(obj->mm.quirked);
4015 			__i915_gem_object_pin_pages(obj);
4016 			obj->mm.quirked = true;
4017 		}
4018 	}
4019 
4020 	if (obj->mm.madv != __I915_MADV_PURGED)
4021 		obj->mm.madv = args->madv;
4022 
4023 	/* if the object is no longer attached, discard its backing storage */
4024 	if (obj->mm.madv == I915_MADV_DONTNEED &&
4025 	    !i915_gem_object_has_pages(obj))
4026 		i915_gem_object_truncate(obj);
4027 
4028 	args->retained = obj->mm.madv != __I915_MADV_PURGED;
4029 	mutex_unlock(&obj->mm.lock);
4030 
4031 out:
4032 	i915_gem_object_put(obj);
4033 	return err;
4034 }
4035 
4036 static void
4037 frontbuffer_retire(struct i915_active_request *active,
4038 		   struct i915_request *request)
4039 {
4040 	struct drm_i915_gem_object *obj =
4041 		container_of(active, typeof(*obj), frontbuffer_write);
4042 
4043 	intel_fb_obj_flush(obj, ORIGIN_CS);
4044 }
4045 
4046 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4047 			  const struct drm_i915_gem_object_ops *ops)
4048 {
4049 	mutex_init(&obj->mm.lock);
4050 
4051 	spin_lock_init(&obj->vma.lock);
4052 	INIT_LIST_HEAD(&obj->vma.list);
4053 
4054 	INIT_LIST_HEAD(&obj->lut_list);
4055 	INIT_LIST_HEAD(&obj->batch_pool_link);
4056 
4057 	init_rcu_head(&obj->rcu);
4058 
4059 	obj->ops = ops;
4060 
4061 	reservation_object_init(&obj->__builtin_resv);
4062 	obj->resv = &obj->__builtin_resv;
4063 
4064 	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4065 	i915_active_request_init(&obj->frontbuffer_write,
4066 				 NULL, frontbuffer_retire);
4067 
4068 	obj->mm.madv = I915_MADV_WILLNEED;
4069 	INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4070 	mutex_init(&obj->mm.get_page.lock);
4071 
4072 	i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4073 }
4074 
4075 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4076 	.flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4077 		 I915_GEM_OBJECT_IS_SHRINKABLE,
4078 
4079 	.get_pages = i915_gem_object_get_pages_gtt,
4080 	.put_pages = i915_gem_object_put_pages_gtt,
4081 
4082 	.pwrite = i915_gem_object_pwrite_gtt,
4083 };
4084 
4085 static int i915_gem_object_create_shmem(struct drm_device *dev,
4086 					struct drm_gem_object *obj,
4087 					size_t size)
4088 {
4089 	struct drm_i915_private *i915 = to_i915(dev);
4090 	unsigned long flags = VM_NORESERVE;
4091 	struct file *filp;
4092 
4093 	drm_gem_private_object_init(dev, obj, size);
4094 
4095 	if (i915->mm.gemfs)
4096 		filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4097 						 flags);
4098 	else
4099 		filp = shmem_file_setup("i915", size, flags);
4100 
4101 	if (IS_ERR(filp))
4102 		return PTR_ERR(filp);
4103 
4104 	obj->filp = filp;
4105 
4106 	return 0;
4107 }
4108 
4109 struct drm_i915_gem_object *
4110 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4111 {
4112 	struct drm_i915_gem_object *obj;
4113 	struct address_space *mapping;
4114 	unsigned int cache_level;
4115 	gfp_t mask;
4116 	int ret;
4117 
4118 	/* There is a prevalence of the assumption that we fit the object's
4119 	 * page count inside a 32bit _signed_ variable. Let's document this and
4120 	 * catch if we ever need to fix it. In the meantime, if you do spot
4121 	 * such a local variable, please consider fixing!
4122 	 */
4123 	if (size >> PAGE_SHIFT > INT_MAX)
4124 		return ERR_PTR(-E2BIG);
4125 
4126 	if (overflows_type(size, obj->base.size))
4127 		return ERR_PTR(-E2BIG);
4128 
4129 	obj = i915_gem_object_alloc();
4130 	if (obj == NULL)
4131 		return ERR_PTR(-ENOMEM);
4132 
4133 	ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4134 	if (ret)
4135 		goto fail;
4136 
4137 	mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4138 	if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4139 		/* 965gm cannot relocate objects above 4GiB. */
4140 		mask &= ~__GFP_HIGHMEM;
4141 		mask |= __GFP_DMA32;
4142 	}
4143 
4144 	mapping = obj->base.filp->f_mapping;
4145 	mapping_set_gfp_mask(mapping, mask);
4146 	GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4147 
4148 	i915_gem_object_init(obj, &i915_gem_object_ops);
4149 
4150 	obj->write_domain = I915_GEM_DOMAIN_CPU;
4151 	obj->read_domains = I915_GEM_DOMAIN_CPU;
4152 
4153 	if (HAS_LLC(dev_priv))
4154 		/* On some devices, we can have the GPU use the LLC (the CPU
4155 		 * cache) for about a 10% performance improvement
4156 		 * compared to uncached.  Graphics requests other than
4157 		 * display scanout are coherent with the CPU in
4158 		 * accessing this cache.  This means in this mode we
4159 		 * don't need to clflush on the CPU side, and on the
4160 		 * GPU side we only need to flush internal caches to
4161 		 * get data visible to the CPU.
4162 		 *
4163 		 * However, we maintain the display planes as UC, and so
4164 		 * need to rebind when first used as such.
4165 		 */
4166 		cache_level = I915_CACHE_LLC;
4167 	else
4168 		cache_level = I915_CACHE_NONE;
4169 
4170 	i915_gem_object_set_cache_coherency(obj, cache_level);
4171 
4172 	trace_i915_gem_object_create(obj);
4173 
4174 	return obj;
4175 
4176 fail:
4177 	i915_gem_object_free(obj);
4178 	return ERR_PTR(ret);
4179 }
4180 
4181 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4182 {
4183 	/* If we are the last user of the backing storage (be it shmemfs
4184 	 * pages or stolen etc), we know that the pages are going to be
4185 	 * immediately released. In this case, we can then skip copying
4186 	 * back the contents from the GPU.
4187 	 */
4188 
4189 	if (obj->mm.madv != I915_MADV_WILLNEED)
4190 		return false;
4191 
4192 	if (obj->base.filp == NULL)
4193 		return true;
4194 
4195 	/* At first glance, this looks racy, but then again so would be
4196 	 * userspace racing mmap against close. However, the first external
4197 	 * reference to the filp can only be obtained through the
4198 	 * i915_gem_mmap_ioctl() which safeguards us against the user
4199 	 * acquiring such a reference whilst we are in the middle of
4200 	 * freeing the object.
4201 	 */
4202 	return file_count(obj->base.filp) == 1;
4203 }
4204 
4205 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4206 				    struct llist_node *freed)
4207 {
4208 	struct drm_i915_gem_object *obj, *on;
4209 	intel_wakeref_t wakeref;
4210 
4211 	wakeref = intel_runtime_pm_get(i915);
4212 	llist_for_each_entry_safe(obj, on, freed, freed) {
4213 		struct i915_vma *vma, *vn;
4214 
4215 		trace_i915_gem_object_destroy(obj);
4216 
4217 		mutex_lock(&i915->drm.struct_mutex);
4218 
4219 		GEM_BUG_ON(i915_gem_object_is_active(obj));
4220 		list_for_each_entry_safe(vma, vn, &obj->vma.list, obj_link) {
4221 			GEM_BUG_ON(i915_vma_is_active(vma));
4222 			vma->flags &= ~I915_VMA_PIN_MASK;
4223 			i915_vma_destroy(vma);
4224 		}
4225 		GEM_BUG_ON(!list_empty(&obj->vma.list));
4226 		GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma.tree));
4227 
4228 		/* This serializes freeing with the shrinker. Since the free
4229 		 * is delayed, first by RCU then by the workqueue, we want the
4230 		 * shrinker to be able to free pages of unreferenced objects,
4231 		 * or else we may oom whilst there are plenty of deferred
4232 		 * freed objects.
4233 		 */
4234 		if (i915_gem_object_has_pages(obj)) {
4235 			spin_lock(&i915->mm.obj_lock);
4236 			list_del_init(&obj->mm.link);
4237 			spin_unlock(&i915->mm.obj_lock);
4238 		}
4239 
4240 		mutex_unlock(&i915->drm.struct_mutex);
4241 
4242 		GEM_BUG_ON(obj->bind_count);
4243 		GEM_BUG_ON(obj->userfault_count);
4244 		GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4245 		GEM_BUG_ON(!list_empty(&obj->lut_list));
4246 
4247 		if (obj->ops->release)
4248 			obj->ops->release(obj);
4249 
4250 		if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4251 			atomic_set(&obj->mm.pages_pin_count, 0);
4252 		__i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4253 		GEM_BUG_ON(i915_gem_object_has_pages(obj));
4254 
4255 		if (obj->base.import_attach)
4256 			drm_prime_gem_destroy(&obj->base, NULL);
4257 
4258 		reservation_object_fini(&obj->__builtin_resv);
4259 		drm_gem_object_release(&obj->base);
4260 		i915_gem_info_remove_obj(i915, obj->base.size);
4261 
4262 		bitmap_free(obj->bit_17);
4263 		i915_gem_object_free(obj);
4264 
4265 		GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4266 		atomic_dec(&i915->mm.free_count);
4267 
4268 		if (on)
4269 			cond_resched();
4270 	}
4271 	intel_runtime_pm_put(i915, wakeref);
4272 }
4273 
4274 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4275 {
4276 	struct llist_node *freed;
4277 
4278 	/* Free the oldest, most stale object to keep the free_list short */
4279 	freed = NULL;
4280 	if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4281 		/* Only one consumer of llist_del_first() allowed */
4282 		spin_lock(&i915->mm.free_lock);
4283 		freed = llist_del_first(&i915->mm.free_list);
4284 		spin_unlock(&i915->mm.free_lock);
4285 	}
4286 	if (unlikely(freed)) {
4287 		freed->next = NULL;
4288 		__i915_gem_free_objects(i915, freed);
4289 	}
4290 }
4291 
4292 static void __i915_gem_free_work(struct work_struct *work)
4293 {
4294 	struct drm_i915_private *i915 =
4295 		container_of(work, struct drm_i915_private, mm.free_work);
4296 	struct llist_node *freed;
4297 
4298 	/*
4299 	 * All file-owned VMA should have been released by this point through
4300 	 * i915_gem_close_object(), or earlier by i915_gem_context_close().
4301 	 * However, the object may also be bound into the global GTT (e.g.
4302 	 * older GPUs without per-process support, or for direct access through
4303 	 * the GTT either for the user or for scanout). Those VMA still need to
4304 	 * unbound now.
4305 	 */
4306 
4307 	spin_lock(&i915->mm.free_lock);
4308 	while ((freed = llist_del_all(&i915->mm.free_list))) {
4309 		spin_unlock(&i915->mm.free_lock);
4310 
4311 		__i915_gem_free_objects(i915, freed);
4312 		if (need_resched())
4313 			return;
4314 
4315 		spin_lock(&i915->mm.free_lock);
4316 	}
4317 	spin_unlock(&i915->mm.free_lock);
4318 }
4319 
4320 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4321 {
4322 	struct drm_i915_gem_object *obj =
4323 		container_of(head, typeof(*obj), rcu);
4324 	struct drm_i915_private *i915 = to_i915(obj->base.dev);
4325 
4326 	/*
4327 	 * We reuse obj->rcu for the freed list, so we had better not treat
4328 	 * it like a rcu_head from this point forwards. And we expect all
4329 	 * objects to be freed via this path.
4330 	 */
4331 	destroy_rcu_head(&obj->rcu);
4332 
4333 	/*
4334 	 * Since we require blocking on struct_mutex to unbind the freed
4335 	 * object from the GPU before releasing resources back to the
4336 	 * system, we can not do that directly from the RCU callback (which may
4337 	 * be a softirq context), but must instead then defer that work onto a
4338 	 * kthread. We use the RCU callback rather than move the freed object
4339 	 * directly onto the work queue so that we can mix between using the
4340 	 * worker and performing frees directly from subsequent allocations for
4341 	 * crude but effective memory throttling.
4342 	 */
4343 	if (llist_add(&obj->freed, &i915->mm.free_list))
4344 		queue_work(i915->wq, &i915->mm.free_work);
4345 }
4346 
4347 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4348 {
4349 	struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4350 
4351 	if (obj->mm.quirked)
4352 		__i915_gem_object_unpin_pages(obj);
4353 
4354 	if (discard_backing_storage(obj))
4355 		obj->mm.madv = I915_MADV_DONTNEED;
4356 
4357 	/*
4358 	 * Before we free the object, make sure any pure RCU-only
4359 	 * read-side critical sections are complete, e.g.
4360 	 * i915_gem_busy_ioctl(). For the corresponding synchronized
4361 	 * lookup see i915_gem_object_lookup_rcu().
4362 	 */
4363 	atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4364 	call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4365 }
4366 
4367 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4368 {
4369 	lockdep_assert_held(&obj->base.dev->struct_mutex);
4370 
4371 	if (!i915_gem_object_has_active_reference(obj) &&
4372 	    i915_gem_object_is_active(obj))
4373 		i915_gem_object_set_active_reference(obj);
4374 	else
4375 		i915_gem_object_put(obj);
4376 }
4377 
4378 void i915_gem_sanitize(struct drm_i915_private *i915)
4379 {
4380 	intel_wakeref_t wakeref;
4381 
4382 	GEM_TRACE("\n");
4383 
4384 	wakeref = intel_runtime_pm_get(i915);
4385 	intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
4386 
4387 	/*
4388 	 * As we have just resumed the machine and woken the device up from
4389 	 * deep PCI sleep (presumably D3_cold), assume the HW has been reset
4390 	 * back to defaults, recovering from whatever wedged state we left it
4391 	 * in and so worth trying to use the device once more.
4392 	 */
4393 	if (i915_terminally_wedged(i915))
4394 		i915_gem_unset_wedged(i915);
4395 
4396 	/*
4397 	 * If we inherit context state from the BIOS or earlier occupants
4398 	 * of the GPU, the GPU may be in an inconsistent state when we
4399 	 * try to take over. The only way to remove the earlier state
4400 	 * is by resetting. However, resetting on earlier gen is tricky as
4401 	 * it may impact the display and we are uncertain about the stability
4402 	 * of the reset, so this could be applied to even earlier gen.
4403 	 */
4404 	intel_engines_sanitize(i915, false);
4405 
4406 	intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
4407 	intel_runtime_pm_put(i915, wakeref);
4408 
4409 	mutex_lock(&i915->drm.struct_mutex);
4410 	i915_gem_contexts_lost(i915);
4411 	mutex_unlock(&i915->drm.struct_mutex);
4412 }
4413 
4414 void i915_gem_suspend(struct drm_i915_private *i915)
4415 {
4416 	intel_wakeref_t wakeref;
4417 
4418 	GEM_TRACE("\n");
4419 
4420 	wakeref = intel_runtime_pm_get(i915);
4421 
4422 	flush_workqueue(i915->wq);
4423 
4424 	mutex_lock(&i915->drm.struct_mutex);
4425 
4426 	/*
4427 	 * We have to flush all the executing contexts to main memory so
4428 	 * that they can saved in the hibernation image. To ensure the last
4429 	 * context image is coherent, we have to switch away from it. That
4430 	 * leaves the i915->kernel_context still active when
4431 	 * we actually suspend, and its image in memory may not match the GPU
4432 	 * state. Fortunately, the kernel_context is disposable and we do
4433 	 * not rely on its state.
4434 	 */
4435 	switch_to_kernel_context_sync(i915, i915->gt.active_engines);
4436 
4437 	mutex_unlock(&i915->drm.struct_mutex);
4438 	i915_reset_flush(i915);
4439 
4440 	drain_delayed_work(&i915->gt.retire_work);
4441 
4442 	/*
4443 	 * As the idle_work is rearming if it detects a race, play safe and
4444 	 * repeat the flush until it is definitely idle.
4445 	 */
4446 	drain_delayed_work(&i915->gt.idle_work);
4447 
4448 	/*
4449 	 * Assert that we successfully flushed all the work and
4450 	 * reset the GPU back to its idle, low power state.
4451 	 */
4452 	GEM_BUG_ON(i915->gt.awake);
4453 
4454 	intel_uc_suspend(i915);
4455 
4456 	intel_runtime_pm_put(i915, wakeref);
4457 }
4458 
4459 void i915_gem_suspend_late(struct drm_i915_private *i915)
4460 {
4461 	struct drm_i915_gem_object *obj;
4462 	struct list_head *phases[] = {
4463 		&i915->mm.unbound_list,
4464 		&i915->mm.bound_list,
4465 		NULL
4466 	}, **phase;
4467 
4468 	/*
4469 	 * Neither the BIOS, ourselves or any other kernel
4470 	 * expects the system to be in execlists mode on startup,
4471 	 * so we need to reset the GPU back to legacy mode. And the only
4472 	 * known way to disable logical contexts is through a GPU reset.
4473 	 *
4474 	 * So in order to leave the system in a known default configuration,
4475 	 * always reset the GPU upon unload and suspend. Afterwards we then
4476 	 * clean up the GEM state tracking, flushing off the requests and
4477 	 * leaving the system in a known idle state.
4478 	 *
4479 	 * Note that is of the upmost importance that the GPU is idle and
4480 	 * all stray writes are flushed *before* we dismantle the backing
4481 	 * storage for the pinned objects.
4482 	 *
4483 	 * However, since we are uncertain that resetting the GPU on older
4484 	 * machines is a good idea, we don't - just in case it leaves the
4485 	 * machine in an unusable condition.
4486 	 */
4487 
4488 	mutex_lock(&i915->drm.struct_mutex);
4489 	for (phase = phases; *phase; phase++) {
4490 		list_for_each_entry(obj, *phase, mm.link)
4491 			WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
4492 	}
4493 	mutex_unlock(&i915->drm.struct_mutex);
4494 
4495 	intel_uc_sanitize(i915);
4496 	i915_gem_sanitize(i915);
4497 }
4498 
4499 void i915_gem_resume(struct drm_i915_private *i915)
4500 {
4501 	GEM_TRACE("\n");
4502 
4503 	WARN_ON(i915->gt.awake);
4504 
4505 	mutex_lock(&i915->drm.struct_mutex);
4506 	intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
4507 
4508 	i915_gem_restore_gtt_mappings(i915);
4509 	i915_gem_restore_fences(i915);
4510 
4511 	/*
4512 	 * As we didn't flush the kernel context before suspend, we cannot
4513 	 * guarantee that the context image is complete. So let's just reset
4514 	 * it and start again.
4515 	 */
4516 	intel_gt_resume(i915);
4517 
4518 	if (i915_gem_init_hw(i915))
4519 		goto err_wedged;
4520 
4521 	intel_uc_resume(i915);
4522 
4523 	/* Always reload a context for powersaving. */
4524 	if (!load_power_context(i915))
4525 		goto err_wedged;
4526 
4527 out_unlock:
4528 	intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
4529 	mutex_unlock(&i915->drm.struct_mutex);
4530 	return;
4531 
4532 err_wedged:
4533 	if (!i915_reset_failed(i915)) {
4534 		dev_err(i915->drm.dev,
4535 			"Failed to re-initialize GPU, declaring it wedged!\n");
4536 		i915_gem_set_wedged(i915);
4537 	}
4538 	goto out_unlock;
4539 }
4540 
4541 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
4542 {
4543 	if (INTEL_GEN(dev_priv) < 5 ||
4544 	    dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
4545 		return;
4546 
4547 	I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
4548 				 DISP_TILE_SURFACE_SWIZZLING);
4549 
4550 	if (IS_GEN(dev_priv, 5))
4551 		return;
4552 
4553 	I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
4554 	if (IS_GEN(dev_priv, 6))
4555 		I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
4556 	else if (IS_GEN(dev_priv, 7))
4557 		I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
4558 	else if (IS_GEN(dev_priv, 8))
4559 		I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
4560 	else
4561 		BUG();
4562 }
4563 
4564 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
4565 {
4566 	I915_WRITE(RING_CTL(base), 0);
4567 	I915_WRITE(RING_HEAD(base), 0);
4568 	I915_WRITE(RING_TAIL(base), 0);
4569 	I915_WRITE(RING_START(base), 0);
4570 }
4571 
4572 static void init_unused_rings(struct drm_i915_private *dev_priv)
4573 {
4574 	if (IS_I830(dev_priv)) {
4575 		init_unused_ring(dev_priv, PRB1_BASE);
4576 		init_unused_ring(dev_priv, SRB0_BASE);
4577 		init_unused_ring(dev_priv, SRB1_BASE);
4578 		init_unused_ring(dev_priv, SRB2_BASE);
4579 		init_unused_ring(dev_priv, SRB3_BASE);
4580 	} else if (IS_GEN(dev_priv, 2)) {
4581 		init_unused_ring(dev_priv, SRB0_BASE);
4582 		init_unused_ring(dev_priv, SRB1_BASE);
4583 	} else if (IS_GEN(dev_priv, 3)) {
4584 		init_unused_ring(dev_priv, PRB1_BASE);
4585 		init_unused_ring(dev_priv, PRB2_BASE);
4586 	}
4587 }
4588 
4589 static int __i915_gem_restart_engines(void *data)
4590 {
4591 	struct drm_i915_private *i915 = data;
4592 	struct intel_engine_cs *engine;
4593 	enum intel_engine_id id;
4594 	int err;
4595 
4596 	for_each_engine(engine, i915, id) {
4597 		err = engine->init_hw(engine);
4598 		if (err) {
4599 			DRM_ERROR("Failed to restart %s (%d)\n",
4600 				  engine->name, err);
4601 			return err;
4602 		}
4603 	}
4604 
4605 	intel_engines_set_scheduler_caps(i915);
4606 
4607 	return 0;
4608 }
4609 
4610 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
4611 {
4612 	int ret;
4613 
4614 	dev_priv->gt.last_init_time = ktime_get();
4615 
4616 	/* Double layer security blanket, see i915_gem_init() */
4617 	intel_uncore_forcewake_get(&dev_priv->uncore, FORCEWAKE_ALL);
4618 
4619 	if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
4620 		I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
4621 
4622 	if (IS_HASWELL(dev_priv))
4623 		I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
4624 			   LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
4625 
4626 	/* Apply the GT workarounds... */
4627 	intel_gt_apply_workarounds(dev_priv);
4628 	/* ...and determine whether they are sticking. */
4629 	intel_gt_verify_workarounds(dev_priv, "init");
4630 
4631 	i915_gem_init_swizzling(dev_priv);
4632 
4633 	/*
4634 	 * At least 830 can leave some of the unused rings
4635 	 * "active" (ie. head != tail) after resume which
4636 	 * will prevent c3 entry. Makes sure all unused rings
4637 	 * are totally idle.
4638 	 */
4639 	init_unused_rings(dev_priv);
4640 
4641 	BUG_ON(!dev_priv->kernel_context);
4642 	ret = i915_terminally_wedged(dev_priv);
4643 	if (ret)
4644 		goto out;
4645 
4646 	ret = i915_ppgtt_init_hw(dev_priv);
4647 	if (ret) {
4648 		DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
4649 		goto out;
4650 	}
4651 
4652 	ret = intel_wopcm_init_hw(&dev_priv->wopcm);
4653 	if (ret) {
4654 		DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
4655 		goto out;
4656 	}
4657 
4658 	/* We can't enable contexts until all firmware is loaded */
4659 	ret = intel_uc_init_hw(dev_priv);
4660 	if (ret) {
4661 		DRM_ERROR("Enabling uc failed (%d)\n", ret);
4662 		goto out;
4663 	}
4664 
4665 	intel_mocs_init_l3cc_table(dev_priv);
4666 
4667 	/* Only when the HW is re-initialised, can we replay the requests */
4668 	ret = __i915_gem_restart_engines(dev_priv);
4669 	if (ret)
4670 		goto cleanup_uc;
4671 
4672 	intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
4673 
4674 	return 0;
4675 
4676 cleanup_uc:
4677 	intel_uc_fini_hw(dev_priv);
4678 out:
4679 	intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
4680 
4681 	return ret;
4682 }
4683 
4684 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
4685 {
4686 	struct i915_gem_context *ctx;
4687 	struct intel_engine_cs *engine;
4688 	enum intel_engine_id id;
4689 	int err = 0;
4690 
4691 	/*
4692 	 * As we reset the gpu during very early sanitisation, the current
4693 	 * register state on the GPU should reflect its defaults values.
4694 	 * We load a context onto the hw (with restore-inhibit), then switch
4695 	 * over to a second context to save that default register state. We
4696 	 * can then prime every new context with that state so they all start
4697 	 * from the same default HW values.
4698 	 */
4699 
4700 	ctx = i915_gem_context_create_kernel(i915, 0);
4701 	if (IS_ERR(ctx))
4702 		return PTR_ERR(ctx);
4703 
4704 	for_each_engine(engine, i915, id) {
4705 		struct i915_request *rq;
4706 
4707 		rq = i915_request_alloc(engine, ctx);
4708 		if (IS_ERR(rq)) {
4709 			err = PTR_ERR(rq);
4710 			goto out_ctx;
4711 		}
4712 
4713 		err = 0;
4714 		if (engine->init_context)
4715 			err = engine->init_context(rq);
4716 
4717 		i915_request_add(rq);
4718 		if (err)
4719 			goto err_active;
4720 	}
4721 
4722 	/* Flush the default context image to memory, and enable powersaving. */
4723 	if (!load_power_context(i915)) {
4724 		err = -EIO;
4725 		goto err_active;
4726 	}
4727 
4728 	for_each_engine(engine, i915, id) {
4729 		struct intel_context *ce;
4730 		struct i915_vma *state;
4731 		void *vaddr;
4732 
4733 		ce = intel_context_lookup(ctx, engine);
4734 		if (!ce)
4735 			continue;
4736 
4737 		state = ce->state;
4738 		if (!state)
4739 			continue;
4740 
4741 		GEM_BUG_ON(intel_context_is_pinned(ce));
4742 
4743 		/*
4744 		 * As we will hold a reference to the logical state, it will
4745 		 * not be torn down with the context, and importantly the
4746 		 * object will hold onto its vma (making it possible for a
4747 		 * stray GTT write to corrupt our defaults). Unmap the vma
4748 		 * from the GTT to prevent such accidents and reclaim the
4749 		 * space.
4750 		 */
4751 		err = i915_vma_unbind(state);
4752 		if (err)
4753 			goto err_active;
4754 
4755 		err = i915_gem_object_set_to_cpu_domain(state->obj, false);
4756 		if (err)
4757 			goto err_active;
4758 
4759 		engine->default_state = i915_gem_object_get(state->obj);
4760 		i915_gem_object_set_cache_coherency(engine->default_state,
4761 						    I915_CACHE_LLC);
4762 
4763 		/* Check we can acquire the image of the context state */
4764 		vaddr = i915_gem_object_pin_map(engine->default_state,
4765 						I915_MAP_FORCE_WB);
4766 		if (IS_ERR(vaddr)) {
4767 			err = PTR_ERR(vaddr);
4768 			goto err_active;
4769 		}
4770 
4771 		i915_gem_object_unpin_map(engine->default_state);
4772 	}
4773 
4774 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
4775 		unsigned int found = intel_engines_has_context_isolation(i915);
4776 
4777 		/*
4778 		 * Make sure that classes with multiple engine instances all
4779 		 * share the same basic configuration.
4780 		 */
4781 		for_each_engine(engine, i915, id) {
4782 			unsigned int bit = BIT(engine->uabi_class);
4783 			unsigned int expected = engine->default_state ? bit : 0;
4784 
4785 			if ((found & bit) != expected) {
4786 				DRM_ERROR("mismatching default context state for class %d on engine %s\n",
4787 					  engine->uabi_class, engine->name);
4788 			}
4789 		}
4790 	}
4791 
4792 out_ctx:
4793 	i915_gem_context_set_closed(ctx);
4794 	i915_gem_context_put(ctx);
4795 	return err;
4796 
4797 err_active:
4798 	/*
4799 	 * If we have to abandon now, we expect the engines to be idle
4800 	 * and ready to be torn-down. The quickest way we can accomplish
4801 	 * this is by declaring ourselves wedged.
4802 	 */
4803 	i915_gem_set_wedged(i915);
4804 	goto out_ctx;
4805 }
4806 
4807 static int
4808 i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size)
4809 {
4810 	struct drm_i915_gem_object *obj;
4811 	struct i915_vma *vma;
4812 	int ret;
4813 
4814 	obj = i915_gem_object_create_stolen(i915, size);
4815 	if (!obj)
4816 		obj = i915_gem_object_create_internal(i915, size);
4817 	if (IS_ERR(obj)) {
4818 		DRM_ERROR("Failed to allocate scratch page\n");
4819 		return PTR_ERR(obj);
4820 	}
4821 
4822 	vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
4823 	if (IS_ERR(vma)) {
4824 		ret = PTR_ERR(vma);
4825 		goto err_unref;
4826 	}
4827 
4828 	ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
4829 	if (ret)
4830 		goto err_unref;
4831 
4832 	i915->gt.scratch = vma;
4833 	return 0;
4834 
4835 err_unref:
4836 	i915_gem_object_put(obj);
4837 	return ret;
4838 }
4839 
4840 static void i915_gem_fini_scratch(struct drm_i915_private *i915)
4841 {
4842 	i915_vma_unpin_and_release(&i915->gt.scratch, 0);
4843 }
4844 
4845 int i915_gem_init(struct drm_i915_private *dev_priv)
4846 {
4847 	int ret;
4848 
4849 	/* We need to fallback to 4K pages if host doesn't support huge gtt. */
4850 	if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
4851 		mkwrite_device_info(dev_priv)->page_sizes =
4852 			I915_GTT_PAGE_SIZE_4K;
4853 
4854 	dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
4855 
4856 	if (HAS_LOGICAL_RING_CONTEXTS(dev_priv))
4857 		dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
4858 	else
4859 		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
4860 
4861 	i915_timelines_init(dev_priv);
4862 
4863 	ret = i915_gem_init_userptr(dev_priv);
4864 	if (ret)
4865 		return ret;
4866 
4867 	ret = intel_uc_init_misc(dev_priv);
4868 	if (ret)
4869 		return ret;
4870 
4871 	ret = intel_wopcm_init(&dev_priv->wopcm);
4872 	if (ret)
4873 		goto err_uc_misc;
4874 
4875 	/* This is just a security blanket to placate dragons.
4876 	 * On some systems, we very sporadically observe that the first TLBs
4877 	 * used by the CS may be stale, despite us poking the TLB reset. If
4878 	 * we hold the forcewake during initialisation these problems
4879 	 * just magically go away.
4880 	 */
4881 	mutex_lock(&dev_priv->drm.struct_mutex);
4882 	intel_uncore_forcewake_get(&dev_priv->uncore, FORCEWAKE_ALL);
4883 
4884 	ret = i915_gem_init_ggtt(dev_priv);
4885 	if (ret) {
4886 		GEM_BUG_ON(ret == -EIO);
4887 		goto err_unlock;
4888 	}
4889 
4890 	ret = i915_gem_init_scratch(dev_priv,
4891 				    IS_GEN(dev_priv, 2) ? SZ_256K : PAGE_SIZE);
4892 	if (ret) {
4893 		GEM_BUG_ON(ret == -EIO);
4894 		goto err_ggtt;
4895 	}
4896 
4897 	ret = i915_gem_contexts_init(dev_priv);
4898 	if (ret) {
4899 		GEM_BUG_ON(ret == -EIO);
4900 		goto err_scratch;
4901 	}
4902 
4903 	ret = intel_engines_init(dev_priv);
4904 	if (ret) {
4905 		GEM_BUG_ON(ret == -EIO);
4906 		goto err_context;
4907 	}
4908 
4909 	intel_init_gt_powersave(dev_priv);
4910 
4911 	ret = intel_uc_init(dev_priv);
4912 	if (ret)
4913 		goto err_pm;
4914 
4915 	ret = i915_gem_init_hw(dev_priv);
4916 	if (ret)
4917 		goto err_uc_init;
4918 
4919 	/*
4920 	 * Despite its name intel_init_clock_gating applies both display
4921 	 * clock gating workarounds; GT mmio workarounds and the occasional
4922 	 * GT power context workaround. Worse, sometimes it includes a context
4923 	 * register workaround which we need to apply before we record the
4924 	 * default HW state for all contexts.
4925 	 *
4926 	 * FIXME: break up the workarounds and apply them at the right time!
4927 	 */
4928 	intel_init_clock_gating(dev_priv);
4929 
4930 	ret = __intel_engines_record_defaults(dev_priv);
4931 	if (ret)
4932 		goto err_init_hw;
4933 
4934 	if (i915_inject_load_failure()) {
4935 		ret = -ENODEV;
4936 		goto err_init_hw;
4937 	}
4938 
4939 	if (i915_inject_load_failure()) {
4940 		ret = -EIO;
4941 		goto err_init_hw;
4942 	}
4943 
4944 	intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
4945 	mutex_unlock(&dev_priv->drm.struct_mutex);
4946 
4947 	return 0;
4948 
4949 	/*
4950 	 * Unwinding is complicated by that we want to handle -EIO to mean
4951 	 * disable GPU submission but keep KMS alive. We want to mark the
4952 	 * HW as irrevisibly wedged, but keep enough state around that the
4953 	 * driver doesn't explode during runtime.
4954 	 */
4955 err_init_hw:
4956 	mutex_unlock(&dev_priv->drm.struct_mutex);
4957 
4958 	i915_gem_suspend(dev_priv);
4959 	i915_gem_suspend_late(dev_priv);
4960 
4961 	i915_gem_drain_workqueue(dev_priv);
4962 
4963 	mutex_lock(&dev_priv->drm.struct_mutex);
4964 	intel_uc_fini_hw(dev_priv);
4965 err_uc_init:
4966 	intel_uc_fini(dev_priv);
4967 err_pm:
4968 	if (ret != -EIO) {
4969 		intel_cleanup_gt_powersave(dev_priv);
4970 		i915_gem_cleanup_engines(dev_priv);
4971 	}
4972 err_context:
4973 	if (ret != -EIO)
4974 		i915_gem_contexts_fini(dev_priv);
4975 err_scratch:
4976 	i915_gem_fini_scratch(dev_priv);
4977 err_ggtt:
4978 err_unlock:
4979 	intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
4980 	mutex_unlock(&dev_priv->drm.struct_mutex);
4981 
4982 err_uc_misc:
4983 	intel_uc_fini_misc(dev_priv);
4984 
4985 	if (ret != -EIO) {
4986 		i915_gem_cleanup_userptr(dev_priv);
4987 		i915_timelines_fini(dev_priv);
4988 	}
4989 
4990 	if (ret == -EIO) {
4991 		mutex_lock(&dev_priv->drm.struct_mutex);
4992 
4993 		/*
4994 		 * Allow engine initialisation to fail by marking the GPU as
4995 		 * wedged. But we only want to do this where the GPU is angry,
4996 		 * for all other failure, such as an allocation failure, bail.
4997 		 */
4998 		if (!i915_reset_failed(dev_priv)) {
4999 			i915_load_error(dev_priv,
5000 					"Failed to initialize GPU, declaring it wedged!\n");
5001 			i915_gem_set_wedged(dev_priv);
5002 		}
5003 
5004 		/* Minimal basic recovery for KMS */
5005 		ret = i915_ggtt_enable_hw(dev_priv);
5006 		i915_gem_restore_gtt_mappings(dev_priv);
5007 		i915_gem_restore_fences(dev_priv);
5008 		intel_init_clock_gating(dev_priv);
5009 
5010 		mutex_unlock(&dev_priv->drm.struct_mutex);
5011 	}
5012 
5013 	i915_gem_drain_freed_objects(dev_priv);
5014 	return ret;
5015 }
5016 
5017 void i915_gem_fini(struct drm_i915_private *dev_priv)
5018 {
5019 	i915_gem_suspend_late(dev_priv);
5020 	intel_disable_gt_powersave(dev_priv);
5021 
5022 	/* Flush any outstanding unpin_work. */
5023 	i915_gem_drain_workqueue(dev_priv);
5024 
5025 	mutex_lock(&dev_priv->drm.struct_mutex);
5026 	intel_uc_fini_hw(dev_priv);
5027 	intel_uc_fini(dev_priv);
5028 	i915_gem_cleanup_engines(dev_priv);
5029 	i915_gem_contexts_fini(dev_priv);
5030 	i915_gem_fini_scratch(dev_priv);
5031 	mutex_unlock(&dev_priv->drm.struct_mutex);
5032 
5033 	intel_wa_list_free(&dev_priv->gt_wa_list);
5034 
5035 	intel_cleanup_gt_powersave(dev_priv);
5036 
5037 	intel_uc_fini_misc(dev_priv);
5038 	i915_gem_cleanup_userptr(dev_priv);
5039 	i915_timelines_fini(dev_priv);
5040 
5041 	i915_gem_drain_freed_objects(dev_priv);
5042 
5043 	WARN_ON(!list_empty(&dev_priv->contexts.list));
5044 }
5045 
5046 void i915_gem_init_mmio(struct drm_i915_private *i915)
5047 {
5048 	i915_gem_sanitize(i915);
5049 }
5050 
5051 void
5052 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5053 {
5054 	struct intel_engine_cs *engine;
5055 	enum intel_engine_id id;
5056 
5057 	for_each_engine(engine, dev_priv, id)
5058 		dev_priv->gt.cleanup_engine(engine);
5059 }
5060 
5061 void
5062 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5063 {
5064 	int i;
5065 
5066 	if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5067 	    !IS_CHERRYVIEW(dev_priv))
5068 		dev_priv->num_fence_regs = 32;
5069 	else if (INTEL_GEN(dev_priv) >= 4 ||
5070 		 IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5071 		 IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5072 		dev_priv->num_fence_regs = 16;
5073 	else
5074 		dev_priv->num_fence_regs = 8;
5075 
5076 	if (intel_vgpu_active(dev_priv))
5077 		dev_priv->num_fence_regs =
5078 				I915_READ(vgtif_reg(avail_rs.fence_num));
5079 
5080 	/* Initialize fence registers to zero */
5081 	for (i = 0; i < dev_priv->num_fence_regs; i++) {
5082 		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5083 
5084 		fence->i915 = dev_priv;
5085 		fence->id = i;
5086 		list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5087 	}
5088 	i915_gem_restore_fences(dev_priv);
5089 
5090 	i915_gem_detect_bit_6_swizzle(dev_priv);
5091 }
5092 
5093 static void i915_gem_init__mm(struct drm_i915_private *i915)
5094 {
5095 	spin_lock_init(&i915->mm.object_stat_lock);
5096 	spin_lock_init(&i915->mm.obj_lock);
5097 	spin_lock_init(&i915->mm.free_lock);
5098 
5099 	init_llist_head(&i915->mm.free_list);
5100 
5101 	INIT_LIST_HEAD(&i915->mm.unbound_list);
5102 	INIT_LIST_HEAD(&i915->mm.bound_list);
5103 	INIT_LIST_HEAD(&i915->mm.fence_list);
5104 	INIT_LIST_HEAD(&i915->mm.userfault_list);
5105 
5106 	INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5107 }
5108 
5109 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5110 {
5111 	int err;
5112 
5113 	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5114 	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5115 
5116 	i915_gem_init__mm(dev_priv);
5117 
5118 	INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5119 			  i915_gem_retire_work_handler);
5120 	INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5121 			  i915_gem_idle_work_handler);
5122 	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5123 	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5124 	mutex_init(&dev_priv->gpu_error.wedge_mutex);
5125 	init_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
5126 
5127 	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5128 
5129 	spin_lock_init(&dev_priv->fb_tracking.lock);
5130 
5131 	err = i915_gemfs_init(dev_priv);
5132 	if (err)
5133 		DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5134 
5135 	return 0;
5136 }
5137 
5138 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5139 {
5140 	i915_gem_drain_freed_objects(dev_priv);
5141 	GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5142 	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5143 	WARN_ON(dev_priv->mm.object_count);
5144 
5145 	cleanup_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
5146 
5147 	i915_gemfs_fini(dev_priv);
5148 }
5149 
5150 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5151 {
5152 	/* Discard all purgeable objects, let userspace recover those as
5153 	 * required after resuming.
5154 	 */
5155 	i915_gem_shrink_all(dev_priv);
5156 
5157 	return 0;
5158 }
5159 
5160 int i915_gem_freeze_late(struct drm_i915_private *i915)
5161 {
5162 	struct drm_i915_gem_object *obj;
5163 	struct list_head *phases[] = {
5164 		&i915->mm.unbound_list,
5165 		&i915->mm.bound_list,
5166 		NULL
5167 	}, **phase;
5168 
5169 	/*
5170 	 * Called just before we write the hibernation image.
5171 	 *
5172 	 * We need to update the domain tracking to reflect that the CPU
5173 	 * will be accessing all the pages to create and restore from the
5174 	 * hibernation, and so upon restoration those pages will be in the
5175 	 * CPU domain.
5176 	 *
5177 	 * To make sure the hibernation image contains the latest state,
5178 	 * we update that state just before writing out the image.
5179 	 *
5180 	 * To try and reduce the hibernation image, we manually shrink
5181 	 * the objects as well, see i915_gem_freeze()
5182 	 */
5183 
5184 	i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5185 	i915_gem_drain_freed_objects(i915);
5186 
5187 	mutex_lock(&i915->drm.struct_mutex);
5188 	for (phase = phases; *phase; phase++) {
5189 		list_for_each_entry(obj, *phase, mm.link)
5190 			WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5191 	}
5192 	mutex_unlock(&i915->drm.struct_mutex);
5193 
5194 	return 0;
5195 }
5196 
5197 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5198 {
5199 	struct drm_i915_file_private *file_priv = file->driver_priv;
5200 	struct i915_request *request;
5201 
5202 	/* Clean up our request list when the client is going away, so that
5203 	 * later retire_requests won't dereference our soon-to-be-gone
5204 	 * file_priv.
5205 	 */
5206 	spin_lock(&file_priv->mm.lock);
5207 	list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5208 		request->file_priv = NULL;
5209 	spin_unlock(&file_priv->mm.lock);
5210 }
5211 
5212 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5213 {
5214 	struct drm_i915_file_private *file_priv;
5215 	int ret;
5216 
5217 	DRM_DEBUG("\n");
5218 
5219 	file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5220 	if (!file_priv)
5221 		return -ENOMEM;
5222 
5223 	file->driver_priv = file_priv;
5224 	file_priv->dev_priv = i915;
5225 	file_priv->file = file;
5226 
5227 	spin_lock_init(&file_priv->mm.lock);
5228 	INIT_LIST_HEAD(&file_priv->mm.request_list);
5229 
5230 	file_priv->bsd_engine = -1;
5231 	file_priv->hang_timestamp = jiffies;
5232 
5233 	ret = i915_gem_context_open(i915, file);
5234 	if (ret)
5235 		kfree(file_priv);
5236 
5237 	return ret;
5238 }
5239 
5240 /**
5241  * i915_gem_track_fb - update frontbuffer tracking
5242  * @old: current GEM buffer for the frontbuffer slots
5243  * @new: new GEM buffer for the frontbuffer slots
5244  * @frontbuffer_bits: bitmask of frontbuffer slots
5245  *
5246  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5247  * from @old and setting them in @new. Both @old and @new can be NULL.
5248  */
5249 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5250 		       struct drm_i915_gem_object *new,
5251 		       unsigned frontbuffer_bits)
5252 {
5253 	/* Control of individual bits within the mask are guarded by
5254 	 * the owning plane->mutex, i.e. we can never see concurrent
5255 	 * manipulation of individual bits. But since the bitfield as a whole
5256 	 * is updated using RMW, we need to use atomics in order to update
5257 	 * the bits.
5258 	 */
5259 	BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5260 		     BITS_PER_TYPE(atomic_t));
5261 
5262 	if (old) {
5263 		WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5264 		atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5265 	}
5266 
5267 	if (new) {
5268 		WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5269 		atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5270 	}
5271 }
5272 
5273 /* Allocate a new GEM object and fill it with the supplied data */
5274 struct drm_i915_gem_object *
5275 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5276 			         const void *data, size_t size)
5277 {
5278 	struct drm_i915_gem_object *obj;
5279 	struct file *file;
5280 	size_t offset;
5281 	int err;
5282 
5283 	obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5284 	if (IS_ERR(obj))
5285 		return obj;
5286 
5287 	GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5288 
5289 	file = obj->base.filp;
5290 	offset = 0;
5291 	do {
5292 		unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5293 		struct page *page;
5294 		void *pgdata, *vaddr;
5295 
5296 		err = pagecache_write_begin(file, file->f_mapping,
5297 					    offset, len, 0,
5298 					    &page, &pgdata);
5299 		if (err < 0)
5300 			goto fail;
5301 
5302 		vaddr = kmap(page);
5303 		memcpy(vaddr, data, len);
5304 		kunmap(page);
5305 
5306 		err = pagecache_write_end(file, file->f_mapping,
5307 					  offset, len, len,
5308 					  page, pgdata);
5309 		if (err < 0)
5310 			goto fail;
5311 
5312 		size -= len;
5313 		data += len;
5314 		offset += len;
5315 	} while (size);
5316 
5317 	return obj;
5318 
5319 fail:
5320 	i915_gem_object_put(obj);
5321 	return ERR_PTR(err);
5322 }
5323 
5324 struct scatterlist *
5325 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5326 		       unsigned int n,
5327 		       unsigned int *offset)
5328 {
5329 	struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5330 	struct scatterlist *sg;
5331 	unsigned int idx, count;
5332 
5333 	might_sleep();
5334 	GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5335 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5336 
5337 	/* As we iterate forward through the sg, we record each entry in a
5338 	 * radixtree for quick repeated (backwards) lookups. If we have seen
5339 	 * this index previously, we will have an entry for it.
5340 	 *
5341 	 * Initial lookup is O(N), but this is amortized to O(1) for
5342 	 * sequential page access (where each new request is consecutive
5343 	 * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5344 	 * i.e. O(1) with a large constant!
5345 	 */
5346 	if (n < READ_ONCE(iter->sg_idx))
5347 		goto lookup;
5348 
5349 	mutex_lock(&iter->lock);
5350 
5351 	/* We prefer to reuse the last sg so that repeated lookup of this
5352 	 * (or the subsequent) sg are fast - comparing against the last
5353 	 * sg is faster than going through the radixtree.
5354 	 */
5355 
5356 	sg = iter->sg_pos;
5357 	idx = iter->sg_idx;
5358 	count = __sg_page_count(sg);
5359 
5360 	while (idx + count <= n) {
5361 		void *entry;
5362 		unsigned long i;
5363 		int ret;
5364 
5365 		/* If we cannot allocate and insert this entry, or the
5366 		 * individual pages from this range, cancel updating the
5367 		 * sg_idx so that on this lookup we are forced to linearly
5368 		 * scan onwards, but on future lookups we will try the
5369 		 * insertion again (in which case we need to be careful of
5370 		 * the error return reporting that we have already inserted
5371 		 * this index).
5372 		 */
5373 		ret = radix_tree_insert(&iter->radix, idx, sg);
5374 		if (ret && ret != -EEXIST)
5375 			goto scan;
5376 
5377 		entry = xa_mk_value(idx);
5378 		for (i = 1; i < count; i++) {
5379 			ret = radix_tree_insert(&iter->radix, idx + i, entry);
5380 			if (ret && ret != -EEXIST)
5381 				goto scan;
5382 		}
5383 
5384 		idx += count;
5385 		sg = ____sg_next(sg);
5386 		count = __sg_page_count(sg);
5387 	}
5388 
5389 scan:
5390 	iter->sg_pos = sg;
5391 	iter->sg_idx = idx;
5392 
5393 	mutex_unlock(&iter->lock);
5394 
5395 	if (unlikely(n < idx)) /* insertion completed by another thread */
5396 		goto lookup;
5397 
5398 	/* In case we failed to insert the entry into the radixtree, we need
5399 	 * to look beyond the current sg.
5400 	 */
5401 	while (idx + count <= n) {
5402 		idx += count;
5403 		sg = ____sg_next(sg);
5404 		count = __sg_page_count(sg);
5405 	}
5406 
5407 	*offset = n - idx;
5408 	return sg;
5409 
5410 lookup:
5411 	rcu_read_lock();
5412 
5413 	sg = radix_tree_lookup(&iter->radix, n);
5414 	GEM_BUG_ON(!sg);
5415 
5416 	/* If this index is in the middle of multi-page sg entry,
5417 	 * the radix tree will contain a value entry that points
5418 	 * to the start of that range. We will return the pointer to
5419 	 * the base page and the offset of this page within the
5420 	 * sg entry's range.
5421 	 */
5422 	*offset = 0;
5423 	if (unlikely(xa_is_value(sg))) {
5424 		unsigned long base = xa_to_value(sg);
5425 
5426 		sg = radix_tree_lookup(&iter->radix, base);
5427 		GEM_BUG_ON(!sg);
5428 
5429 		*offset = n - base;
5430 	}
5431 
5432 	rcu_read_unlock();
5433 
5434 	return sg;
5435 }
5436 
5437 struct page *
5438 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5439 {
5440 	struct scatterlist *sg;
5441 	unsigned int offset;
5442 
5443 	GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5444 
5445 	sg = i915_gem_object_get_sg(obj, n, &offset);
5446 	return nth_page(sg_page(sg), offset);
5447 }
5448 
5449 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5450 struct page *
5451 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5452 			       unsigned int n)
5453 {
5454 	struct page *page;
5455 
5456 	page = i915_gem_object_get_page(obj, n);
5457 	if (!obj->mm.dirty)
5458 		set_page_dirty(page);
5459 
5460 	return page;
5461 }
5462 
5463 dma_addr_t
5464 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5465 				unsigned long n)
5466 {
5467 	struct scatterlist *sg;
5468 	unsigned int offset;
5469 
5470 	sg = i915_gem_object_get_sg(obj, n, &offset);
5471 	return sg_dma_address(sg) + (offset << PAGE_SHIFT);
5472 }
5473 
5474 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
5475 {
5476 	struct sg_table *pages;
5477 	int err;
5478 
5479 	if (align > obj->base.size)
5480 		return -EINVAL;
5481 
5482 	if (obj->ops == &i915_gem_phys_ops)
5483 		return 0;
5484 
5485 	if (obj->ops != &i915_gem_object_ops)
5486 		return -EINVAL;
5487 
5488 	err = i915_gem_object_unbind(obj);
5489 	if (err)
5490 		return err;
5491 
5492 	mutex_lock(&obj->mm.lock);
5493 
5494 	if (obj->mm.madv != I915_MADV_WILLNEED) {
5495 		err = -EFAULT;
5496 		goto err_unlock;
5497 	}
5498 
5499 	if (obj->mm.quirked) {
5500 		err = -EFAULT;
5501 		goto err_unlock;
5502 	}
5503 
5504 	if (obj->mm.mapping) {
5505 		err = -EBUSY;
5506 		goto err_unlock;
5507 	}
5508 
5509 	pages = __i915_gem_object_unset_pages(obj);
5510 
5511 	obj->ops = &i915_gem_phys_ops;
5512 
5513 	err = ____i915_gem_object_get_pages(obj);
5514 	if (err)
5515 		goto err_xfer;
5516 
5517 	/* Perma-pin (until release) the physical set of pages */
5518 	__i915_gem_object_pin_pages(obj);
5519 
5520 	if (!IS_ERR_OR_NULL(pages))
5521 		i915_gem_object_ops.put_pages(obj, pages);
5522 	mutex_unlock(&obj->mm.lock);
5523 	return 0;
5524 
5525 err_xfer:
5526 	obj->ops = &i915_gem_object_ops;
5527 	if (!IS_ERR_OR_NULL(pages)) {
5528 		unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
5529 
5530 		__i915_gem_object_set_pages(obj, pages, sg_page_sizes);
5531 	}
5532 err_unlock:
5533 	mutex_unlock(&obj->mm.lock);
5534 	return err;
5535 }
5536 
5537 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5538 #include "selftests/scatterlist.c"
5539 #include "selftests/mock_gem_device.c"
5540 #include "selftests/huge_gem_object.c"
5541 #include "selftests/huge_pages.c"
5542 #include "selftests/i915_gem_object.c"
5543 #include "selftests/i915_gem_coherency.c"
5544 #include "selftests/i915_gem.c"
5545 #endif
5546