1 /*
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include <linux/amd-iommu.h>
24 #include <linux/bsearch.h>
25 #include <linux/pci.h>
26 #include <linux/slab.h>
27 #include "kfd_priv.h"
28 #include "kfd_device_queue_manager.h"
29 #include "kfd_pm4_headers.h"
30 
31 #define MQD_SIZE_ALIGNED 768
32 
33 static const struct kfd_device_info kaveri_device_info = {
34 	.asic_family = CHIP_KAVERI,
35 	.max_pasid_bits = 16,
36 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
37 	.mqd_size_aligned = MQD_SIZE_ALIGNED
38 };
39 
40 static const struct kfd_device_info carrizo_device_info = {
41 	.asic_family = CHIP_CARRIZO,
42 	.max_pasid_bits = 16,
43 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
44 	.num_of_watch_points = 4,
45 	.mqd_size_aligned = MQD_SIZE_ALIGNED
46 };
47 
48 struct kfd_deviceid {
49 	unsigned short did;
50 	const struct kfd_device_info *device_info;
51 };
52 
53 /* Please keep this sorted by increasing device id. */
54 static const struct kfd_deviceid supported_devices[] = {
55 	{ 0x1304, &kaveri_device_info },	/* Kaveri */
56 	{ 0x1305, &kaveri_device_info },	/* Kaveri */
57 	{ 0x1306, &kaveri_device_info },	/* Kaveri */
58 	{ 0x1307, &kaveri_device_info },	/* Kaveri */
59 	{ 0x1309, &kaveri_device_info },	/* Kaveri */
60 	{ 0x130A, &kaveri_device_info },	/* Kaveri */
61 	{ 0x130B, &kaveri_device_info },	/* Kaveri */
62 	{ 0x130C, &kaveri_device_info },	/* Kaveri */
63 	{ 0x130D, &kaveri_device_info },	/* Kaveri */
64 	{ 0x130E, &kaveri_device_info },	/* Kaveri */
65 	{ 0x130F, &kaveri_device_info },	/* Kaveri */
66 	{ 0x1310, &kaveri_device_info },	/* Kaveri */
67 	{ 0x1311, &kaveri_device_info },	/* Kaveri */
68 	{ 0x1312, &kaveri_device_info },	/* Kaveri */
69 	{ 0x1313, &kaveri_device_info },	/* Kaveri */
70 	{ 0x1315, &kaveri_device_info },	/* Kaveri */
71 	{ 0x1316, &kaveri_device_info },	/* Kaveri */
72 	{ 0x1317, &kaveri_device_info },	/* Kaveri */
73 	{ 0x1318, &kaveri_device_info },	/* Kaveri */
74 	{ 0x131B, &kaveri_device_info },	/* Kaveri */
75 	{ 0x131C, &kaveri_device_info },	/* Kaveri */
76 	{ 0x131D, &kaveri_device_info }		/* Kaveri */
77 };
78 
79 static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
80 				unsigned int chunk_size);
81 static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
82 
83 static const struct kfd_device_info *lookup_device_info(unsigned short did)
84 {
85 	size_t i;
86 
87 	for (i = 0; i < ARRAY_SIZE(supported_devices); i++) {
88 		if (supported_devices[i].did == did) {
89 			BUG_ON(supported_devices[i].device_info == NULL);
90 			return supported_devices[i].device_info;
91 		}
92 	}
93 
94 	return NULL;
95 }
96 
97 struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
98 	struct pci_dev *pdev, const struct kfd2kgd_calls *f2g)
99 {
100 	struct kfd_dev *kfd;
101 
102 	const struct kfd_device_info *device_info =
103 					lookup_device_info(pdev->device);
104 
105 	if (!device_info)
106 		return NULL;
107 
108 	kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
109 	if (!kfd)
110 		return NULL;
111 
112 	kfd->kgd = kgd;
113 	kfd->device_info = device_info;
114 	kfd->pdev = pdev;
115 	kfd->init_complete = false;
116 	kfd->kfd2kgd = f2g;
117 
118 	mutex_init(&kfd->doorbell_mutex);
119 	memset(&kfd->doorbell_available_index, 0,
120 		sizeof(kfd->doorbell_available_index));
121 
122 	return kfd;
123 }
124 
125 static bool device_iommu_pasid_init(struct kfd_dev *kfd)
126 {
127 	const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
128 					AMD_IOMMU_DEVICE_FLAG_PRI_SUP |
129 					AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
130 
131 	struct amd_iommu_device_info iommu_info;
132 	unsigned int pasid_limit;
133 	int err;
134 
135 	err = amd_iommu_device_info(kfd->pdev, &iommu_info);
136 	if (err < 0) {
137 		dev_err(kfd_device,
138 			"error getting iommu info. is the iommu enabled?\n");
139 		return false;
140 	}
141 
142 	if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) {
143 		dev_err(kfd_device, "error required iommu flags ats(%i), pri(%i), pasid(%i)\n",
144 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0,
145 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0,
146 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) != 0);
147 		return false;
148 	}
149 
150 	pasid_limit = min_t(unsigned int,
151 			(unsigned int)1 << kfd->device_info->max_pasid_bits,
152 			iommu_info.max_pasids);
153 	/*
154 	 * last pasid is used for kernel queues doorbells
155 	 * in the future the last pasid might be used for a kernel thread.
156 	 */
157 	pasid_limit = min_t(unsigned int,
158 				pasid_limit,
159 				kfd->doorbell_process_limit - 1);
160 
161 	err = amd_iommu_init_device(kfd->pdev, pasid_limit);
162 	if (err < 0) {
163 		dev_err(kfd_device, "error initializing iommu device\n");
164 		return false;
165 	}
166 
167 	if (!kfd_set_pasid_limit(pasid_limit)) {
168 		dev_err(kfd_device, "error setting pasid limit\n");
169 		amd_iommu_free_device(kfd->pdev);
170 		return false;
171 	}
172 
173 	return true;
174 }
175 
176 static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
177 {
178 	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
179 
180 	if (dev)
181 		kfd_unbind_process_from_device(dev, pasid);
182 }
183 
184 bool kgd2kfd_device_init(struct kfd_dev *kfd,
185 			 const struct kgd2kfd_shared_resources *gpu_resources)
186 {
187 	unsigned int size;
188 
189 	kfd->shared_resources = *gpu_resources;
190 
191 	/* calculate max size of mqds needed for queues */
192 	size = max_num_of_queues_per_device *
193 			kfd->device_info->mqd_size_aligned;
194 
195 	/*
196 	 * calculate max size of runlist packet.
197 	 * There can be only 2 packets at once
198 	 */
199 	size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_map_process) +
200 		max_num_of_queues_per_device *
201 		sizeof(struct pm4_map_queues) + sizeof(struct pm4_runlist)) * 2;
202 
203 	/* Add size of HIQ & DIQ */
204 	size += KFD_KERNEL_QUEUE_SIZE * 2;
205 
206 	/* add another 512KB for all other allocations on gart (HPD, fences) */
207 	size += 512 * 1024;
208 
209 	if (kfd->kfd2kgd->init_gtt_mem_allocation(
210 			kfd->kgd, size, &kfd->gtt_mem,
211 			&kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)){
212 		dev_err(kfd_device,
213 			"Could not allocate %d bytes for device (%x:%x)\n",
214 			size, kfd->pdev->vendor, kfd->pdev->device);
215 		goto out;
216 	}
217 
218 	dev_info(kfd_device,
219 		"Allocated %d bytes on gart for device(%x:%x)\n",
220 		size, kfd->pdev->vendor, kfd->pdev->device);
221 
222 	/* Initialize GTT sa with 512 byte chunk size */
223 	if (kfd_gtt_sa_init(kfd, size, 512) != 0) {
224 		dev_err(kfd_device,
225 			"Error initializing gtt sub-allocator\n");
226 		goto kfd_gtt_sa_init_error;
227 	}
228 
229 	kfd_doorbell_init(kfd);
230 
231 	if (kfd_topology_add_device(kfd) != 0) {
232 		dev_err(kfd_device,
233 			"Error adding device (%x:%x) to topology\n",
234 			kfd->pdev->vendor, kfd->pdev->device);
235 		goto kfd_topology_add_device_error;
236 	}
237 
238 	if (!device_iommu_pasid_init(kfd)) {
239 		dev_err(kfd_device,
240 			"Error initializing iommuv2 for device (%x:%x)\n",
241 			kfd->pdev->vendor, kfd->pdev->device);
242 		goto device_iommu_pasid_error;
243 	}
244 	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
245 						iommu_pasid_shutdown_callback);
246 
247 	kfd->dqm = device_queue_manager_init(kfd);
248 	if (!kfd->dqm) {
249 		dev_err(kfd_device,
250 			"Error initializing queue manager for device (%x:%x)\n",
251 			kfd->pdev->vendor, kfd->pdev->device);
252 		goto device_queue_manager_error;
253 	}
254 
255 	if (kfd->dqm->ops.start(kfd->dqm) != 0) {
256 		dev_err(kfd_device,
257 			"Error starting queuen manager for device (%x:%x)\n",
258 			kfd->pdev->vendor, kfd->pdev->device);
259 		goto dqm_start_error;
260 	}
261 
262 	kfd->init_complete = true;
263 	dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor,
264 		 kfd->pdev->device);
265 
266 	pr_debug("kfd: Starting kfd with the following scheduling policy %d\n",
267 		sched_policy);
268 
269 	goto out;
270 
271 dqm_start_error:
272 	device_queue_manager_uninit(kfd->dqm);
273 device_queue_manager_error:
274 	amd_iommu_free_device(kfd->pdev);
275 device_iommu_pasid_error:
276 	kfd_topology_remove_device(kfd);
277 kfd_topology_add_device_error:
278 	kfd_gtt_sa_fini(kfd);
279 kfd_gtt_sa_init_error:
280 	kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem);
281 	dev_err(kfd_device,
282 		"device (%x:%x) NOT added due to errors\n",
283 		kfd->pdev->vendor, kfd->pdev->device);
284 out:
285 	return kfd->init_complete;
286 }
287 
288 void kgd2kfd_device_exit(struct kfd_dev *kfd)
289 {
290 	if (kfd->init_complete) {
291 		device_queue_manager_uninit(kfd->dqm);
292 		amd_iommu_free_device(kfd->pdev);
293 		kfd_topology_remove_device(kfd);
294 		kfd_gtt_sa_fini(kfd);
295 		kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem);
296 	}
297 
298 	kfree(kfd);
299 }
300 
301 void kgd2kfd_suspend(struct kfd_dev *kfd)
302 {
303 	BUG_ON(kfd == NULL);
304 
305 	if (kfd->init_complete) {
306 		kfd->dqm->ops.stop(kfd->dqm);
307 		amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
308 		amd_iommu_free_device(kfd->pdev);
309 	}
310 }
311 
312 int kgd2kfd_resume(struct kfd_dev *kfd)
313 {
314 	unsigned int pasid_limit;
315 	int err;
316 
317 	BUG_ON(kfd == NULL);
318 
319 	pasid_limit = kfd_get_pasid_limit();
320 
321 	if (kfd->init_complete) {
322 		err = amd_iommu_init_device(kfd->pdev, pasid_limit);
323 		if (err < 0)
324 			return -ENXIO;
325 		amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
326 						iommu_pasid_shutdown_callback);
327 		kfd->dqm->ops.start(kfd->dqm);
328 	}
329 
330 	return 0;
331 }
332 
333 /* This is called directly from KGD at ISR. */
334 void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
335 {
336 	/* Process interrupts / schedule work as necessary */
337 }
338 
339 static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
340 				unsigned int chunk_size)
341 {
342 	unsigned int num_of_bits;
343 
344 	BUG_ON(!kfd);
345 	BUG_ON(!kfd->gtt_mem);
346 	BUG_ON(buf_size < chunk_size);
347 	BUG_ON(buf_size == 0);
348 	BUG_ON(chunk_size == 0);
349 
350 	kfd->gtt_sa_chunk_size = chunk_size;
351 	kfd->gtt_sa_num_of_chunks = buf_size / chunk_size;
352 
353 	num_of_bits = kfd->gtt_sa_num_of_chunks / BITS_PER_BYTE;
354 	BUG_ON(num_of_bits == 0);
355 
356 	kfd->gtt_sa_bitmap = kzalloc(num_of_bits, GFP_KERNEL);
357 
358 	if (!kfd->gtt_sa_bitmap)
359 		return -ENOMEM;
360 
361 	pr_debug("kfd: gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n",
362 			kfd->gtt_sa_num_of_chunks, kfd->gtt_sa_bitmap);
363 
364 	mutex_init(&kfd->gtt_sa_lock);
365 
366 	return 0;
367 
368 }
369 
370 static void kfd_gtt_sa_fini(struct kfd_dev *kfd)
371 {
372 	mutex_destroy(&kfd->gtt_sa_lock);
373 	kfree(kfd->gtt_sa_bitmap);
374 }
375 
376 static inline uint64_t kfd_gtt_sa_calc_gpu_addr(uint64_t start_addr,
377 						unsigned int bit_num,
378 						unsigned int chunk_size)
379 {
380 	return start_addr + bit_num * chunk_size;
381 }
382 
383 static inline uint32_t *kfd_gtt_sa_calc_cpu_addr(void *start_addr,
384 						unsigned int bit_num,
385 						unsigned int chunk_size)
386 {
387 	return (uint32_t *) ((uint64_t) start_addr + bit_num * chunk_size);
388 }
389 
390 int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
391 			struct kfd_mem_obj **mem_obj)
392 {
393 	unsigned int found, start_search, cur_size;
394 
395 	BUG_ON(!kfd);
396 
397 	if (size == 0)
398 		return -EINVAL;
399 
400 	if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size)
401 		return -ENOMEM;
402 
403 	*mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
404 	if ((*mem_obj) == NULL)
405 		return -ENOMEM;
406 
407 	pr_debug("kfd: allocated mem_obj = %p for size = %d\n", *mem_obj, size);
408 
409 	start_search = 0;
410 
411 	mutex_lock(&kfd->gtt_sa_lock);
412 
413 kfd_gtt_restart_search:
414 	/* Find the first chunk that is free */
415 	found = find_next_zero_bit(kfd->gtt_sa_bitmap,
416 					kfd->gtt_sa_num_of_chunks,
417 					start_search);
418 
419 	pr_debug("kfd: found = %d\n", found);
420 
421 	/* If there wasn't any free chunk, bail out */
422 	if (found == kfd->gtt_sa_num_of_chunks)
423 		goto kfd_gtt_no_free_chunk;
424 
425 	/* Update fields of mem_obj */
426 	(*mem_obj)->range_start = found;
427 	(*mem_obj)->range_end = found;
428 	(*mem_obj)->gpu_addr = kfd_gtt_sa_calc_gpu_addr(
429 					kfd->gtt_start_gpu_addr,
430 					found,
431 					kfd->gtt_sa_chunk_size);
432 	(*mem_obj)->cpu_ptr = kfd_gtt_sa_calc_cpu_addr(
433 					kfd->gtt_start_cpu_ptr,
434 					found,
435 					kfd->gtt_sa_chunk_size);
436 
437 	pr_debug("kfd: gpu_addr = %p, cpu_addr = %p\n",
438 			(uint64_t *) (*mem_obj)->gpu_addr, (*mem_obj)->cpu_ptr);
439 
440 	/* If we need only one chunk, mark it as allocated and get out */
441 	if (size <= kfd->gtt_sa_chunk_size) {
442 		pr_debug("kfd: single bit\n");
443 		set_bit(found, kfd->gtt_sa_bitmap);
444 		goto kfd_gtt_out;
445 	}
446 
447 	/* Otherwise, try to see if we have enough contiguous chunks */
448 	cur_size = size - kfd->gtt_sa_chunk_size;
449 	do {
450 		(*mem_obj)->range_end =
451 			find_next_zero_bit(kfd->gtt_sa_bitmap,
452 					kfd->gtt_sa_num_of_chunks, ++found);
453 		/*
454 		 * If next free chunk is not contiguous than we need to
455 		 * restart our search from the last free chunk we found (which
456 		 * wasn't contiguous to the previous ones
457 		 */
458 		if ((*mem_obj)->range_end != found) {
459 			start_search = found;
460 			goto kfd_gtt_restart_search;
461 		}
462 
463 		/*
464 		 * If we reached end of buffer, bail out with error
465 		 */
466 		if (found == kfd->gtt_sa_num_of_chunks)
467 			goto kfd_gtt_no_free_chunk;
468 
469 		/* Check if we don't need another chunk */
470 		if (cur_size <= kfd->gtt_sa_chunk_size)
471 			cur_size = 0;
472 		else
473 			cur_size -= kfd->gtt_sa_chunk_size;
474 
475 	} while (cur_size > 0);
476 
477 	pr_debug("kfd: range_start = %d, range_end = %d\n",
478 		(*mem_obj)->range_start, (*mem_obj)->range_end);
479 
480 	/* Mark the chunks as allocated */
481 	for (found = (*mem_obj)->range_start;
482 		found <= (*mem_obj)->range_end;
483 		found++)
484 		set_bit(found, kfd->gtt_sa_bitmap);
485 
486 kfd_gtt_out:
487 	mutex_unlock(&kfd->gtt_sa_lock);
488 	return 0;
489 
490 kfd_gtt_no_free_chunk:
491 	pr_debug("kfd: allocation failed with mem_obj = %p\n", mem_obj);
492 	mutex_unlock(&kfd->gtt_sa_lock);
493 	kfree(mem_obj);
494 	return -ENOMEM;
495 }
496 
497 int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
498 {
499 	unsigned int bit;
500 
501 	BUG_ON(!kfd);
502 
503 	/* Act like kfree when trying to free a NULL object */
504 	if (!mem_obj)
505 		return 0;
506 
507 	pr_debug("kfd: free mem_obj = %p, range_start = %d, range_end = %d\n",
508 			mem_obj, mem_obj->range_start, mem_obj->range_end);
509 
510 	mutex_lock(&kfd->gtt_sa_lock);
511 
512 	/* Mark the chunks as free */
513 	for (bit = mem_obj->range_start;
514 		bit <= mem_obj->range_end;
515 		bit++)
516 		clear_bit(bit, kfd->gtt_sa_bitmap);
517 
518 	mutex_unlock(&kfd->gtt_sa_lock);
519 
520 	kfree(mem_obj);
521 	return 0;
522 }
523