1 /*
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include <linux/amd-iommu.h>
24 #include <linux/bsearch.h>
25 #include <linux/pci.h>
26 #include <linux/slab.h>
27 #include "kfd_priv.h"
28 #include "kfd_device_queue_manager.h"
29 #include "kfd_pm4_headers.h"
30 
31 #define MQD_SIZE_ALIGNED 768
32 
33 static const struct kfd_device_info kaveri_device_info = {
34 	.asic_family = CHIP_KAVERI,
35 	.max_pasid_bits = 16,
36 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
37 	.mqd_size_aligned = MQD_SIZE_ALIGNED
38 };
39 
40 static const struct kfd_device_info carrizo_device_info = {
41 	.asic_family = CHIP_CARRIZO,
42 	.max_pasid_bits = 16,
43 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
44 	.num_of_watch_points = 4,
45 	.mqd_size_aligned = MQD_SIZE_ALIGNED
46 };
47 
48 struct kfd_deviceid {
49 	unsigned short did;
50 	const struct kfd_device_info *device_info;
51 };
52 
53 /* Please keep this sorted by increasing device id. */
54 static const struct kfd_deviceid supported_devices[] = {
55 	{ 0x1304, &kaveri_device_info },	/* Kaveri */
56 	{ 0x1305, &kaveri_device_info },	/* Kaveri */
57 	{ 0x1306, &kaveri_device_info },	/* Kaveri */
58 	{ 0x1307, &kaveri_device_info },	/* Kaveri */
59 	{ 0x1309, &kaveri_device_info },	/* Kaveri */
60 	{ 0x130A, &kaveri_device_info },	/* Kaveri */
61 	{ 0x130B, &kaveri_device_info },	/* Kaveri */
62 	{ 0x130C, &kaveri_device_info },	/* Kaveri */
63 	{ 0x130D, &kaveri_device_info },	/* Kaveri */
64 	{ 0x130E, &kaveri_device_info },	/* Kaveri */
65 	{ 0x130F, &kaveri_device_info },	/* Kaveri */
66 	{ 0x1310, &kaveri_device_info },	/* Kaveri */
67 	{ 0x1311, &kaveri_device_info },	/* Kaveri */
68 	{ 0x1312, &kaveri_device_info },	/* Kaveri */
69 	{ 0x1313, &kaveri_device_info },	/* Kaveri */
70 	{ 0x1315, &kaveri_device_info },	/* Kaveri */
71 	{ 0x1316, &kaveri_device_info },	/* Kaveri */
72 	{ 0x1317, &kaveri_device_info },	/* Kaveri */
73 	{ 0x1318, &kaveri_device_info },	/* Kaveri */
74 	{ 0x131B, &kaveri_device_info },	/* Kaveri */
75 	{ 0x131C, &kaveri_device_info },	/* Kaveri */
76 	{ 0x131D, &kaveri_device_info }		/* Kaveri */
77 };
78 
79 static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
80 				unsigned int chunk_size);
81 static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
82 
83 static const struct kfd_device_info *lookup_device_info(unsigned short did)
84 {
85 	size_t i;
86 
87 	for (i = 0; i < ARRAY_SIZE(supported_devices); i++) {
88 		if (supported_devices[i].did == did) {
89 			BUG_ON(supported_devices[i].device_info == NULL);
90 			return supported_devices[i].device_info;
91 		}
92 	}
93 
94 	return NULL;
95 }
96 
97 struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev)
98 {
99 	struct kfd_dev *kfd;
100 
101 	const struct kfd_device_info *device_info =
102 					lookup_device_info(pdev->device);
103 
104 	if (!device_info)
105 		return NULL;
106 
107 	kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
108 	if (!kfd)
109 		return NULL;
110 
111 	kfd->kgd = kgd;
112 	kfd->device_info = device_info;
113 	kfd->pdev = pdev;
114 	kfd->init_complete = false;
115 
116 	return kfd;
117 }
118 
119 static bool device_iommu_pasid_init(struct kfd_dev *kfd)
120 {
121 	const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
122 					AMD_IOMMU_DEVICE_FLAG_PRI_SUP |
123 					AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
124 
125 	struct amd_iommu_device_info iommu_info;
126 	unsigned int pasid_limit;
127 	int err;
128 
129 	err = amd_iommu_device_info(kfd->pdev, &iommu_info);
130 	if (err < 0) {
131 		dev_err(kfd_device,
132 			"error getting iommu info. is the iommu enabled?\n");
133 		return false;
134 	}
135 
136 	if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) {
137 		dev_err(kfd_device, "error required iommu flags ats(%i), pri(%i), pasid(%i)\n",
138 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0,
139 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0,
140 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) != 0);
141 		return false;
142 	}
143 
144 	pasid_limit = min_t(unsigned int,
145 			(unsigned int)1 << kfd->device_info->max_pasid_bits,
146 			iommu_info.max_pasids);
147 	/*
148 	 * last pasid is used for kernel queues doorbells
149 	 * in the future the last pasid might be used for a kernel thread.
150 	 */
151 	pasid_limit = min_t(unsigned int,
152 				pasid_limit,
153 				kfd->doorbell_process_limit - 1);
154 
155 	err = amd_iommu_init_device(kfd->pdev, pasid_limit);
156 	if (err < 0) {
157 		dev_err(kfd_device, "error initializing iommu device\n");
158 		return false;
159 	}
160 
161 	if (!kfd_set_pasid_limit(pasid_limit)) {
162 		dev_err(kfd_device, "error setting pasid limit\n");
163 		amd_iommu_free_device(kfd->pdev);
164 		return false;
165 	}
166 
167 	return true;
168 }
169 
170 static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
171 {
172 	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
173 
174 	if (dev)
175 		kfd_unbind_process_from_device(dev, pasid);
176 }
177 
178 bool kgd2kfd_device_init(struct kfd_dev *kfd,
179 			 const struct kgd2kfd_shared_resources *gpu_resources)
180 {
181 	unsigned int size;
182 
183 	kfd->shared_resources = *gpu_resources;
184 
185 	/* calculate max size of mqds needed for queues */
186 	size = max_num_of_queues_per_device *
187 			kfd->device_info->mqd_size_aligned;
188 
189 	/*
190 	 * calculate max size of runlist packet.
191 	 * There can be only 2 packets at once
192 	 */
193 	size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_map_process) +
194 		max_num_of_queues_per_device *
195 		sizeof(struct pm4_map_queues) + sizeof(struct pm4_runlist)) * 2;
196 
197 	/* Add size of HIQ & DIQ */
198 	size += KFD_KERNEL_QUEUE_SIZE * 2;
199 
200 	/* add another 512KB for all other allocations on gart (HPD, fences) */
201 	size += 512 * 1024;
202 
203 	if (kfd2kgd->init_gtt_mem_allocation(kfd->kgd, size, &kfd->gtt_mem,
204 			&kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)) {
205 		dev_err(kfd_device,
206 			"Could not allocate %d bytes for device (%x:%x)\n",
207 			size, kfd->pdev->vendor, kfd->pdev->device);
208 		goto out;
209 	}
210 
211 	dev_info(kfd_device,
212 		"Allocated %d bytes on gart for device(%x:%x)\n",
213 		size, kfd->pdev->vendor, kfd->pdev->device);
214 
215 	/* Initialize GTT sa with 512 byte chunk size */
216 	if (kfd_gtt_sa_init(kfd, size, 512) != 0) {
217 		dev_err(kfd_device,
218 			"Error initializing gtt sub-allocator\n");
219 		goto kfd_gtt_sa_init_error;
220 	}
221 
222 	kfd_doorbell_init(kfd);
223 
224 	if (kfd_topology_add_device(kfd) != 0) {
225 		dev_err(kfd_device,
226 			"Error adding device (%x:%x) to topology\n",
227 			kfd->pdev->vendor, kfd->pdev->device);
228 		goto kfd_topology_add_device_error;
229 	}
230 
231 	if (!device_iommu_pasid_init(kfd)) {
232 		dev_err(kfd_device,
233 			"Error initializing iommuv2 for device (%x:%x)\n",
234 			kfd->pdev->vendor, kfd->pdev->device);
235 		goto device_iommu_pasid_error;
236 	}
237 	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
238 						iommu_pasid_shutdown_callback);
239 
240 	kfd->dqm = device_queue_manager_init(kfd);
241 	if (!kfd->dqm) {
242 		dev_err(kfd_device,
243 			"Error initializing queue manager for device (%x:%x)\n",
244 			kfd->pdev->vendor, kfd->pdev->device);
245 		goto device_queue_manager_error;
246 	}
247 
248 	if (kfd->dqm->ops.start(kfd->dqm) != 0) {
249 		dev_err(kfd_device,
250 			"Error starting queuen manager for device (%x:%x)\n",
251 			kfd->pdev->vendor, kfd->pdev->device);
252 		goto dqm_start_error;
253 	}
254 
255 	kfd->init_complete = true;
256 	dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor,
257 		 kfd->pdev->device);
258 
259 	pr_debug("kfd: Starting kfd with the following scheduling policy %d\n",
260 		sched_policy);
261 
262 	goto out;
263 
264 dqm_start_error:
265 	device_queue_manager_uninit(kfd->dqm);
266 device_queue_manager_error:
267 	amd_iommu_free_device(kfd->pdev);
268 device_iommu_pasid_error:
269 	kfd_topology_remove_device(kfd);
270 kfd_topology_add_device_error:
271 	kfd_gtt_sa_fini(kfd);
272 kfd_gtt_sa_init_error:
273 	kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem);
274 	dev_err(kfd_device,
275 		"device (%x:%x) NOT added due to errors\n",
276 		kfd->pdev->vendor, kfd->pdev->device);
277 out:
278 	return kfd->init_complete;
279 }
280 
281 void kgd2kfd_device_exit(struct kfd_dev *kfd)
282 {
283 	if (kfd->init_complete) {
284 		device_queue_manager_uninit(kfd->dqm);
285 		amd_iommu_free_device(kfd->pdev);
286 		kfd_topology_remove_device(kfd);
287 		kfd_gtt_sa_fini(kfd);
288 		kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem);
289 	}
290 
291 	kfree(kfd);
292 }
293 
294 void kgd2kfd_suspend(struct kfd_dev *kfd)
295 {
296 	BUG_ON(kfd == NULL);
297 
298 	if (kfd->init_complete) {
299 		kfd->dqm->ops.stop(kfd->dqm);
300 		amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
301 		amd_iommu_free_device(kfd->pdev);
302 	}
303 }
304 
305 int kgd2kfd_resume(struct kfd_dev *kfd)
306 {
307 	unsigned int pasid_limit;
308 	int err;
309 
310 	BUG_ON(kfd == NULL);
311 
312 	pasid_limit = kfd_get_pasid_limit();
313 
314 	if (kfd->init_complete) {
315 		err = amd_iommu_init_device(kfd->pdev, pasid_limit);
316 		if (err < 0)
317 			return -ENXIO;
318 		amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
319 						iommu_pasid_shutdown_callback);
320 		kfd->dqm->ops.start(kfd->dqm);
321 	}
322 
323 	return 0;
324 }
325 
326 /* This is called directly from KGD at ISR. */
327 void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
328 {
329 	/* Process interrupts / schedule work as necessary */
330 }
331 
332 static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
333 				unsigned int chunk_size)
334 {
335 	unsigned int num_of_bits;
336 
337 	BUG_ON(!kfd);
338 	BUG_ON(!kfd->gtt_mem);
339 	BUG_ON(buf_size < chunk_size);
340 	BUG_ON(buf_size == 0);
341 	BUG_ON(chunk_size == 0);
342 
343 	kfd->gtt_sa_chunk_size = chunk_size;
344 	kfd->gtt_sa_num_of_chunks = buf_size / chunk_size;
345 
346 	num_of_bits = kfd->gtt_sa_num_of_chunks / BITS_PER_BYTE;
347 	BUG_ON(num_of_bits == 0);
348 
349 	kfd->gtt_sa_bitmap = kzalloc(num_of_bits, GFP_KERNEL);
350 
351 	if (!kfd->gtt_sa_bitmap)
352 		return -ENOMEM;
353 
354 	pr_debug("kfd: gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n",
355 			kfd->gtt_sa_num_of_chunks, kfd->gtt_sa_bitmap);
356 
357 	mutex_init(&kfd->gtt_sa_lock);
358 
359 	return 0;
360 
361 }
362 
363 static void kfd_gtt_sa_fini(struct kfd_dev *kfd)
364 {
365 	mutex_destroy(&kfd->gtt_sa_lock);
366 	kfree(kfd->gtt_sa_bitmap);
367 }
368 
369 static inline uint64_t kfd_gtt_sa_calc_gpu_addr(uint64_t start_addr,
370 						unsigned int bit_num,
371 						unsigned int chunk_size)
372 {
373 	return start_addr + bit_num * chunk_size;
374 }
375 
376 static inline uint32_t *kfd_gtt_sa_calc_cpu_addr(void *start_addr,
377 						unsigned int bit_num,
378 						unsigned int chunk_size)
379 {
380 	return (uint32_t *) ((uint64_t) start_addr + bit_num * chunk_size);
381 }
382 
383 int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
384 			struct kfd_mem_obj **mem_obj)
385 {
386 	unsigned int found, start_search, cur_size;
387 
388 	BUG_ON(!kfd);
389 
390 	if (size == 0)
391 		return -EINVAL;
392 
393 	if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size)
394 		return -ENOMEM;
395 
396 	*mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
397 	if ((*mem_obj) == NULL)
398 		return -ENOMEM;
399 
400 	pr_debug("kfd: allocated mem_obj = %p for size = %d\n", *mem_obj, size);
401 
402 	start_search = 0;
403 
404 	mutex_lock(&kfd->gtt_sa_lock);
405 
406 kfd_gtt_restart_search:
407 	/* Find the first chunk that is free */
408 	found = find_next_zero_bit(kfd->gtt_sa_bitmap,
409 					kfd->gtt_sa_num_of_chunks,
410 					start_search);
411 
412 	pr_debug("kfd: found = %d\n", found);
413 
414 	/* If there wasn't any free chunk, bail out */
415 	if (found == kfd->gtt_sa_num_of_chunks)
416 		goto kfd_gtt_no_free_chunk;
417 
418 	/* Update fields of mem_obj */
419 	(*mem_obj)->range_start = found;
420 	(*mem_obj)->range_end = found;
421 	(*mem_obj)->gpu_addr = kfd_gtt_sa_calc_gpu_addr(
422 					kfd->gtt_start_gpu_addr,
423 					found,
424 					kfd->gtt_sa_chunk_size);
425 	(*mem_obj)->cpu_ptr = kfd_gtt_sa_calc_cpu_addr(
426 					kfd->gtt_start_cpu_ptr,
427 					found,
428 					kfd->gtt_sa_chunk_size);
429 
430 	pr_debug("kfd: gpu_addr = %p, cpu_addr = %p\n",
431 			(uint64_t *) (*mem_obj)->gpu_addr, (*mem_obj)->cpu_ptr);
432 
433 	/* If we need only one chunk, mark it as allocated and get out */
434 	if (size <= kfd->gtt_sa_chunk_size) {
435 		pr_debug("kfd: single bit\n");
436 		set_bit(found, kfd->gtt_sa_bitmap);
437 		goto kfd_gtt_out;
438 	}
439 
440 	/* Otherwise, try to see if we have enough contiguous chunks */
441 	cur_size = size - kfd->gtt_sa_chunk_size;
442 	do {
443 		(*mem_obj)->range_end =
444 			find_next_zero_bit(kfd->gtt_sa_bitmap,
445 					kfd->gtt_sa_num_of_chunks, ++found);
446 		/*
447 		 * If next free chunk is not contiguous than we need to
448 		 * restart our search from the last free chunk we found (which
449 		 * wasn't contiguous to the previous ones
450 		 */
451 		if ((*mem_obj)->range_end != found) {
452 			start_search = found;
453 			goto kfd_gtt_restart_search;
454 		}
455 
456 		/*
457 		 * If we reached end of buffer, bail out with error
458 		 */
459 		if (found == kfd->gtt_sa_num_of_chunks)
460 			goto kfd_gtt_no_free_chunk;
461 
462 		/* Check if we don't need another chunk */
463 		if (cur_size <= kfd->gtt_sa_chunk_size)
464 			cur_size = 0;
465 		else
466 			cur_size -= kfd->gtt_sa_chunk_size;
467 
468 	} while (cur_size > 0);
469 
470 	pr_debug("kfd: range_start = %d, range_end = %d\n",
471 		(*mem_obj)->range_start, (*mem_obj)->range_end);
472 
473 	/* Mark the chunks as allocated */
474 	for (found = (*mem_obj)->range_start;
475 		found <= (*mem_obj)->range_end;
476 		found++)
477 		set_bit(found, kfd->gtt_sa_bitmap);
478 
479 kfd_gtt_out:
480 	mutex_unlock(&kfd->gtt_sa_lock);
481 	return 0;
482 
483 kfd_gtt_no_free_chunk:
484 	pr_debug("kfd: allocation failed with mem_obj = %p\n", mem_obj);
485 	mutex_unlock(&kfd->gtt_sa_lock);
486 	kfree(mem_obj);
487 	return -ENOMEM;
488 }
489 
490 int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
491 {
492 	unsigned int bit;
493 
494 	BUG_ON(!kfd);
495 
496 	/* Act like kfree when trying to free a NULL object */
497 	if (!mem_obj)
498 		return 0;
499 
500 	pr_debug("kfd: free mem_obj = %p, range_start = %d, range_end = %d\n",
501 			mem_obj, mem_obj->range_start, mem_obj->range_end);
502 
503 	mutex_lock(&kfd->gtt_sa_lock);
504 
505 	/* Mark the chunks as free */
506 	for (bit = mem_obj->range_start;
507 		bit <= mem_obj->range_end;
508 		bit++)
509 		clear_bit(bit, kfd->gtt_sa_bitmap);
510 
511 	mutex_unlock(&kfd->gtt_sa_lock);
512 
513 	kfree(mem_obj);
514 	return 0;
515 }
516