xref: /openbmc/linux/arch/x86/kernel/cpu/sgx/main.c (revision c6acb1e7)
1 // SPDX-License-Identifier: GPL-2.0
2 /*  Copyright(c) 2016-20 Intel Corporation. */
3 
4 #include <linux/file.h>
5 #include <linux/freezer.h>
6 #include <linux/highmem.h>
7 #include <linux/kthread.h>
8 #include <linux/miscdevice.h>
9 #include <linux/pagemap.h>
10 #include <linux/ratelimit.h>
11 #include <linux/sched/mm.h>
12 #include <linux/sched/signal.h>
13 #include <linux/slab.h>
14 #include <asm/sgx.h>
15 #include "driver.h"
16 #include "encl.h"
17 #include "encls.h"
18 
19 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
20 static int sgx_nr_epc_sections;
21 static struct task_struct *ksgxd_tsk;
22 static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
23 static DEFINE_XARRAY(sgx_epc_address_space);
24 
25 /*
26  * These variables are part of the state of the reclaimer, and must be accessed
27  * with sgx_reclaimer_lock acquired.
28  */
29 static LIST_HEAD(sgx_active_page_list);
30 static DEFINE_SPINLOCK(sgx_reclaimer_lock);
31 
32 /* The free page list lock protected variables prepend the lock. */
33 static unsigned long sgx_nr_free_pages;
34 
35 /* Nodes with one or more EPC sections. */
36 static nodemask_t sgx_numa_mask;
37 
38 /*
39  * Array with one list_head for each possible NUMA node.  Each
40  * list contains all the sgx_epc_section's which are on that
41  * node.
42  */
43 static struct sgx_numa_node *sgx_numa_nodes;
44 
45 static LIST_HEAD(sgx_dirty_page_list);
46 
47 /*
48  * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
49  * from the input list, and made available for the page allocator. SECS pages
50  * prepending their children in the input list are left intact.
51  */
52 static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
53 {
54 	struct sgx_epc_page *page;
55 	LIST_HEAD(dirty);
56 	int ret;
57 
58 	/* dirty_page_list is thread-local, no need for a lock: */
59 	while (!list_empty(dirty_page_list)) {
60 		if (kthread_should_stop())
61 			return;
62 
63 		page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
64 
65 		/*
66 		 * Checking page->poison without holding the node->lock
67 		 * is racy, but losing the race (i.e. poison is set just
68 		 * after the check) just means __eremove() will be uselessly
69 		 * called for a page that sgx_free_epc_page() will put onto
70 		 * the node->sgx_poison_page_list later.
71 		 */
72 		if (page->poison) {
73 			struct sgx_epc_section *section = &sgx_epc_sections[page->section];
74 			struct sgx_numa_node *node = section->node;
75 
76 			spin_lock(&node->lock);
77 			list_move(&page->list, &node->sgx_poison_page_list);
78 			spin_unlock(&node->lock);
79 
80 			continue;
81 		}
82 
83 		ret = __eremove(sgx_get_epc_virt_addr(page));
84 		if (!ret) {
85 			/*
86 			 * page is now sanitized.  Make it available via the SGX
87 			 * page allocator:
88 			 */
89 			list_del(&page->list);
90 			sgx_free_epc_page(page);
91 		} else {
92 			/* The page is not yet clean - move to the dirty list. */
93 			list_move_tail(&page->list, &dirty);
94 		}
95 
96 		cond_resched();
97 	}
98 
99 	list_splice(&dirty, dirty_page_list);
100 }
101 
102 static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
103 {
104 	struct sgx_encl_page *page = epc_page->owner;
105 	struct sgx_encl *encl = page->encl;
106 	struct sgx_encl_mm *encl_mm;
107 	bool ret = true;
108 	int idx;
109 
110 	idx = srcu_read_lock(&encl->srcu);
111 
112 	list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
113 		if (!mmget_not_zero(encl_mm->mm))
114 			continue;
115 
116 		mmap_read_lock(encl_mm->mm);
117 		ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
118 		mmap_read_unlock(encl_mm->mm);
119 
120 		mmput_async(encl_mm->mm);
121 
122 		if (!ret)
123 			break;
124 	}
125 
126 	srcu_read_unlock(&encl->srcu, idx);
127 
128 	if (!ret)
129 		return false;
130 
131 	return true;
132 }
133 
134 static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
135 {
136 	struct sgx_encl_page *page = epc_page->owner;
137 	unsigned long addr = page->desc & PAGE_MASK;
138 	struct sgx_encl *encl = page->encl;
139 	unsigned long mm_list_version;
140 	struct sgx_encl_mm *encl_mm;
141 	struct vm_area_struct *vma;
142 	int idx, ret;
143 
144 	do {
145 		mm_list_version = encl->mm_list_version;
146 
147 		/* Pairs with smp_rmb() in sgx_encl_mm_add(). */
148 		smp_rmb();
149 
150 		idx = srcu_read_lock(&encl->srcu);
151 
152 		list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
153 			if (!mmget_not_zero(encl_mm->mm))
154 				continue;
155 
156 			mmap_read_lock(encl_mm->mm);
157 
158 			ret = sgx_encl_find(encl_mm->mm, addr, &vma);
159 			if (!ret && encl == vma->vm_private_data)
160 				zap_vma_ptes(vma, addr, PAGE_SIZE);
161 
162 			mmap_read_unlock(encl_mm->mm);
163 
164 			mmput_async(encl_mm->mm);
165 		}
166 
167 		srcu_read_unlock(&encl->srcu, idx);
168 	} while (unlikely(encl->mm_list_version != mm_list_version));
169 
170 	mutex_lock(&encl->lock);
171 
172 	ret = __eblock(sgx_get_epc_virt_addr(epc_page));
173 	if (encls_failed(ret))
174 		ENCLS_WARN(ret, "EBLOCK");
175 
176 	mutex_unlock(&encl->lock);
177 }
178 
179 static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
180 			  struct sgx_backing *backing)
181 {
182 	struct sgx_pageinfo pginfo;
183 	int ret;
184 
185 	pginfo.addr = 0;
186 	pginfo.secs = 0;
187 
188 	pginfo.contents = (unsigned long)kmap_atomic(backing->contents);
189 	pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) +
190 			  backing->pcmd_offset;
191 
192 	ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
193 
194 	kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
195 					      backing->pcmd_offset));
196 	kunmap_atomic((void *)(unsigned long)pginfo.contents);
197 
198 	return ret;
199 }
200 
201 static void sgx_ipi_cb(void *info)
202 {
203 }
204 
205 static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl)
206 {
207 	cpumask_t *cpumask = &encl->cpumask;
208 	struct sgx_encl_mm *encl_mm;
209 	int idx;
210 
211 	/*
212 	 * Can race with sgx_encl_mm_add(), but ETRACK has already been
213 	 * executed, which means that the CPUs running in the new mm will enter
214 	 * into the enclave with a fresh epoch.
215 	 */
216 	cpumask_clear(cpumask);
217 
218 	idx = srcu_read_lock(&encl->srcu);
219 
220 	list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
221 		if (!mmget_not_zero(encl_mm->mm))
222 			continue;
223 
224 		cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
225 
226 		mmput_async(encl_mm->mm);
227 	}
228 
229 	srcu_read_unlock(&encl->srcu, idx);
230 
231 	return cpumask;
232 }
233 
234 /*
235  * Swap page to the regular memory transformed to the blocked state by using
236  * EBLOCK, which means that it can no longer be referenced (no new TLB entries).
237  *
238  * The first trial just tries to write the page assuming that some other thread
239  * has reset the count for threads inside the enclave by using ETRACK, and
240  * previous thread count has been zeroed out. The second trial calls ETRACK
241  * before EWB. If that fails we kick all the HW threads out, and then do EWB,
242  * which should be guaranteed the succeed.
243  */
244 static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
245 			 struct sgx_backing *backing)
246 {
247 	struct sgx_encl_page *encl_page = epc_page->owner;
248 	struct sgx_encl *encl = encl_page->encl;
249 	struct sgx_va_page *va_page;
250 	unsigned int va_offset;
251 	void *va_slot;
252 	int ret;
253 
254 	encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
255 
256 	va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
257 				   list);
258 	va_offset = sgx_alloc_va_slot(va_page);
259 	va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
260 	if (sgx_va_page_full(va_page))
261 		list_move_tail(&va_page->list, &encl->va_pages);
262 
263 	ret = __sgx_encl_ewb(epc_page, va_slot, backing);
264 	if (ret == SGX_NOT_TRACKED) {
265 		ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
266 		if (ret) {
267 			if (encls_failed(ret))
268 				ENCLS_WARN(ret, "ETRACK");
269 		}
270 
271 		ret = __sgx_encl_ewb(epc_page, va_slot, backing);
272 		if (ret == SGX_NOT_TRACKED) {
273 			/*
274 			 * Slow path, send IPIs to kick cpus out of the
275 			 * enclave.  Note, it's imperative that the cpu
276 			 * mask is generated *after* ETRACK, else we'll
277 			 * miss cpus that entered the enclave between
278 			 * generating the mask and incrementing epoch.
279 			 */
280 			on_each_cpu_mask(sgx_encl_ewb_cpumask(encl),
281 					 sgx_ipi_cb, NULL, 1);
282 			ret = __sgx_encl_ewb(epc_page, va_slot, backing);
283 		}
284 	}
285 
286 	if (ret) {
287 		if (encls_failed(ret))
288 			ENCLS_WARN(ret, "EWB");
289 
290 		sgx_free_va_slot(va_page, va_offset);
291 	} else {
292 		encl_page->desc |= va_offset;
293 		encl_page->va_page = va_page;
294 	}
295 }
296 
297 static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
298 				struct sgx_backing *backing)
299 {
300 	struct sgx_encl_page *encl_page = epc_page->owner;
301 	struct sgx_encl *encl = encl_page->encl;
302 	struct sgx_backing secs_backing;
303 	int ret;
304 
305 	mutex_lock(&encl->lock);
306 
307 	sgx_encl_ewb(epc_page, backing);
308 	encl_page->epc_page = NULL;
309 	encl->secs_child_cnt--;
310 
311 	if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
312 		ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size),
313 					   &secs_backing);
314 		if (ret)
315 			goto out;
316 
317 		sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
318 
319 		sgx_encl_free_epc_page(encl->secs.epc_page);
320 		encl->secs.epc_page = NULL;
321 
322 		sgx_encl_put_backing(&secs_backing, true);
323 	}
324 
325 out:
326 	mutex_unlock(&encl->lock);
327 }
328 
329 /*
330  * Take a fixed number of pages from the head of the active page pool and
331  * reclaim them to the enclave's private shmem files. Skip the pages, which have
332  * been accessed since the last scan. Move those pages to the tail of active
333  * page pool so that the pages get scanned in LRU like fashion.
334  *
335  * Batch process a chunk of pages (at the moment 16) in order to degrade amount
336  * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
337  * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
338  * + EWB) but not sufficiently. Reclaiming one page at a time would also be
339  * problematic as it would increase the lock contention too much, which would
340  * halt forward progress.
341  */
342 static void sgx_reclaim_pages(void)
343 {
344 	struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
345 	struct sgx_backing backing[SGX_NR_TO_SCAN];
346 	struct sgx_epc_section *section;
347 	struct sgx_encl_page *encl_page;
348 	struct sgx_epc_page *epc_page;
349 	struct sgx_numa_node *node;
350 	pgoff_t page_index;
351 	int cnt = 0;
352 	int ret;
353 	int i;
354 
355 	spin_lock(&sgx_reclaimer_lock);
356 	for (i = 0; i < SGX_NR_TO_SCAN; i++) {
357 		if (list_empty(&sgx_active_page_list))
358 			break;
359 
360 		epc_page = list_first_entry(&sgx_active_page_list,
361 					    struct sgx_epc_page, list);
362 		list_del_init(&epc_page->list);
363 		encl_page = epc_page->owner;
364 
365 		if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
366 			chunk[cnt++] = epc_page;
367 		else
368 			/* The owner is freeing the page. No need to add the
369 			 * page back to the list of reclaimable pages.
370 			 */
371 			epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
372 	}
373 	spin_unlock(&sgx_reclaimer_lock);
374 
375 	for (i = 0; i < cnt; i++) {
376 		epc_page = chunk[i];
377 		encl_page = epc_page->owner;
378 
379 		if (!sgx_reclaimer_age(epc_page))
380 			goto skip;
381 
382 		page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
383 		ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]);
384 		if (ret)
385 			goto skip;
386 
387 		mutex_lock(&encl_page->encl->lock);
388 		encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
389 		mutex_unlock(&encl_page->encl->lock);
390 		continue;
391 
392 skip:
393 		spin_lock(&sgx_reclaimer_lock);
394 		list_add_tail(&epc_page->list, &sgx_active_page_list);
395 		spin_unlock(&sgx_reclaimer_lock);
396 
397 		kref_put(&encl_page->encl->refcount, sgx_encl_release);
398 
399 		chunk[i] = NULL;
400 	}
401 
402 	for (i = 0; i < cnt; i++) {
403 		epc_page = chunk[i];
404 		if (epc_page)
405 			sgx_reclaimer_block(epc_page);
406 	}
407 
408 	for (i = 0; i < cnt; i++) {
409 		epc_page = chunk[i];
410 		if (!epc_page)
411 			continue;
412 
413 		encl_page = epc_page->owner;
414 		sgx_reclaimer_write(epc_page, &backing[i]);
415 		sgx_encl_put_backing(&backing[i], true);
416 
417 		kref_put(&encl_page->encl->refcount, sgx_encl_release);
418 		epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
419 
420 		section = &sgx_epc_sections[epc_page->section];
421 		node = section->node;
422 
423 		spin_lock(&node->lock);
424 		list_add_tail(&epc_page->list, &node->free_page_list);
425 		sgx_nr_free_pages++;
426 		spin_unlock(&node->lock);
427 	}
428 }
429 
430 static bool sgx_should_reclaim(unsigned long watermark)
431 {
432 	return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
433 }
434 
435 static int ksgxd(void *p)
436 {
437 	set_freezable();
438 
439 	/*
440 	 * Sanitize pages in order to recover from kexec(). The 2nd pass is
441 	 * required for SECS pages, whose child pages blocked EREMOVE.
442 	 */
443 	__sgx_sanitize_pages(&sgx_dirty_page_list);
444 	__sgx_sanitize_pages(&sgx_dirty_page_list);
445 
446 	/* sanity check: */
447 	WARN_ON(!list_empty(&sgx_dirty_page_list));
448 
449 	while (!kthread_should_stop()) {
450 		if (try_to_freeze())
451 			continue;
452 
453 		wait_event_freezable(ksgxd_waitq,
454 				     kthread_should_stop() ||
455 				     sgx_should_reclaim(SGX_NR_HIGH_PAGES));
456 
457 		if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
458 			sgx_reclaim_pages();
459 
460 		cond_resched();
461 	}
462 
463 	return 0;
464 }
465 
466 static bool __init sgx_page_reclaimer_init(void)
467 {
468 	struct task_struct *tsk;
469 
470 	tsk = kthread_run(ksgxd, NULL, "ksgxd");
471 	if (IS_ERR(tsk))
472 		return false;
473 
474 	ksgxd_tsk = tsk;
475 
476 	return true;
477 }
478 
479 static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
480 {
481 	struct sgx_numa_node *node = &sgx_numa_nodes[nid];
482 	struct sgx_epc_page *page = NULL;
483 
484 	spin_lock(&node->lock);
485 
486 	if (list_empty(&node->free_page_list)) {
487 		spin_unlock(&node->lock);
488 		return NULL;
489 	}
490 
491 	page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
492 	list_del_init(&page->list);
493 	sgx_nr_free_pages--;
494 	page->flags = 0;
495 
496 	spin_unlock(&node->lock);
497 
498 	return page;
499 }
500 
501 /**
502  * __sgx_alloc_epc_page() - Allocate an EPC page
503  *
504  * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
505  * from the NUMA node, where the caller is executing.
506  *
507  * Return:
508  * - an EPC page:	A borrowed EPC pages were available.
509  * - NULL:		Out of EPC pages.
510  */
511 struct sgx_epc_page *__sgx_alloc_epc_page(void)
512 {
513 	struct sgx_epc_page *page;
514 	int nid_of_current = numa_node_id();
515 	int nid = nid_of_current;
516 
517 	if (node_isset(nid_of_current, sgx_numa_mask)) {
518 		page = __sgx_alloc_epc_page_from_node(nid_of_current);
519 		if (page)
520 			return page;
521 	}
522 
523 	/* Fall back to the non-local NUMA nodes: */
524 	while (true) {
525 		nid = next_node_in(nid, sgx_numa_mask);
526 		if (nid == nid_of_current)
527 			break;
528 
529 		page = __sgx_alloc_epc_page_from_node(nid);
530 		if (page)
531 			return page;
532 	}
533 
534 	return ERR_PTR(-ENOMEM);
535 }
536 
537 /**
538  * sgx_mark_page_reclaimable() - Mark a page as reclaimable
539  * @page:	EPC page
540  *
541  * Mark a page as reclaimable and add it to the active page list. Pages
542  * are automatically removed from the active list when freed.
543  */
544 void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
545 {
546 	spin_lock(&sgx_reclaimer_lock);
547 	page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
548 	list_add_tail(&page->list, &sgx_active_page_list);
549 	spin_unlock(&sgx_reclaimer_lock);
550 }
551 
552 /**
553  * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
554  * @page:	EPC page
555  *
556  * Clear the reclaimable flag and remove the page from the active page list.
557  *
558  * Return:
559  *   0 on success,
560  *   -EBUSY if the page is in the process of being reclaimed
561  */
562 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
563 {
564 	spin_lock(&sgx_reclaimer_lock);
565 	if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
566 		/* The page is being reclaimed. */
567 		if (list_empty(&page->list)) {
568 			spin_unlock(&sgx_reclaimer_lock);
569 			return -EBUSY;
570 		}
571 
572 		list_del(&page->list);
573 		page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
574 	}
575 	spin_unlock(&sgx_reclaimer_lock);
576 
577 	return 0;
578 }
579 
580 /**
581  * sgx_alloc_epc_page() - Allocate an EPC page
582  * @owner:	the owner of the EPC page
583  * @reclaim:	reclaim pages if necessary
584  *
585  * Iterate through EPC sections and borrow a free EPC page to the caller. When a
586  * page is no longer needed it must be released with sgx_free_epc_page(). If
587  * @reclaim is set to true, directly reclaim pages when we are out of pages. No
588  * mm's can be locked when @reclaim is set to true.
589  *
590  * Finally, wake up ksgxd when the number of pages goes below the watermark
591  * before returning back to the caller.
592  *
593  * Return:
594  *   an EPC page,
595  *   -errno on error
596  */
597 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
598 {
599 	struct sgx_epc_page *page;
600 
601 	for ( ; ; ) {
602 		page = __sgx_alloc_epc_page();
603 		if (!IS_ERR(page)) {
604 			page->owner = owner;
605 			break;
606 		}
607 
608 		if (list_empty(&sgx_active_page_list))
609 			return ERR_PTR(-ENOMEM);
610 
611 		if (!reclaim) {
612 			page = ERR_PTR(-EBUSY);
613 			break;
614 		}
615 
616 		if (signal_pending(current)) {
617 			page = ERR_PTR(-ERESTARTSYS);
618 			break;
619 		}
620 
621 		sgx_reclaim_pages();
622 		cond_resched();
623 	}
624 
625 	if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
626 		wake_up(&ksgxd_waitq);
627 
628 	return page;
629 }
630 
631 /**
632  * sgx_free_epc_page() - Free an EPC page
633  * @page:	an EPC page
634  *
635  * Put the EPC page back to the list of free pages. It's the caller's
636  * responsibility to make sure that the page is in uninitialized state. In other
637  * words, do EREMOVE, EWB or whatever operation is necessary before calling
638  * this function.
639  */
640 void sgx_free_epc_page(struct sgx_epc_page *page)
641 {
642 	struct sgx_epc_section *section = &sgx_epc_sections[page->section];
643 	struct sgx_numa_node *node = section->node;
644 
645 	spin_lock(&node->lock);
646 
647 	page->owner = NULL;
648 	if (page->poison)
649 		list_add(&page->list, &node->sgx_poison_page_list);
650 	else
651 		list_add_tail(&page->list, &node->free_page_list);
652 	sgx_nr_free_pages++;
653 	page->flags = SGX_EPC_PAGE_IS_FREE;
654 
655 	spin_unlock(&node->lock);
656 }
657 
658 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
659 					 unsigned long index,
660 					 struct sgx_epc_section *section)
661 {
662 	unsigned long nr_pages = size >> PAGE_SHIFT;
663 	unsigned long i;
664 
665 	section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
666 	if (!section->virt_addr)
667 		return false;
668 
669 	section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
670 	if (!section->pages) {
671 		memunmap(section->virt_addr);
672 		return false;
673 	}
674 
675 	section->phys_addr = phys_addr;
676 	xa_store_range(&sgx_epc_address_space, section->phys_addr,
677 		       phys_addr + size - 1, section, GFP_KERNEL);
678 
679 	for (i = 0; i < nr_pages; i++) {
680 		section->pages[i].section = index;
681 		section->pages[i].flags = 0;
682 		section->pages[i].owner = NULL;
683 		section->pages[i].poison = 0;
684 		list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
685 	}
686 
687 	return true;
688 }
689 
690 bool arch_is_platform_page(u64 paddr)
691 {
692 	return !!xa_load(&sgx_epc_address_space, paddr);
693 }
694 EXPORT_SYMBOL_GPL(arch_is_platform_page);
695 
696 static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
697 {
698 	struct sgx_epc_section *section;
699 
700 	section = xa_load(&sgx_epc_address_space, paddr);
701 	if (!section)
702 		return NULL;
703 
704 	return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
705 }
706 
707 /*
708  * Called in process context to handle a hardware reported
709  * error in an SGX EPC page.
710  * If the MF_ACTION_REQUIRED bit is set in flags, then the
711  * context is the task that consumed the poison data. Otherwise
712  * this is called from a kernel thread unrelated to the page.
713  */
714 int arch_memory_failure(unsigned long pfn, int flags)
715 {
716 	struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
717 	struct sgx_epc_section *section;
718 	struct sgx_numa_node *node;
719 
720 	/*
721 	 * mm/memory-failure.c calls this routine for all errors
722 	 * where there isn't a "struct page" for the address. But that
723 	 * includes other address ranges besides SGX.
724 	 */
725 	if (!page)
726 		return -ENXIO;
727 
728 	/*
729 	 * If poison was consumed synchronously. Send a SIGBUS to
730 	 * the task. Hardware has already exited the SGX enclave and
731 	 * will not allow re-entry to an enclave that has a memory
732 	 * error. The signal may help the task understand why the
733 	 * enclave is broken.
734 	 */
735 	if (flags & MF_ACTION_REQUIRED)
736 		force_sig(SIGBUS);
737 
738 	section = &sgx_epc_sections[page->section];
739 	node = section->node;
740 
741 	spin_lock(&node->lock);
742 
743 	/* Already poisoned? Nothing more to do */
744 	if (page->poison)
745 		goto out;
746 
747 	page->poison = 1;
748 
749 	/*
750 	 * If the page is on a free list, move it to the per-node
751 	 * poison page list.
752 	 */
753 	if (page->flags & SGX_EPC_PAGE_IS_FREE) {
754 		list_move(&page->list, &node->sgx_poison_page_list);
755 		goto out;
756 	}
757 
758 	/*
759 	 * TBD: Add additional plumbing to enable pre-emptive
760 	 * action for asynchronous poison notification. Until
761 	 * then just hope that the poison:
762 	 * a) is not accessed - sgx_free_epc_page() will deal with it
763 	 *    when the user gives it back
764 	 * b) results in a recoverable machine check rather than
765 	 *    a fatal one
766 	 */
767 out:
768 	spin_unlock(&node->lock);
769 	return 0;
770 }
771 
772 /**
773  * A section metric is concatenated in a way that @low bits 12-31 define the
774  * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
775  * metric.
776  */
777 static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
778 {
779 	return (low & GENMASK_ULL(31, 12)) +
780 	       ((high & GENMASK_ULL(19, 0)) << 32);
781 }
782 
783 static bool __init sgx_page_cache_init(void)
784 {
785 	u32 eax, ebx, ecx, edx, type;
786 	u64 pa, size;
787 	int nid;
788 	int i;
789 
790 	sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
791 	if (!sgx_numa_nodes)
792 		return false;
793 
794 	for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
795 		cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
796 
797 		type = eax & SGX_CPUID_EPC_MASK;
798 		if (type == SGX_CPUID_EPC_INVALID)
799 			break;
800 
801 		if (type != SGX_CPUID_EPC_SECTION) {
802 			pr_err_once("Unknown EPC section type: %u\n", type);
803 			break;
804 		}
805 
806 		pa   = sgx_calc_section_metric(eax, ebx);
807 		size = sgx_calc_section_metric(ecx, edx);
808 
809 		pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
810 
811 		if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
812 			pr_err("No free memory for an EPC section\n");
813 			break;
814 		}
815 
816 		nid = numa_map_to_online_node(phys_to_target_node(pa));
817 		if (nid == NUMA_NO_NODE) {
818 			/* The physical address is already printed above. */
819 			pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
820 			nid = 0;
821 		}
822 
823 		if (!node_isset(nid, sgx_numa_mask)) {
824 			spin_lock_init(&sgx_numa_nodes[nid].lock);
825 			INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
826 			INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
827 			node_set(nid, sgx_numa_mask);
828 		}
829 
830 		sgx_epc_sections[i].node =  &sgx_numa_nodes[nid];
831 
832 		sgx_nr_epc_sections++;
833 	}
834 
835 	if (!sgx_nr_epc_sections) {
836 		pr_err("There are zero EPC sections.\n");
837 		return false;
838 	}
839 
840 	return true;
841 }
842 
843 /*
844  * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
845  * Bare-metal driver requires to update them to hash of enclave's signer
846  * before EINIT. KVM needs to update them to guest's virtual MSR values
847  * before doing EINIT from guest.
848  */
849 void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
850 {
851 	int i;
852 
853 	WARN_ON_ONCE(preemptible());
854 
855 	for (i = 0; i < 4; i++)
856 		wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
857 }
858 
859 const struct file_operations sgx_provision_fops = {
860 	.owner			= THIS_MODULE,
861 };
862 
863 static struct miscdevice sgx_dev_provision = {
864 	.minor = MISC_DYNAMIC_MINOR,
865 	.name = "sgx_provision",
866 	.nodename = "sgx_provision",
867 	.fops = &sgx_provision_fops,
868 };
869 
870 /**
871  * sgx_set_attribute() - Update allowed attributes given file descriptor
872  * @allowed_attributes:		Pointer to allowed enclave attributes
873  * @attribute_fd:		File descriptor for specific attribute
874  *
875  * Append enclave attribute indicated by file descriptor to allowed
876  * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
877  * /dev/sgx_provision is supported.
878  *
879  * Return:
880  * -0:		SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
881  * -EINVAL:	Invalid, or not supported file descriptor
882  */
883 int sgx_set_attribute(unsigned long *allowed_attributes,
884 		      unsigned int attribute_fd)
885 {
886 	struct file *file;
887 
888 	file = fget(attribute_fd);
889 	if (!file)
890 		return -EINVAL;
891 
892 	if (file->f_op != &sgx_provision_fops) {
893 		fput(file);
894 		return -EINVAL;
895 	}
896 
897 	*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
898 
899 	fput(file);
900 	return 0;
901 }
902 EXPORT_SYMBOL_GPL(sgx_set_attribute);
903 
904 static int __init sgx_init(void)
905 {
906 	int ret;
907 	int i;
908 
909 	if (!cpu_feature_enabled(X86_FEATURE_SGX))
910 		return -ENODEV;
911 
912 	if (!sgx_page_cache_init())
913 		return -ENOMEM;
914 
915 	if (!sgx_page_reclaimer_init()) {
916 		ret = -ENOMEM;
917 		goto err_page_cache;
918 	}
919 
920 	ret = misc_register(&sgx_dev_provision);
921 	if (ret)
922 		goto err_kthread;
923 
924 	/*
925 	 * Always try to initialize the native *and* KVM drivers.
926 	 * The KVM driver is less picky than the native one and
927 	 * can function if the native one is not supported on the
928 	 * current system or fails to initialize.
929 	 *
930 	 * Error out only if both fail to initialize.
931 	 */
932 	ret = sgx_drv_init();
933 
934 	if (sgx_vepc_init() && ret)
935 		goto err_provision;
936 
937 	return 0;
938 
939 err_provision:
940 	misc_deregister(&sgx_dev_provision);
941 
942 err_kthread:
943 	kthread_stop(ksgxd_tsk);
944 
945 err_page_cache:
946 	for (i = 0; i < sgx_nr_epc_sections; i++) {
947 		vfree(sgx_epc_sections[i].pages);
948 		memunmap(sgx_epc_sections[i].virt_addr);
949 	}
950 
951 	return ret;
952 }
953 
954 device_initcall(sgx_init);
955