1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * pSeries_lpar.c
4  * Copyright (C) 2001 Todd Inglett, IBM Corporation
5  *
6  * pSeries LPAR support.
7  */
8 
9 /* Enables debugging of low-level hash table routines - careful! */
10 #undef DEBUG
11 #define pr_fmt(fmt) "lpar: " fmt
12 
13 #include <linux/kernel.h>
14 #include <linux/dma-mapping.h>
15 #include <linux/console.h>
16 #include <linux/export.h>
17 #include <linux/jump_label.h>
18 #include <linux/delay.h>
19 #include <linux/stop_machine.h>
20 #include <linux/spinlock.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/workqueue.h>
23 #include <linux/proc_fs.h>
24 #include <asm/processor.h>
25 #include <asm/mmu.h>
26 #include <asm/page.h>
27 #include <asm/pgtable.h>
28 #include <asm/machdep.h>
29 #include <asm/mmu_context.h>
30 #include <asm/iommu.h>
31 #include <asm/tlb.h>
32 #include <asm/prom.h>
33 #include <asm/cputable.h>
34 #include <asm/udbg.h>
35 #include <asm/smp.h>
36 #include <asm/trace.h>
37 #include <asm/firmware.h>
38 #include <asm/plpar_wrappers.h>
39 #include <asm/kexec.h>
40 #include <asm/fadump.h>
41 #include <asm/asm-prototypes.h>
42 #include <asm/debugfs.h>
43 
44 #include "pseries.h"
45 
46 /* Flag bits for H_BULK_REMOVE */
47 #define HBR_REQUEST	0x4000000000000000UL
48 #define HBR_RESPONSE	0x8000000000000000UL
49 #define HBR_END		0xc000000000000000UL
50 #define HBR_AVPN	0x0200000000000000UL
51 #define HBR_ANDCOND	0x0100000000000000UL
52 
53 
54 /* in hvCall.S */
55 EXPORT_SYMBOL(plpar_hcall);
56 EXPORT_SYMBOL(plpar_hcall9);
57 EXPORT_SYMBOL(plpar_hcall_norets);
58 
59 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
60 static u8 dtl_mask = DTL_LOG_PREEMPT;
61 #else
62 static u8 dtl_mask;
63 #endif
64 
65 void alloc_dtl_buffers(unsigned long *time_limit)
66 {
67 	int cpu;
68 	struct paca_struct *pp;
69 	struct dtl_entry *dtl;
70 
71 	for_each_possible_cpu(cpu) {
72 		pp = paca_ptrs[cpu];
73 		if (pp->dispatch_log)
74 			continue;
75 		dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
76 		if (!dtl) {
77 			pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
78 				cpu);
79 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
80 			pr_warn("Stolen time statistics will be unreliable\n");
81 #endif
82 			break;
83 		}
84 
85 		pp->dtl_ridx = 0;
86 		pp->dispatch_log = dtl;
87 		pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
88 		pp->dtl_curr = dtl;
89 
90 		if (time_limit && time_after(jiffies, *time_limit)) {
91 			cond_resched();
92 			*time_limit = jiffies + HZ;
93 		}
94 	}
95 }
96 
97 void register_dtl_buffer(int cpu)
98 {
99 	long ret;
100 	struct paca_struct *pp;
101 	struct dtl_entry *dtl;
102 	int hwcpu = get_hard_smp_processor_id(cpu);
103 
104 	pp = paca_ptrs[cpu];
105 	dtl = pp->dispatch_log;
106 	if (dtl && dtl_mask) {
107 		pp->dtl_ridx = 0;
108 		pp->dtl_curr = dtl;
109 		lppaca_of(cpu).dtl_idx = 0;
110 
111 		/* hypervisor reads buffer length from this field */
112 		dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
113 		ret = register_dtl(hwcpu, __pa(dtl));
114 		if (ret)
115 			pr_err("WARNING: DTL registration of cpu %d (hw %d) failed with %ld\n",
116 			       cpu, hwcpu, ret);
117 
118 		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
119 	}
120 }
121 
122 #ifdef CONFIG_PPC_SPLPAR
123 struct dtl_worker {
124 	struct delayed_work work;
125 	int cpu;
126 };
127 
128 struct vcpu_dispatch_data {
129 	int last_disp_cpu;
130 
131 	int total_disp;
132 
133 	int same_cpu_disp;
134 	int same_chip_disp;
135 	int diff_chip_disp;
136 	int far_chip_disp;
137 
138 	int numa_home_disp;
139 	int numa_remote_disp;
140 	int numa_far_disp;
141 };
142 
143 /*
144  * This represents the number of cpus in the hypervisor. Since there is no
145  * architected way to discover the number of processors in the host, we
146  * provision for dealing with NR_CPUS. This is currently 2048 by default, and
147  * is sufficient for our purposes. This will need to be tweaked if
148  * CONFIG_NR_CPUS is changed.
149  */
150 #define NR_CPUS_H	NR_CPUS
151 
152 DEFINE_RWLOCK(dtl_access_lock);
153 static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data);
154 static DEFINE_PER_CPU(u64, dtl_entry_ridx);
155 static DEFINE_PER_CPU(struct dtl_worker, dtl_workers);
156 static enum cpuhp_state dtl_worker_state;
157 static DEFINE_MUTEX(dtl_enable_mutex);
158 static int vcpudispatch_stats_on __read_mostly;
159 static int vcpudispatch_stats_freq = 50;
160 static __be32 *vcpu_associativity, *pcpu_associativity;
161 
162 
163 static void free_dtl_buffers(unsigned long *time_limit)
164 {
165 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
166 	int cpu;
167 	struct paca_struct *pp;
168 
169 	for_each_possible_cpu(cpu) {
170 		pp = paca_ptrs[cpu];
171 		if (!pp->dispatch_log)
172 			continue;
173 		kmem_cache_free(dtl_cache, pp->dispatch_log);
174 		pp->dtl_ridx = 0;
175 		pp->dispatch_log = 0;
176 		pp->dispatch_log_end = 0;
177 		pp->dtl_curr = 0;
178 
179 		if (time_limit && time_after(jiffies, *time_limit)) {
180 			cond_resched();
181 			*time_limit = jiffies + HZ;
182 		}
183 	}
184 #endif
185 }
186 
187 static int init_cpu_associativity(void)
188 {
189 	vcpu_associativity = kcalloc(num_possible_cpus() / threads_per_core,
190 			VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
191 	pcpu_associativity = kcalloc(NR_CPUS_H / threads_per_core,
192 			VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
193 
194 	if (!vcpu_associativity || !pcpu_associativity) {
195 		pr_err("error allocating memory for associativity information\n");
196 		return -ENOMEM;
197 	}
198 
199 	return 0;
200 }
201 
202 static void destroy_cpu_associativity(void)
203 {
204 	kfree(vcpu_associativity);
205 	kfree(pcpu_associativity);
206 	vcpu_associativity = pcpu_associativity = 0;
207 }
208 
209 static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag)
210 {
211 	__be32 *assoc;
212 	int rc = 0;
213 
214 	assoc = &cpu_assoc[(int)(cpu / threads_per_core) * VPHN_ASSOC_BUFSIZE];
215 	if (!assoc[0]) {
216 		rc = hcall_vphn(cpu, flag, &assoc[0]);
217 		if (rc)
218 			return NULL;
219 	}
220 
221 	return assoc;
222 }
223 
224 static __be32 *get_pcpu_associativity(int cpu)
225 {
226 	return __get_cpu_associativity(cpu, pcpu_associativity, VPHN_FLAG_PCPU);
227 }
228 
229 static __be32 *get_vcpu_associativity(int cpu)
230 {
231 	return __get_cpu_associativity(cpu, vcpu_associativity, VPHN_FLAG_VCPU);
232 }
233 
234 static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu)
235 {
236 	__be32 *last_disp_cpu_assoc, *cur_disp_cpu_assoc;
237 
238 	if (last_disp_cpu >= NR_CPUS_H || cur_disp_cpu >= NR_CPUS_H)
239 		return -EINVAL;
240 
241 	last_disp_cpu_assoc = get_pcpu_associativity(last_disp_cpu);
242 	cur_disp_cpu_assoc = get_pcpu_associativity(cur_disp_cpu);
243 
244 	if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
245 		return -EIO;
246 
247 	return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
248 }
249 
250 static int cpu_home_node_dispatch_distance(int disp_cpu)
251 {
252 	__be32 *disp_cpu_assoc, *vcpu_assoc;
253 	int vcpu_id = smp_processor_id();
254 
255 	if (disp_cpu >= NR_CPUS_H) {
256 		pr_debug_ratelimited("vcpu dispatch cpu %d > %d\n",
257 						disp_cpu, NR_CPUS_H);
258 		return -EINVAL;
259 	}
260 
261 	disp_cpu_assoc = get_pcpu_associativity(disp_cpu);
262 	vcpu_assoc = get_vcpu_associativity(vcpu_id);
263 
264 	if (!disp_cpu_assoc || !vcpu_assoc)
265 		return -EIO;
266 
267 	return cpu_distance(disp_cpu_assoc, vcpu_assoc);
268 }
269 
270 static void update_vcpu_disp_stat(int disp_cpu)
271 {
272 	struct vcpu_dispatch_data *disp;
273 	int distance;
274 
275 	disp = this_cpu_ptr(&vcpu_disp_data);
276 	if (disp->last_disp_cpu == -1) {
277 		disp->last_disp_cpu = disp_cpu;
278 		return;
279 	}
280 
281 	disp->total_disp++;
282 
283 	if (disp->last_disp_cpu == disp_cpu ||
284 		(cpu_first_thread_sibling(disp->last_disp_cpu) ==
285 					cpu_first_thread_sibling(disp_cpu)))
286 		disp->same_cpu_disp++;
287 	else {
288 		distance = cpu_relative_dispatch_distance(disp->last_disp_cpu,
289 								disp_cpu);
290 		if (distance < 0)
291 			pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
292 					smp_processor_id());
293 		else {
294 			switch (distance) {
295 			case 0:
296 				disp->same_chip_disp++;
297 				break;
298 			case 1:
299 				disp->diff_chip_disp++;
300 				break;
301 			case 2:
302 				disp->far_chip_disp++;
303 				break;
304 			default:
305 				pr_debug_ratelimited("vcpudispatch_stats: cpu %d (%d -> %d): unexpected relative dispatch distance %d\n",
306 						 smp_processor_id(),
307 						 disp->last_disp_cpu,
308 						 disp_cpu,
309 						 distance);
310 			}
311 		}
312 	}
313 
314 	distance = cpu_home_node_dispatch_distance(disp_cpu);
315 	if (distance < 0)
316 		pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
317 				smp_processor_id());
318 	else {
319 		switch (distance) {
320 		case 0:
321 			disp->numa_home_disp++;
322 			break;
323 		case 1:
324 			disp->numa_remote_disp++;
325 			break;
326 		case 2:
327 			disp->numa_far_disp++;
328 			break;
329 		default:
330 			pr_debug_ratelimited("vcpudispatch_stats: cpu %d on %d: unexpected numa dispatch distance %d\n",
331 						 smp_processor_id(),
332 						 disp_cpu,
333 						 distance);
334 		}
335 	}
336 
337 	disp->last_disp_cpu = disp_cpu;
338 }
339 
340 static void process_dtl_buffer(struct work_struct *work)
341 {
342 	struct dtl_entry dtle;
343 	u64 i = __this_cpu_read(dtl_entry_ridx);
344 	struct dtl_entry *dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
345 	struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
346 	struct lppaca *vpa = local_paca->lppaca_ptr;
347 	struct dtl_worker *d = container_of(work, struct dtl_worker, work.work);
348 
349 	if (!local_paca->dispatch_log)
350 		return;
351 
352 	/* if we have been migrated away, we cancel ourself */
353 	if (d->cpu != smp_processor_id()) {
354 		pr_debug("vcpudispatch_stats: cpu %d worker migrated -- canceling worker\n",
355 						smp_processor_id());
356 		return;
357 	}
358 
359 	if (i == be64_to_cpu(vpa->dtl_idx))
360 		goto out;
361 
362 	while (i < be64_to_cpu(vpa->dtl_idx)) {
363 		dtle = *dtl;
364 		barrier();
365 		if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
366 			/* buffer has overflowed */
367 			pr_debug_ratelimited("vcpudispatch_stats: cpu %d lost %lld DTL samples\n",
368 				d->cpu,
369 				be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG - i);
370 			i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
371 			dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
372 			continue;
373 		}
374 		update_vcpu_disp_stat(be16_to_cpu(dtle.processor_id));
375 		++i;
376 		++dtl;
377 		if (dtl == dtl_end)
378 			dtl = local_paca->dispatch_log;
379 	}
380 
381 	__this_cpu_write(dtl_entry_ridx, i);
382 
383 out:
384 	schedule_delayed_work_on(d->cpu, to_delayed_work(work),
385 					HZ / vcpudispatch_stats_freq);
386 }
387 
388 static int dtl_worker_online(unsigned int cpu)
389 {
390 	struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
391 
392 	memset(d, 0, sizeof(*d));
393 	INIT_DELAYED_WORK(&d->work, process_dtl_buffer);
394 	d->cpu = cpu;
395 
396 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
397 	per_cpu(dtl_entry_ridx, cpu) = 0;
398 	register_dtl_buffer(cpu);
399 #else
400 	per_cpu(dtl_entry_ridx, cpu) = be64_to_cpu(lppaca_of(cpu).dtl_idx);
401 #endif
402 
403 	schedule_delayed_work_on(cpu, &d->work, HZ / vcpudispatch_stats_freq);
404 	return 0;
405 }
406 
407 static int dtl_worker_offline(unsigned int cpu)
408 {
409 	struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
410 
411 	cancel_delayed_work_sync(&d->work);
412 
413 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
414 	unregister_dtl(get_hard_smp_processor_id(cpu));
415 #endif
416 
417 	return 0;
418 }
419 
420 static void set_global_dtl_mask(u8 mask)
421 {
422 	int cpu;
423 
424 	dtl_mask = mask;
425 	for_each_present_cpu(cpu)
426 		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
427 }
428 
429 static void reset_global_dtl_mask(void)
430 {
431 	int cpu;
432 
433 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
434 	dtl_mask = DTL_LOG_PREEMPT;
435 #else
436 	dtl_mask = 0;
437 #endif
438 	for_each_present_cpu(cpu)
439 		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
440 }
441 
442 static int dtl_worker_enable(unsigned long *time_limit)
443 {
444 	int rc = 0, state;
445 
446 	if (!write_trylock(&dtl_access_lock)) {
447 		rc = -EBUSY;
448 		goto out;
449 	}
450 
451 	set_global_dtl_mask(DTL_LOG_ALL);
452 
453 	/* Setup dtl buffers and register those */
454 	alloc_dtl_buffers(time_limit);
455 
456 	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/dtl:online",
457 					dtl_worker_online, dtl_worker_offline);
458 	if (state < 0) {
459 		pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n");
460 		free_dtl_buffers(time_limit);
461 		reset_global_dtl_mask();
462 		write_unlock(&dtl_access_lock);
463 		rc = -EINVAL;
464 		goto out;
465 	}
466 	dtl_worker_state = state;
467 
468 out:
469 	return rc;
470 }
471 
472 static void dtl_worker_disable(unsigned long *time_limit)
473 {
474 	cpuhp_remove_state(dtl_worker_state);
475 	free_dtl_buffers(time_limit);
476 	reset_global_dtl_mask();
477 	write_unlock(&dtl_access_lock);
478 }
479 
480 static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p,
481 		size_t count, loff_t *ppos)
482 {
483 	unsigned long time_limit = jiffies + HZ;
484 	struct vcpu_dispatch_data *disp;
485 	int rc, cmd, cpu;
486 	char buf[16];
487 
488 	if (count > 15)
489 		return -EINVAL;
490 
491 	if (copy_from_user(buf, p, count))
492 		return -EFAULT;
493 
494 	buf[count] = 0;
495 	rc = kstrtoint(buf, 0, &cmd);
496 	if (rc || cmd < 0 || cmd > 1) {
497 		pr_err("vcpudispatch_stats: please use 0 to disable or 1 to enable dispatch statistics\n");
498 		return rc ? rc : -EINVAL;
499 	}
500 
501 	mutex_lock(&dtl_enable_mutex);
502 
503 	if ((cmd == 0 && !vcpudispatch_stats_on) ||
504 			(cmd == 1 && vcpudispatch_stats_on))
505 		goto out;
506 
507 	if (cmd) {
508 		rc = init_cpu_associativity();
509 		if (rc)
510 			goto out;
511 
512 		for_each_possible_cpu(cpu) {
513 			disp = per_cpu_ptr(&vcpu_disp_data, cpu);
514 			memset(disp, 0, sizeof(*disp));
515 			disp->last_disp_cpu = -1;
516 		}
517 
518 		rc = dtl_worker_enable(&time_limit);
519 		if (rc) {
520 			destroy_cpu_associativity();
521 			goto out;
522 		}
523 	} else {
524 		dtl_worker_disable(&time_limit);
525 		destroy_cpu_associativity();
526 	}
527 
528 	vcpudispatch_stats_on = cmd;
529 
530 out:
531 	mutex_unlock(&dtl_enable_mutex);
532 	if (rc)
533 		return rc;
534 	return count;
535 }
536 
537 static int vcpudispatch_stats_display(struct seq_file *p, void *v)
538 {
539 	int cpu;
540 	struct vcpu_dispatch_data *disp;
541 
542 	if (!vcpudispatch_stats_on) {
543 		seq_puts(p, "off\n");
544 		return 0;
545 	}
546 
547 	for_each_online_cpu(cpu) {
548 		disp = per_cpu_ptr(&vcpu_disp_data, cpu);
549 		seq_printf(p, "cpu%d", cpu);
550 		seq_put_decimal_ull(p, " ", disp->total_disp);
551 		seq_put_decimal_ull(p, " ", disp->same_cpu_disp);
552 		seq_put_decimal_ull(p, " ", disp->same_chip_disp);
553 		seq_put_decimal_ull(p, " ", disp->diff_chip_disp);
554 		seq_put_decimal_ull(p, " ", disp->far_chip_disp);
555 		seq_put_decimal_ull(p, " ", disp->numa_home_disp);
556 		seq_put_decimal_ull(p, " ", disp->numa_remote_disp);
557 		seq_put_decimal_ull(p, " ", disp->numa_far_disp);
558 		seq_puts(p, "\n");
559 	}
560 
561 	return 0;
562 }
563 
564 static int vcpudispatch_stats_open(struct inode *inode, struct file *file)
565 {
566 	return single_open(file, vcpudispatch_stats_display, NULL);
567 }
568 
569 static const struct file_operations vcpudispatch_stats_proc_ops = {
570 	.open		= vcpudispatch_stats_open,
571 	.read		= seq_read,
572 	.write		= vcpudispatch_stats_write,
573 	.llseek		= seq_lseek,
574 	.release	= single_release,
575 };
576 
577 static ssize_t vcpudispatch_stats_freq_write(struct file *file,
578 		const char __user *p, size_t count, loff_t *ppos)
579 {
580 	int rc, freq;
581 	char buf[16];
582 
583 	if (count > 15)
584 		return -EINVAL;
585 
586 	if (copy_from_user(buf, p, count))
587 		return -EFAULT;
588 
589 	buf[count] = 0;
590 	rc = kstrtoint(buf, 0, &freq);
591 	if (rc || freq < 1 || freq > HZ) {
592 		pr_err("vcpudispatch_stats_freq: please specify a frequency between 1 and %d\n",
593 				HZ);
594 		return rc ? rc : -EINVAL;
595 	}
596 
597 	vcpudispatch_stats_freq = freq;
598 
599 	return count;
600 }
601 
602 static int vcpudispatch_stats_freq_display(struct seq_file *p, void *v)
603 {
604 	seq_printf(p, "%d\n", vcpudispatch_stats_freq);
605 	return 0;
606 }
607 
608 static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file)
609 {
610 	return single_open(file, vcpudispatch_stats_freq_display, NULL);
611 }
612 
613 static const struct file_operations vcpudispatch_stats_freq_proc_ops = {
614 	.open		= vcpudispatch_stats_freq_open,
615 	.read		= seq_read,
616 	.write		= vcpudispatch_stats_freq_write,
617 	.llseek		= seq_lseek,
618 	.release	= single_release,
619 };
620 
621 static int __init vcpudispatch_stats_procfs_init(void)
622 {
623 	if (!lppaca_shared_proc(get_lppaca()))
624 		return 0;
625 
626 	if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL,
627 					&vcpudispatch_stats_proc_ops))
628 		pr_err("vcpudispatch_stats: error creating procfs file\n");
629 	else if (!proc_create("powerpc/vcpudispatch_stats_freq", 0600, NULL,
630 					&vcpudispatch_stats_freq_proc_ops))
631 		pr_err("vcpudispatch_stats_freq: error creating procfs file\n");
632 
633 	return 0;
634 }
635 
636 machine_device_initcall(pseries, vcpudispatch_stats_procfs_init);
637 #endif /* CONFIG_PPC_SPLPAR */
638 
639 void vpa_init(int cpu)
640 {
641 	int hwcpu = get_hard_smp_processor_id(cpu);
642 	unsigned long addr;
643 	long ret;
644 
645 	/*
646 	 * The spec says it "may be problematic" if CPU x registers the VPA of
647 	 * CPU y. We should never do that, but wail if we ever do.
648 	 */
649 	WARN_ON(cpu != smp_processor_id());
650 
651 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
652 		lppaca_of(cpu).vmxregs_in_use = 1;
653 
654 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
655 		lppaca_of(cpu).ebb_regs_in_use = 1;
656 
657 	addr = __pa(&lppaca_of(cpu));
658 	ret = register_vpa(hwcpu, addr);
659 
660 	if (ret) {
661 		pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
662 		       "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
663 		return;
664 	}
665 
666 #ifdef CONFIG_PPC_BOOK3S_64
667 	/*
668 	 * PAPR says this feature is SLB-Buffer but firmware never
669 	 * reports that.  All SPLPAR support SLB shadow buffer.
670 	 */
671 	if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
672 		addr = __pa(paca_ptrs[cpu]->slb_shadow_ptr);
673 		ret = register_slb_shadow(hwcpu, addr);
674 		if (ret)
675 			pr_err("WARNING: SLB shadow buffer registration for "
676 			       "cpu %d (hw %d) of area %lx failed with %ld\n",
677 			       cpu, hwcpu, addr, ret);
678 	}
679 #endif /* CONFIG_PPC_BOOK3S_64 */
680 
681 	/*
682 	 * Register dispatch trace log, if one has been allocated.
683 	 */
684 	register_dtl_buffer(cpu);
685 }
686 
687 #ifdef CONFIG_PPC_BOOK3S_64
688 
689 static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
690 				     unsigned long vpn, unsigned long pa,
691 				     unsigned long rflags, unsigned long vflags,
692 				     int psize, int apsize, int ssize)
693 {
694 	unsigned long lpar_rc;
695 	unsigned long flags;
696 	unsigned long slot;
697 	unsigned long hpte_v, hpte_r;
698 
699 	if (!(vflags & HPTE_V_BOLTED))
700 		pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
701 			 "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
702 			 hpte_group, vpn,  pa, rflags, vflags, psize);
703 
704 	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
705 	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
706 
707 	if (!(vflags & HPTE_V_BOLTED))
708 		pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
709 
710 	/* Now fill in the actual HPTE */
711 	/* Set CEC cookie to 0         */
712 	/* Zero page = 0               */
713 	/* I-cache Invalidate = 0      */
714 	/* I-cache synchronize = 0     */
715 	/* Exact = 0                   */
716 	flags = 0;
717 
718 	if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
719 		flags |= H_COALESCE_CAND;
720 
721 	lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
722 	if (unlikely(lpar_rc == H_PTEG_FULL)) {
723 		pr_devel("Hash table group is full\n");
724 		return -1;
725 	}
726 
727 	/*
728 	 * Since we try and ioremap PHBs we don't own, the pte insert
729 	 * will fail. However we must catch the failure in hash_page
730 	 * or we will loop forever, so return -2 in this case.
731 	 */
732 	if (unlikely(lpar_rc != H_SUCCESS)) {
733 		pr_err("Failed hash pte insert with error %ld\n", lpar_rc);
734 		return -2;
735 	}
736 	if (!(vflags & HPTE_V_BOLTED))
737 		pr_devel(" -> slot: %lu\n", slot & 7);
738 
739 	/* Because of iSeries, we have to pass down the secondary
740 	 * bucket bit here as well
741 	 */
742 	return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
743 }
744 
745 static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
746 
747 static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
748 {
749 	unsigned long slot_offset;
750 	unsigned long lpar_rc;
751 	int i;
752 	unsigned long dummy1, dummy2;
753 
754 	/* pick a random slot to start at */
755 	slot_offset = mftb() & 0x7;
756 
757 	for (i = 0; i < HPTES_PER_GROUP; i++) {
758 
759 		/* don't remove a bolted entry */
760 		lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
761 					   (0x1UL << 4), &dummy1, &dummy2);
762 		if (lpar_rc == H_SUCCESS)
763 			return i;
764 
765 		/*
766 		 * The test for adjunct partition is performed before the
767 		 * ANDCOND test.  H_RESOURCE may be returned, so we need to
768 		 * check for that as well.
769 		 */
770 		BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE);
771 
772 		slot_offset++;
773 		slot_offset &= 0x7;
774 	}
775 
776 	return -1;
777 }
778 
779 static void manual_hpte_clear_all(void)
780 {
781 	unsigned long size_bytes = 1UL << ppc64_pft_size;
782 	unsigned long hpte_count = size_bytes >> 4;
783 	struct {
784 		unsigned long pteh;
785 		unsigned long ptel;
786 	} ptes[4];
787 	long lpar_rc;
788 	unsigned long i, j;
789 
790 	/* Read in batches of 4,
791 	 * invalidate only valid entries not in the VRMA
792 	 * hpte_count will be a multiple of 4
793          */
794 	for (i = 0; i < hpte_count; i += 4) {
795 		lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
796 		if (lpar_rc != H_SUCCESS) {
797 			pr_info("Failed to read hash page table at %ld err %ld\n",
798 				i, lpar_rc);
799 			continue;
800 		}
801 		for (j = 0; j < 4; j++){
802 			if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
803 				HPTE_V_VRMA_MASK)
804 				continue;
805 			if (ptes[j].pteh & HPTE_V_VALID)
806 				plpar_pte_remove_raw(0, i + j, 0,
807 					&(ptes[j].pteh), &(ptes[j].ptel));
808 		}
809 	}
810 }
811 
812 static int hcall_hpte_clear_all(void)
813 {
814 	int rc;
815 
816 	do {
817 		rc = plpar_hcall_norets(H_CLEAR_HPT);
818 	} while (rc == H_CONTINUE);
819 
820 	return rc;
821 }
822 
823 static void pseries_hpte_clear_all(void)
824 {
825 	int rc;
826 
827 	rc = hcall_hpte_clear_all();
828 	if (rc != H_SUCCESS)
829 		manual_hpte_clear_all();
830 
831 #ifdef __LITTLE_ENDIAN__
832 	/*
833 	 * Reset exceptions to big endian.
834 	 *
835 	 * FIXME this is a hack for kexec, we need to reset the exception
836 	 * endian before starting the new kernel and this is a convenient place
837 	 * to do it.
838 	 *
839 	 * This is also called on boot when a fadump happens. In that case we
840 	 * must not change the exception endian mode.
841 	 */
842 	if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active())
843 		pseries_big_endian_exceptions();
844 #endif
845 }
846 
847 /*
848  * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
849  * the low 3 bits of flags happen to line up.  So no transform is needed.
850  * We can probably optimize here and assume the high bits of newpp are
851  * already zero.  For now I am paranoid.
852  */
853 static long pSeries_lpar_hpte_updatepp(unsigned long slot,
854 				       unsigned long newpp,
855 				       unsigned long vpn,
856 				       int psize, int apsize,
857 				       int ssize, unsigned long inv_flags)
858 {
859 	unsigned long lpar_rc;
860 	unsigned long flags;
861 	unsigned long want_v;
862 
863 	want_v = hpte_encode_avpn(vpn, psize, ssize);
864 
865 	flags = (newpp & 7) | H_AVPN;
866 	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
867 		/* Move pp0 into bit 8 (IBM 55) */
868 		flags |= (newpp & HPTE_R_PP0) >> 55;
869 
870 	pr_devel("    update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
871 		 want_v, slot, flags, psize);
872 
873 	lpar_rc = plpar_pte_protect(flags, slot, want_v);
874 
875 	if (lpar_rc == H_NOT_FOUND) {
876 		pr_devel("not found !\n");
877 		return -1;
878 	}
879 
880 	pr_devel("ok\n");
881 
882 	BUG_ON(lpar_rc != H_SUCCESS);
883 
884 	return 0;
885 }
886 
887 static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group)
888 {
889 	long lpar_rc;
890 	unsigned long i, j;
891 	struct {
892 		unsigned long pteh;
893 		unsigned long ptel;
894 	} ptes[4];
895 
896 	for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
897 
898 		lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
899 		if (lpar_rc != H_SUCCESS) {
900 			pr_info("Failed to read hash page table at %ld err %ld\n",
901 				hpte_group, lpar_rc);
902 			continue;
903 		}
904 
905 		for (j = 0; j < 4; j++) {
906 			if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
907 			    (ptes[j].pteh & HPTE_V_VALID))
908 				return i + j;
909 		}
910 	}
911 
912 	return -1;
913 }
914 
915 static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
916 {
917 	long slot;
918 	unsigned long hash;
919 	unsigned long want_v;
920 	unsigned long hpte_group;
921 
922 	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
923 	want_v = hpte_encode_avpn(vpn, psize, ssize);
924 
925 	/* Bolted entries are always in the primary group */
926 	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
927 	slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
928 	if (slot < 0)
929 		return -1;
930 	return hpte_group + slot;
931 }
932 
933 static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
934 					     unsigned long ea,
935 					     int psize, int ssize)
936 {
937 	unsigned long vpn;
938 	unsigned long lpar_rc, slot, vsid, flags;
939 
940 	vsid = get_kernel_vsid(ea, ssize);
941 	vpn = hpt_vpn(ea, vsid, ssize);
942 
943 	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
944 	BUG_ON(slot == -1);
945 
946 	flags = newpp & 7;
947 	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
948 		/* Move pp0 into bit 8 (IBM 55) */
949 		flags |= (newpp & HPTE_R_PP0) >> 55;
950 
951 	lpar_rc = plpar_pte_protect(flags, slot, 0);
952 
953 	BUG_ON(lpar_rc != H_SUCCESS);
954 }
955 
956 static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
957 					 int psize, int apsize,
958 					 int ssize, int local)
959 {
960 	unsigned long want_v;
961 	unsigned long lpar_rc;
962 	unsigned long dummy1, dummy2;
963 
964 	pr_devel("    inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
965 		 slot, vpn, psize, local);
966 
967 	want_v = hpte_encode_avpn(vpn, psize, ssize);
968 	lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
969 	if (lpar_rc == H_NOT_FOUND)
970 		return;
971 
972 	BUG_ON(lpar_rc != H_SUCCESS);
973 }
974 
975 
976 /*
977  * As defined in the PAPR's section 14.5.4.1.8
978  * The control mask doesn't include the returned reference and change bit from
979  * the processed PTE.
980  */
981 #define HBLKR_AVPN		0x0100000000000000UL
982 #define HBLKR_CTRL_MASK		0xf800000000000000UL
983 #define HBLKR_CTRL_SUCCESS	0x8000000000000000UL
984 #define HBLKR_CTRL_ERRNOTFOUND	0x8800000000000000UL
985 #define HBLKR_CTRL_ERRBUSY	0xa000000000000000UL
986 
987 /**
988  * H_BLOCK_REMOVE caller.
989  * @idx should point to the latest @param entry set with a PTEX.
990  * If PTE cannot be processed because another CPUs has already locked that
991  * group, those entries are put back in @param starting at index 1.
992  * If entries has to be retried and @retry_busy is set to true, these entries
993  * are retried until success. If @retry_busy is set to false, the returned
994  * is the number of entries yet to process.
995  */
996 static unsigned long call_block_remove(unsigned long idx, unsigned long *param,
997 				       bool retry_busy)
998 {
999 	unsigned long i, rc, new_idx;
1000 	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
1001 
1002 	if (idx < 2) {
1003 		pr_warn("Unexpected empty call to H_BLOCK_REMOVE");
1004 		return 0;
1005 	}
1006 again:
1007 	new_idx = 0;
1008 	if (idx > PLPAR_HCALL9_BUFSIZE) {
1009 		pr_err("Too many PTEs (%lu) for H_BLOCK_REMOVE", idx);
1010 		idx = PLPAR_HCALL9_BUFSIZE;
1011 	} else if (idx < PLPAR_HCALL9_BUFSIZE)
1012 		param[idx] = HBR_END;
1013 
1014 	rc = plpar_hcall9(H_BLOCK_REMOVE, retbuf,
1015 			  param[0], /* AVA */
1016 			  param[1],  param[2],  param[3],  param[4], /* TS0-7 */
1017 			  param[5],  param[6],  param[7],  param[8]);
1018 	if (rc == H_SUCCESS)
1019 		return 0;
1020 
1021 	BUG_ON(rc != H_PARTIAL);
1022 
1023 	/* Check that the unprocessed entries were 'not found' or 'busy' */
1024 	for (i = 0; i < idx-1; i++) {
1025 		unsigned long ctrl = retbuf[i] & HBLKR_CTRL_MASK;
1026 
1027 		if (ctrl == HBLKR_CTRL_ERRBUSY) {
1028 			param[++new_idx] = param[i+1];
1029 			continue;
1030 		}
1031 
1032 		BUG_ON(ctrl != HBLKR_CTRL_SUCCESS
1033 		       && ctrl != HBLKR_CTRL_ERRNOTFOUND);
1034 	}
1035 
1036 	/*
1037 	 * If there were entries found busy, retry these entries if requested,
1038 	 * of if all the entries have to be retried.
1039 	 */
1040 	if (new_idx && (retry_busy || new_idx == (PLPAR_HCALL9_BUFSIZE-1))) {
1041 		idx = new_idx + 1;
1042 		goto again;
1043 	}
1044 
1045 	return new_idx;
1046 }
1047 
1048 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1049 /*
1050  * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
1051  * to make sure that we avoid bouncing the hypervisor tlbie lock.
1052  */
1053 #define PPC64_HUGE_HPTE_BATCH 12
1054 
1055 static void hugepage_block_invalidate(unsigned long *slot, unsigned long *vpn,
1056 				      int count, int psize, int ssize)
1057 {
1058 	unsigned long param[PLPAR_HCALL9_BUFSIZE];
1059 	unsigned long shift, current_vpgb, vpgb;
1060 	int i, pix = 0;
1061 
1062 	shift = mmu_psize_defs[psize].shift;
1063 
1064 	for (i = 0; i < count; i++) {
1065 		/*
1066 		 * Shifting 3 bits more on the right to get a
1067 		 * 8 pages aligned virtual addresse.
1068 		 */
1069 		vpgb = (vpn[i] >> (shift - VPN_SHIFT + 3));
1070 		if (!pix || vpgb != current_vpgb) {
1071 			/*
1072 			 * Need to start a new 8 pages block, flush
1073 			 * the current one if needed.
1074 			 */
1075 			if (pix)
1076 				(void)call_block_remove(pix, param, true);
1077 			current_vpgb = vpgb;
1078 			param[0] = hpte_encode_avpn(vpn[i], psize, ssize);
1079 			pix = 1;
1080 		}
1081 
1082 		param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot[i];
1083 		if (pix == PLPAR_HCALL9_BUFSIZE) {
1084 			pix = call_block_remove(pix, param, false);
1085 			/*
1086 			 * pix = 0 means that all the entries were
1087 			 * removed, we can start a new block.
1088 			 * Otherwise, this means that there are entries
1089 			 * to retry, and pix points to latest one, so
1090 			 * we should increment it and try to continue
1091 			 * the same block.
1092 			 */
1093 			if (pix)
1094 				pix++;
1095 		}
1096 	}
1097 	if (pix)
1098 		(void)call_block_remove(pix, param, true);
1099 }
1100 
1101 static void hugepage_bulk_invalidate(unsigned long *slot, unsigned long *vpn,
1102 				     int count, int psize, int ssize)
1103 {
1104 	unsigned long param[PLPAR_HCALL9_BUFSIZE];
1105 	int i = 0, pix = 0, rc;
1106 
1107 	for (i = 0; i < count; i++) {
1108 
1109 		if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1110 			pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
1111 						     ssize, 0);
1112 		} else {
1113 			param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
1114 			param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
1115 			pix += 2;
1116 			if (pix == 8) {
1117 				rc = plpar_hcall9(H_BULK_REMOVE, param,
1118 						  param[0], param[1], param[2],
1119 						  param[3], param[4], param[5],
1120 						  param[6], param[7]);
1121 				BUG_ON(rc != H_SUCCESS);
1122 				pix = 0;
1123 			}
1124 		}
1125 	}
1126 	if (pix) {
1127 		param[pix] = HBR_END;
1128 		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
1129 				  param[2], param[3], param[4], param[5],
1130 				  param[6], param[7]);
1131 		BUG_ON(rc != H_SUCCESS);
1132 	}
1133 }
1134 
1135 static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
1136 						      unsigned long *vpn,
1137 						      int count, int psize,
1138 						      int ssize)
1139 {
1140 	unsigned long flags = 0;
1141 	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
1142 
1143 	if (lock_tlbie)
1144 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
1145 
1146 	if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
1147 		hugepage_block_invalidate(slot, vpn, count, psize, ssize);
1148 	else
1149 		hugepage_bulk_invalidate(slot, vpn, count, psize, ssize);
1150 
1151 	if (lock_tlbie)
1152 		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
1153 }
1154 
1155 static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
1156 					     unsigned long addr,
1157 					     unsigned char *hpte_slot_array,
1158 					     int psize, int ssize, int local)
1159 {
1160 	int i, index = 0;
1161 	unsigned long s_addr = addr;
1162 	unsigned int max_hpte_count, valid;
1163 	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
1164 	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
1165 	unsigned long shift, hidx, vpn = 0, hash, slot;
1166 
1167 	shift = mmu_psize_defs[psize].shift;
1168 	max_hpte_count = 1U << (PMD_SHIFT - shift);
1169 
1170 	for (i = 0; i < max_hpte_count; i++) {
1171 		valid = hpte_valid(hpte_slot_array, i);
1172 		if (!valid)
1173 			continue;
1174 		hidx =  hpte_hash_index(hpte_slot_array, i);
1175 
1176 		/* get the vpn */
1177 		addr = s_addr + (i * (1ul << shift));
1178 		vpn = hpt_vpn(addr, vsid, ssize);
1179 		hash = hpt_hash(vpn, shift, ssize);
1180 		if (hidx & _PTEIDX_SECONDARY)
1181 			hash = ~hash;
1182 
1183 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1184 		slot += hidx & _PTEIDX_GROUP_IX;
1185 
1186 		slot_array[index] = slot;
1187 		vpn_array[index] = vpn;
1188 		if (index == PPC64_HUGE_HPTE_BATCH - 1) {
1189 			/*
1190 			 * Now do a bluk invalidate
1191 			 */
1192 			__pSeries_lpar_hugepage_invalidate(slot_array,
1193 							   vpn_array,
1194 							   PPC64_HUGE_HPTE_BATCH,
1195 							   psize, ssize);
1196 			index = 0;
1197 		} else
1198 			index++;
1199 	}
1200 	if (index)
1201 		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
1202 						   index, psize, ssize);
1203 }
1204 #else
1205 static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
1206 					     unsigned long addr,
1207 					     unsigned char *hpte_slot_array,
1208 					     int psize, int ssize, int local)
1209 {
1210 	WARN(1, "%s called without THP support\n", __func__);
1211 }
1212 #endif
1213 
1214 static int pSeries_lpar_hpte_removebolted(unsigned long ea,
1215 					  int psize, int ssize)
1216 {
1217 	unsigned long vpn;
1218 	unsigned long slot, vsid;
1219 
1220 	vsid = get_kernel_vsid(ea, ssize);
1221 	vpn = hpt_vpn(ea, vsid, ssize);
1222 
1223 	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
1224 	if (slot == -1)
1225 		return -ENOENT;
1226 
1227 	/*
1228 	 * lpar doesn't use the passed actual page size
1229 	 */
1230 	pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
1231 	return 0;
1232 }
1233 
1234 
1235 static inline unsigned long compute_slot(real_pte_t pte,
1236 					 unsigned long vpn,
1237 					 unsigned long index,
1238 					 unsigned long shift,
1239 					 int ssize)
1240 {
1241 	unsigned long slot, hash, hidx;
1242 
1243 	hash = hpt_hash(vpn, shift, ssize);
1244 	hidx = __rpte_to_hidx(pte, index);
1245 	if (hidx & _PTEIDX_SECONDARY)
1246 		hash = ~hash;
1247 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1248 	slot += hidx & _PTEIDX_GROUP_IX;
1249 	return slot;
1250 }
1251 
1252 /**
1253  * The hcall H_BLOCK_REMOVE implies that the virtual pages to processed are
1254  * "all within the same naturally aligned 8 page virtual address block".
1255  */
1256 static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch,
1257 			    unsigned long *param)
1258 {
1259 	unsigned long vpn;
1260 	unsigned long i, pix = 0;
1261 	unsigned long index, shift, slot, current_vpgb, vpgb;
1262 	real_pte_t pte;
1263 	int psize, ssize;
1264 
1265 	psize = batch->psize;
1266 	ssize = batch->ssize;
1267 
1268 	for (i = 0; i < number; i++) {
1269 		vpn = batch->vpn[i];
1270 		pte = batch->pte[i];
1271 		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
1272 			/*
1273 			 * Shifting 3 bits more on the right to get a
1274 			 * 8 pages aligned virtual addresse.
1275 			 */
1276 			vpgb = (vpn >> (shift - VPN_SHIFT + 3));
1277 			if (!pix || vpgb != current_vpgb) {
1278 				/*
1279 				 * Need to start a new 8 pages block, flush
1280 				 * the current one if needed.
1281 				 */
1282 				if (pix)
1283 					(void)call_block_remove(pix, param,
1284 								true);
1285 				current_vpgb = vpgb;
1286 				param[0] = hpte_encode_avpn(vpn, psize,
1287 							    ssize);
1288 				pix = 1;
1289 			}
1290 
1291 			slot = compute_slot(pte, vpn, index, shift, ssize);
1292 			param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot;
1293 
1294 			if (pix == PLPAR_HCALL9_BUFSIZE) {
1295 				pix = call_block_remove(pix, param, false);
1296 				/*
1297 				 * pix = 0 means that all the entries were
1298 				 * removed, we can start a new block.
1299 				 * Otherwise, this means that there are entries
1300 				 * to retry, and pix points to latest one, so
1301 				 * we should increment it and try to continue
1302 				 * the same block.
1303 				 */
1304 				if (pix)
1305 					pix++;
1306 			}
1307 		} pte_iterate_hashed_end();
1308 	}
1309 
1310 	if (pix)
1311 		(void)call_block_remove(pix, param, true);
1312 }
1313 
1314 /*
1315  * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
1316  * lock.
1317  */
1318 static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
1319 {
1320 	unsigned long vpn;
1321 	unsigned long i, pix, rc;
1322 	unsigned long flags = 0;
1323 	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
1324 	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
1325 	unsigned long param[PLPAR_HCALL9_BUFSIZE];
1326 	unsigned long index, shift, slot;
1327 	real_pte_t pte;
1328 	int psize, ssize;
1329 
1330 	if (lock_tlbie)
1331 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
1332 
1333 	if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) {
1334 		do_block_remove(number, batch, param);
1335 		goto out;
1336 	}
1337 
1338 	psize = batch->psize;
1339 	ssize = batch->ssize;
1340 	pix = 0;
1341 	for (i = 0; i < number; i++) {
1342 		vpn = batch->vpn[i];
1343 		pte = batch->pte[i];
1344 		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
1345 			slot = compute_slot(pte, vpn, index, shift, ssize);
1346 			if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1347 				/*
1348 				 * lpar doesn't use the passed actual page size
1349 				 */
1350 				pSeries_lpar_hpte_invalidate(slot, vpn, psize,
1351 							     0, ssize, local);
1352 			} else {
1353 				param[pix] = HBR_REQUEST | HBR_AVPN | slot;
1354 				param[pix+1] = hpte_encode_avpn(vpn, psize,
1355 								ssize);
1356 				pix += 2;
1357 				if (pix == 8) {
1358 					rc = plpar_hcall9(H_BULK_REMOVE, param,
1359 						param[0], param[1], param[2],
1360 						param[3], param[4], param[5],
1361 						param[6], param[7]);
1362 					BUG_ON(rc != H_SUCCESS);
1363 					pix = 0;
1364 				}
1365 			}
1366 		} pte_iterate_hashed_end();
1367 	}
1368 	if (pix) {
1369 		param[pix] = HBR_END;
1370 		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
1371 				  param[2], param[3], param[4], param[5],
1372 				  param[6], param[7]);
1373 		BUG_ON(rc != H_SUCCESS);
1374 	}
1375 
1376 out:
1377 	if (lock_tlbie)
1378 		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
1379 }
1380 
1381 static int __init disable_bulk_remove(char *str)
1382 {
1383 	if (strcmp(str, "off") == 0 &&
1384 	    firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1385 		pr_info("Disabling BULK_REMOVE firmware feature");
1386 		powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
1387 	}
1388 	return 1;
1389 }
1390 
1391 __setup("bulk_remove=", disable_bulk_remove);
1392 
1393 #define HPT_RESIZE_TIMEOUT	10000 /* ms */
1394 
1395 struct hpt_resize_state {
1396 	unsigned long shift;
1397 	int commit_rc;
1398 };
1399 
1400 static int pseries_lpar_resize_hpt_commit(void *data)
1401 {
1402 	struct hpt_resize_state *state = data;
1403 
1404 	state->commit_rc = plpar_resize_hpt_commit(0, state->shift);
1405 	if (state->commit_rc != H_SUCCESS)
1406 		return -EIO;
1407 
1408 	/* Hypervisor has transitioned the HTAB, update our globals */
1409 	ppc64_pft_size = state->shift;
1410 	htab_size_bytes = 1UL << ppc64_pft_size;
1411 	htab_hash_mask = (htab_size_bytes >> 7) - 1;
1412 
1413 	return 0;
1414 }
1415 
1416 /* Must be called in user context */
1417 static int pseries_lpar_resize_hpt(unsigned long shift)
1418 {
1419 	struct hpt_resize_state state = {
1420 		.shift = shift,
1421 		.commit_rc = H_FUNCTION,
1422 	};
1423 	unsigned int delay, total_delay = 0;
1424 	int rc;
1425 	ktime_t t0, t1, t2;
1426 
1427 	might_sleep();
1428 
1429 	if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE))
1430 		return -ENODEV;
1431 
1432 	pr_info("Attempting to resize HPT to shift %lu\n", shift);
1433 
1434 	t0 = ktime_get();
1435 
1436 	rc = plpar_resize_hpt_prepare(0, shift);
1437 	while (H_IS_LONG_BUSY(rc)) {
1438 		delay = get_longbusy_msecs(rc);
1439 		total_delay += delay;
1440 		if (total_delay > HPT_RESIZE_TIMEOUT) {
1441 			/* prepare with shift==0 cancels an in-progress resize */
1442 			rc = plpar_resize_hpt_prepare(0, 0);
1443 			if (rc != H_SUCCESS)
1444 				pr_warn("Unexpected error %d cancelling timed out HPT resize\n",
1445 				       rc);
1446 			return -ETIMEDOUT;
1447 		}
1448 		msleep(delay);
1449 		rc = plpar_resize_hpt_prepare(0, shift);
1450 	};
1451 
1452 	switch (rc) {
1453 	case H_SUCCESS:
1454 		/* Continue on */
1455 		break;
1456 
1457 	case H_PARAMETER:
1458 		pr_warn("Invalid argument from H_RESIZE_HPT_PREPARE\n");
1459 		return -EINVAL;
1460 	case H_RESOURCE:
1461 		pr_warn("Operation not permitted from H_RESIZE_HPT_PREPARE\n");
1462 		return -EPERM;
1463 	default:
1464 		pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc);
1465 		return -EIO;
1466 	}
1467 
1468 	t1 = ktime_get();
1469 
1470 	rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL);
1471 
1472 	t2 = ktime_get();
1473 
1474 	if (rc != 0) {
1475 		switch (state.commit_rc) {
1476 		case H_PTEG_FULL:
1477 			return -ENOSPC;
1478 
1479 		default:
1480 			pr_warn("Unexpected error %d from H_RESIZE_HPT_COMMIT\n",
1481 				state.commit_rc);
1482 			return -EIO;
1483 		};
1484 	}
1485 
1486 	pr_info("HPT resize to shift %lu complete (%lld ms / %lld ms)\n",
1487 		shift, (long long) ktime_ms_delta(t1, t0),
1488 		(long long) ktime_ms_delta(t2, t1));
1489 
1490 	return 0;
1491 }
1492 
1493 static int pseries_lpar_register_process_table(unsigned long base,
1494 			unsigned long page_size, unsigned long table_size)
1495 {
1496 	long rc;
1497 	unsigned long flags = 0;
1498 
1499 	if (table_size)
1500 		flags |= PROC_TABLE_NEW;
1501 	if (radix_enabled())
1502 		flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
1503 	else
1504 		flags |= PROC_TABLE_HPT_SLB;
1505 	for (;;) {
1506 		rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
1507 					page_size, table_size);
1508 		if (!H_IS_LONG_BUSY(rc))
1509 			break;
1510 		mdelay(get_longbusy_msecs(rc));
1511 	}
1512 	if (rc != H_SUCCESS) {
1513 		pr_err("Failed to register process table (rc=%ld)\n", rc);
1514 		BUG();
1515 	}
1516 	return rc;
1517 }
1518 
1519 void __init hpte_init_pseries(void)
1520 {
1521 	mmu_hash_ops.hpte_invalidate	 = pSeries_lpar_hpte_invalidate;
1522 	mmu_hash_ops.hpte_updatepp	 = pSeries_lpar_hpte_updatepp;
1523 	mmu_hash_ops.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
1524 	mmu_hash_ops.hpte_insert	 = pSeries_lpar_hpte_insert;
1525 	mmu_hash_ops.hpte_remove	 = pSeries_lpar_hpte_remove;
1526 	mmu_hash_ops.hpte_removebolted   = pSeries_lpar_hpte_removebolted;
1527 	mmu_hash_ops.flush_hash_range	 = pSeries_lpar_flush_hash_range;
1528 	mmu_hash_ops.hpte_clear_all      = pseries_hpte_clear_all;
1529 	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
1530 	register_process_table		 = pseries_lpar_register_process_table;
1531 
1532 	if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
1533 		mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
1534 }
1535 
1536 void radix_init_pseries(void)
1537 {
1538 	pr_info("Using radix MMU under hypervisor\n");
1539 	register_process_table = pseries_lpar_register_process_table;
1540 }
1541 
1542 #ifdef CONFIG_PPC_SMLPAR
1543 #define CMO_FREE_HINT_DEFAULT 1
1544 static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
1545 
1546 static int __init cmo_free_hint(char *str)
1547 {
1548 	char *parm;
1549 	parm = strstrip(str);
1550 
1551 	if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
1552 		pr_info("%s: CMO free page hinting is not active.\n", __func__);
1553 		cmo_free_hint_flag = 0;
1554 		return 1;
1555 	}
1556 
1557 	cmo_free_hint_flag = 1;
1558 	pr_info("%s: CMO free page hinting is active.\n", __func__);
1559 
1560 	if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
1561 		return 1;
1562 
1563 	return 0;
1564 }
1565 
1566 __setup("cmo_free_hint=", cmo_free_hint);
1567 
1568 static void pSeries_set_page_state(struct page *page, int order,
1569 				   unsigned long state)
1570 {
1571 	int i, j;
1572 	unsigned long cmo_page_sz, addr;
1573 
1574 	cmo_page_sz = cmo_get_page_size();
1575 	addr = __pa((unsigned long)page_address(page));
1576 
1577 	for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
1578 		for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
1579 			plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
1580 	}
1581 }
1582 
1583 void arch_free_page(struct page *page, int order)
1584 {
1585 	if (radix_enabled())
1586 		return;
1587 	if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
1588 		return;
1589 
1590 	pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
1591 }
1592 EXPORT_SYMBOL(arch_free_page);
1593 
1594 #endif /* CONFIG_PPC_SMLPAR */
1595 #endif /* CONFIG_PPC_BOOK3S_64 */
1596 
1597 #ifdef CONFIG_TRACEPOINTS
1598 #ifdef CONFIG_JUMP_LABEL
1599 struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
1600 
1601 int hcall_tracepoint_regfunc(void)
1602 {
1603 	static_key_slow_inc(&hcall_tracepoint_key);
1604 	return 0;
1605 }
1606 
1607 void hcall_tracepoint_unregfunc(void)
1608 {
1609 	static_key_slow_dec(&hcall_tracepoint_key);
1610 }
1611 #else
1612 /*
1613  * We optimise our hcall path by placing hcall_tracepoint_refcount
1614  * directly in the TOC so we can check if the hcall tracepoints are
1615  * enabled via a single load.
1616  */
1617 
1618 /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
1619 extern long hcall_tracepoint_refcount;
1620 
1621 int hcall_tracepoint_regfunc(void)
1622 {
1623 	hcall_tracepoint_refcount++;
1624 	return 0;
1625 }
1626 
1627 void hcall_tracepoint_unregfunc(void)
1628 {
1629 	hcall_tracepoint_refcount--;
1630 }
1631 #endif
1632 
1633 /*
1634  * Since the tracing code might execute hcalls we need to guard against
1635  * recursion. One example of this are spinlocks calling H_YIELD on
1636  * shared processor partitions.
1637  */
1638 static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
1639 
1640 
1641 void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
1642 {
1643 	unsigned long flags;
1644 	unsigned int *depth;
1645 
1646 	/*
1647 	 * We cannot call tracepoints inside RCU idle regions which
1648 	 * means we must not trace H_CEDE.
1649 	 */
1650 	if (opcode == H_CEDE)
1651 		return;
1652 
1653 	local_irq_save(flags);
1654 
1655 	depth = this_cpu_ptr(&hcall_trace_depth);
1656 
1657 	if (*depth)
1658 		goto out;
1659 
1660 	(*depth)++;
1661 	preempt_disable();
1662 	trace_hcall_entry(opcode, args);
1663 	(*depth)--;
1664 
1665 out:
1666 	local_irq_restore(flags);
1667 }
1668 
1669 void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf)
1670 {
1671 	unsigned long flags;
1672 	unsigned int *depth;
1673 
1674 	if (opcode == H_CEDE)
1675 		return;
1676 
1677 	local_irq_save(flags);
1678 
1679 	depth = this_cpu_ptr(&hcall_trace_depth);
1680 
1681 	if (*depth)
1682 		goto out;
1683 
1684 	(*depth)++;
1685 	trace_hcall_exit(opcode, retval, retbuf);
1686 	preempt_enable();
1687 	(*depth)--;
1688 
1689 out:
1690 	local_irq_restore(flags);
1691 }
1692 #endif
1693 
1694 /**
1695  * h_get_mpp
1696  * H_GET_MPP hcall returns info in 7 parms
1697  */
1698 int h_get_mpp(struct hvcall_mpp_data *mpp_data)
1699 {
1700 	int rc;
1701 	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
1702 
1703 	rc = plpar_hcall9(H_GET_MPP, retbuf);
1704 
1705 	mpp_data->entitled_mem = retbuf[0];
1706 	mpp_data->mapped_mem = retbuf[1];
1707 
1708 	mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
1709 	mpp_data->pool_num = retbuf[2] & 0xffff;
1710 
1711 	mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
1712 	mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
1713 	mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL;
1714 
1715 	mpp_data->pool_size = retbuf[4];
1716 	mpp_data->loan_request = retbuf[5];
1717 	mpp_data->backing_mem = retbuf[6];
1718 
1719 	return rc;
1720 }
1721 EXPORT_SYMBOL(h_get_mpp);
1722 
1723 int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
1724 {
1725 	int rc;
1726 	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
1727 
1728 	rc = plpar_hcall9(H_GET_MPP_X, retbuf);
1729 
1730 	mpp_x_data->coalesced_bytes = retbuf[0];
1731 	mpp_x_data->pool_coalesced_bytes = retbuf[1];
1732 	mpp_x_data->pool_purr_cycles = retbuf[2];
1733 	mpp_x_data->pool_spurr_cycles = retbuf[3];
1734 
1735 	return rc;
1736 }
1737 
1738 static unsigned long vsid_unscramble(unsigned long vsid, int ssize)
1739 {
1740 	unsigned long protovsid;
1741 	unsigned long va_bits = VA_BITS;
1742 	unsigned long modinv, vsid_modulus;
1743 	unsigned long max_mod_inv, tmp_modinv;
1744 
1745 	if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
1746 		va_bits = 65;
1747 
1748 	if (ssize == MMU_SEGSIZE_256M) {
1749 		modinv = VSID_MULINV_256M;
1750 		vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
1751 	} else {
1752 		modinv = VSID_MULINV_1T;
1753 		vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
1754 	}
1755 
1756 	/*
1757 	 * vsid outside our range.
1758 	 */
1759 	if (vsid >= vsid_modulus)
1760 		return 0;
1761 
1762 	/*
1763 	 * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
1764 	 * and vsid = (protovsid * x) % vsid_modulus, then we say:
1765 	 *   protovsid = (vsid * modinv) % vsid_modulus
1766 	 */
1767 
1768 	/* Check if (vsid * modinv) overflow (63 bits) */
1769 	max_mod_inv = 0x7fffffffffffffffull / vsid;
1770 	if (modinv < max_mod_inv)
1771 		return (vsid * modinv) % vsid_modulus;
1772 
1773 	tmp_modinv = modinv/max_mod_inv;
1774 	modinv %= max_mod_inv;
1775 
1776 	protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
1777 	protovsid = (protovsid + vsid * modinv) % vsid_modulus;
1778 
1779 	return protovsid;
1780 }
1781 
1782 static int __init reserve_vrma_context_id(void)
1783 {
1784 	unsigned long protovsid;
1785 
1786 	/*
1787 	 * Reserve context ids which map to reserved virtual addresses. For now
1788 	 * we only reserve the context id which maps to the VRMA VSID. We ignore
1789 	 * the addresses in "ibm,adjunct-virtual-addresses" because we don't
1790 	 * enable adjunct support via the "ibm,client-architecture-support"
1791 	 * interface.
1792 	 */
1793 	protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
1794 	hash__reserve_context_id(protovsid >> ESID_BITS_1T);
1795 	return 0;
1796 }
1797 machine_device_initcall(pseries, reserve_vrma_context_id);
1798 
1799 #ifdef CONFIG_DEBUG_FS
1800 /* debugfs file interface for vpa data */
1801 static ssize_t vpa_file_read(struct file *filp, char __user *buf, size_t len,
1802 			      loff_t *pos)
1803 {
1804 	int cpu = (long)filp->private_data;
1805 	struct lppaca *lppaca = &lppaca_of(cpu);
1806 
1807 	return simple_read_from_buffer(buf, len, pos, lppaca,
1808 				sizeof(struct lppaca));
1809 }
1810 
1811 static const struct file_operations vpa_fops = {
1812 	.open		= simple_open,
1813 	.read		= vpa_file_read,
1814 	.llseek		= default_llseek,
1815 };
1816 
1817 static int __init vpa_debugfs_init(void)
1818 {
1819 	char name[16];
1820 	long i;
1821 	static struct dentry *vpa_dir;
1822 
1823 	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
1824 		return 0;
1825 
1826 	vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root);
1827 	if (!vpa_dir) {
1828 		pr_warn("%s: can't create vpa root dir\n", __func__);
1829 		return -ENOMEM;
1830 	}
1831 
1832 	/* set up the per-cpu vpa file*/
1833 	for_each_possible_cpu(i) {
1834 		struct dentry *d;
1835 
1836 		sprintf(name, "cpu-%ld", i);
1837 
1838 		d = debugfs_create_file(name, 0400, vpa_dir, (void *)i,
1839 					&vpa_fops);
1840 		if (!d) {
1841 			pr_warn("%s: can't create per-cpu vpa file\n",
1842 					__func__);
1843 			return -ENOMEM;
1844 		}
1845 	}
1846 
1847 	return 0;
1848 }
1849 machine_arch_initcall(pseries, vpa_debugfs_init);
1850 #endif /* CONFIG_DEBUG_FS */
1851