xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision 9144f784f852f9a125cabe9927b986d909bfa439)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  Copyright (C) 1995  Linus Torvalds
4   *
5   *  Pentium III FXSR, SSE support
6   *	Gareth Hughes <gareth@valinux.com>, May 2000
7   *
8   *  X86-64 port
9   *	Andi Kleen.
10   *
11   *	CPU hotplug support - ashok.raj@intel.com
12   */
13  
14  /*
15   * This file handles the architecture-dependent parts of process handling..
16   */
17  
18  #include <linux/cpu.h>
19  #include <linux/errno.h>
20  #include <linux/sched.h>
21  #include <linux/sched/task.h>
22  #include <linux/sched/task_stack.h>
23  #include <linux/fs.h>
24  #include <linux/kernel.h>
25  #include <linux/mm.h>
26  #include <linux/elfcore.h>
27  #include <linux/smp.h>
28  #include <linux/slab.h>
29  #include <linux/user.h>
30  #include <linux/interrupt.h>
31  #include <linux/delay.h>
32  #include <linux/export.h>
33  #include <linux/ptrace.h>
34  #include <linux/notifier.h>
35  #include <linux/kprobes.h>
36  #include <linux/kdebug.h>
37  #include <linux/prctl.h>
38  #include <linux/uaccess.h>
39  #include <linux/io.h>
40  #include <linux/ftrace.h>
41  #include <linux/syscalls.h>
42  #include <linux/iommu.h>
43  
44  #include <asm/processor.h>
45  #include <asm/pkru.h>
46  #include <asm/fpu/sched.h>
47  #include <asm/mmu_context.h>
48  #include <asm/prctl.h>
49  #include <asm/desc.h>
50  #include <asm/proto.h>
51  #include <asm/ia32.h>
52  #include <asm/debugreg.h>
53  #include <asm/switch_to.h>
54  #include <asm/xen/hypervisor.h>
55  #include <asm/vdso.h>
56  #include <asm/resctrl.h>
57  #include <asm/unistd.h>
58  #include <asm/fsgsbase.h>
59  #ifdef CONFIG_IA32_EMULATION
60  /* Not included via unistd.h */
61  #include <asm/unistd_32_ia32.h>
62  #endif
63  
64  #include "process.h"
65  
66  /* Prints also some state that isn't saved in the pt_regs */
__show_regs(struct pt_regs * regs,enum show_regs_mode mode,const char * log_lvl)67  void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
68  		 const char *log_lvl)
69  {
70  	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
71  	unsigned long d0, d1, d2, d3, d6, d7;
72  	unsigned int fsindex, gsindex;
73  	unsigned int ds, es;
74  
75  	show_iret_regs(regs, log_lvl);
76  
77  	if (regs->orig_ax != -1)
78  		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
79  	else
80  		pr_cont("\n");
81  
82  	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
83  	       log_lvl, regs->ax, regs->bx, regs->cx);
84  	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
85  	       log_lvl, regs->dx, regs->si, regs->di);
86  	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
87  	       log_lvl, regs->bp, regs->r8, regs->r9);
88  	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
89  	       log_lvl, regs->r10, regs->r11, regs->r12);
90  	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
91  	       log_lvl, regs->r13, regs->r14, regs->r15);
92  
93  	if (mode == SHOW_REGS_SHORT)
94  		return;
95  
96  	if (mode == SHOW_REGS_USER) {
97  		rdmsrl(MSR_FS_BASE, fs);
98  		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
99  		printk("%sFS:  %016lx GS:  %016lx\n",
100  		       log_lvl, fs, shadowgs);
101  		return;
102  	}
103  
104  	asm("movl %%ds,%0" : "=r" (ds));
105  	asm("movl %%es,%0" : "=r" (es));
106  	asm("movl %%fs,%0" : "=r" (fsindex));
107  	asm("movl %%gs,%0" : "=r" (gsindex));
108  
109  	rdmsrl(MSR_FS_BASE, fs);
110  	rdmsrl(MSR_GS_BASE, gs);
111  	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
112  
113  	cr0 = read_cr0();
114  	cr2 = read_cr2();
115  	cr3 = __read_cr3();
116  	cr4 = __read_cr4();
117  
118  	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
119  	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
120  	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
121  		log_lvl, regs->cs, ds, es, cr0);
122  	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
123  		log_lvl, cr2, cr3, cr4);
124  
125  	get_debugreg(d0, 0);
126  	get_debugreg(d1, 1);
127  	get_debugreg(d2, 2);
128  	get_debugreg(d3, 3);
129  	get_debugreg(d6, 6);
130  	get_debugreg(d7, 7);
131  
132  	/* Only print out debug registers if they are in their non-default state. */
133  	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
134  	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
135  		printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
136  		       log_lvl, d0, d1, d2);
137  		printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
138  		       log_lvl, d3, d6, d7);
139  	}
140  
141  	if (cr4 & X86_CR4_PKE)
142  		printk("%sPKRU: %08x\n", log_lvl, read_pkru());
143  }
144  
release_thread(struct task_struct * dead_task)145  void release_thread(struct task_struct *dead_task)
146  {
147  	WARN_ON(dead_task->mm);
148  }
149  
150  enum which_selector {
151  	FS,
152  	GS
153  };
154  
155  /*
156   * Out of line to be protected from kprobes and tracing. If this would be
157   * traced or probed than any access to a per CPU variable happens with
158   * the wrong GS.
159   *
160   * It is not used on Xen paravirt. When paravirt support is needed, it
161   * needs to be renamed with native_ prefix.
162   */
__rdgsbase_inactive(void)163  static noinstr unsigned long __rdgsbase_inactive(void)
164  {
165  	unsigned long gsbase;
166  
167  	lockdep_assert_irqs_disabled();
168  
169  	if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
170  		native_swapgs();
171  		gsbase = rdgsbase();
172  		native_swapgs();
173  	} else {
174  		instrumentation_begin();
175  		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
176  		instrumentation_end();
177  	}
178  
179  	return gsbase;
180  }
181  
182  /*
183   * Out of line to be protected from kprobes and tracing. If this would be
184   * traced or probed than any access to a per CPU variable happens with
185   * the wrong GS.
186   *
187   * It is not used on Xen paravirt. When paravirt support is needed, it
188   * needs to be renamed with native_ prefix.
189   */
__wrgsbase_inactive(unsigned long gsbase)190  static noinstr void __wrgsbase_inactive(unsigned long gsbase)
191  {
192  	lockdep_assert_irqs_disabled();
193  
194  	if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
195  		native_swapgs();
196  		wrgsbase(gsbase);
197  		native_swapgs();
198  	} else {
199  		instrumentation_begin();
200  		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
201  		instrumentation_end();
202  	}
203  }
204  
205  /*
206   * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
207   * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
208   * It's forcibly inlined because it'll generate better code and this function
209   * is hot.
210   */
save_base_legacy(struct task_struct * prev_p,unsigned short selector,enum which_selector which)211  static __always_inline void save_base_legacy(struct task_struct *prev_p,
212  					     unsigned short selector,
213  					     enum which_selector which)
214  {
215  	if (likely(selector == 0)) {
216  		/*
217  		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
218  		 * be the pre-existing saved base or it could be zero.  On AMD
219  		 * (with X86_BUG_NULL_SEG), the segment base could be almost
220  		 * anything.
221  		 *
222  		 * This branch is very hot (it's hit twice on almost every
223  		 * context switch between 64-bit programs), and avoiding
224  		 * the RDMSR helps a lot, so we just assume that whatever
225  		 * value is already saved is correct.  This matches historical
226  		 * Linux behavior, so it won't break existing applications.
227  		 *
228  		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
229  		 * report that the base is zero, it needs to actually be zero:
230  		 * see the corresponding logic in load_seg_legacy.
231  		 */
232  	} else {
233  		/*
234  		 * If the selector is 1, 2, or 3, then the base is zero on
235  		 * !X86_BUG_NULL_SEG CPUs and could be anything on
236  		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
237  		 * has never attempted to preserve the base across context
238  		 * switches.
239  		 *
240  		 * If selector > 3, then it refers to a real segment, and
241  		 * saving the base isn't necessary.
242  		 */
243  		if (which == FS)
244  			prev_p->thread.fsbase = 0;
245  		else
246  			prev_p->thread.gsbase = 0;
247  	}
248  }
249  
save_fsgs(struct task_struct * task)250  static __always_inline void save_fsgs(struct task_struct *task)
251  {
252  	savesegment(fs, task->thread.fsindex);
253  	savesegment(gs, task->thread.gsindex);
254  	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
255  		/*
256  		 * If FSGSBASE is enabled, we can't make any useful guesses
257  		 * about the base, and user code expects us to save the current
258  		 * value.  Fortunately, reading the base directly is efficient.
259  		 */
260  		task->thread.fsbase = rdfsbase();
261  		task->thread.gsbase = __rdgsbase_inactive();
262  	} else {
263  		save_base_legacy(task, task->thread.fsindex, FS);
264  		save_base_legacy(task, task->thread.gsindex, GS);
265  	}
266  }
267  
268  /*
269   * While a process is running,current->thread.fsbase and current->thread.gsbase
270   * may not match the corresponding CPU registers (see save_base_legacy()).
271   */
current_save_fsgs(void)272  void current_save_fsgs(void)
273  {
274  	unsigned long flags;
275  
276  	/* Interrupts need to be off for FSGSBASE */
277  	local_irq_save(flags);
278  	save_fsgs(current);
279  	local_irq_restore(flags);
280  }
281  #if IS_ENABLED(CONFIG_KVM)
282  EXPORT_SYMBOL_GPL(current_save_fsgs);
283  #endif
284  
loadseg(enum which_selector which,unsigned short sel)285  static __always_inline void loadseg(enum which_selector which,
286  				    unsigned short sel)
287  {
288  	if (which == FS)
289  		loadsegment(fs, sel);
290  	else
291  		load_gs_index(sel);
292  }
293  
load_seg_legacy(unsigned short prev_index,unsigned long prev_base,unsigned short next_index,unsigned long next_base,enum which_selector which)294  static __always_inline void load_seg_legacy(unsigned short prev_index,
295  					    unsigned long prev_base,
296  					    unsigned short next_index,
297  					    unsigned long next_base,
298  					    enum which_selector which)
299  {
300  	if (likely(next_index <= 3)) {
301  		/*
302  		 * The next task is using 64-bit TLS, is not using this
303  		 * segment at all, or is having fun with arcane CPU features.
304  		 */
305  		if (next_base == 0) {
306  			/*
307  			 * Nasty case: on AMD CPUs, we need to forcibly zero
308  			 * the base.
309  			 */
310  			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
311  				loadseg(which, __USER_DS);
312  				loadseg(which, next_index);
313  			} else {
314  				/*
315  				 * We could try to exhaustively detect cases
316  				 * under which we can skip the segment load,
317  				 * but there's really only one case that matters
318  				 * for performance: if both the previous and
319  				 * next states are fully zeroed, we can skip
320  				 * the load.
321  				 *
322  				 * (This assumes that prev_base == 0 has no
323  				 * false positives.  This is the case on
324  				 * Intel-style CPUs.)
325  				 */
326  				if (likely(prev_index | next_index | prev_base))
327  					loadseg(which, next_index);
328  			}
329  		} else {
330  			if (prev_index != next_index)
331  				loadseg(which, next_index);
332  			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
333  			       next_base);
334  		}
335  	} else {
336  		/*
337  		 * The next task is using a real segment.  Loading the selector
338  		 * is sufficient.
339  		 */
340  		loadseg(which, next_index);
341  	}
342  }
343  
344  /*
345   * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
346   * is not XSTATE managed on context switch because that would require a
347   * lookup in the task's FPU xsave buffer and require to keep that updated
348   * in various places.
349   */
x86_pkru_load(struct thread_struct * prev,struct thread_struct * next)350  static __always_inline void x86_pkru_load(struct thread_struct *prev,
351  					  struct thread_struct *next)
352  {
353  	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
354  		return;
355  
356  	/* Stash the prev task's value: */
357  	prev->pkru = rdpkru();
358  
359  	/*
360  	 * PKRU writes are slightly expensive.  Avoid them when not
361  	 * strictly necessary:
362  	 */
363  	if (prev->pkru != next->pkru)
364  		wrpkru(next->pkru);
365  }
366  
x86_fsgsbase_load(struct thread_struct * prev,struct thread_struct * next)367  static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
368  					      struct thread_struct *next)
369  {
370  	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
371  		/* Update the FS and GS selectors if they could have changed. */
372  		if (unlikely(prev->fsindex || next->fsindex))
373  			loadseg(FS, next->fsindex);
374  		if (unlikely(prev->gsindex || next->gsindex))
375  			loadseg(GS, next->gsindex);
376  
377  		/* Update the bases. */
378  		wrfsbase(next->fsbase);
379  		__wrgsbase_inactive(next->gsbase);
380  	} else {
381  		load_seg_legacy(prev->fsindex, prev->fsbase,
382  				next->fsindex, next->fsbase, FS);
383  		load_seg_legacy(prev->gsindex, prev->gsbase,
384  				next->gsindex, next->gsbase, GS);
385  	}
386  }
387  
x86_fsgsbase_read_task(struct task_struct * task,unsigned short selector)388  unsigned long x86_fsgsbase_read_task(struct task_struct *task,
389  				     unsigned short selector)
390  {
391  	unsigned short idx = selector >> 3;
392  	unsigned long base;
393  
394  	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
395  		if (unlikely(idx >= GDT_ENTRIES))
396  			return 0;
397  
398  		/*
399  		 * There are no user segments in the GDT with nonzero bases
400  		 * other than the TLS segments.
401  		 */
402  		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
403  			return 0;
404  
405  		idx -= GDT_ENTRY_TLS_MIN;
406  		base = get_desc_base(&task->thread.tls_array[idx]);
407  	} else {
408  #ifdef CONFIG_MODIFY_LDT_SYSCALL
409  		struct ldt_struct *ldt;
410  
411  		/*
412  		 * If performance here mattered, we could protect the LDT
413  		 * with RCU.  This is a slow path, though, so we can just
414  		 * take the mutex.
415  		 */
416  		mutex_lock(&task->mm->context.lock);
417  		ldt = task->mm->context.ldt;
418  		if (unlikely(!ldt || idx >= ldt->nr_entries))
419  			base = 0;
420  		else
421  			base = get_desc_base(ldt->entries + idx);
422  		mutex_unlock(&task->mm->context.lock);
423  #else
424  		base = 0;
425  #endif
426  	}
427  
428  	return base;
429  }
430  
x86_gsbase_read_cpu_inactive(void)431  unsigned long x86_gsbase_read_cpu_inactive(void)
432  {
433  	unsigned long gsbase;
434  
435  	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
436  		unsigned long flags;
437  
438  		local_irq_save(flags);
439  		gsbase = __rdgsbase_inactive();
440  		local_irq_restore(flags);
441  	} else {
442  		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
443  	}
444  
445  	return gsbase;
446  }
447  
x86_gsbase_write_cpu_inactive(unsigned long gsbase)448  void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
449  {
450  	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
451  		unsigned long flags;
452  
453  		local_irq_save(flags);
454  		__wrgsbase_inactive(gsbase);
455  		local_irq_restore(flags);
456  	} else {
457  		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
458  	}
459  }
460  
x86_fsbase_read_task(struct task_struct * task)461  unsigned long x86_fsbase_read_task(struct task_struct *task)
462  {
463  	unsigned long fsbase;
464  
465  	if (task == current)
466  		fsbase = x86_fsbase_read_cpu();
467  	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
468  		 (task->thread.fsindex == 0))
469  		fsbase = task->thread.fsbase;
470  	else
471  		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
472  
473  	return fsbase;
474  }
475  
x86_gsbase_read_task(struct task_struct * task)476  unsigned long x86_gsbase_read_task(struct task_struct *task)
477  {
478  	unsigned long gsbase;
479  
480  	if (task == current)
481  		gsbase = x86_gsbase_read_cpu_inactive();
482  	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
483  		 (task->thread.gsindex == 0))
484  		gsbase = task->thread.gsbase;
485  	else
486  		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
487  
488  	return gsbase;
489  }
490  
x86_fsbase_write_task(struct task_struct * task,unsigned long fsbase)491  void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
492  {
493  	WARN_ON_ONCE(task == current);
494  
495  	task->thread.fsbase = fsbase;
496  }
497  
x86_gsbase_write_task(struct task_struct * task,unsigned long gsbase)498  void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
499  {
500  	WARN_ON_ONCE(task == current);
501  
502  	task->thread.gsbase = gsbase;
503  }
504  
505  static void
start_thread_common(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp,unsigned int _cs,unsigned int _ss,unsigned int _ds)506  start_thread_common(struct pt_regs *regs, unsigned long new_ip,
507  		    unsigned long new_sp,
508  		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
509  {
510  	WARN_ON_ONCE(regs != current_pt_regs());
511  
512  	if (static_cpu_has(X86_BUG_NULL_SEG)) {
513  		/* Loading zero below won't clear the base. */
514  		loadsegment(fs, __USER_DS);
515  		load_gs_index(__USER_DS);
516  	}
517  
518  	reset_thread_features();
519  
520  	loadsegment(fs, 0);
521  	loadsegment(es, _ds);
522  	loadsegment(ds, _ds);
523  	load_gs_index(0);
524  
525  	regs->ip		= new_ip;
526  	regs->sp		= new_sp;
527  	regs->cs		= _cs;
528  	regs->ss		= _ss;
529  	regs->flags		= X86_EFLAGS_IF;
530  }
531  
532  void
start_thread(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp)533  start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
534  {
535  	start_thread_common(regs, new_ip, new_sp,
536  			    __USER_CS, __USER_DS, 0);
537  }
538  EXPORT_SYMBOL_GPL(start_thread);
539  
540  #ifdef CONFIG_COMPAT
compat_start_thread(struct pt_regs * regs,u32 new_ip,u32 new_sp,bool x32)541  void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
542  {
543  	start_thread_common(regs, new_ip, new_sp,
544  			    x32 ? __USER_CS : __USER32_CS,
545  			    __USER_DS, __USER_DS);
546  }
547  #endif
548  
549  /*
550   *	switch_to(x,y) should switch tasks from x to y.
551   *
552   * This could still be optimized:
553   * - fold all the options into a flag word and test it with a single test.
554   * - could test fs/gs bitsliced
555   *
556   * Kprobes not supported here. Set the probe on schedule instead.
557   * Function graph tracer not supported too.
558   */
559  __no_kmsan_checks
560  __visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct * prev_p,struct task_struct * next_p)561  __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
562  {
563  	struct thread_struct *prev = &prev_p->thread;
564  	struct thread_struct *next = &next_p->thread;
565  	struct fpu *prev_fpu = &prev->fpu;
566  	int cpu = smp_processor_id();
567  
568  	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
569  		     this_cpu_read(pcpu_hot.hardirq_stack_inuse));
570  
571  	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
572  		switch_fpu_prepare(prev_fpu, cpu);
573  
574  	/* We must save %fs and %gs before load_TLS() because
575  	 * %fs and %gs may be cleared by load_TLS().
576  	 *
577  	 * (e.g. xen_load_tls())
578  	 */
579  	save_fsgs(prev_p);
580  
581  	/*
582  	 * Load TLS before restoring any segments so that segment loads
583  	 * reference the correct GDT entries.
584  	 */
585  	load_TLS(next, cpu);
586  
587  	/*
588  	 * Leave lazy mode, flushing any hypercalls made here.  This
589  	 * must be done after loading TLS entries in the GDT but before
590  	 * loading segments that might reference them.
591  	 */
592  	arch_end_context_switch(next_p);
593  
594  	/* Switch DS and ES.
595  	 *
596  	 * Reading them only returns the selectors, but writing them (if
597  	 * nonzero) loads the full descriptor from the GDT or LDT.  The
598  	 * LDT for next is loaded in switch_mm, and the GDT is loaded
599  	 * above.
600  	 *
601  	 * We therefore need to write new values to the segment
602  	 * registers on every context switch unless both the new and old
603  	 * values are zero.
604  	 *
605  	 * Note that we don't need to do anything for CS and SS, as
606  	 * those are saved and restored as part of pt_regs.
607  	 */
608  	savesegment(es, prev->es);
609  	if (unlikely(next->es | prev->es))
610  		loadsegment(es, next->es);
611  
612  	savesegment(ds, prev->ds);
613  	if (unlikely(next->ds | prev->ds))
614  		loadsegment(ds, next->ds);
615  
616  	x86_fsgsbase_load(prev, next);
617  
618  	x86_pkru_load(prev, next);
619  
620  	/*
621  	 * Switch the PDA and FPU contexts.
622  	 */
623  	raw_cpu_write(pcpu_hot.current_task, next_p);
624  	raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
625  
626  	switch_fpu_finish();
627  
628  	/* Reload sp0. */
629  	update_task_stack(next_p);
630  
631  	switch_to_extra(prev_p, next_p);
632  
633  	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
634  		/*
635  		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
636  		 * does not update the cached descriptor.  As a result, if we
637  		 * do SYSRET while SS is NULL, we'll end up in user mode with
638  		 * SS apparently equal to __USER_DS but actually unusable.
639  		 *
640  		 * The straightforward workaround would be to fix it up just
641  		 * before SYSRET, but that would slow down the system call
642  		 * fast paths.  Instead, we ensure that SS is never NULL in
643  		 * system call context.  We do this by replacing NULL SS
644  		 * selectors at every context switch.  SYSCALL sets up a valid
645  		 * SS, so the only way to get NULL is to re-enter the kernel
646  		 * from CPL 3 through an interrupt.  Since that can't happen
647  		 * in the same task as a running syscall, we are guaranteed to
648  		 * context switch between every interrupt vector entry and a
649  		 * subsequent SYSRET.
650  		 *
651  		 * We read SS first because SS reads are much faster than
652  		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
653  		 * it previously had a different non-NULL value.
654  		 */
655  		unsigned short ss_sel;
656  		savesegment(ss, ss_sel);
657  		if (ss_sel != __KERNEL_DS)
658  			loadsegment(ss, __KERNEL_DS);
659  	}
660  
661  	/* Load the Intel cache allocation PQR MSR. */
662  	resctrl_sched_in(next_p);
663  
664  	return prev_p;
665  }
666  
set_personality_64bit(void)667  void set_personality_64bit(void)
668  {
669  	/* inherit personality from parent */
670  
671  	/* Make sure to be in 64bit mode */
672  	clear_thread_flag(TIF_ADDR32);
673  	/* Pretend that this comes from a 64bit execve */
674  	task_pt_regs(current)->orig_ax = __NR_execve;
675  	current_thread_info()->status &= ~TS_COMPAT;
676  	if (current->mm)
677  		__set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
678  
679  	/* TBD: overwrites user setup. Should have two bits.
680  	   But 64bit processes have always behaved this way,
681  	   so it's not too bad. The main problem is just that
682  	   32bit children are affected again. */
683  	current->personality &= ~READ_IMPLIES_EXEC;
684  }
685  
__set_personality_x32(void)686  static void __set_personality_x32(void)
687  {
688  #ifdef CONFIG_X86_X32_ABI
689  	if (current->mm)
690  		current->mm->context.flags = 0;
691  
692  	current->personality &= ~READ_IMPLIES_EXEC;
693  	/*
694  	 * in_32bit_syscall() uses the presence of the x32 syscall bit
695  	 * flag to determine compat status.  The x86 mmap() code relies on
696  	 * the syscall bitness so set x32 syscall bit right here to make
697  	 * in_32bit_syscall() work during exec().
698  	 *
699  	 * Pretend to come from a x32 execve.
700  	 */
701  	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
702  	current_thread_info()->status &= ~TS_COMPAT;
703  #endif
704  }
705  
__set_personality_ia32(void)706  static void __set_personality_ia32(void)
707  {
708  #ifdef CONFIG_IA32_EMULATION
709  	if (current->mm) {
710  		/*
711  		 * uprobes applied to this MM need to know this and
712  		 * cannot use user_64bit_mode() at that time.
713  		 */
714  		__set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
715  	}
716  
717  	current->personality |= force_personality32;
718  	/* Prepare the first "return" to user space */
719  	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
720  	current_thread_info()->status |= TS_COMPAT;
721  #endif
722  }
723  
set_personality_ia32(bool x32)724  void set_personality_ia32(bool x32)
725  {
726  	/* Make sure to be in 32bit mode */
727  	set_thread_flag(TIF_ADDR32);
728  
729  	if (x32)
730  		__set_personality_x32();
731  	else
732  		__set_personality_ia32();
733  }
734  EXPORT_SYMBOL_GPL(set_personality_ia32);
735  
736  #ifdef CONFIG_CHECKPOINT_RESTORE
prctl_map_vdso(const struct vdso_image * image,unsigned long addr)737  static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
738  {
739  	int ret;
740  
741  	ret = map_vdso_once(image, addr);
742  	if (ret)
743  		return ret;
744  
745  	return (long)image->size;
746  }
747  #endif
748  
749  #ifdef CONFIG_ADDRESS_MASKING
750  
751  #define LAM_U57_BITS 6
752  
enable_lam_func(void * __mm)753  static void enable_lam_func(void *__mm)
754  {
755  	struct mm_struct *mm = __mm;
756  
757  	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
758  		write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
759  		set_tlbstate_lam_mode(mm);
760  	}
761  }
762  
mm_enable_lam(struct mm_struct * mm)763  static void mm_enable_lam(struct mm_struct *mm)
764  {
765  	/*
766  	 * Even though the process must still be single-threaded at this
767  	 * point, kernel threads may be using the mm.  IPI those kernel
768  	 * threads if they exist.
769  	 */
770  	on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
771  	set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
772  }
773  
prctl_enable_tagged_addr(struct mm_struct * mm,unsigned long nr_bits)774  static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
775  {
776  	if (!cpu_feature_enabled(X86_FEATURE_LAM))
777  		return -ENODEV;
778  
779  	/* PTRACE_ARCH_PRCTL */
780  	if (current->mm != mm)
781  		return -EINVAL;
782  
783  	if (mm_valid_pasid(mm) &&
784  	    !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
785  		return -EINVAL;
786  
787  	if (mmap_write_lock_killable(mm))
788  		return -EINTR;
789  
790  	/*
791  	 * MM_CONTEXT_LOCK_LAM is set on clone.  Prevent LAM from
792  	 * being enabled unless the process is single threaded:
793  	 */
794  	if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
795  		mmap_write_unlock(mm);
796  		return -EBUSY;
797  	}
798  
799  	if (!nr_bits) {
800  		mmap_write_unlock(mm);
801  		return -EINVAL;
802  	} else if (nr_bits <= LAM_U57_BITS) {
803  		mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
804  		mm->context.untag_mask =  ~GENMASK(62, 57);
805  	} else {
806  		mmap_write_unlock(mm);
807  		return -EINVAL;
808  	}
809  
810  	mm_enable_lam(mm);
811  
812  	mmap_write_unlock(mm);
813  
814  	return 0;
815  }
816  #endif
817  
do_arch_prctl_64(struct task_struct * task,int option,unsigned long arg2)818  long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
819  {
820  	int ret = 0;
821  
822  	switch (option) {
823  	case ARCH_SET_GS: {
824  		if (unlikely(arg2 >= TASK_SIZE_MAX))
825  			return -EPERM;
826  
827  		preempt_disable();
828  		/*
829  		 * ARCH_SET_GS has always overwritten the index
830  		 * and the base. Zero is the most sensible value
831  		 * to put in the index, and is the only value that
832  		 * makes any sense if FSGSBASE is unavailable.
833  		 */
834  		if (task == current) {
835  			loadseg(GS, 0);
836  			x86_gsbase_write_cpu_inactive(arg2);
837  
838  			/*
839  			 * On non-FSGSBASE systems, save_base_legacy() expects
840  			 * that we also fill in thread.gsbase.
841  			 */
842  			task->thread.gsbase = arg2;
843  
844  		} else {
845  			task->thread.gsindex = 0;
846  			x86_gsbase_write_task(task, arg2);
847  		}
848  		preempt_enable();
849  		break;
850  	}
851  	case ARCH_SET_FS: {
852  		/*
853  		 * Not strictly needed for %fs, but do it for symmetry
854  		 * with %gs
855  		 */
856  		if (unlikely(arg2 >= TASK_SIZE_MAX))
857  			return -EPERM;
858  
859  		preempt_disable();
860  		/*
861  		 * Set the selector to 0 for the same reason
862  		 * as %gs above.
863  		 */
864  		if (task == current) {
865  			loadseg(FS, 0);
866  			x86_fsbase_write_cpu(arg2);
867  
868  			/*
869  			 * On non-FSGSBASE systems, save_base_legacy() expects
870  			 * that we also fill in thread.fsbase.
871  			 */
872  			task->thread.fsbase = arg2;
873  		} else {
874  			task->thread.fsindex = 0;
875  			x86_fsbase_write_task(task, arg2);
876  		}
877  		preempt_enable();
878  		break;
879  	}
880  	case ARCH_GET_FS: {
881  		unsigned long base = x86_fsbase_read_task(task);
882  
883  		ret = put_user(base, (unsigned long __user *)arg2);
884  		break;
885  	}
886  	case ARCH_GET_GS: {
887  		unsigned long base = x86_gsbase_read_task(task);
888  
889  		ret = put_user(base, (unsigned long __user *)arg2);
890  		break;
891  	}
892  
893  #ifdef CONFIG_CHECKPOINT_RESTORE
894  # ifdef CONFIG_X86_X32_ABI
895  	case ARCH_MAP_VDSO_X32:
896  		return prctl_map_vdso(&vdso_image_x32, arg2);
897  # endif
898  # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
899  	case ARCH_MAP_VDSO_32:
900  		return prctl_map_vdso(&vdso_image_32, arg2);
901  # endif
902  	case ARCH_MAP_VDSO_64:
903  		return prctl_map_vdso(&vdso_image_64, arg2);
904  #endif
905  #ifdef CONFIG_ADDRESS_MASKING
906  	case ARCH_GET_UNTAG_MASK:
907  		return put_user(task->mm->context.untag_mask,
908  				(unsigned long __user *)arg2);
909  	case ARCH_ENABLE_TAGGED_ADDR:
910  		return prctl_enable_tagged_addr(task->mm, arg2);
911  	case ARCH_FORCE_TAGGED_SVA:
912  		if (current != task)
913  			return -EINVAL;
914  		set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
915  		return 0;
916  	case ARCH_GET_MAX_TAG_BITS:
917  		if (!cpu_feature_enabled(X86_FEATURE_LAM))
918  			return put_user(0, (unsigned long __user *)arg2);
919  		else
920  			return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
921  #endif
922  	case ARCH_SHSTK_ENABLE:
923  	case ARCH_SHSTK_DISABLE:
924  	case ARCH_SHSTK_LOCK:
925  	case ARCH_SHSTK_UNLOCK:
926  	case ARCH_SHSTK_STATUS:
927  		return shstk_prctl(task, option, arg2);
928  	default:
929  		ret = -EINVAL;
930  		break;
931  	}
932  
933  	return ret;
934  }
935  
SYSCALL_DEFINE2(arch_prctl,int,option,unsigned long,arg2)936  SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
937  {
938  	long ret;
939  
940  	ret = do_arch_prctl_64(current, option, arg2);
941  	if (ret == -EINVAL)
942  		ret = do_arch_prctl_common(option, arg2);
943  
944  	return ret;
945  }
946  
947  #ifdef CONFIG_IA32_EMULATION
COMPAT_SYSCALL_DEFINE2(arch_prctl,int,option,unsigned long,arg2)948  COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
949  {
950  	return do_arch_prctl_common(option, arg2);
951  }
952  #endif
953  
KSTK_ESP(struct task_struct * task)954  unsigned long KSTK_ESP(struct task_struct *task)
955  {
956  	return task_pt_regs(task)->sp;
957  }
958