xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision ac8b6f14)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/sched/task.h>
21 #include <linux/sched/task_stack.h>
22 #include <linux/fs.h>
23 #include <linux/kernel.h>
24 #include <linux/mm.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/delay.h>
31 #include <linux/export.h>
32 #include <linux/ptrace.h>
33 #include <linux/notifier.h>
34 #include <linux/kprobes.h>
35 #include <linux/kdebug.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 #include <linux/syscalls.h>
41 
42 #include <asm/pgtable.h>
43 #include <asm/processor.h>
44 #include <asm/fpu/internal.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/syscalls.h>
51 #include <asm/debugreg.h>
52 #include <asm/switch_to.h>
53 #include <asm/xen/hypervisor.h>
54 #include <asm/vdso.h>
55 #include <asm/intel_rdt_sched.h>
56 #include <asm/unistd.h>
57 #include <asm/fsgsbase.h>
58 #ifdef CONFIG_IA32_EMULATION
59 /* Not included via unistd.h */
60 #include <asm/unistd_32_ia32.h>
61 #endif
62 
63 #include "process.h"
64 
65 /* Prints also some state that isn't saved in the pt_regs */
66 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
67 {
68 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
69 	unsigned long d0, d1, d2, d3, d6, d7;
70 	unsigned int fsindex, gsindex;
71 	unsigned int ds, cs, es;
72 
73 	show_iret_regs(regs);
74 
75 	if (regs->orig_ax != -1)
76 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
77 	else
78 		pr_cont("\n");
79 
80 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
81 	       regs->ax, regs->bx, regs->cx);
82 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
83 	       regs->dx, regs->si, regs->di);
84 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
85 	       regs->bp, regs->r8, regs->r9);
86 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
87 	       regs->r10, regs->r11, regs->r12);
88 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
89 	       regs->r13, regs->r14, regs->r15);
90 
91 	if (mode == SHOW_REGS_SHORT)
92 		return;
93 
94 	if (mode == SHOW_REGS_USER) {
95 		rdmsrl(MSR_FS_BASE, fs);
96 		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
97 		printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
98 		       fs, shadowgs);
99 		return;
100 	}
101 
102 	asm("movl %%ds,%0" : "=r" (ds));
103 	asm("movl %%cs,%0" : "=r" (cs));
104 	asm("movl %%es,%0" : "=r" (es));
105 	asm("movl %%fs,%0" : "=r" (fsindex));
106 	asm("movl %%gs,%0" : "=r" (gsindex));
107 
108 	rdmsrl(MSR_FS_BASE, fs);
109 	rdmsrl(MSR_GS_BASE, gs);
110 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
111 
112 	cr0 = read_cr0();
113 	cr2 = read_cr2();
114 	cr3 = __read_cr3();
115 	cr4 = __read_cr4();
116 
117 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
118 	       fs, fsindex, gs, gsindex, shadowgs);
119 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
120 			es, cr0);
121 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
122 			cr4);
123 
124 	get_debugreg(d0, 0);
125 	get_debugreg(d1, 1);
126 	get_debugreg(d2, 2);
127 	get_debugreg(d3, 3);
128 	get_debugreg(d6, 6);
129 	get_debugreg(d7, 7);
130 
131 	/* Only print out debug registers if they are in their non-default state. */
132 	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
133 	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
134 		printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
135 		       d0, d1, d2);
136 		printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
137 		       d3, d6, d7);
138 	}
139 
140 	if (boot_cpu_has(X86_FEATURE_OSPKE))
141 		printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
142 }
143 
144 void release_thread(struct task_struct *dead_task)
145 {
146 	if (dead_task->mm) {
147 #ifdef CONFIG_MODIFY_LDT_SYSCALL
148 		if (dead_task->mm->context.ldt) {
149 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
150 				dead_task->comm,
151 				dead_task->mm->context.ldt->entries,
152 				dead_task->mm->context.ldt->nr_entries);
153 			BUG();
154 		}
155 #endif
156 	}
157 }
158 
159 enum which_selector {
160 	FS,
161 	GS
162 };
163 
164 /*
165  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
166  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
167  * It's forcibly inlined because it'll generate better code and this function
168  * is hot.
169  */
170 static __always_inline void save_base_legacy(struct task_struct *prev_p,
171 					     unsigned short selector,
172 					     enum which_selector which)
173 {
174 	if (likely(selector == 0)) {
175 		/*
176 		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
177 		 * be the pre-existing saved base or it could be zero.  On AMD
178 		 * (with X86_BUG_NULL_SEG), the segment base could be almost
179 		 * anything.
180 		 *
181 		 * This branch is very hot (it's hit twice on almost every
182 		 * context switch between 64-bit programs), and avoiding
183 		 * the RDMSR helps a lot, so we just assume that whatever
184 		 * value is already saved is correct.  This matches historical
185 		 * Linux behavior, so it won't break existing applications.
186 		 *
187 		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
188 		 * report that the base is zero, it needs to actually be zero:
189 		 * see the corresponding logic in load_seg_legacy.
190 		 */
191 	} else {
192 		/*
193 		 * If the selector is 1, 2, or 3, then the base is zero on
194 		 * !X86_BUG_NULL_SEG CPUs and could be anything on
195 		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
196 		 * has never attempted to preserve the base across context
197 		 * switches.
198 		 *
199 		 * If selector > 3, then it refers to a real segment, and
200 		 * saving the base isn't necessary.
201 		 */
202 		if (which == FS)
203 			prev_p->thread.fsbase = 0;
204 		else
205 			prev_p->thread.gsbase = 0;
206 	}
207 }
208 
209 static __always_inline void save_fsgs(struct task_struct *task)
210 {
211 	savesegment(fs, task->thread.fsindex);
212 	savesegment(gs, task->thread.gsindex);
213 	save_base_legacy(task, task->thread.fsindex, FS);
214 	save_base_legacy(task, task->thread.gsindex, GS);
215 }
216 
217 #if IS_ENABLED(CONFIG_KVM)
218 /*
219  * While a process is running,current->thread.fsbase and current->thread.gsbase
220  * may not match the corresponding CPU registers (see save_base_legacy()). KVM
221  * wants an efficient way to save and restore FSBASE and GSBASE.
222  * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
223  */
224 void save_fsgs_for_kvm(void)
225 {
226 	save_fsgs(current);
227 }
228 EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
229 #endif
230 
231 static __always_inline void loadseg(enum which_selector which,
232 				    unsigned short sel)
233 {
234 	if (which == FS)
235 		loadsegment(fs, sel);
236 	else
237 		load_gs_index(sel);
238 }
239 
240 static __always_inline void load_seg_legacy(unsigned short prev_index,
241 					    unsigned long prev_base,
242 					    unsigned short next_index,
243 					    unsigned long next_base,
244 					    enum which_selector which)
245 {
246 	if (likely(next_index <= 3)) {
247 		/*
248 		 * The next task is using 64-bit TLS, is not using this
249 		 * segment at all, or is having fun with arcane CPU features.
250 		 */
251 		if (next_base == 0) {
252 			/*
253 			 * Nasty case: on AMD CPUs, we need to forcibly zero
254 			 * the base.
255 			 */
256 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
257 				loadseg(which, __USER_DS);
258 				loadseg(which, next_index);
259 			} else {
260 				/*
261 				 * We could try to exhaustively detect cases
262 				 * under which we can skip the segment load,
263 				 * but there's really only one case that matters
264 				 * for performance: if both the previous and
265 				 * next states are fully zeroed, we can skip
266 				 * the load.
267 				 *
268 				 * (This assumes that prev_base == 0 has no
269 				 * false positives.  This is the case on
270 				 * Intel-style CPUs.)
271 				 */
272 				if (likely(prev_index | next_index | prev_base))
273 					loadseg(which, next_index);
274 			}
275 		} else {
276 			if (prev_index != next_index)
277 				loadseg(which, next_index);
278 			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
279 			       next_base);
280 		}
281 	} else {
282 		/*
283 		 * The next task is using a real segment.  Loading the selector
284 		 * is sufficient.
285 		 */
286 		loadseg(which, next_index);
287 	}
288 }
289 
290 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
291 					      struct thread_struct *next)
292 {
293 	load_seg_legacy(prev->fsindex, prev->fsbase,
294 			next->fsindex, next->fsbase, FS);
295 	load_seg_legacy(prev->gsindex, prev->gsbase,
296 			next->gsindex, next->gsbase, GS);
297 }
298 
299 static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
300 					    unsigned short selector)
301 {
302 	unsigned short idx = selector >> 3;
303 	unsigned long base;
304 
305 	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
306 		if (unlikely(idx >= GDT_ENTRIES))
307 			return 0;
308 
309 		/*
310 		 * There are no user segments in the GDT with nonzero bases
311 		 * other than the TLS segments.
312 		 */
313 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
314 			return 0;
315 
316 		idx -= GDT_ENTRY_TLS_MIN;
317 		base = get_desc_base(&task->thread.tls_array[idx]);
318 	} else {
319 #ifdef CONFIG_MODIFY_LDT_SYSCALL
320 		struct ldt_struct *ldt;
321 
322 		/*
323 		 * If performance here mattered, we could protect the LDT
324 		 * with RCU.  This is a slow path, though, so we can just
325 		 * take the mutex.
326 		 */
327 		mutex_lock(&task->mm->context.lock);
328 		ldt = task->mm->context.ldt;
329 		if (unlikely(idx >= ldt->nr_entries))
330 			base = 0;
331 		else
332 			base = get_desc_base(ldt->entries + idx);
333 		mutex_unlock(&task->mm->context.lock);
334 #else
335 		base = 0;
336 #endif
337 	}
338 
339 	return base;
340 }
341 
342 void x86_fsbase_write_cpu(unsigned long fsbase)
343 {
344 	/*
345 	 * Set the selector to 0 as a notion, that the segment base is
346 	 * overwritten, which will be checked for skipping the segment load
347 	 * during context switch.
348 	 */
349 	loadseg(FS, 0);
350 	wrmsrl(MSR_FS_BASE, fsbase);
351 }
352 
353 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
354 {
355 	/* Set the selector to 0 for the same reason as %fs above. */
356 	loadseg(GS, 0);
357 	wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
358 }
359 
360 unsigned long x86_fsbase_read_task(struct task_struct *task)
361 {
362 	unsigned long fsbase;
363 
364 	if (task == current)
365 		fsbase = x86_fsbase_read_cpu();
366 	else if (task->thread.fsindex == 0)
367 		fsbase = task->thread.fsbase;
368 	else
369 		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
370 
371 	return fsbase;
372 }
373 
374 unsigned long x86_gsbase_read_task(struct task_struct *task)
375 {
376 	unsigned long gsbase;
377 
378 	if (task == current)
379 		gsbase = x86_gsbase_read_cpu_inactive();
380 	else if (task->thread.gsindex == 0)
381 		gsbase = task->thread.gsbase;
382 	else
383 		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
384 
385 	return gsbase;
386 }
387 
388 int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
389 {
390 	/*
391 	 * Not strictly needed for %fs, but do it for symmetry
392 	 * with %gs
393 	 */
394 	if (unlikely(fsbase >= TASK_SIZE_MAX))
395 		return -EPERM;
396 
397 	preempt_disable();
398 	task->thread.fsbase = fsbase;
399 	if (task == current)
400 		x86_fsbase_write_cpu(fsbase);
401 	task->thread.fsindex = 0;
402 	preempt_enable();
403 
404 	return 0;
405 }
406 
407 int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
408 {
409 	if (unlikely(gsbase >= TASK_SIZE_MAX))
410 		return -EPERM;
411 
412 	preempt_disable();
413 	task->thread.gsbase = gsbase;
414 	if (task == current)
415 		x86_gsbase_write_cpu_inactive(gsbase);
416 	task->thread.gsindex = 0;
417 	preempt_enable();
418 
419 	return 0;
420 }
421 
422 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
423 		unsigned long arg, struct task_struct *p, unsigned long tls)
424 {
425 	int err;
426 	struct pt_regs *childregs;
427 	struct fork_frame *fork_frame;
428 	struct inactive_task_frame *frame;
429 	struct task_struct *me = current;
430 
431 	childregs = task_pt_regs(p);
432 	fork_frame = container_of(childregs, struct fork_frame, regs);
433 	frame = &fork_frame->frame;
434 	frame->bp = 0;
435 	frame->ret_addr = (unsigned long) ret_from_fork;
436 	p->thread.sp = (unsigned long) fork_frame;
437 	p->thread.io_bitmap_ptr = NULL;
438 
439 	savesegment(gs, p->thread.gsindex);
440 	p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
441 	savesegment(fs, p->thread.fsindex);
442 	p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
443 	savesegment(es, p->thread.es);
444 	savesegment(ds, p->thread.ds);
445 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
446 
447 	if (unlikely(p->flags & PF_KTHREAD)) {
448 		/* kernel thread */
449 		memset(childregs, 0, sizeof(struct pt_regs));
450 		frame->bx = sp;		/* function */
451 		frame->r12 = arg;
452 		return 0;
453 	}
454 	frame->bx = 0;
455 	*childregs = *current_pt_regs();
456 
457 	childregs->ax = 0;
458 	if (sp)
459 		childregs->sp = sp;
460 
461 	err = -ENOMEM;
462 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
463 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
464 						  IO_BITMAP_BYTES, GFP_KERNEL);
465 		if (!p->thread.io_bitmap_ptr) {
466 			p->thread.io_bitmap_max = 0;
467 			return -ENOMEM;
468 		}
469 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
470 	}
471 
472 	/*
473 	 * Set a new TLS for the child thread?
474 	 */
475 	if (clone_flags & CLONE_SETTLS) {
476 #ifdef CONFIG_IA32_EMULATION
477 		if (in_ia32_syscall())
478 			err = do_set_thread_area(p, -1,
479 				(struct user_desc __user *)tls, 0);
480 		else
481 #endif
482 			err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
483 		if (err)
484 			goto out;
485 	}
486 	err = 0;
487 out:
488 	if (err && p->thread.io_bitmap_ptr) {
489 		kfree(p->thread.io_bitmap_ptr);
490 		p->thread.io_bitmap_max = 0;
491 	}
492 
493 	return err;
494 }
495 
496 static void
497 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
498 		    unsigned long new_sp,
499 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
500 {
501 	WARN_ON_ONCE(regs != current_pt_regs());
502 
503 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
504 		/* Loading zero below won't clear the base. */
505 		loadsegment(fs, __USER_DS);
506 		load_gs_index(__USER_DS);
507 	}
508 
509 	loadsegment(fs, 0);
510 	loadsegment(es, _ds);
511 	loadsegment(ds, _ds);
512 	load_gs_index(0);
513 
514 	regs->ip		= new_ip;
515 	regs->sp		= new_sp;
516 	regs->cs		= _cs;
517 	regs->ss		= _ss;
518 	regs->flags		= X86_EFLAGS_IF;
519 	force_iret();
520 }
521 
522 void
523 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
524 {
525 	start_thread_common(regs, new_ip, new_sp,
526 			    __USER_CS, __USER_DS, 0);
527 }
528 EXPORT_SYMBOL_GPL(start_thread);
529 
530 #ifdef CONFIG_COMPAT
531 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
532 {
533 	start_thread_common(regs, new_ip, new_sp,
534 			    test_thread_flag(TIF_X32)
535 			    ? __USER_CS : __USER32_CS,
536 			    __USER_DS, __USER_DS);
537 }
538 #endif
539 
540 /*
541  *	switch_to(x,y) should switch tasks from x to y.
542  *
543  * This could still be optimized:
544  * - fold all the options into a flag word and test it with a single test.
545  * - could test fs/gs bitsliced
546  *
547  * Kprobes not supported here. Set the probe on schedule instead.
548  * Function graph tracer not supported too.
549  */
550 __visible __notrace_funcgraph struct task_struct *
551 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552 {
553 	struct thread_struct *prev = &prev_p->thread;
554 	struct thread_struct *next = &next_p->thread;
555 	struct fpu *prev_fpu = &prev->fpu;
556 	struct fpu *next_fpu = &next->fpu;
557 	int cpu = smp_processor_id();
558 
559 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
560 		     this_cpu_read(irq_count) != -1);
561 
562 	switch_fpu_prepare(prev_fpu, cpu);
563 
564 	/* We must save %fs and %gs before load_TLS() because
565 	 * %fs and %gs may be cleared by load_TLS().
566 	 *
567 	 * (e.g. xen_load_tls())
568 	 */
569 	save_fsgs(prev_p);
570 
571 	/*
572 	 * Load TLS before restoring any segments so that segment loads
573 	 * reference the correct GDT entries.
574 	 */
575 	load_TLS(next, cpu);
576 
577 	/*
578 	 * Leave lazy mode, flushing any hypercalls made here.  This
579 	 * must be done after loading TLS entries in the GDT but before
580 	 * loading segments that might reference them, and and it must
581 	 * be done before fpu__restore(), so the TS bit is up to
582 	 * date.
583 	 */
584 	arch_end_context_switch(next_p);
585 
586 	/* Switch DS and ES.
587 	 *
588 	 * Reading them only returns the selectors, but writing them (if
589 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
590 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
591 	 * above.
592 	 *
593 	 * We therefore need to write new values to the segment
594 	 * registers on every context switch unless both the new and old
595 	 * values are zero.
596 	 *
597 	 * Note that we don't need to do anything for CS and SS, as
598 	 * those are saved and restored as part of pt_regs.
599 	 */
600 	savesegment(es, prev->es);
601 	if (unlikely(next->es | prev->es))
602 		loadsegment(es, next->es);
603 
604 	savesegment(ds, prev->ds);
605 	if (unlikely(next->ds | prev->ds))
606 		loadsegment(ds, next->ds);
607 
608 	x86_fsgsbase_load(prev, next);
609 
610 	switch_fpu_finish(next_fpu, cpu);
611 
612 	/*
613 	 * Switch the PDA and FPU contexts.
614 	 */
615 	this_cpu_write(current_task, next_p);
616 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
617 
618 	/* Reload sp0. */
619 	update_task_stack(next_p);
620 
621 	switch_to_extra(prev_p, next_p);
622 
623 #ifdef CONFIG_XEN_PV
624 	/*
625 	 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
626 	 * current_pt_regs()->flags may not match the current task's
627 	 * intended IOPL.  We need to switch it manually.
628 	 */
629 	if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
630 		     prev->iopl != next->iopl))
631 		xen_set_iopl_mask(next->iopl);
632 #endif
633 
634 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
635 		/*
636 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
637 		 * does not update the cached descriptor.  As a result, if we
638 		 * do SYSRET while SS is NULL, we'll end up in user mode with
639 		 * SS apparently equal to __USER_DS but actually unusable.
640 		 *
641 		 * The straightforward workaround would be to fix it up just
642 		 * before SYSRET, but that would slow down the system call
643 		 * fast paths.  Instead, we ensure that SS is never NULL in
644 		 * system call context.  We do this by replacing NULL SS
645 		 * selectors at every context switch.  SYSCALL sets up a valid
646 		 * SS, so the only way to get NULL is to re-enter the kernel
647 		 * from CPL 3 through an interrupt.  Since that can't happen
648 		 * in the same task as a running syscall, we are guaranteed to
649 		 * context switch between every interrupt vector entry and a
650 		 * subsequent SYSRET.
651 		 *
652 		 * We read SS first because SS reads are much faster than
653 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
654 		 * it previously had a different non-NULL value.
655 		 */
656 		unsigned short ss_sel;
657 		savesegment(ss, ss_sel);
658 		if (ss_sel != __KERNEL_DS)
659 			loadsegment(ss, __KERNEL_DS);
660 	}
661 
662 	/* Load the Intel cache allocation PQR MSR. */
663 	intel_rdt_sched_in();
664 
665 	return prev_p;
666 }
667 
668 void set_personality_64bit(void)
669 {
670 	/* inherit personality from parent */
671 
672 	/* Make sure to be in 64bit mode */
673 	clear_thread_flag(TIF_IA32);
674 	clear_thread_flag(TIF_ADDR32);
675 	clear_thread_flag(TIF_X32);
676 	/* Pretend that this comes from a 64bit execve */
677 	task_pt_regs(current)->orig_ax = __NR_execve;
678 	current_thread_info()->status &= ~TS_COMPAT;
679 
680 	/* Ensure the corresponding mm is not marked. */
681 	if (current->mm)
682 		current->mm->context.ia32_compat = 0;
683 
684 	/* TBD: overwrites user setup. Should have two bits.
685 	   But 64bit processes have always behaved this way,
686 	   so it's not too bad. The main problem is just that
687 	   32bit childs are affected again. */
688 	current->personality &= ~READ_IMPLIES_EXEC;
689 }
690 
691 static void __set_personality_x32(void)
692 {
693 #ifdef CONFIG_X86_X32
694 	clear_thread_flag(TIF_IA32);
695 	set_thread_flag(TIF_X32);
696 	if (current->mm)
697 		current->mm->context.ia32_compat = TIF_X32;
698 	current->personality &= ~READ_IMPLIES_EXEC;
699 	/*
700 	 * in_32bit_syscall() uses the presence of the x32 syscall bit
701 	 * flag to determine compat status.  The x86 mmap() code relies on
702 	 * the syscall bitness so set x32 syscall bit right here to make
703 	 * in_32bit_syscall() work during exec().
704 	 *
705 	 * Pretend to come from a x32 execve.
706 	 */
707 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
708 	current_thread_info()->status &= ~TS_COMPAT;
709 #endif
710 }
711 
712 static void __set_personality_ia32(void)
713 {
714 #ifdef CONFIG_IA32_EMULATION
715 	set_thread_flag(TIF_IA32);
716 	clear_thread_flag(TIF_X32);
717 	if (current->mm)
718 		current->mm->context.ia32_compat = TIF_IA32;
719 	current->personality |= force_personality32;
720 	/* Prepare the first "return" to user space */
721 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
722 	current_thread_info()->status |= TS_COMPAT;
723 #endif
724 }
725 
726 void set_personality_ia32(bool x32)
727 {
728 	/* Make sure to be in 32bit mode */
729 	set_thread_flag(TIF_ADDR32);
730 
731 	if (x32)
732 		__set_personality_x32();
733 	else
734 		__set_personality_ia32();
735 }
736 EXPORT_SYMBOL_GPL(set_personality_ia32);
737 
738 #ifdef CONFIG_CHECKPOINT_RESTORE
739 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
740 {
741 	int ret;
742 
743 	ret = map_vdso_once(image, addr);
744 	if (ret)
745 		return ret;
746 
747 	return (long)image->size;
748 }
749 #endif
750 
751 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
752 {
753 	int ret = 0;
754 
755 	switch (option) {
756 	case ARCH_SET_GS: {
757 		ret = x86_gsbase_write_task(task, arg2);
758 		break;
759 	}
760 	case ARCH_SET_FS: {
761 		ret = x86_fsbase_write_task(task, arg2);
762 		break;
763 	}
764 	case ARCH_GET_FS: {
765 		unsigned long base = x86_fsbase_read_task(task);
766 
767 		ret = put_user(base, (unsigned long __user *)arg2);
768 		break;
769 	}
770 	case ARCH_GET_GS: {
771 		unsigned long base = x86_gsbase_read_task(task);
772 
773 		ret = put_user(base, (unsigned long __user *)arg2);
774 		break;
775 	}
776 
777 #ifdef CONFIG_CHECKPOINT_RESTORE
778 # ifdef CONFIG_X86_X32_ABI
779 	case ARCH_MAP_VDSO_X32:
780 		return prctl_map_vdso(&vdso_image_x32, arg2);
781 # endif
782 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
783 	case ARCH_MAP_VDSO_32:
784 		return prctl_map_vdso(&vdso_image_32, arg2);
785 # endif
786 	case ARCH_MAP_VDSO_64:
787 		return prctl_map_vdso(&vdso_image_64, arg2);
788 #endif
789 
790 	default:
791 		ret = -EINVAL;
792 		break;
793 	}
794 
795 	return ret;
796 }
797 
798 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
799 {
800 	long ret;
801 
802 	ret = do_arch_prctl_64(current, option, arg2);
803 	if (ret == -EINVAL)
804 		ret = do_arch_prctl_common(current, option, arg2);
805 
806 	return ret;
807 }
808 
809 #ifdef CONFIG_IA32_EMULATION
810 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
811 {
812 	return do_arch_prctl_common(current, option, arg2);
813 }
814 #endif
815 
816 unsigned long KSTK_ESP(struct task_struct *task)
817 {
818 	return task_pt_regs(task)->sp;
819 }
820