xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision e3d786a3)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/sched/task.h>
21 #include <linux/sched/task_stack.h>
22 #include <linux/fs.h>
23 #include <linux/kernel.h>
24 #include <linux/mm.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/delay.h>
31 #include <linux/export.h>
32 #include <linux/ptrace.h>
33 #include <linux/notifier.h>
34 #include <linux/kprobes.h>
35 #include <linux/kdebug.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 #include <linux/syscalls.h>
41 
42 #include <asm/pgtable.h>
43 #include <asm/processor.h>
44 #include <asm/fpu/internal.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/syscalls.h>
51 #include <asm/debugreg.h>
52 #include <asm/switch_to.h>
53 #include <asm/xen/hypervisor.h>
54 #include <asm/vdso.h>
55 #include <asm/intel_rdt_sched.h>
56 #include <asm/unistd.h>
57 #include <asm/fsgsbase.h>
58 #ifdef CONFIG_IA32_EMULATION
59 /* Not included via unistd.h */
60 #include <asm/unistd_32_ia32.h>
61 #endif
62 
63 /* Prints also some state that isn't saved in the pt_regs */
64 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
65 {
66 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
67 	unsigned long d0, d1, d2, d3, d6, d7;
68 	unsigned int fsindex, gsindex;
69 	unsigned int ds, cs, es;
70 
71 	show_iret_regs(regs);
72 
73 	if (regs->orig_ax != -1)
74 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
75 	else
76 		pr_cont("\n");
77 
78 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
79 	       regs->ax, regs->bx, regs->cx);
80 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
81 	       regs->dx, regs->si, regs->di);
82 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
83 	       regs->bp, regs->r8, regs->r9);
84 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
85 	       regs->r10, regs->r11, regs->r12);
86 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
87 	       regs->r13, regs->r14, regs->r15);
88 
89 	if (mode == SHOW_REGS_SHORT)
90 		return;
91 
92 	if (mode == SHOW_REGS_USER) {
93 		rdmsrl(MSR_FS_BASE, fs);
94 		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
95 		printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
96 		       fs, shadowgs);
97 		return;
98 	}
99 
100 	asm("movl %%ds,%0" : "=r" (ds));
101 	asm("movl %%cs,%0" : "=r" (cs));
102 	asm("movl %%es,%0" : "=r" (es));
103 	asm("movl %%fs,%0" : "=r" (fsindex));
104 	asm("movl %%gs,%0" : "=r" (gsindex));
105 
106 	rdmsrl(MSR_FS_BASE, fs);
107 	rdmsrl(MSR_GS_BASE, gs);
108 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
109 
110 	cr0 = read_cr0();
111 	cr2 = read_cr2();
112 	cr3 = __read_cr3();
113 	cr4 = __read_cr4();
114 
115 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
116 	       fs, fsindex, gs, gsindex, shadowgs);
117 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
118 			es, cr0);
119 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
120 			cr4);
121 
122 	get_debugreg(d0, 0);
123 	get_debugreg(d1, 1);
124 	get_debugreg(d2, 2);
125 	get_debugreg(d3, 3);
126 	get_debugreg(d6, 6);
127 	get_debugreg(d7, 7);
128 
129 	/* Only print out debug registers if they are in their non-default state. */
130 	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
131 	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
132 		printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
133 		       d0, d1, d2);
134 		printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
135 		       d3, d6, d7);
136 	}
137 
138 	if (boot_cpu_has(X86_FEATURE_OSPKE))
139 		printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
140 }
141 
142 void release_thread(struct task_struct *dead_task)
143 {
144 	if (dead_task->mm) {
145 #ifdef CONFIG_MODIFY_LDT_SYSCALL
146 		if (dead_task->mm->context.ldt) {
147 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
148 				dead_task->comm,
149 				dead_task->mm->context.ldt->entries,
150 				dead_task->mm->context.ldt->nr_entries);
151 			BUG();
152 		}
153 #endif
154 	}
155 }
156 
157 enum which_selector {
158 	FS,
159 	GS
160 };
161 
162 /*
163  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
164  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
165  * It's forcibly inlined because it'll generate better code and this function
166  * is hot.
167  */
168 static __always_inline void save_base_legacy(struct task_struct *prev_p,
169 					     unsigned short selector,
170 					     enum which_selector which)
171 {
172 	if (likely(selector == 0)) {
173 		/*
174 		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
175 		 * be the pre-existing saved base or it could be zero.  On AMD
176 		 * (with X86_BUG_NULL_SEG), the segment base could be almost
177 		 * anything.
178 		 *
179 		 * This branch is very hot (it's hit twice on almost every
180 		 * context switch between 64-bit programs), and avoiding
181 		 * the RDMSR helps a lot, so we just assume that whatever
182 		 * value is already saved is correct.  This matches historical
183 		 * Linux behavior, so it won't break existing applications.
184 		 *
185 		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
186 		 * report that the base is zero, it needs to actually be zero:
187 		 * see the corresponding logic in load_seg_legacy.
188 		 */
189 	} else {
190 		/*
191 		 * If the selector is 1, 2, or 3, then the base is zero on
192 		 * !X86_BUG_NULL_SEG CPUs and could be anything on
193 		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
194 		 * has never attempted to preserve the base across context
195 		 * switches.
196 		 *
197 		 * If selector > 3, then it refers to a real segment, and
198 		 * saving the base isn't necessary.
199 		 */
200 		if (which == FS)
201 			prev_p->thread.fsbase = 0;
202 		else
203 			prev_p->thread.gsbase = 0;
204 	}
205 }
206 
207 static __always_inline void save_fsgs(struct task_struct *task)
208 {
209 	savesegment(fs, task->thread.fsindex);
210 	savesegment(gs, task->thread.gsindex);
211 	save_base_legacy(task, task->thread.fsindex, FS);
212 	save_base_legacy(task, task->thread.gsindex, GS);
213 }
214 
215 #if IS_ENABLED(CONFIG_KVM)
216 /*
217  * While a process is running,current->thread.fsbase and current->thread.gsbase
218  * may not match the corresponding CPU registers (see save_base_legacy()). KVM
219  * wants an efficient way to save and restore FSBASE and GSBASE.
220  * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
221  */
222 void save_fsgs_for_kvm(void)
223 {
224 	save_fsgs(current);
225 }
226 EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
227 #endif
228 
229 static __always_inline void loadseg(enum which_selector which,
230 				    unsigned short sel)
231 {
232 	if (which == FS)
233 		loadsegment(fs, sel);
234 	else
235 		load_gs_index(sel);
236 }
237 
238 static __always_inline void load_seg_legacy(unsigned short prev_index,
239 					    unsigned long prev_base,
240 					    unsigned short next_index,
241 					    unsigned long next_base,
242 					    enum which_selector which)
243 {
244 	if (likely(next_index <= 3)) {
245 		/*
246 		 * The next task is using 64-bit TLS, is not using this
247 		 * segment at all, or is having fun with arcane CPU features.
248 		 */
249 		if (next_base == 0) {
250 			/*
251 			 * Nasty case: on AMD CPUs, we need to forcibly zero
252 			 * the base.
253 			 */
254 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
255 				loadseg(which, __USER_DS);
256 				loadseg(which, next_index);
257 			} else {
258 				/*
259 				 * We could try to exhaustively detect cases
260 				 * under which we can skip the segment load,
261 				 * but there's really only one case that matters
262 				 * for performance: if both the previous and
263 				 * next states are fully zeroed, we can skip
264 				 * the load.
265 				 *
266 				 * (This assumes that prev_base == 0 has no
267 				 * false positives.  This is the case on
268 				 * Intel-style CPUs.)
269 				 */
270 				if (likely(prev_index | next_index | prev_base))
271 					loadseg(which, next_index);
272 			}
273 		} else {
274 			if (prev_index != next_index)
275 				loadseg(which, next_index);
276 			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
277 			       next_base);
278 		}
279 	} else {
280 		/*
281 		 * The next task is using a real segment.  Loading the selector
282 		 * is sufficient.
283 		 */
284 		loadseg(which, next_index);
285 	}
286 }
287 
288 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
289 					      struct thread_struct *next)
290 {
291 	load_seg_legacy(prev->fsindex, prev->fsbase,
292 			next->fsindex, next->fsbase, FS);
293 	load_seg_legacy(prev->gsindex, prev->gsbase,
294 			next->gsindex, next->gsbase, GS);
295 }
296 
297 static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
298 					    unsigned short selector)
299 {
300 	unsigned short idx = selector >> 3;
301 	unsigned long base;
302 
303 	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
304 		if (unlikely(idx >= GDT_ENTRIES))
305 			return 0;
306 
307 		/*
308 		 * There are no user segments in the GDT with nonzero bases
309 		 * other than the TLS segments.
310 		 */
311 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
312 			return 0;
313 
314 		idx -= GDT_ENTRY_TLS_MIN;
315 		base = get_desc_base(&task->thread.tls_array[idx]);
316 	} else {
317 #ifdef CONFIG_MODIFY_LDT_SYSCALL
318 		struct ldt_struct *ldt;
319 
320 		/*
321 		 * If performance here mattered, we could protect the LDT
322 		 * with RCU.  This is a slow path, though, so we can just
323 		 * take the mutex.
324 		 */
325 		mutex_lock(&task->mm->context.lock);
326 		ldt = task->mm->context.ldt;
327 		if (unlikely(idx >= ldt->nr_entries))
328 			base = 0;
329 		else
330 			base = get_desc_base(ldt->entries + idx);
331 		mutex_unlock(&task->mm->context.lock);
332 #else
333 		base = 0;
334 #endif
335 	}
336 
337 	return base;
338 }
339 
340 void x86_fsbase_write_cpu(unsigned long fsbase)
341 {
342 	/*
343 	 * Set the selector to 0 as a notion, that the segment base is
344 	 * overwritten, which will be checked for skipping the segment load
345 	 * during context switch.
346 	 */
347 	loadseg(FS, 0);
348 	wrmsrl(MSR_FS_BASE, fsbase);
349 }
350 
351 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
352 {
353 	/* Set the selector to 0 for the same reason as %fs above. */
354 	loadseg(GS, 0);
355 	wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
356 }
357 
358 unsigned long x86_fsbase_read_task(struct task_struct *task)
359 {
360 	unsigned long fsbase;
361 
362 	if (task == current)
363 		fsbase = x86_fsbase_read_cpu();
364 	else if (task->thread.fsindex == 0)
365 		fsbase = task->thread.fsbase;
366 	else
367 		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
368 
369 	return fsbase;
370 }
371 
372 unsigned long x86_gsbase_read_task(struct task_struct *task)
373 {
374 	unsigned long gsbase;
375 
376 	if (task == current)
377 		gsbase = x86_gsbase_read_cpu_inactive();
378 	else if (task->thread.gsindex == 0)
379 		gsbase = task->thread.gsbase;
380 	else
381 		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
382 
383 	return gsbase;
384 }
385 
386 int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
387 {
388 	/*
389 	 * Not strictly needed for %fs, but do it for symmetry
390 	 * with %gs
391 	 */
392 	if (unlikely(fsbase >= TASK_SIZE_MAX))
393 		return -EPERM;
394 
395 	preempt_disable();
396 	task->thread.fsbase = fsbase;
397 	if (task == current)
398 		x86_fsbase_write_cpu(fsbase);
399 	task->thread.fsindex = 0;
400 	preempt_enable();
401 
402 	return 0;
403 }
404 
405 int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
406 {
407 	if (unlikely(gsbase >= TASK_SIZE_MAX))
408 		return -EPERM;
409 
410 	preempt_disable();
411 	task->thread.gsbase = gsbase;
412 	if (task == current)
413 		x86_gsbase_write_cpu_inactive(gsbase);
414 	task->thread.gsindex = 0;
415 	preempt_enable();
416 
417 	return 0;
418 }
419 
420 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
421 		unsigned long arg, struct task_struct *p, unsigned long tls)
422 {
423 	int err;
424 	struct pt_regs *childregs;
425 	struct fork_frame *fork_frame;
426 	struct inactive_task_frame *frame;
427 	struct task_struct *me = current;
428 
429 	childregs = task_pt_regs(p);
430 	fork_frame = container_of(childregs, struct fork_frame, regs);
431 	frame = &fork_frame->frame;
432 	frame->bp = 0;
433 	frame->ret_addr = (unsigned long) ret_from_fork;
434 	p->thread.sp = (unsigned long) fork_frame;
435 	p->thread.io_bitmap_ptr = NULL;
436 
437 	savesegment(gs, p->thread.gsindex);
438 	p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
439 	savesegment(fs, p->thread.fsindex);
440 	p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
441 	savesegment(es, p->thread.es);
442 	savesegment(ds, p->thread.ds);
443 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
444 
445 	if (unlikely(p->flags & PF_KTHREAD)) {
446 		/* kernel thread */
447 		memset(childregs, 0, sizeof(struct pt_regs));
448 		frame->bx = sp;		/* function */
449 		frame->r12 = arg;
450 		return 0;
451 	}
452 	frame->bx = 0;
453 	*childregs = *current_pt_regs();
454 
455 	childregs->ax = 0;
456 	if (sp)
457 		childregs->sp = sp;
458 
459 	err = -ENOMEM;
460 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
461 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
462 						  IO_BITMAP_BYTES, GFP_KERNEL);
463 		if (!p->thread.io_bitmap_ptr) {
464 			p->thread.io_bitmap_max = 0;
465 			return -ENOMEM;
466 		}
467 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
468 	}
469 
470 	/*
471 	 * Set a new TLS for the child thread?
472 	 */
473 	if (clone_flags & CLONE_SETTLS) {
474 #ifdef CONFIG_IA32_EMULATION
475 		if (in_ia32_syscall())
476 			err = do_set_thread_area(p, -1,
477 				(struct user_desc __user *)tls, 0);
478 		else
479 #endif
480 			err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
481 		if (err)
482 			goto out;
483 	}
484 	err = 0;
485 out:
486 	if (err && p->thread.io_bitmap_ptr) {
487 		kfree(p->thread.io_bitmap_ptr);
488 		p->thread.io_bitmap_max = 0;
489 	}
490 
491 	return err;
492 }
493 
494 static void
495 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
496 		    unsigned long new_sp,
497 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
498 {
499 	WARN_ON_ONCE(regs != current_pt_regs());
500 
501 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
502 		/* Loading zero below won't clear the base. */
503 		loadsegment(fs, __USER_DS);
504 		load_gs_index(__USER_DS);
505 	}
506 
507 	loadsegment(fs, 0);
508 	loadsegment(es, _ds);
509 	loadsegment(ds, _ds);
510 	load_gs_index(0);
511 
512 	regs->ip		= new_ip;
513 	regs->sp		= new_sp;
514 	regs->cs		= _cs;
515 	regs->ss		= _ss;
516 	regs->flags		= X86_EFLAGS_IF;
517 	force_iret();
518 }
519 
520 void
521 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
522 {
523 	start_thread_common(regs, new_ip, new_sp,
524 			    __USER_CS, __USER_DS, 0);
525 }
526 EXPORT_SYMBOL_GPL(start_thread);
527 
528 #ifdef CONFIG_COMPAT
529 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
530 {
531 	start_thread_common(regs, new_ip, new_sp,
532 			    test_thread_flag(TIF_X32)
533 			    ? __USER_CS : __USER32_CS,
534 			    __USER_DS, __USER_DS);
535 }
536 #endif
537 
538 /*
539  *	switch_to(x,y) should switch tasks from x to y.
540  *
541  * This could still be optimized:
542  * - fold all the options into a flag word and test it with a single test.
543  * - could test fs/gs bitsliced
544  *
545  * Kprobes not supported here. Set the probe on schedule instead.
546  * Function graph tracer not supported too.
547  */
548 __visible __notrace_funcgraph struct task_struct *
549 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
550 {
551 	struct thread_struct *prev = &prev_p->thread;
552 	struct thread_struct *next = &next_p->thread;
553 	struct fpu *prev_fpu = &prev->fpu;
554 	struct fpu *next_fpu = &next->fpu;
555 	int cpu = smp_processor_id();
556 	struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
557 
558 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
559 		     this_cpu_read(irq_count) != -1);
560 
561 	switch_fpu_prepare(prev_fpu, cpu);
562 
563 	/* We must save %fs and %gs before load_TLS() because
564 	 * %fs and %gs may be cleared by load_TLS().
565 	 *
566 	 * (e.g. xen_load_tls())
567 	 */
568 	save_fsgs(prev_p);
569 
570 	/*
571 	 * Load TLS before restoring any segments so that segment loads
572 	 * reference the correct GDT entries.
573 	 */
574 	load_TLS(next, cpu);
575 
576 	/*
577 	 * Leave lazy mode, flushing any hypercalls made here.  This
578 	 * must be done after loading TLS entries in the GDT but before
579 	 * loading segments that might reference them, and and it must
580 	 * be done before fpu__restore(), so the TS bit is up to
581 	 * date.
582 	 */
583 	arch_end_context_switch(next_p);
584 
585 	/* Switch DS and ES.
586 	 *
587 	 * Reading them only returns the selectors, but writing them (if
588 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
589 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
590 	 * above.
591 	 *
592 	 * We therefore need to write new values to the segment
593 	 * registers on every context switch unless both the new and old
594 	 * values are zero.
595 	 *
596 	 * Note that we don't need to do anything for CS and SS, as
597 	 * those are saved and restored as part of pt_regs.
598 	 */
599 	savesegment(es, prev->es);
600 	if (unlikely(next->es | prev->es))
601 		loadsegment(es, next->es);
602 
603 	savesegment(ds, prev->ds);
604 	if (unlikely(next->ds | prev->ds))
605 		loadsegment(ds, next->ds);
606 
607 	x86_fsgsbase_load(prev, next);
608 
609 	switch_fpu_finish(next_fpu, cpu);
610 
611 	/*
612 	 * Switch the PDA and FPU contexts.
613 	 */
614 	this_cpu_write(current_task, next_p);
615 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
616 
617 	/* Reload sp0. */
618 	update_task_stack(next_p);
619 
620 	/*
621 	 * Now maybe reload the debug registers and handle I/O bitmaps
622 	 */
623 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
624 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
625 		__switch_to_xtra(prev_p, next_p, tss);
626 
627 #ifdef CONFIG_XEN_PV
628 	/*
629 	 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
630 	 * current_pt_regs()->flags may not match the current task's
631 	 * intended IOPL.  We need to switch it manually.
632 	 */
633 	if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
634 		     prev->iopl != next->iopl))
635 		xen_set_iopl_mask(next->iopl);
636 #endif
637 
638 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
639 		/*
640 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
641 		 * does not update the cached descriptor.  As a result, if we
642 		 * do SYSRET while SS is NULL, we'll end up in user mode with
643 		 * SS apparently equal to __USER_DS but actually unusable.
644 		 *
645 		 * The straightforward workaround would be to fix it up just
646 		 * before SYSRET, but that would slow down the system call
647 		 * fast paths.  Instead, we ensure that SS is never NULL in
648 		 * system call context.  We do this by replacing NULL SS
649 		 * selectors at every context switch.  SYSCALL sets up a valid
650 		 * SS, so the only way to get NULL is to re-enter the kernel
651 		 * from CPL 3 through an interrupt.  Since that can't happen
652 		 * in the same task as a running syscall, we are guaranteed to
653 		 * context switch between every interrupt vector entry and a
654 		 * subsequent SYSRET.
655 		 *
656 		 * We read SS first because SS reads are much faster than
657 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
658 		 * it previously had a different non-NULL value.
659 		 */
660 		unsigned short ss_sel;
661 		savesegment(ss, ss_sel);
662 		if (ss_sel != __KERNEL_DS)
663 			loadsegment(ss, __KERNEL_DS);
664 	}
665 
666 	/* Load the Intel cache allocation PQR MSR. */
667 	intel_rdt_sched_in();
668 
669 	return prev_p;
670 }
671 
672 void set_personality_64bit(void)
673 {
674 	/* inherit personality from parent */
675 
676 	/* Make sure to be in 64bit mode */
677 	clear_thread_flag(TIF_IA32);
678 	clear_thread_flag(TIF_ADDR32);
679 	clear_thread_flag(TIF_X32);
680 	/* Pretend that this comes from a 64bit execve */
681 	task_pt_regs(current)->orig_ax = __NR_execve;
682 	current_thread_info()->status &= ~TS_COMPAT;
683 
684 	/* Ensure the corresponding mm is not marked. */
685 	if (current->mm)
686 		current->mm->context.ia32_compat = 0;
687 
688 	/* TBD: overwrites user setup. Should have two bits.
689 	   But 64bit processes have always behaved this way,
690 	   so it's not too bad. The main problem is just that
691 	   32bit childs are affected again. */
692 	current->personality &= ~READ_IMPLIES_EXEC;
693 }
694 
695 static void __set_personality_x32(void)
696 {
697 #ifdef CONFIG_X86_X32
698 	clear_thread_flag(TIF_IA32);
699 	set_thread_flag(TIF_X32);
700 	if (current->mm)
701 		current->mm->context.ia32_compat = TIF_X32;
702 	current->personality &= ~READ_IMPLIES_EXEC;
703 	/*
704 	 * in_32bit_syscall() uses the presence of the x32 syscall bit
705 	 * flag to determine compat status.  The x86 mmap() code relies on
706 	 * the syscall bitness so set x32 syscall bit right here to make
707 	 * in_32bit_syscall() work during exec().
708 	 *
709 	 * Pretend to come from a x32 execve.
710 	 */
711 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
712 	current_thread_info()->status &= ~TS_COMPAT;
713 #endif
714 }
715 
716 static void __set_personality_ia32(void)
717 {
718 #ifdef CONFIG_IA32_EMULATION
719 	set_thread_flag(TIF_IA32);
720 	clear_thread_flag(TIF_X32);
721 	if (current->mm)
722 		current->mm->context.ia32_compat = TIF_IA32;
723 	current->personality |= force_personality32;
724 	/* Prepare the first "return" to user space */
725 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
726 	current_thread_info()->status |= TS_COMPAT;
727 #endif
728 }
729 
730 void set_personality_ia32(bool x32)
731 {
732 	/* Make sure to be in 32bit mode */
733 	set_thread_flag(TIF_ADDR32);
734 
735 	if (x32)
736 		__set_personality_x32();
737 	else
738 		__set_personality_ia32();
739 }
740 EXPORT_SYMBOL_GPL(set_personality_ia32);
741 
742 #ifdef CONFIG_CHECKPOINT_RESTORE
743 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
744 {
745 	int ret;
746 
747 	ret = map_vdso_once(image, addr);
748 	if (ret)
749 		return ret;
750 
751 	return (long)image->size;
752 }
753 #endif
754 
755 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
756 {
757 	int ret = 0;
758 
759 	switch (option) {
760 	case ARCH_SET_GS: {
761 		ret = x86_gsbase_write_task(task, arg2);
762 		break;
763 	}
764 	case ARCH_SET_FS: {
765 		ret = x86_fsbase_write_task(task, arg2);
766 		break;
767 	}
768 	case ARCH_GET_FS: {
769 		unsigned long base = x86_fsbase_read_task(task);
770 
771 		ret = put_user(base, (unsigned long __user *)arg2);
772 		break;
773 	}
774 	case ARCH_GET_GS: {
775 		unsigned long base = x86_gsbase_read_task(task);
776 
777 		ret = put_user(base, (unsigned long __user *)arg2);
778 		break;
779 	}
780 
781 #ifdef CONFIG_CHECKPOINT_RESTORE
782 # ifdef CONFIG_X86_X32_ABI
783 	case ARCH_MAP_VDSO_X32:
784 		return prctl_map_vdso(&vdso_image_x32, arg2);
785 # endif
786 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
787 	case ARCH_MAP_VDSO_32:
788 		return prctl_map_vdso(&vdso_image_32, arg2);
789 # endif
790 	case ARCH_MAP_VDSO_64:
791 		return prctl_map_vdso(&vdso_image_64, arg2);
792 #endif
793 
794 	default:
795 		ret = -EINVAL;
796 		break;
797 	}
798 
799 	return ret;
800 }
801 
802 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
803 {
804 	long ret;
805 
806 	ret = do_arch_prctl_64(current, option, arg2);
807 	if (ret == -EINVAL)
808 		ret = do_arch_prctl_common(current, option, arg2);
809 
810 	return ret;
811 }
812 
813 #ifdef CONFIG_IA32_EMULATION
814 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
815 {
816 	return do_arch_prctl_common(current, option, arg2);
817 }
818 #endif
819 
820 unsigned long KSTK_ESP(struct task_struct *task)
821 {
822 	return task_pt_regs(task)->sp;
823 }
824