xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision 6774def6)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38 
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52 
53 asmlinkage extern void ret_from_fork(void);
54 
55 __visible DEFINE_PER_CPU(unsigned long, old_rsp);
56 
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61 	unsigned long d0, d1, d2, d3, d6, d7;
62 	unsigned int fsindex, gsindex;
63 	unsigned int ds, cs, es;
64 
65 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
66 	printk_address(regs->ip);
67 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
68 			regs->sp, regs->flags);
69 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
70 	       regs->ax, regs->bx, regs->cx);
71 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
72 	       regs->dx, regs->si, regs->di);
73 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
74 	       regs->bp, regs->r8, regs->r9);
75 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
76 	       regs->r10, regs->r11, regs->r12);
77 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
78 	       regs->r13, regs->r14, regs->r15);
79 
80 	asm("movl %%ds,%0" : "=r" (ds));
81 	asm("movl %%cs,%0" : "=r" (cs));
82 	asm("movl %%es,%0" : "=r" (es));
83 	asm("movl %%fs,%0" : "=r" (fsindex));
84 	asm("movl %%gs,%0" : "=r" (gsindex));
85 
86 	rdmsrl(MSR_FS_BASE, fs);
87 	rdmsrl(MSR_GS_BASE, gs);
88 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
89 
90 	if (!all)
91 		return;
92 
93 	cr0 = read_cr0();
94 	cr2 = read_cr2();
95 	cr3 = read_cr3();
96 	cr4 = read_cr4();
97 
98 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
99 	       fs, fsindex, gs, gsindex, shadowgs);
100 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
101 			es, cr0);
102 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
103 			cr4);
104 
105 	get_debugreg(d0, 0);
106 	get_debugreg(d1, 1);
107 	get_debugreg(d2, 2);
108 	get_debugreg(d3, 3);
109 	get_debugreg(d6, 6);
110 	get_debugreg(d7, 7);
111 
112 	/* Only print out debug registers if they are in their non-default state. */
113 	if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
114 	    (d6 == DR6_RESERVED) && (d7 == 0x400))
115 		return;
116 
117 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
118 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
119 
120 }
121 
122 void release_thread(struct task_struct *dead_task)
123 {
124 	if (dead_task->mm) {
125 		if (dead_task->mm->context.size) {
126 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
127 				dead_task->comm,
128 				dead_task->mm->context.ldt,
129 				dead_task->mm->context.size);
130 			BUG();
131 		}
132 	}
133 }
134 
135 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
136 {
137 	struct user_desc ud = {
138 		.base_addr = addr,
139 		.limit = 0xfffff,
140 		.seg_32bit = 1,
141 		.limit_in_pages = 1,
142 		.useable = 1,
143 	};
144 	struct desc_struct *desc = t->thread.tls_array;
145 	desc += tls;
146 	fill_ldt(desc, &ud);
147 }
148 
149 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
150 {
151 	return get_desc_base(&t->thread.tls_array[tls]);
152 }
153 
154 int copy_thread(unsigned long clone_flags, unsigned long sp,
155 		unsigned long arg, struct task_struct *p)
156 {
157 	int err;
158 	struct pt_regs *childregs;
159 	struct task_struct *me = current;
160 
161 	p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
162 	childregs = task_pt_regs(p);
163 	p->thread.sp = (unsigned long) childregs;
164 	p->thread.usersp = me->thread.usersp;
165 	set_tsk_thread_flag(p, TIF_FORK);
166 	p->thread.io_bitmap_ptr = NULL;
167 
168 	savesegment(gs, p->thread.gsindex);
169 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
170 	savesegment(fs, p->thread.fsindex);
171 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
172 	savesegment(es, p->thread.es);
173 	savesegment(ds, p->thread.ds);
174 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
175 
176 	if (unlikely(p->flags & PF_KTHREAD)) {
177 		/* kernel thread */
178 		memset(childregs, 0, sizeof(struct pt_regs));
179 		childregs->sp = (unsigned long)childregs;
180 		childregs->ss = __KERNEL_DS;
181 		childregs->bx = sp; /* function */
182 		childregs->bp = arg;
183 		childregs->orig_ax = -1;
184 		childregs->cs = __KERNEL_CS | get_kernel_rpl();
185 		childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
186 		return 0;
187 	}
188 	*childregs = *current_pt_regs();
189 
190 	childregs->ax = 0;
191 	if (sp)
192 		childregs->sp = sp;
193 
194 	err = -ENOMEM;
195 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
196 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
197 						  IO_BITMAP_BYTES, GFP_KERNEL);
198 		if (!p->thread.io_bitmap_ptr) {
199 			p->thread.io_bitmap_max = 0;
200 			return -ENOMEM;
201 		}
202 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
203 	}
204 
205 	/*
206 	 * Set a new TLS for the child thread?
207 	 */
208 	if (clone_flags & CLONE_SETTLS) {
209 #ifdef CONFIG_IA32_EMULATION
210 		if (test_thread_flag(TIF_IA32))
211 			err = do_set_thread_area(p, -1,
212 				(struct user_desc __user *)childregs->si, 0);
213 		else
214 #endif
215 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
216 		if (err)
217 			goto out;
218 	}
219 	err = 0;
220 out:
221 	if (err && p->thread.io_bitmap_ptr) {
222 		kfree(p->thread.io_bitmap_ptr);
223 		p->thread.io_bitmap_max = 0;
224 	}
225 
226 	return err;
227 }
228 
229 static void
230 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
231 		    unsigned long new_sp,
232 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
233 {
234 	loadsegment(fs, 0);
235 	loadsegment(es, _ds);
236 	loadsegment(ds, _ds);
237 	load_gs_index(0);
238 	current->thread.usersp	= new_sp;
239 	regs->ip		= new_ip;
240 	regs->sp		= new_sp;
241 	this_cpu_write(old_rsp, new_sp);
242 	regs->cs		= _cs;
243 	regs->ss		= _ss;
244 	regs->flags		= X86_EFLAGS_IF;
245 }
246 
247 void
248 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
249 {
250 	start_thread_common(regs, new_ip, new_sp,
251 			    __USER_CS, __USER_DS, 0);
252 }
253 
254 #ifdef CONFIG_IA32_EMULATION
255 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
256 {
257 	start_thread_common(regs, new_ip, new_sp,
258 			    test_thread_flag(TIF_X32)
259 			    ? __USER_CS : __USER32_CS,
260 			    __USER_DS, __USER_DS);
261 }
262 #endif
263 
264 /*
265  *	switch_to(x,y) should switch tasks from x to y.
266  *
267  * This could still be optimized:
268  * - fold all the options into a flag word and test it with a single test.
269  * - could test fs/gs bitsliced
270  *
271  * Kprobes not supported here. Set the probe on schedule instead.
272  * Function graph tracer not supported too.
273  */
274 __visible __notrace_funcgraph struct task_struct *
275 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
276 {
277 	struct thread_struct *prev = &prev_p->thread;
278 	struct thread_struct *next = &next_p->thread;
279 	int cpu = smp_processor_id();
280 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
281 	unsigned fsindex, gsindex;
282 	fpu_switch_t fpu;
283 
284 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
285 
286 	/*
287 	 * Reload esp0, LDT and the page table pointer:
288 	 */
289 	load_sp0(tss, next);
290 
291 	/*
292 	 * Switch DS and ES.
293 	 * This won't pick up thread selector changes, but I guess that is ok.
294 	 */
295 	savesegment(es, prev->es);
296 	if (unlikely(next->es | prev->es))
297 		loadsegment(es, next->es);
298 
299 	savesegment(ds, prev->ds);
300 	if (unlikely(next->ds | prev->ds))
301 		loadsegment(ds, next->ds);
302 
303 
304 	/* We must save %fs and %gs before load_TLS() because
305 	 * %fs and %gs may be cleared by load_TLS().
306 	 *
307 	 * (e.g. xen_load_tls())
308 	 */
309 	savesegment(fs, fsindex);
310 	savesegment(gs, gsindex);
311 
312 	load_TLS(next, cpu);
313 
314 	/*
315 	 * Leave lazy mode, flushing any hypercalls made here.
316 	 * This must be done before restoring TLS segments so
317 	 * the GDT and LDT are properly updated, and must be
318 	 * done before math_state_restore, so the TS bit is up
319 	 * to date.
320 	 */
321 	arch_end_context_switch(next_p);
322 
323 	/*
324 	 * Switch FS and GS.
325 	 *
326 	 * Segment register != 0 always requires a reload.  Also
327 	 * reload when it has changed.  When prev process used 64bit
328 	 * base always reload to avoid an information leak.
329 	 */
330 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
331 		loadsegment(fs, next->fsindex);
332 		/*
333 		 * Check if the user used a selector != 0; if yes
334 		 *  clear 64bit base, since overloaded base is always
335 		 *  mapped to the Null selector
336 		 */
337 		if (fsindex)
338 			prev->fs = 0;
339 	}
340 	/* when next process has a 64bit base use it */
341 	if (next->fs)
342 		wrmsrl(MSR_FS_BASE, next->fs);
343 	prev->fsindex = fsindex;
344 
345 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
346 		load_gs_index(next->gsindex);
347 		if (gsindex)
348 			prev->gs = 0;
349 	}
350 	if (next->gs)
351 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
352 	prev->gsindex = gsindex;
353 
354 	switch_fpu_finish(next_p, fpu);
355 
356 	/*
357 	 * Switch the PDA and FPU contexts.
358 	 */
359 	prev->usersp = this_cpu_read(old_rsp);
360 	this_cpu_write(old_rsp, next->usersp);
361 	this_cpu_write(current_task, next_p);
362 
363 	/*
364 	 * If it were not for PREEMPT_ACTIVE we could guarantee that the
365 	 * preempt_count of all tasks was equal here and this would not be
366 	 * needed.
367 	 */
368 	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
369 	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
370 
371 	this_cpu_write(kernel_stack,
372 		  (unsigned long)task_stack_page(next_p) +
373 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
374 
375 	/*
376 	 * Now maybe reload the debug registers and handle I/O bitmaps
377 	 */
378 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
379 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
380 		__switch_to_xtra(prev_p, next_p, tss);
381 
382 	return prev_p;
383 }
384 
385 void set_personality_64bit(void)
386 {
387 	/* inherit personality from parent */
388 
389 	/* Make sure to be in 64bit mode */
390 	clear_thread_flag(TIF_IA32);
391 	clear_thread_flag(TIF_ADDR32);
392 	clear_thread_flag(TIF_X32);
393 
394 	/* Ensure the corresponding mm is not marked. */
395 	if (current->mm)
396 		current->mm->context.ia32_compat = 0;
397 
398 	/* TBD: overwrites user setup. Should have two bits.
399 	   But 64bit processes have always behaved this way,
400 	   so it's not too bad. The main problem is just that
401 	   32bit childs are affected again. */
402 	current->personality &= ~READ_IMPLIES_EXEC;
403 }
404 
405 void set_personality_ia32(bool x32)
406 {
407 	/* inherit personality from parent */
408 
409 	/* Make sure to be in 32bit mode */
410 	set_thread_flag(TIF_ADDR32);
411 
412 	/* Mark the associated mm as containing 32-bit tasks. */
413 	if (x32) {
414 		clear_thread_flag(TIF_IA32);
415 		set_thread_flag(TIF_X32);
416 		if (current->mm)
417 			current->mm->context.ia32_compat = TIF_X32;
418 		current->personality &= ~READ_IMPLIES_EXEC;
419 		/* is_compat_task() uses the presence of the x32
420 		   syscall bit flag to determine compat status */
421 		current_thread_info()->status &= ~TS_COMPAT;
422 	} else {
423 		set_thread_flag(TIF_IA32);
424 		clear_thread_flag(TIF_X32);
425 		if (current->mm)
426 			current->mm->context.ia32_compat = TIF_IA32;
427 		current->personality |= force_personality32;
428 		/* Prepare the first "return" to user space */
429 		current_thread_info()->status |= TS_COMPAT;
430 	}
431 }
432 EXPORT_SYMBOL_GPL(set_personality_ia32);
433 
434 unsigned long get_wchan(struct task_struct *p)
435 {
436 	unsigned long stack;
437 	u64 fp, ip;
438 	int count = 0;
439 
440 	if (!p || p == current || p->state == TASK_RUNNING)
441 		return 0;
442 	stack = (unsigned long)task_stack_page(p);
443 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
444 		return 0;
445 	fp = *(u64 *)(p->thread.sp);
446 	do {
447 		if (fp < (unsigned long)stack ||
448 		    fp >= (unsigned long)stack+THREAD_SIZE)
449 			return 0;
450 		ip = *(u64 *)(fp+8);
451 		if (!in_sched_functions(ip))
452 			return ip;
453 		fp = *(u64 *)fp;
454 	} while (count++ < 16);
455 	return 0;
456 }
457 
458 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
459 {
460 	int ret = 0;
461 	int doit = task == current;
462 	int cpu;
463 
464 	switch (code) {
465 	case ARCH_SET_GS:
466 		if (addr >= TASK_SIZE_OF(task))
467 			return -EPERM;
468 		cpu = get_cpu();
469 		/* handle small bases via the GDT because that's faster to
470 		   switch. */
471 		if (addr <= 0xffffffff) {
472 			set_32bit_tls(task, GS_TLS, addr);
473 			if (doit) {
474 				load_TLS(&task->thread, cpu);
475 				load_gs_index(GS_TLS_SEL);
476 			}
477 			task->thread.gsindex = GS_TLS_SEL;
478 			task->thread.gs = 0;
479 		} else {
480 			task->thread.gsindex = 0;
481 			task->thread.gs = addr;
482 			if (doit) {
483 				load_gs_index(0);
484 				ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
485 			}
486 		}
487 		put_cpu();
488 		break;
489 	case ARCH_SET_FS:
490 		/* Not strictly needed for fs, but do it for symmetry
491 		   with gs */
492 		if (addr >= TASK_SIZE_OF(task))
493 			return -EPERM;
494 		cpu = get_cpu();
495 		/* handle small bases via the GDT because that's faster to
496 		   switch. */
497 		if (addr <= 0xffffffff) {
498 			set_32bit_tls(task, FS_TLS, addr);
499 			if (doit) {
500 				load_TLS(&task->thread, cpu);
501 				loadsegment(fs, FS_TLS_SEL);
502 			}
503 			task->thread.fsindex = FS_TLS_SEL;
504 			task->thread.fs = 0;
505 		} else {
506 			task->thread.fsindex = 0;
507 			task->thread.fs = addr;
508 			if (doit) {
509 				/* set the selector to 0 to not confuse
510 				   __switch_to */
511 				loadsegment(fs, 0);
512 				ret = wrmsrl_safe(MSR_FS_BASE, addr);
513 			}
514 		}
515 		put_cpu();
516 		break;
517 	case ARCH_GET_FS: {
518 		unsigned long base;
519 		if (task->thread.fsindex == FS_TLS_SEL)
520 			base = read_32bit_tls(task, FS_TLS);
521 		else if (doit)
522 			rdmsrl(MSR_FS_BASE, base);
523 		else
524 			base = task->thread.fs;
525 		ret = put_user(base, (unsigned long __user *)addr);
526 		break;
527 	}
528 	case ARCH_GET_GS: {
529 		unsigned long base;
530 		unsigned gsindex;
531 		if (task->thread.gsindex == GS_TLS_SEL)
532 			base = read_32bit_tls(task, GS_TLS);
533 		else if (doit) {
534 			savesegment(gs, gsindex);
535 			if (gsindex)
536 				rdmsrl(MSR_KERNEL_GS_BASE, base);
537 			else
538 				base = task->thread.gs;
539 		} else
540 			base = task->thread.gs;
541 		ret = put_user(base, (unsigned long __user *)addr);
542 		break;
543 	}
544 
545 	default:
546 		ret = -EINVAL;
547 		break;
548 	}
549 
550 	return ret;
551 }
552 
553 long sys_arch_prctl(int code, unsigned long addr)
554 {
555 	return do_arch_prctl(current, code, addr);
556 }
557 
558 unsigned long KSTK_ESP(struct task_struct *task)
559 {
560 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
561 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
562 }
563