xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision 94c7b6fc)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38 
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52 
53 asmlinkage extern void ret_from_fork(void);
54 
55 __visible DEFINE_PER_CPU(unsigned long, old_rsp);
56 
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61 	unsigned long d0, d1, d2, d3, d6, d7;
62 	unsigned int fsindex, gsindex;
63 	unsigned int ds, cs, es;
64 
65 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
66 	printk_address(regs->ip);
67 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
68 			regs->sp, regs->flags);
69 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
70 	       regs->ax, regs->bx, regs->cx);
71 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
72 	       regs->dx, regs->si, regs->di);
73 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
74 	       regs->bp, regs->r8, regs->r9);
75 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
76 	       regs->r10, regs->r11, regs->r12);
77 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
78 	       regs->r13, regs->r14, regs->r15);
79 
80 	asm("movl %%ds,%0" : "=r" (ds));
81 	asm("movl %%cs,%0" : "=r" (cs));
82 	asm("movl %%es,%0" : "=r" (es));
83 	asm("movl %%fs,%0" : "=r" (fsindex));
84 	asm("movl %%gs,%0" : "=r" (gsindex));
85 
86 	rdmsrl(MSR_FS_BASE, fs);
87 	rdmsrl(MSR_GS_BASE, gs);
88 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
89 
90 	if (!all)
91 		return;
92 
93 	cr0 = read_cr0();
94 	cr2 = read_cr2();
95 	cr3 = read_cr3();
96 	cr4 = read_cr4();
97 
98 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
99 	       fs, fsindex, gs, gsindex, shadowgs);
100 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
101 			es, cr0);
102 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
103 			cr4);
104 
105 	get_debugreg(d0, 0);
106 	get_debugreg(d1, 1);
107 	get_debugreg(d2, 2);
108 	get_debugreg(d3, 3);
109 	get_debugreg(d6, 6);
110 	get_debugreg(d7, 7);
111 
112 	/* Only print out debug registers if they are in their non-default state. */
113 	if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
114 	    (d6 == DR6_RESERVED) && (d7 == 0x400))
115 		return;
116 
117 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
118 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
119 
120 }
121 
122 void release_thread(struct task_struct *dead_task)
123 {
124 	if (dead_task->mm) {
125 		if (dead_task->mm->context.size) {
126 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
127 				dead_task->comm,
128 				dead_task->mm->context.ldt,
129 				dead_task->mm->context.size);
130 			BUG();
131 		}
132 	}
133 }
134 
135 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
136 {
137 	struct user_desc ud = {
138 		.base_addr = addr,
139 		.limit = 0xfffff,
140 		.seg_32bit = 1,
141 		.limit_in_pages = 1,
142 		.useable = 1,
143 	};
144 	struct desc_struct *desc = t->thread.tls_array;
145 	desc += tls;
146 	fill_ldt(desc, &ud);
147 }
148 
149 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
150 {
151 	return get_desc_base(&t->thread.tls_array[tls]);
152 }
153 
154 int copy_thread(unsigned long clone_flags, unsigned long sp,
155 		unsigned long arg, struct task_struct *p)
156 {
157 	int err;
158 	struct pt_regs *childregs;
159 	struct task_struct *me = current;
160 
161 	p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
162 	childregs = task_pt_regs(p);
163 	p->thread.sp = (unsigned long) childregs;
164 	p->thread.usersp = me->thread.usersp;
165 	set_tsk_thread_flag(p, TIF_FORK);
166 	p->thread.fpu_counter = 0;
167 	p->thread.io_bitmap_ptr = NULL;
168 
169 	savesegment(gs, p->thread.gsindex);
170 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
171 	savesegment(fs, p->thread.fsindex);
172 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
173 	savesegment(es, p->thread.es);
174 	savesegment(ds, p->thread.ds);
175 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
176 
177 	if (unlikely(p->flags & PF_KTHREAD)) {
178 		/* kernel thread */
179 		memset(childregs, 0, sizeof(struct pt_regs));
180 		childregs->sp = (unsigned long)childregs;
181 		childregs->ss = __KERNEL_DS;
182 		childregs->bx = sp; /* function */
183 		childregs->bp = arg;
184 		childregs->orig_ax = -1;
185 		childregs->cs = __KERNEL_CS | get_kernel_rpl();
186 		childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
187 		return 0;
188 	}
189 	*childregs = *current_pt_regs();
190 
191 	childregs->ax = 0;
192 	if (sp)
193 		childregs->sp = sp;
194 
195 	err = -ENOMEM;
196 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
197 
198 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
199 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
200 						  IO_BITMAP_BYTES, GFP_KERNEL);
201 		if (!p->thread.io_bitmap_ptr) {
202 			p->thread.io_bitmap_max = 0;
203 			return -ENOMEM;
204 		}
205 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
206 	}
207 
208 	/*
209 	 * Set a new TLS for the child thread?
210 	 */
211 	if (clone_flags & CLONE_SETTLS) {
212 #ifdef CONFIG_IA32_EMULATION
213 		if (test_thread_flag(TIF_IA32))
214 			err = do_set_thread_area(p, -1,
215 				(struct user_desc __user *)childregs->si, 0);
216 		else
217 #endif
218 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
219 		if (err)
220 			goto out;
221 	}
222 	err = 0;
223 out:
224 	if (err && p->thread.io_bitmap_ptr) {
225 		kfree(p->thread.io_bitmap_ptr);
226 		p->thread.io_bitmap_max = 0;
227 	}
228 
229 	return err;
230 }
231 
232 static void
233 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
234 		    unsigned long new_sp,
235 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
236 {
237 	loadsegment(fs, 0);
238 	loadsegment(es, _ds);
239 	loadsegment(ds, _ds);
240 	load_gs_index(0);
241 	current->thread.usersp	= new_sp;
242 	regs->ip		= new_ip;
243 	regs->sp		= new_sp;
244 	this_cpu_write(old_rsp, new_sp);
245 	regs->cs		= _cs;
246 	regs->ss		= _ss;
247 	regs->flags		= X86_EFLAGS_IF;
248 }
249 
250 void
251 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
252 {
253 	start_thread_common(regs, new_ip, new_sp,
254 			    __USER_CS, __USER_DS, 0);
255 }
256 
257 #ifdef CONFIG_IA32_EMULATION
258 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
259 {
260 	start_thread_common(regs, new_ip, new_sp,
261 			    test_thread_flag(TIF_X32)
262 			    ? __USER_CS : __USER32_CS,
263 			    __USER_DS, __USER_DS);
264 }
265 #endif
266 
267 /*
268  *	switch_to(x,y) should switch tasks from x to y.
269  *
270  * This could still be optimized:
271  * - fold all the options into a flag word and test it with a single test.
272  * - could test fs/gs bitsliced
273  *
274  * Kprobes not supported here. Set the probe on schedule instead.
275  * Function graph tracer not supported too.
276  */
277 __visible __notrace_funcgraph struct task_struct *
278 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
279 {
280 	struct thread_struct *prev = &prev_p->thread;
281 	struct thread_struct *next = &next_p->thread;
282 	int cpu = smp_processor_id();
283 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
284 	unsigned fsindex, gsindex;
285 	fpu_switch_t fpu;
286 
287 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
288 
289 	/*
290 	 * Reload esp0, LDT and the page table pointer:
291 	 */
292 	load_sp0(tss, next);
293 
294 	/*
295 	 * Switch DS and ES.
296 	 * This won't pick up thread selector changes, but I guess that is ok.
297 	 */
298 	savesegment(es, prev->es);
299 	if (unlikely(next->es | prev->es))
300 		loadsegment(es, next->es);
301 
302 	savesegment(ds, prev->ds);
303 	if (unlikely(next->ds | prev->ds))
304 		loadsegment(ds, next->ds);
305 
306 
307 	/* We must save %fs and %gs before load_TLS() because
308 	 * %fs and %gs may be cleared by load_TLS().
309 	 *
310 	 * (e.g. xen_load_tls())
311 	 */
312 	savesegment(fs, fsindex);
313 	savesegment(gs, gsindex);
314 
315 	load_TLS(next, cpu);
316 
317 	/*
318 	 * Leave lazy mode, flushing any hypercalls made here.
319 	 * This must be done before restoring TLS segments so
320 	 * the GDT and LDT are properly updated, and must be
321 	 * done before math_state_restore, so the TS bit is up
322 	 * to date.
323 	 */
324 	arch_end_context_switch(next_p);
325 
326 	/*
327 	 * Switch FS and GS.
328 	 *
329 	 * Segment register != 0 always requires a reload.  Also
330 	 * reload when it has changed.  When prev process used 64bit
331 	 * base always reload to avoid an information leak.
332 	 */
333 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
334 		loadsegment(fs, next->fsindex);
335 		/*
336 		 * Check if the user used a selector != 0; if yes
337 		 *  clear 64bit base, since overloaded base is always
338 		 *  mapped to the Null selector
339 		 */
340 		if (fsindex)
341 			prev->fs = 0;
342 	}
343 	/* when next process has a 64bit base use it */
344 	if (next->fs)
345 		wrmsrl(MSR_FS_BASE, next->fs);
346 	prev->fsindex = fsindex;
347 
348 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
349 		load_gs_index(next->gsindex);
350 		if (gsindex)
351 			prev->gs = 0;
352 	}
353 	if (next->gs)
354 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
355 	prev->gsindex = gsindex;
356 
357 	switch_fpu_finish(next_p, fpu);
358 
359 	/*
360 	 * Switch the PDA and FPU contexts.
361 	 */
362 	prev->usersp = this_cpu_read(old_rsp);
363 	this_cpu_write(old_rsp, next->usersp);
364 	this_cpu_write(current_task, next_p);
365 
366 	/*
367 	 * If it were not for PREEMPT_ACTIVE we could guarantee that the
368 	 * preempt_count of all tasks was equal here and this would not be
369 	 * needed.
370 	 */
371 	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
372 	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
373 
374 	this_cpu_write(kernel_stack,
375 		  (unsigned long)task_stack_page(next_p) +
376 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
377 
378 	/*
379 	 * Now maybe reload the debug registers and handle I/O bitmaps
380 	 */
381 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
382 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
383 		__switch_to_xtra(prev_p, next_p, tss);
384 
385 	return prev_p;
386 }
387 
388 void set_personality_64bit(void)
389 {
390 	/* inherit personality from parent */
391 
392 	/* Make sure to be in 64bit mode */
393 	clear_thread_flag(TIF_IA32);
394 	clear_thread_flag(TIF_ADDR32);
395 	clear_thread_flag(TIF_X32);
396 
397 	/* Ensure the corresponding mm is not marked. */
398 	if (current->mm)
399 		current->mm->context.ia32_compat = 0;
400 
401 	/* TBD: overwrites user setup. Should have two bits.
402 	   But 64bit processes have always behaved this way,
403 	   so it's not too bad. The main problem is just that
404 	   32bit childs are affected again. */
405 	current->personality &= ~READ_IMPLIES_EXEC;
406 }
407 
408 void set_personality_ia32(bool x32)
409 {
410 	/* inherit personality from parent */
411 
412 	/* Make sure to be in 32bit mode */
413 	set_thread_flag(TIF_ADDR32);
414 
415 	/* Mark the associated mm as containing 32-bit tasks. */
416 	if (x32) {
417 		clear_thread_flag(TIF_IA32);
418 		set_thread_flag(TIF_X32);
419 		if (current->mm)
420 			current->mm->context.ia32_compat = TIF_X32;
421 		current->personality &= ~READ_IMPLIES_EXEC;
422 		/* is_compat_task() uses the presence of the x32
423 		   syscall bit flag to determine compat status */
424 		current_thread_info()->status &= ~TS_COMPAT;
425 	} else {
426 		set_thread_flag(TIF_IA32);
427 		clear_thread_flag(TIF_X32);
428 		if (current->mm)
429 			current->mm->context.ia32_compat = TIF_IA32;
430 		current->personality |= force_personality32;
431 		/* Prepare the first "return" to user space */
432 		current_thread_info()->status |= TS_COMPAT;
433 	}
434 }
435 EXPORT_SYMBOL_GPL(set_personality_ia32);
436 
437 unsigned long get_wchan(struct task_struct *p)
438 {
439 	unsigned long stack;
440 	u64 fp, ip;
441 	int count = 0;
442 
443 	if (!p || p == current || p->state == TASK_RUNNING)
444 		return 0;
445 	stack = (unsigned long)task_stack_page(p);
446 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
447 		return 0;
448 	fp = *(u64 *)(p->thread.sp);
449 	do {
450 		if (fp < (unsigned long)stack ||
451 		    fp >= (unsigned long)stack+THREAD_SIZE)
452 			return 0;
453 		ip = *(u64 *)(fp+8);
454 		if (!in_sched_functions(ip))
455 			return ip;
456 		fp = *(u64 *)fp;
457 	} while (count++ < 16);
458 	return 0;
459 }
460 
461 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
462 {
463 	int ret = 0;
464 	int doit = task == current;
465 	int cpu;
466 
467 	switch (code) {
468 	case ARCH_SET_GS:
469 		if (addr >= TASK_SIZE_OF(task))
470 			return -EPERM;
471 		cpu = get_cpu();
472 		/* handle small bases via the GDT because that's faster to
473 		   switch. */
474 		if (addr <= 0xffffffff) {
475 			set_32bit_tls(task, GS_TLS, addr);
476 			if (doit) {
477 				load_TLS(&task->thread, cpu);
478 				load_gs_index(GS_TLS_SEL);
479 			}
480 			task->thread.gsindex = GS_TLS_SEL;
481 			task->thread.gs = 0;
482 		} else {
483 			task->thread.gsindex = 0;
484 			task->thread.gs = addr;
485 			if (doit) {
486 				load_gs_index(0);
487 				ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
488 			}
489 		}
490 		put_cpu();
491 		break;
492 	case ARCH_SET_FS:
493 		/* Not strictly needed for fs, but do it for symmetry
494 		   with gs */
495 		if (addr >= TASK_SIZE_OF(task))
496 			return -EPERM;
497 		cpu = get_cpu();
498 		/* handle small bases via the GDT because that's faster to
499 		   switch. */
500 		if (addr <= 0xffffffff) {
501 			set_32bit_tls(task, FS_TLS, addr);
502 			if (doit) {
503 				load_TLS(&task->thread, cpu);
504 				loadsegment(fs, FS_TLS_SEL);
505 			}
506 			task->thread.fsindex = FS_TLS_SEL;
507 			task->thread.fs = 0;
508 		} else {
509 			task->thread.fsindex = 0;
510 			task->thread.fs = addr;
511 			if (doit) {
512 				/* set the selector to 0 to not confuse
513 				   __switch_to */
514 				loadsegment(fs, 0);
515 				ret = wrmsrl_safe(MSR_FS_BASE, addr);
516 			}
517 		}
518 		put_cpu();
519 		break;
520 	case ARCH_GET_FS: {
521 		unsigned long base;
522 		if (task->thread.fsindex == FS_TLS_SEL)
523 			base = read_32bit_tls(task, FS_TLS);
524 		else if (doit)
525 			rdmsrl(MSR_FS_BASE, base);
526 		else
527 			base = task->thread.fs;
528 		ret = put_user(base, (unsigned long __user *)addr);
529 		break;
530 	}
531 	case ARCH_GET_GS: {
532 		unsigned long base;
533 		unsigned gsindex;
534 		if (task->thread.gsindex == GS_TLS_SEL)
535 			base = read_32bit_tls(task, GS_TLS);
536 		else if (doit) {
537 			savesegment(gs, gsindex);
538 			if (gsindex)
539 				rdmsrl(MSR_KERNEL_GS_BASE, base);
540 			else
541 				base = task->thread.gs;
542 		} else
543 			base = task->thread.gs;
544 		ret = put_user(base, (unsigned long __user *)addr);
545 		break;
546 	}
547 
548 	default:
549 		ret = -EINVAL;
550 		break;
551 	}
552 
553 	return ret;
554 }
555 
556 long sys_arch_prctl(int code, unsigned long addr)
557 {
558 	return do_arch_prctl(current, code, addr);
559 }
560 
561 unsigned long KSTK_ESP(struct task_struct *task)
562 {
563 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
564 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
565 }
566