xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision 9f380456)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38 
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52 
53 asmlinkage extern void ret_from_fork(void);
54 
55 DEFINE_PER_CPU(unsigned long, old_rsp);
56 
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61 	unsigned long d0, d1, d2, d3, d6, d7;
62 	unsigned int fsindex, gsindex;
63 	unsigned int ds, cs, es;
64 
65 	show_regs_common();
66 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
67 	printk_address(regs->ip, 1);
68 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
69 			regs->sp, regs->flags);
70 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
71 	       regs->ax, regs->bx, regs->cx);
72 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
73 	       regs->dx, regs->si, regs->di);
74 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
75 	       regs->bp, regs->r8, regs->r9);
76 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
77 	       regs->r10, regs->r11, regs->r12);
78 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
79 	       regs->r13, regs->r14, regs->r15);
80 
81 	asm("movl %%ds,%0" : "=r" (ds));
82 	asm("movl %%cs,%0" : "=r" (cs));
83 	asm("movl %%es,%0" : "=r" (es));
84 	asm("movl %%fs,%0" : "=r" (fsindex));
85 	asm("movl %%gs,%0" : "=r" (gsindex));
86 
87 	rdmsrl(MSR_FS_BASE, fs);
88 	rdmsrl(MSR_GS_BASE, gs);
89 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
90 
91 	if (!all)
92 		return;
93 
94 	cr0 = read_cr0();
95 	cr2 = read_cr2();
96 	cr3 = read_cr3();
97 	cr4 = read_cr4();
98 
99 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
100 	       fs, fsindex, gs, gsindex, shadowgs);
101 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
102 			es, cr0);
103 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
104 			cr4);
105 
106 	get_debugreg(d0, 0);
107 	get_debugreg(d1, 1);
108 	get_debugreg(d2, 2);
109 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
110 	get_debugreg(d3, 3);
111 	get_debugreg(d6, 6);
112 	get_debugreg(d7, 7);
113 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
114 }
115 
116 void release_thread(struct task_struct *dead_task)
117 {
118 	if (dead_task->mm) {
119 		if (dead_task->mm->context.size) {
120 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121 					dead_task->comm,
122 					dead_task->mm->context.ldt,
123 					dead_task->mm->context.size);
124 			BUG();
125 		}
126 	}
127 }
128 
129 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
130 {
131 	struct user_desc ud = {
132 		.base_addr = addr,
133 		.limit = 0xfffff,
134 		.seg_32bit = 1,
135 		.limit_in_pages = 1,
136 		.useable = 1,
137 	};
138 	struct desc_struct *desc = t->thread.tls_array;
139 	desc += tls;
140 	fill_ldt(desc, &ud);
141 }
142 
143 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
144 {
145 	return get_desc_base(&t->thread.tls_array[tls]);
146 }
147 
148 /*
149  * This gets called before we allocate a new thread and copy
150  * the current task into it.
151  */
152 void prepare_to_copy(struct task_struct *tsk)
153 {
154 	unlazy_fpu(tsk);
155 }
156 
157 int copy_thread(unsigned long clone_flags, unsigned long sp,
158 		unsigned long unused,
159 	struct task_struct *p, struct pt_regs *regs)
160 {
161 	int err;
162 	struct pt_regs *childregs;
163 	struct task_struct *me = current;
164 
165 	childregs = ((struct pt_regs *)
166 			(THREAD_SIZE + task_stack_page(p))) - 1;
167 	*childregs = *regs;
168 
169 	childregs->ax = 0;
170 	if (user_mode(regs))
171 		childregs->sp = sp;
172 	else
173 		childregs->sp = (unsigned long)childregs;
174 
175 	p->thread.sp = (unsigned long) childregs;
176 	p->thread.sp0 = (unsigned long) (childregs+1);
177 	p->thread.usersp = me->thread.usersp;
178 
179 	set_tsk_thread_flag(p, TIF_FORK);
180 
181 	p->fpu_counter = 0;
182 	p->thread.io_bitmap_ptr = NULL;
183 
184 	savesegment(gs, p->thread.gsindex);
185 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
186 	savesegment(fs, p->thread.fsindex);
187 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
188 	savesegment(es, p->thread.es);
189 	savesegment(ds, p->thread.ds);
190 
191 	err = -ENOMEM;
192 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
193 
194 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
195 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
196 						  IO_BITMAP_BYTES, GFP_KERNEL);
197 		if (!p->thread.io_bitmap_ptr) {
198 			p->thread.io_bitmap_max = 0;
199 			return -ENOMEM;
200 		}
201 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
202 	}
203 
204 	/*
205 	 * Set a new TLS for the child thread?
206 	 */
207 	if (clone_flags & CLONE_SETTLS) {
208 #ifdef CONFIG_IA32_EMULATION
209 		if (test_thread_flag(TIF_IA32))
210 			err = do_set_thread_area(p, -1,
211 				(struct user_desc __user *)childregs->si, 0);
212 		else
213 #endif
214 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
215 		if (err)
216 			goto out;
217 	}
218 	err = 0;
219 out:
220 	if (err && p->thread.io_bitmap_ptr) {
221 		kfree(p->thread.io_bitmap_ptr);
222 		p->thread.io_bitmap_max = 0;
223 	}
224 
225 	return err;
226 }
227 
228 static void
229 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
230 		    unsigned long new_sp,
231 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
232 {
233 	loadsegment(fs, 0);
234 	loadsegment(es, _ds);
235 	loadsegment(ds, _ds);
236 	load_gs_index(0);
237 	current->thread.usersp	= new_sp;
238 	regs->ip		= new_ip;
239 	regs->sp		= new_sp;
240 	percpu_write(old_rsp, new_sp);
241 	regs->cs		= _cs;
242 	regs->ss		= _ss;
243 	regs->flags		= X86_EFLAGS_IF;
244 	/*
245 	 * Free the old FP and other extended state
246 	 */
247 	free_thread_xstate(current);
248 }
249 
250 void
251 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
252 {
253 	start_thread_common(regs, new_ip, new_sp,
254 			    __USER_CS, __USER_DS, 0);
255 }
256 
257 #ifdef CONFIG_IA32_EMULATION
258 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
259 {
260 	start_thread_common(regs, new_ip, new_sp,
261 			    test_thread_flag(TIF_X32)
262 			    ? __USER_CS : __USER32_CS,
263 			    __USER_DS, __USER_DS);
264 }
265 #endif
266 
267 /*
268  *	switch_to(x,y) should switch tasks from x to y.
269  *
270  * This could still be optimized:
271  * - fold all the options into a flag word and test it with a single test.
272  * - could test fs/gs bitsliced
273  *
274  * Kprobes not supported here. Set the probe on schedule instead.
275  * Function graph tracer not supported too.
276  */
277 __notrace_funcgraph struct task_struct *
278 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
279 {
280 	struct thread_struct *prev = &prev_p->thread;
281 	struct thread_struct *next = &next_p->thread;
282 	int cpu = smp_processor_id();
283 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
284 	unsigned fsindex, gsindex;
285 	fpu_switch_t fpu;
286 
287 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
288 
289 	/*
290 	 * Reload esp0, LDT and the page table pointer:
291 	 */
292 	load_sp0(tss, next);
293 
294 	/*
295 	 * Switch DS and ES.
296 	 * This won't pick up thread selector changes, but I guess that is ok.
297 	 */
298 	savesegment(es, prev->es);
299 	if (unlikely(next->es | prev->es))
300 		loadsegment(es, next->es);
301 
302 	savesegment(ds, prev->ds);
303 	if (unlikely(next->ds | prev->ds))
304 		loadsegment(ds, next->ds);
305 
306 
307 	/* We must save %fs and %gs before load_TLS() because
308 	 * %fs and %gs may be cleared by load_TLS().
309 	 *
310 	 * (e.g. xen_load_tls())
311 	 */
312 	savesegment(fs, fsindex);
313 	savesegment(gs, gsindex);
314 
315 	load_TLS(next, cpu);
316 
317 	/*
318 	 * Leave lazy mode, flushing any hypercalls made here.
319 	 * This must be done before restoring TLS segments so
320 	 * the GDT and LDT are properly updated, and must be
321 	 * done before math_state_restore, so the TS bit is up
322 	 * to date.
323 	 */
324 	arch_end_context_switch(next_p);
325 
326 	/*
327 	 * Switch FS and GS.
328 	 *
329 	 * Segment register != 0 always requires a reload.  Also
330 	 * reload when it has changed.  When prev process used 64bit
331 	 * base always reload to avoid an information leak.
332 	 */
333 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
334 		loadsegment(fs, next->fsindex);
335 		/*
336 		 * Check if the user used a selector != 0; if yes
337 		 *  clear 64bit base, since overloaded base is always
338 		 *  mapped to the Null selector
339 		 */
340 		if (fsindex)
341 			prev->fs = 0;
342 	}
343 	/* when next process has a 64bit base use it */
344 	if (next->fs)
345 		wrmsrl(MSR_FS_BASE, next->fs);
346 	prev->fsindex = fsindex;
347 
348 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
349 		load_gs_index(next->gsindex);
350 		if (gsindex)
351 			prev->gs = 0;
352 	}
353 	if (next->gs)
354 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
355 	prev->gsindex = gsindex;
356 
357 	switch_fpu_finish(next_p, fpu);
358 
359 	/*
360 	 * Switch the PDA and FPU contexts.
361 	 */
362 	prev->usersp = percpu_read(old_rsp);
363 	percpu_write(old_rsp, next->usersp);
364 	percpu_write(current_task, next_p);
365 
366 	percpu_write(kernel_stack,
367 		  (unsigned long)task_stack_page(next_p) +
368 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
369 
370 	/*
371 	 * Now maybe reload the debug registers and handle I/O bitmaps
372 	 */
373 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
374 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
375 		__switch_to_xtra(prev_p, next_p, tss);
376 
377 	return prev_p;
378 }
379 
380 void set_personality_64bit(void)
381 {
382 	/* inherit personality from parent */
383 
384 	/* Make sure to be in 64bit mode */
385 	clear_thread_flag(TIF_IA32);
386 	clear_thread_flag(TIF_ADDR32);
387 	clear_thread_flag(TIF_X32);
388 
389 	/* Ensure the corresponding mm is not marked. */
390 	if (current->mm)
391 		current->mm->context.ia32_compat = 0;
392 
393 	/* TBD: overwrites user setup. Should have two bits.
394 	   But 64bit processes have always behaved this way,
395 	   so it's not too bad. The main problem is just that
396 	   32bit childs are affected again. */
397 	current->personality &= ~READ_IMPLIES_EXEC;
398 }
399 
400 void set_personality_ia32(bool x32)
401 {
402 	/* inherit personality from parent */
403 
404 	/* Make sure to be in 32bit mode */
405 	set_thread_flag(TIF_ADDR32);
406 
407 	/* Mark the associated mm as containing 32-bit tasks. */
408 	if (current->mm)
409 		current->mm->context.ia32_compat = 1;
410 
411 	if (x32) {
412 		clear_thread_flag(TIF_IA32);
413 		set_thread_flag(TIF_X32);
414 		current->personality &= ~READ_IMPLIES_EXEC;
415 		/* is_compat_task() uses the presence of the x32
416 		   syscall bit flag to determine compat status */
417 		current_thread_info()->status &= ~TS_COMPAT;
418 	} else {
419 		set_thread_flag(TIF_IA32);
420 		clear_thread_flag(TIF_X32);
421 		current->personality |= force_personality32;
422 		/* Prepare the first "return" to user space */
423 		current_thread_info()->status |= TS_COMPAT;
424 	}
425 }
426 EXPORT_SYMBOL_GPL(set_personality_ia32);
427 
428 unsigned long get_wchan(struct task_struct *p)
429 {
430 	unsigned long stack;
431 	u64 fp, ip;
432 	int count = 0;
433 
434 	if (!p || p == current || p->state == TASK_RUNNING)
435 		return 0;
436 	stack = (unsigned long)task_stack_page(p);
437 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
438 		return 0;
439 	fp = *(u64 *)(p->thread.sp);
440 	do {
441 		if (fp < (unsigned long)stack ||
442 		    fp >= (unsigned long)stack+THREAD_SIZE)
443 			return 0;
444 		ip = *(u64 *)(fp+8);
445 		if (!in_sched_functions(ip))
446 			return ip;
447 		fp = *(u64 *)fp;
448 	} while (count++ < 16);
449 	return 0;
450 }
451 
452 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
453 {
454 	int ret = 0;
455 	int doit = task == current;
456 	int cpu;
457 
458 	switch (code) {
459 	case ARCH_SET_GS:
460 		if (addr >= TASK_SIZE_OF(task))
461 			return -EPERM;
462 		cpu = get_cpu();
463 		/* handle small bases via the GDT because that's faster to
464 		   switch. */
465 		if (addr <= 0xffffffff) {
466 			set_32bit_tls(task, GS_TLS, addr);
467 			if (doit) {
468 				load_TLS(&task->thread, cpu);
469 				load_gs_index(GS_TLS_SEL);
470 			}
471 			task->thread.gsindex = GS_TLS_SEL;
472 			task->thread.gs = 0;
473 		} else {
474 			task->thread.gsindex = 0;
475 			task->thread.gs = addr;
476 			if (doit) {
477 				load_gs_index(0);
478 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
479 			}
480 		}
481 		put_cpu();
482 		break;
483 	case ARCH_SET_FS:
484 		/* Not strictly needed for fs, but do it for symmetry
485 		   with gs */
486 		if (addr >= TASK_SIZE_OF(task))
487 			return -EPERM;
488 		cpu = get_cpu();
489 		/* handle small bases via the GDT because that's faster to
490 		   switch. */
491 		if (addr <= 0xffffffff) {
492 			set_32bit_tls(task, FS_TLS, addr);
493 			if (doit) {
494 				load_TLS(&task->thread, cpu);
495 				loadsegment(fs, FS_TLS_SEL);
496 			}
497 			task->thread.fsindex = FS_TLS_SEL;
498 			task->thread.fs = 0;
499 		} else {
500 			task->thread.fsindex = 0;
501 			task->thread.fs = addr;
502 			if (doit) {
503 				/* set the selector to 0 to not confuse
504 				   __switch_to */
505 				loadsegment(fs, 0);
506 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
507 			}
508 		}
509 		put_cpu();
510 		break;
511 	case ARCH_GET_FS: {
512 		unsigned long base;
513 		if (task->thread.fsindex == FS_TLS_SEL)
514 			base = read_32bit_tls(task, FS_TLS);
515 		else if (doit)
516 			rdmsrl(MSR_FS_BASE, base);
517 		else
518 			base = task->thread.fs;
519 		ret = put_user(base, (unsigned long __user *)addr);
520 		break;
521 	}
522 	case ARCH_GET_GS: {
523 		unsigned long base;
524 		unsigned gsindex;
525 		if (task->thread.gsindex == GS_TLS_SEL)
526 			base = read_32bit_tls(task, GS_TLS);
527 		else if (doit) {
528 			savesegment(gs, gsindex);
529 			if (gsindex)
530 				rdmsrl(MSR_KERNEL_GS_BASE, base);
531 			else
532 				base = task->thread.gs;
533 		} else
534 			base = task->thread.gs;
535 		ret = put_user(base, (unsigned long __user *)addr);
536 		break;
537 	}
538 
539 	default:
540 		ret = -EINVAL;
541 		break;
542 	}
543 
544 	return ret;
545 }
546 
547 long sys_arch_prctl(int code, unsigned long addr)
548 {
549 	return do_arch_prctl(current, code, addr);
550 }
551 
552 unsigned long KSTK_ESP(struct task_struct *task)
553 {
554 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
555 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
556 }
557