xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision 565d76cb)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/processor.h>
44 #include <asm/i387.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/idle.h>
51 #include <asm/syscalls.h>
52 #include <asm/debugreg.h>
53 
54 asmlinkage extern void ret_from_fork(void);
55 
56 DEFINE_PER_CPU(unsigned long, old_rsp);
57 static DEFINE_PER_CPU(unsigned char, is_idle);
58 
59 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
60 
61 void idle_notifier_register(struct notifier_block *n)
62 {
63 	atomic_notifier_chain_register(&idle_notifier, n);
64 }
65 EXPORT_SYMBOL_GPL(idle_notifier_register);
66 
67 void idle_notifier_unregister(struct notifier_block *n)
68 {
69 	atomic_notifier_chain_unregister(&idle_notifier, n);
70 }
71 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
72 
73 void enter_idle(void)
74 {
75 	percpu_write(is_idle, 1);
76 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
77 }
78 
79 static void __exit_idle(void)
80 {
81 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
82 		return;
83 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
84 }
85 
86 /* Called from interrupts to signify idle end */
87 void exit_idle(void)
88 {
89 	/* idle loop has pid 0 */
90 	if (current->pid)
91 		return;
92 	__exit_idle();
93 }
94 
95 #ifndef CONFIG_SMP
96 static inline void play_dead(void)
97 {
98 	BUG();
99 }
100 #endif
101 
102 /*
103  * The idle thread. There's no useful work to be
104  * done, so just try to conserve power and have a
105  * low exit latency (ie sit in a loop waiting for
106  * somebody to say that they'd like to reschedule)
107  */
108 void cpu_idle(void)
109 {
110 	current_thread_info()->status |= TS_POLLING;
111 
112 	/*
113 	 * If we're the non-boot CPU, nothing set the stack canary up
114 	 * for us.  CPU0 already has it initialized but no harm in
115 	 * doing it again.  This is a good place for updating it, as
116 	 * we wont ever return from this function (so the invalid
117 	 * canaries already on the stack wont ever trigger).
118 	 */
119 	boot_init_stack_canary();
120 
121 	/* endless idle loop with no priority at all */
122 	while (1) {
123 		tick_nohz_stop_sched_tick(1);
124 		while (!need_resched()) {
125 
126 			rmb();
127 
128 			if (cpu_is_offline(smp_processor_id()))
129 				play_dead();
130 			/*
131 			 * Idle routines should keep interrupts disabled
132 			 * from here on, until they go to idle.
133 			 * Otherwise, idle callbacks can misfire.
134 			 */
135 			local_irq_disable();
136 			enter_idle();
137 			/* Don't trace irqs off for idle */
138 			stop_critical_timings();
139 			pm_idle();
140 			start_critical_timings();
141 
142 			/* In many cases the interrupt that ended idle
143 			   has already called exit_idle. But some idle
144 			   loops can be woken up without interrupt. */
145 			__exit_idle();
146 		}
147 
148 		tick_nohz_restart_sched_tick();
149 		preempt_enable_no_resched();
150 		schedule();
151 		preempt_disable();
152 	}
153 }
154 
155 /* Prints also some state that isn't saved in the pt_regs */
156 void __show_regs(struct pt_regs *regs, int all)
157 {
158 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
159 	unsigned long d0, d1, d2, d3, d6, d7;
160 	unsigned int fsindex, gsindex;
161 	unsigned int ds, cs, es;
162 
163 	show_regs_common();
164 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
165 	printk_address(regs->ip, 1);
166 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
167 			regs->sp, regs->flags);
168 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
169 	       regs->ax, regs->bx, regs->cx);
170 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
171 	       regs->dx, regs->si, regs->di);
172 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
173 	       regs->bp, regs->r8, regs->r9);
174 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
175 	       regs->r10, regs->r11, regs->r12);
176 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
177 	       regs->r13, regs->r14, regs->r15);
178 
179 	asm("movl %%ds,%0" : "=r" (ds));
180 	asm("movl %%cs,%0" : "=r" (cs));
181 	asm("movl %%es,%0" : "=r" (es));
182 	asm("movl %%fs,%0" : "=r" (fsindex));
183 	asm("movl %%gs,%0" : "=r" (gsindex));
184 
185 	rdmsrl(MSR_FS_BASE, fs);
186 	rdmsrl(MSR_GS_BASE, gs);
187 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
188 
189 	if (!all)
190 		return;
191 
192 	cr0 = read_cr0();
193 	cr2 = read_cr2();
194 	cr3 = read_cr3();
195 	cr4 = read_cr4();
196 
197 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
198 	       fs, fsindex, gs, gsindex, shadowgs);
199 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
200 			es, cr0);
201 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
202 			cr4);
203 
204 	get_debugreg(d0, 0);
205 	get_debugreg(d1, 1);
206 	get_debugreg(d2, 2);
207 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
208 	get_debugreg(d3, 3);
209 	get_debugreg(d6, 6);
210 	get_debugreg(d7, 7);
211 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212 }
213 
214 void release_thread(struct task_struct *dead_task)
215 {
216 	if (dead_task->mm) {
217 		if (dead_task->mm->context.size) {
218 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
219 					dead_task->comm,
220 					dead_task->mm->context.ldt,
221 					dead_task->mm->context.size);
222 			BUG();
223 		}
224 	}
225 }
226 
227 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
228 {
229 	struct user_desc ud = {
230 		.base_addr = addr,
231 		.limit = 0xfffff,
232 		.seg_32bit = 1,
233 		.limit_in_pages = 1,
234 		.useable = 1,
235 	};
236 	struct desc_struct *desc = t->thread.tls_array;
237 	desc += tls;
238 	fill_ldt(desc, &ud);
239 }
240 
241 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
242 {
243 	return get_desc_base(&t->thread.tls_array[tls]);
244 }
245 
246 /*
247  * This gets called before we allocate a new thread and copy
248  * the current task into it.
249  */
250 void prepare_to_copy(struct task_struct *tsk)
251 {
252 	unlazy_fpu(tsk);
253 }
254 
255 int copy_thread(unsigned long clone_flags, unsigned long sp,
256 		unsigned long unused,
257 	struct task_struct *p, struct pt_regs *regs)
258 {
259 	int err;
260 	struct pt_regs *childregs;
261 	struct task_struct *me = current;
262 
263 	childregs = ((struct pt_regs *)
264 			(THREAD_SIZE + task_stack_page(p))) - 1;
265 	*childregs = *regs;
266 
267 	childregs->ax = 0;
268 	if (user_mode(regs))
269 		childregs->sp = sp;
270 	else
271 		childregs->sp = (unsigned long)childregs;
272 
273 	p->thread.sp = (unsigned long) childregs;
274 	p->thread.sp0 = (unsigned long) (childregs+1);
275 	p->thread.usersp = me->thread.usersp;
276 
277 	set_tsk_thread_flag(p, TIF_FORK);
278 
279 	p->thread.io_bitmap_ptr = NULL;
280 
281 	savesegment(gs, p->thread.gsindex);
282 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
283 	savesegment(fs, p->thread.fsindex);
284 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
285 	savesegment(es, p->thread.es);
286 	savesegment(ds, p->thread.ds);
287 
288 	err = -ENOMEM;
289 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
290 
291 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
292 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
293 		if (!p->thread.io_bitmap_ptr) {
294 			p->thread.io_bitmap_max = 0;
295 			return -ENOMEM;
296 		}
297 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
298 				IO_BITMAP_BYTES);
299 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
300 	}
301 
302 	/*
303 	 * Set a new TLS for the child thread?
304 	 */
305 	if (clone_flags & CLONE_SETTLS) {
306 #ifdef CONFIG_IA32_EMULATION
307 		if (test_thread_flag(TIF_IA32))
308 			err = do_set_thread_area(p, -1,
309 				(struct user_desc __user *)childregs->si, 0);
310 		else
311 #endif
312 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
313 		if (err)
314 			goto out;
315 	}
316 	err = 0;
317 out:
318 	if (err && p->thread.io_bitmap_ptr) {
319 		kfree(p->thread.io_bitmap_ptr);
320 		p->thread.io_bitmap_max = 0;
321 	}
322 
323 	return err;
324 }
325 
326 static void
327 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
328 		    unsigned long new_sp,
329 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
330 {
331 	loadsegment(fs, 0);
332 	loadsegment(es, _ds);
333 	loadsegment(ds, _ds);
334 	load_gs_index(0);
335 	regs->ip		= new_ip;
336 	regs->sp		= new_sp;
337 	percpu_write(old_rsp, new_sp);
338 	regs->cs		= _cs;
339 	regs->ss		= _ss;
340 	regs->flags		= X86_EFLAGS_IF;
341 	set_fs(USER_DS);
342 	/*
343 	 * Free the old FP and other extended state
344 	 */
345 	free_thread_xstate(current);
346 }
347 
348 void
349 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
350 {
351 	start_thread_common(regs, new_ip, new_sp,
352 			    __USER_CS, __USER_DS, 0);
353 }
354 
355 #ifdef CONFIG_IA32_EMULATION
356 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
357 {
358 	start_thread_common(regs, new_ip, new_sp,
359 			    __USER32_CS, __USER32_DS, __USER32_DS);
360 }
361 #endif
362 
363 /*
364  *	switch_to(x,y) should switch tasks from x to y.
365  *
366  * This could still be optimized:
367  * - fold all the options into a flag word and test it with a single test.
368  * - could test fs/gs bitsliced
369  *
370  * Kprobes not supported here. Set the probe on schedule instead.
371  * Function graph tracer not supported too.
372  */
373 __notrace_funcgraph struct task_struct *
374 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
375 {
376 	struct thread_struct *prev = &prev_p->thread;
377 	struct thread_struct *next = &next_p->thread;
378 	int cpu = smp_processor_id();
379 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
380 	unsigned fsindex, gsindex;
381 	bool preload_fpu;
382 
383 	/*
384 	 * If the task has used fpu the last 5 timeslices, just do a full
385 	 * restore of the math state immediately to avoid the trap; the
386 	 * chances of needing FPU soon are obviously high now
387 	 */
388 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
389 
390 	/* we're going to use this soon, after a few expensive things */
391 	if (preload_fpu)
392 		prefetch(next->fpu.state);
393 
394 	/*
395 	 * Reload esp0, LDT and the page table pointer:
396 	 */
397 	load_sp0(tss, next);
398 
399 	/*
400 	 * Switch DS and ES.
401 	 * This won't pick up thread selector changes, but I guess that is ok.
402 	 */
403 	savesegment(es, prev->es);
404 	if (unlikely(next->es | prev->es))
405 		loadsegment(es, next->es);
406 
407 	savesegment(ds, prev->ds);
408 	if (unlikely(next->ds | prev->ds))
409 		loadsegment(ds, next->ds);
410 
411 
412 	/* We must save %fs and %gs before load_TLS() because
413 	 * %fs and %gs may be cleared by load_TLS().
414 	 *
415 	 * (e.g. xen_load_tls())
416 	 */
417 	savesegment(fs, fsindex);
418 	savesegment(gs, gsindex);
419 
420 	load_TLS(next, cpu);
421 
422 	/* Must be after DS reload */
423 	__unlazy_fpu(prev_p);
424 
425 	/* Make sure cpu is ready for new context */
426 	if (preload_fpu)
427 		clts();
428 
429 	/*
430 	 * Leave lazy mode, flushing any hypercalls made here.
431 	 * This must be done before restoring TLS segments so
432 	 * the GDT and LDT are properly updated, and must be
433 	 * done before math_state_restore, so the TS bit is up
434 	 * to date.
435 	 */
436 	arch_end_context_switch(next_p);
437 
438 	/*
439 	 * Switch FS and GS.
440 	 *
441 	 * Segment register != 0 always requires a reload.  Also
442 	 * reload when it has changed.  When prev process used 64bit
443 	 * base always reload to avoid an information leak.
444 	 */
445 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
446 		loadsegment(fs, next->fsindex);
447 		/*
448 		 * Check if the user used a selector != 0; if yes
449 		 *  clear 64bit base, since overloaded base is always
450 		 *  mapped to the Null selector
451 		 */
452 		if (fsindex)
453 			prev->fs = 0;
454 	}
455 	/* when next process has a 64bit base use it */
456 	if (next->fs)
457 		wrmsrl(MSR_FS_BASE, next->fs);
458 	prev->fsindex = fsindex;
459 
460 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
461 		load_gs_index(next->gsindex);
462 		if (gsindex)
463 			prev->gs = 0;
464 	}
465 	if (next->gs)
466 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
467 	prev->gsindex = gsindex;
468 
469 	/*
470 	 * Switch the PDA and FPU contexts.
471 	 */
472 	prev->usersp = percpu_read(old_rsp);
473 	percpu_write(old_rsp, next->usersp);
474 	percpu_write(current_task, next_p);
475 
476 	percpu_write(kernel_stack,
477 		  (unsigned long)task_stack_page(next_p) +
478 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
479 
480 	/*
481 	 * Now maybe reload the debug registers and handle I/O bitmaps
482 	 */
483 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
484 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
485 		__switch_to_xtra(prev_p, next_p, tss);
486 
487 	/*
488 	 * Preload the FPU context, now that we've determined that the
489 	 * task is likely to be using it.
490 	 */
491 	if (preload_fpu)
492 		__math_state_restore();
493 
494 	return prev_p;
495 }
496 
497 void set_personality_64bit(void)
498 {
499 	/* inherit personality from parent */
500 
501 	/* Make sure to be in 64bit mode */
502 	clear_thread_flag(TIF_IA32);
503 
504 	/* TBD: overwrites user setup. Should have two bits.
505 	   But 64bit processes have always behaved this way,
506 	   so it's not too bad. The main problem is just that
507 	   32bit childs are affected again. */
508 	current->personality &= ~READ_IMPLIES_EXEC;
509 }
510 
511 void set_personality_ia32(void)
512 {
513 	/* inherit personality from parent */
514 
515 	/* Make sure to be in 32bit mode */
516 	set_thread_flag(TIF_IA32);
517 	current->personality |= force_personality32;
518 
519 	/* Prepare the first "return" to user space */
520 	current_thread_info()->status |= TS_COMPAT;
521 }
522 
523 unsigned long get_wchan(struct task_struct *p)
524 {
525 	unsigned long stack;
526 	u64 fp, ip;
527 	int count = 0;
528 
529 	if (!p || p == current || p->state == TASK_RUNNING)
530 		return 0;
531 	stack = (unsigned long)task_stack_page(p);
532 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
533 		return 0;
534 	fp = *(u64 *)(p->thread.sp);
535 	do {
536 		if (fp < (unsigned long)stack ||
537 		    fp >= (unsigned long)stack+THREAD_SIZE)
538 			return 0;
539 		ip = *(u64 *)(fp+8);
540 		if (!in_sched_functions(ip))
541 			return ip;
542 		fp = *(u64 *)fp;
543 	} while (count++ < 16);
544 	return 0;
545 }
546 
547 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
548 {
549 	int ret = 0;
550 	int doit = task == current;
551 	int cpu;
552 
553 	switch (code) {
554 	case ARCH_SET_GS:
555 		if (addr >= TASK_SIZE_OF(task))
556 			return -EPERM;
557 		cpu = get_cpu();
558 		/* handle small bases via the GDT because that's faster to
559 		   switch. */
560 		if (addr <= 0xffffffff) {
561 			set_32bit_tls(task, GS_TLS, addr);
562 			if (doit) {
563 				load_TLS(&task->thread, cpu);
564 				load_gs_index(GS_TLS_SEL);
565 			}
566 			task->thread.gsindex = GS_TLS_SEL;
567 			task->thread.gs = 0;
568 		} else {
569 			task->thread.gsindex = 0;
570 			task->thread.gs = addr;
571 			if (doit) {
572 				load_gs_index(0);
573 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
574 			}
575 		}
576 		put_cpu();
577 		break;
578 	case ARCH_SET_FS:
579 		/* Not strictly needed for fs, but do it for symmetry
580 		   with gs */
581 		if (addr >= TASK_SIZE_OF(task))
582 			return -EPERM;
583 		cpu = get_cpu();
584 		/* handle small bases via the GDT because that's faster to
585 		   switch. */
586 		if (addr <= 0xffffffff) {
587 			set_32bit_tls(task, FS_TLS, addr);
588 			if (doit) {
589 				load_TLS(&task->thread, cpu);
590 				loadsegment(fs, FS_TLS_SEL);
591 			}
592 			task->thread.fsindex = FS_TLS_SEL;
593 			task->thread.fs = 0;
594 		} else {
595 			task->thread.fsindex = 0;
596 			task->thread.fs = addr;
597 			if (doit) {
598 				/* set the selector to 0 to not confuse
599 				   __switch_to */
600 				loadsegment(fs, 0);
601 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
602 			}
603 		}
604 		put_cpu();
605 		break;
606 	case ARCH_GET_FS: {
607 		unsigned long base;
608 		if (task->thread.fsindex == FS_TLS_SEL)
609 			base = read_32bit_tls(task, FS_TLS);
610 		else if (doit)
611 			rdmsrl(MSR_FS_BASE, base);
612 		else
613 			base = task->thread.fs;
614 		ret = put_user(base, (unsigned long __user *)addr);
615 		break;
616 	}
617 	case ARCH_GET_GS: {
618 		unsigned long base;
619 		unsigned gsindex;
620 		if (task->thread.gsindex == GS_TLS_SEL)
621 			base = read_32bit_tls(task, GS_TLS);
622 		else if (doit) {
623 			savesegment(gs, gsindex);
624 			if (gsindex)
625 				rdmsrl(MSR_KERNEL_GS_BASE, base);
626 			else
627 				base = task->thread.gs;
628 		} else
629 			base = task->thread.gs;
630 		ret = put_user(base, (unsigned long __user *)addr);
631 		break;
632 	}
633 
634 	default:
635 		ret = -EINVAL;
636 		break;
637 	}
638 
639 	return ret;
640 }
641 
642 long sys_arch_prctl(int code, unsigned long addr)
643 {
644 	return do_arch_prctl(current, code, addr);
645 }
646 
647 unsigned long KSTK_ESP(struct task_struct *task)
648 {
649 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
650 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
651 }
652