xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision 9c1f8594)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 #include <linux/cpuidle.h>
41 
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/processor.h>
45 #include <asm/i387.h>
46 #include <asm/mmu_context.h>
47 #include <asm/prctl.h>
48 #include <asm/desc.h>
49 #include <asm/proto.h>
50 #include <asm/ia32.h>
51 #include <asm/idle.h>
52 #include <asm/syscalls.h>
53 #include <asm/debugreg.h>
54 
55 asmlinkage extern void ret_from_fork(void);
56 
57 DEFINE_PER_CPU(unsigned long, old_rsp);
58 static DEFINE_PER_CPU(unsigned char, is_idle);
59 
60 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
61 
62 void idle_notifier_register(struct notifier_block *n)
63 {
64 	atomic_notifier_chain_register(&idle_notifier, n);
65 }
66 EXPORT_SYMBOL_GPL(idle_notifier_register);
67 
68 void idle_notifier_unregister(struct notifier_block *n)
69 {
70 	atomic_notifier_chain_unregister(&idle_notifier, n);
71 }
72 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
73 
74 void enter_idle(void)
75 {
76 	percpu_write(is_idle, 1);
77 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
78 }
79 
80 static void __exit_idle(void)
81 {
82 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
83 		return;
84 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
85 }
86 
87 /* Called from interrupts to signify idle end */
88 void exit_idle(void)
89 {
90 	/* idle loop has pid 0 */
91 	if (current->pid)
92 		return;
93 	__exit_idle();
94 }
95 
96 #ifndef CONFIG_SMP
97 static inline void play_dead(void)
98 {
99 	BUG();
100 }
101 #endif
102 
103 /*
104  * The idle thread. There's no useful work to be
105  * done, so just try to conserve power and have a
106  * low exit latency (ie sit in a loop waiting for
107  * somebody to say that they'd like to reschedule)
108  */
109 void cpu_idle(void)
110 {
111 	current_thread_info()->status |= TS_POLLING;
112 
113 	/*
114 	 * If we're the non-boot CPU, nothing set the stack canary up
115 	 * for us.  CPU0 already has it initialized but no harm in
116 	 * doing it again.  This is a good place for updating it, as
117 	 * we wont ever return from this function (so the invalid
118 	 * canaries already on the stack wont ever trigger).
119 	 */
120 	boot_init_stack_canary();
121 
122 	/* endless idle loop with no priority at all */
123 	while (1) {
124 		tick_nohz_stop_sched_tick(1);
125 		while (!need_resched()) {
126 
127 			rmb();
128 
129 			if (cpu_is_offline(smp_processor_id()))
130 				play_dead();
131 			/*
132 			 * Idle routines should keep interrupts disabled
133 			 * from here on, until they go to idle.
134 			 * Otherwise, idle callbacks can misfire.
135 			 */
136 			local_irq_disable();
137 			enter_idle();
138 			/* Don't trace irqs off for idle */
139 			stop_critical_timings();
140 			if (cpuidle_idle_call())
141 				pm_idle();
142 			start_critical_timings();
143 
144 			/* In many cases the interrupt that ended idle
145 			   has already called exit_idle. But some idle
146 			   loops can be woken up without interrupt. */
147 			__exit_idle();
148 		}
149 
150 		tick_nohz_restart_sched_tick();
151 		preempt_enable_no_resched();
152 		schedule();
153 		preempt_disable();
154 	}
155 }
156 
157 /* Prints also some state that isn't saved in the pt_regs */
158 void __show_regs(struct pt_regs *regs, int all)
159 {
160 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
161 	unsigned long d0, d1, d2, d3, d6, d7;
162 	unsigned int fsindex, gsindex;
163 	unsigned int ds, cs, es;
164 
165 	show_regs_common();
166 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
167 	printk_address(regs->ip, 1);
168 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
169 			regs->sp, regs->flags);
170 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
171 	       regs->ax, regs->bx, regs->cx);
172 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
173 	       regs->dx, regs->si, regs->di);
174 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
175 	       regs->bp, regs->r8, regs->r9);
176 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
177 	       regs->r10, regs->r11, regs->r12);
178 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
179 	       regs->r13, regs->r14, regs->r15);
180 
181 	asm("movl %%ds,%0" : "=r" (ds));
182 	asm("movl %%cs,%0" : "=r" (cs));
183 	asm("movl %%es,%0" : "=r" (es));
184 	asm("movl %%fs,%0" : "=r" (fsindex));
185 	asm("movl %%gs,%0" : "=r" (gsindex));
186 
187 	rdmsrl(MSR_FS_BASE, fs);
188 	rdmsrl(MSR_GS_BASE, gs);
189 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
190 
191 	if (!all)
192 		return;
193 
194 	cr0 = read_cr0();
195 	cr2 = read_cr2();
196 	cr3 = read_cr3();
197 	cr4 = read_cr4();
198 
199 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
200 	       fs, fsindex, gs, gsindex, shadowgs);
201 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
202 			es, cr0);
203 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
204 			cr4);
205 
206 	get_debugreg(d0, 0);
207 	get_debugreg(d1, 1);
208 	get_debugreg(d2, 2);
209 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
210 	get_debugreg(d3, 3);
211 	get_debugreg(d6, 6);
212 	get_debugreg(d7, 7);
213 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
214 }
215 
216 void release_thread(struct task_struct *dead_task)
217 {
218 	if (dead_task->mm) {
219 		if (dead_task->mm->context.size) {
220 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
221 					dead_task->comm,
222 					dead_task->mm->context.ldt,
223 					dead_task->mm->context.size);
224 			BUG();
225 		}
226 	}
227 }
228 
229 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
230 {
231 	struct user_desc ud = {
232 		.base_addr = addr,
233 		.limit = 0xfffff,
234 		.seg_32bit = 1,
235 		.limit_in_pages = 1,
236 		.useable = 1,
237 	};
238 	struct desc_struct *desc = t->thread.tls_array;
239 	desc += tls;
240 	fill_ldt(desc, &ud);
241 }
242 
243 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
244 {
245 	return get_desc_base(&t->thread.tls_array[tls]);
246 }
247 
248 /*
249  * This gets called before we allocate a new thread and copy
250  * the current task into it.
251  */
252 void prepare_to_copy(struct task_struct *tsk)
253 {
254 	unlazy_fpu(tsk);
255 }
256 
257 int copy_thread(unsigned long clone_flags, unsigned long sp,
258 		unsigned long unused,
259 	struct task_struct *p, struct pt_regs *regs)
260 {
261 	int err;
262 	struct pt_regs *childregs;
263 	struct task_struct *me = current;
264 
265 	childregs = ((struct pt_regs *)
266 			(THREAD_SIZE + task_stack_page(p))) - 1;
267 	*childregs = *regs;
268 
269 	childregs->ax = 0;
270 	if (user_mode(regs))
271 		childregs->sp = sp;
272 	else
273 		childregs->sp = (unsigned long)childregs;
274 
275 	p->thread.sp = (unsigned long) childregs;
276 	p->thread.sp0 = (unsigned long) (childregs+1);
277 	p->thread.usersp = me->thread.usersp;
278 
279 	set_tsk_thread_flag(p, TIF_FORK);
280 
281 	p->thread.io_bitmap_ptr = NULL;
282 
283 	savesegment(gs, p->thread.gsindex);
284 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
285 	savesegment(fs, p->thread.fsindex);
286 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
287 	savesegment(es, p->thread.es);
288 	savesegment(ds, p->thread.ds);
289 
290 	err = -ENOMEM;
291 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
292 
293 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
294 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
295 		if (!p->thread.io_bitmap_ptr) {
296 			p->thread.io_bitmap_max = 0;
297 			return -ENOMEM;
298 		}
299 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
300 				IO_BITMAP_BYTES);
301 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
302 	}
303 
304 	/*
305 	 * Set a new TLS for the child thread?
306 	 */
307 	if (clone_flags & CLONE_SETTLS) {
308 #ifdef CONFIG_IA32_EMULATION
309 		if (test_thread_flag(TIF_IA32))
310 			err = do_set_thread_area(p, -1,
311 				(struct user_desc __user *)childregs->si, 0);
312 		else
313 #endif
314 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
315 		if (err)
316 			goto out;
317 	}
318 	err = 0;
319 out:
320 	if (err && p->thread.io_bitmap_ptr) {
321 		kfree(p->thread.io_bitmap_ptr);
322 		p->thread.io_bitmap_max = 0;
323 	}
324 
325 	return err;
326 }
327 
328 static void
329 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
330 		    unsigned long new_sp,
331 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
332 {
333 	loadsegment(fs, 0);
334 	loadsegment(es, _ds);
335 	loadsegment(ds, _ds);
336 	load_gs_index(0);
337 	regs->ip		= new_ip;
338 	regs->sp		= new_sp;
339 	percpu_write(old_rsp, new_sp);
340 	regs->cs		= _cs;
341 	regs->ss		= _ss;
342 	regs->flags		= X86_EFLAGS_IF;
343 	/*
344 	 * Free the old FP and other extended state
345 	 */
346 	free_thread_xstate(current);
347 }
348 
349 void
350 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
351 {
352 	start_thread_common(regs, new_ip, new_sp,
353 			    __USER_CS, __USER_DS, 0);
354 }
355 
356 #ifdef CONFIG_IA32_EMULATION
357 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
358 {
359 	start_thread_common(regs, new_ip, new_sp,
360 			    __USER32_CS, __USER32_DS, __USER32_DS);
361 }
362 #endif
363 
364 /*
365  *	switch_to(x,y) should switch tasks from x to y.
366  *
367  * This could still be optimized:
368  * - fold all the options into a flag word and test it with a single test.
369  * - could test fs/gs bitsliced
370  *
371  * Kprobes not supported here. Set the probe on schedule instead.
372  * Function graph tracer not supported too.
373  */
374 __notrace_funcgraph struct task_struct *
375 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
376 {
377 	struct thread_struct *prev = &prev_p->thread;
378 	struct thread_struct *next = &next_p->thread;
379 	int cpu = smp_processor_id();
380 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
381 	unsigned fsindex, gsindex;
382 	bool preload_fpu;
383 
384 	/*
385 	 * If the task has used fpu the last 5 timeslices, just do a full
386 	 * restore of the math state immediately to avoid the trap; the
387 	 * chances of needing FPU soon are obviously high now
388 	 */
389 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
390 
391 	/* we're going to use this soon, after a few expensive things */
392 	if (preload_fpu)
393 		prefetch(next->fpu.state);
394 
395 	/*
396 	 * Reload esp0, LDT and the page table pointer:
397 	 */
398 	load_sp0(tss, next);
399 
400 	/*
401 	 * Switch DS and ES.
402 	 * This won't pick up thread selector changes, but I guess that is ok.
403 	 */
404 	savesegment(es, prev->es);
405 	if (unlikely(next->es | prev->es))
406 		loadsegment(es, next->es);
407 
408 	savesegment(ds, prev->ds);
409 	if (unlikely(next->ds | prev->ds))
410 		loadsegment(ds, next->ds);
411 
412 
413 	/* We must save %fs and %gs before load_TLS() because
414 	 * %fs and %gs may be cleared by load_TLS().
415 	 *
416 	 * (e.g. xen_load_tls())
417 	 */
418 	savesegment(fs, fsindex);
419 	savesegment(gs, gsindex);
420 
421 	load_TLS(next, cpu);
422 
423 	/* Must be after DS reload */
424 	__unlazy_fpu(prev_p);
425 
426 	/* Make sure cpu is ready for new context */
427 	if (preload_fpu)
428 		clts();
429 
430 	/*
431 	 * Leave lazy mode, flushing any hypercalls made here.
432 	 * This must be done before restoring TLS segments so
433 	 * the GDT and LDT are properly updated, and must be
434 	 * done before math_state_restore, so the TS bit is up
435 	 * to date.
436 	 */
437 	arch_end_context_switch(next_p);
438 
439 	/*
440 	 * Switch FS and GS.
441 	 *
442 	 * Segment register != 0 always requires a reload.  Also
443 	 * reload when it has changed.  When prev process used 64bit
444 	 * base always reload to avoid an information leak.
445 	 */
446 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
447 		loadsegment(fs, next->fsindex);
448 		/*
449 		 * Check if the user used a selector != 0; if yes
450 		 *  clear 64bit base, since overloaded base is always
451 		 *  mapped to the Null selector
452 		 */
453 		if (fsindex)
454 			prev->fs = 0;
455 	}
456 	/* when next process has a 64bit base use it */
457 	if (next->fs)
458 		wrmsrl(MSR_FS_BASE, next->fs);
459 	prev->fsindex = fsindex;
460 
461 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
462 		load_gs_index(next->gsindex);
463 		if (gsindex)
464 			prev->gs = 0;
465 	}
466 	if (next->gs)
467 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
468 	prev->gsindex = gsindex;
469 
470 	/*
471 	 * Switch the PDA and FPU contexts.
472 	 */
473 	prev->usersp = percpu_read(old_rsp);
474 	percpu_write(old_rsp, next->usersp);
475 	percpu_write(current_task, next_p);
476 
477 	percpu_write(kernel_stack,
478 		  (unsigned long)task_stack_page(next_p) +
479 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
480 
481 	/*
482 	 * Now maybe reload the debug registers and handle I/O bitmaps
483 	 */
484 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
485 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
486 		__switch_to_xtra(prev_p, next_p, tss);
487 
488 	/*
489 	 * Preload the FPU context, now that we've determined that the
490 	 * task is likely to be using it.
491 	 */
492 	if (preload_fpu)
493 		__math_state_restore();
494 
495 	return prev_p;
496 }
497 
498 void set_personality_64bit(void)
499 {
500 	/* inherit personality from parent */
501 
502 	/* Make sure to be in 64bit mode */
503 	clear_thread_flag(TIF_IA32);
504 
505 	/* Ensure the corresponding mm is not marked. */
506 	if (current->mm)
507 		current->mm->context.ia32_compat = 0;
508 
509 	/* TBD: overwrites user setup. Should have two bits.
510 	   But 64bit processes have always behaved this way,
511 	   so it's not too bad. The main problem is just that
512 	   32bit childs are affected again. */
513 	current->personality &= ~READ_IMPLIES_EXEC;
514 }
515 
516 void set_personality_ia32(void)
517 {
518 	/* inherit personality from parent */
519 
520 	/* Make sure to be in 32bit mode */
521 	set_thread_flag(TIF_IA32);
522 	current->personality |= force_personality32;
523 
524 	/* Mark the associated mm as containing 32-bit tasks. */
525 	if (current->mm)
526 		current->mm->context.ia32_compat = 1;
527 
528 	/* Prepare the first "return" to user space */
529 	current_thread_info()->status |= TS_COMPAT;
530 }
531 
532 unsigned long get_wchan(struct task_struct *p)
533 {
534 	unsigned long stack;
535 	u64 fp, ip;
536 	int count = 0;
537 
538 	if (!p || p == current || p->state == TASK_RUNNING)
539 		return 0;
540 	stack = (unsigned long)task_stack_page(p);
541 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
542 		return 0;
543 	fp = *(u64 *)(p->thread.sp);
544 	do {
545 		if (fp < (unsigned long)stack ||
546 		    fp >= (unsigned long)stack+THREAD_SIZE)
547 			return 0;
548 		ip = *(u64 *)(fp+8);
549 		if (!in_sched_functions(ip))
550 			return ip;
551 		fp = *(u64 *)fp;
552 	} while (count++ < 16);
553 	return 0;
554 }
555 
556 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
557 {
558 	int ret = 0;
559 	int doit = task == current;
560 	int cpu;
561 
562 	switch (code) {
563 	case ARCH_SET_GS:
564 		if (addr >= TASK_SIZE_OF(task))
565 			return -EPERM;
566 		cpu = get_cpu();
567 		/* handle small bases via the GDT because that's faster to
568 		   switch. */
569 		if (addr <= 0xffffffff) {
570 			set_32bit_tls(task, GS_TLS, addr);
571 			if (doit) {
572 				load_TLS(&task->thread, cpu);
573 				load_gs_index(GS_TLS_SEL);
574 			}
575 			task->thread.gsindex = GS_TLS_SEL;
576 			task->thread.gs = 0;
577 		} else {
578 			task->thread.gsindex = 0;
579 			task->thread.gs = addr;
580 			if (doit) {
581 				load_gs_index(0);
582 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
583 			}
584 		}
585 		put_cpu();
586 		break;
587 	case ARCH_SET_FS:
588 		/* Not strictly needed for fs, but do it for symmetry
589 		   with gs */
590 		if (addr >= TASK_SIZE_OF(task))
591 			return -EPERM;
592 		cpu = get_cpu();
593 		/* handle small bases via the GDT because that's faster to
594 		   switch. */
595 		if (addr <= 0xffffffff) {
596 			set_32bit_tls(task, FS_TLS, addr);
597 			if (doit) {
598 				load_TLS(&task->thread, cpu);
599 				loadsegment(fs, FS_TLS_SEL);
600 			}
601 			task->thread.fsindex = FS_TLS_SEL;
602 			task->thread.fs = 0;
603 		} else {
604 			task->thread.fsindex = 0;
605 			task->thread.fs = addr;
606 			if (doit) {
607 				/* set the selector to 0 to not confuse
608 				   __switch_to */
609 				loadsegment(fs, 0);
610 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
611 			}
612 		}
613 		put_cpu();
614 		break;
615 	case ARCH_GET_FS: {
616 		unsigned long base;
617 		if (task->thread.fsindex == FS_TLS_SEL)
618 			base = read_32bit_tls(task, FS_TLS);
619 		else if (doit)
620 			rdmsrl(MSR_FS_BASE, base);
621 		else
622 			base = task->thread.fs;
623 		ret = put_user(base, (unsigned long __user *)addr);
624 		break;
625 	}
626 	case ARCH_GET_GS: {
627 		unsigned long base;
628 		unsigned gsindex;
629 		if (task->thread.gsindex == GS_TLS_SEL)
630 			base = read_32bit_tls(task, GS_TLS);
631 		else if (doit) {
632 			savesegment(gs, gsindex);
633 			if (gsindex)
634 				rdmsrl(MSR_KERNEL_GS_BASE, base);
635 			else
636 				base = task->thread.gs;
637 		} else
638 			base = task->thread.gs;
639 		ret = put_user(base, (unsigned long __user *)addr);
640 		break;
641 	}
642 
643 	default:
644 		ret = -EINVAL;
645 		break;
646 	}
647 
648 	return ret;
649 }
650 
651 long sys_arch_prctl(int code, unsigned long addr)
652 {
653 	return do_arch_prctl(current, code, addr);
654 }
655 
656 unsigned long KSTK_ESP(struct task_struct *task)
657 {
658 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
659 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
660 }
661