xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision 7490ca1e)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 #include <linux/cpuidle.h>
41 
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/processor.h>
45 #include <asm/i387.h>
46 #include <asm/mmu_context.h>
47 #include <asm/prctl.h>
48 #include <asm/desc.h>
49 #include <asm/proto.h>
50 #include <asm/ia32.h>
51 #include <asm/idle.h>
52 #include <asm/syscalls.h>
53 #include <asm/debugreg.h>
54 #include <asm/nmi.h>
55 
56 asmlinkage extern void ret_from_fork(void);
57 
58 DEFINE_PER_CPU(unsigned long, old_rsp);
59 static DEFINE_PER_CPU(unsigned char, is_idle);
60 
61 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62 
63 void idle_notifier_register(struct notifier_block *n)
64 {
65 	atomic_notifier_chain_register(&idle_notifier, n);
66 }
67 EXPORT_SYMBOL_GPL(idle_notifier_register);
68 
69 void idle_notifier_unregister(struct notifier_block *n)
70 {
71 	atomic_notifier_chain_unregister(&idle_notifier, n);
72 }
73 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74 
75 void enter_idle(void)
76 {
77 	percpu_write(is_idle, 1);
78 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79 }
80 
81 static void __exit_idle(void)
82 {
83 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84 		return;
85 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86 }
87 
88 /* Called from interrupts to signify idle end */
89 void exit_idle(void)
90 {
91 	/* idle loop has pid 0 */
92 	if (current->pid)
93 		return;
94 	__exit_idle();
95 }
96 
97 #ifndef CONFIG_SMP
98 static inline void play_dead(void)
99 {
100 	BUG();
101 }
102 #endif
103 
104 /*
105  * The idle thread. There's no useful work to be
106  * done, so just try to conserve power and have a
107  * low exit latency (ie sit in a loop waiting for
108  * somebody to say that they'd like to reschedule)
109  */
110 void cpu_idle(void)
111 {
112 	current_thread_info()->status |= TS_POLLING;
113 
114 	/*
115 	 * If we're the non-boot CPU, nothing set the stack canary up
116 	 * for us.  CPU0 already has it initialized but no harm in
117 	 * doing it again.  This is a good place for updating it, as
118 	 * we wont ever return from this function (so the invalid
119 	 * canaries already on the stack wont ever trigger).
120 	 */
121 	boot_init_stack_canary();
122 
123 	/* endless idle loop with no priority at all */
124 	while (1) {
125 		tick_nohz_idle_enter();
126 		while (!need_resched()) {
127 
128 			rmb();
129 
130 			if (cpu_is_offline(smp_processor_id()))
131 				play_dead();
132 			/*
133 			 * Idle routines should keep interrupts disabled
134 			 * from here on, until they go to idle.
135 			 * Otherwise, idle callbacks can misfire.
136 			 */
137 			local_touch_nmi();
138 			local_irq_disable();
139 			enter_idle();
140 			/* Don't trace irqs off for idle */
141 			stop_critical_timings();
142 
143 			/* enter_idle() needs rcu for notifiers */
144 			rcu_idle_enter();
145 
146 			if (cpuidle_idle_call())
147 				pm_idle();
148 
149 			rcu_idle_exit();
150 			start_critical_timings();
151 
152 			/* In many cases the interrupt that ended idle
153 			   has already called exit_idle. But some idle
154 			   loops can be woken up without interrupt. */
155 			__exit_idle();
156 		}
157 
158 		tick_nohz_idle_exit();
159 		preempt_enable_no_resched();
160 		schedule();
161 		preempt_disable();
162 	}
163 }
164 
165 /* Prints also some state that isn't saved in the pt_regs */
166 void __show_regs(struct pt_regs *regs, int all)
167 {
168 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
169 	unsigned long d0, d1, d2, d3, d6, d7;
170 	unsigned int fsindex, gsindex;
171 	unsigned int ds, cs, es;
172 
173 	show_regs_common();
174 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
175 	printk_address(regs->ip, 1);
176 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
177 			regs->sp, regs->flags);
178 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
179 	       regs->ax, regs->bx, regs->cx);
180 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
181 	       regs->dx, regs->si, regs->di);
182 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
183 	       regs->bp, regs->r8, regs->r9);
184 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
185 	       regs->r10, regs->r11, regs->r12);
186 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
187 	       regs->r13, regs->r14, regs->r15);
188 
189 	asm("movl %%ds,%0" : "=r" (ds));
190 	asm("movl %%cs,%0" : "=r" (cs));
191 	asm("movl %%es,%0" : "=r" (es));
192 	asm("movl %%fs,%0" : "=r" (fsindex));
193 	asm("movl %%gs,%0" : "=r" (gsindex));
194 
195 	rdmsrl(MSR_FS_BASE, fs);
196 	rdmsrl(MSR_GS_BASE, gs);
197 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
198 
199 	if (!all)
200 		return;
201 
202 	cr0 = read_cr0();
203 	cr2 = read_cr2();
204 	cr3 = read_cr3();
205 	cr4 = read_cr4();
206 
207 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
208 	       fs, fsindex, gs, gsindex, shadowgs);
209 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
210 			es, cr0);
211 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
212 			cr4);
213 
214 	get_debugreg(d0, 0);
215 	get_debugreg(d1, 1);
216 	get_debugreg(d2, 2);
217 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
218 	get_debugreg(d3, 3);
219 	get_debugreg(d6, 6);
220 	get_debugreg(d7, 7);
221 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
222 }
223 
224 void release_thread(struct task_struct *dead_task)
225 {
226 	if (dead_task->mm) {
227 		if (dead_task->mm->context.size) {
228 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
229 					dead_task->comm,
230 					dead_task->mm->context.ldt,
231 					dead_task->mm->context.size);
232 			BUG();
233 		}
234 	}
235 }
236 
237 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
238 {
239 	struct user_desc ud = {
240 		.base_addr = addr,
241 		.limit = 0xfffff,
242 		.seg_32bit = 1,
243 		.limit_in_pages = 1,
244 		.useable = 1,
245 	};
246 	struct desc_struct *desc = t->thread.tls_array;
247 	desc += tls;
248 	fill_ldt(desc, &ud);
249 }
250 
251 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
252 {
253 	return get_desc_base(&t->thread.tls_array[tls]);
254 }
255 
256 /*
257  * This gets called before we allocate a new thread and copy
258  * the current task into it.
259  */
260 void prepare_to_copy(struct task_struct *tsk)
261 {
262 	unlazy_fpu(tsk);
263 }
264 
265 int copy_thread(unsigned long clone_flags, unsigned long sp,
266 		unsigned long unused,
267 	struct task_struct *p, struct pt_regs *regs)
268 {
269 	int err;
270 	struct pt_regs *childregs;
271 	struct task_struct *me = current;
272 
273 	childregs = ((struct pt_regs *)
274 			(THREAD_SIZE + task_stack_page(p))) - 1;
275 	*childregs = *regs;
276 
277 	childregs->ax = 0;
278 	if (user_mode(regs))
279 		childregs->sp = sp;
280 	else
281 		childregs->sp = (unsigned long)childregs;
282 
283 	p->thread.sp = (unsigned long) childregs;
284 	p->thread.sp0 = (unsigned long) (childregs+1);
285 	p->thread.usersp = me->thread.usersp;
286 
287 	set_tsk_thread_flag(p, TIF_FORK);
288 
289 	p->fpu_counter = 0;
290 	p->thread.io_bitmap_ptr = NULL;
291 
292 	savesegment(gs, p->thread.gsindex);
293 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
294 	savesegment(fs, p->thread.fsindex);
295 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
296 	savesegment(es, p->thread.es);
297 	savesegment(ds, p->thread.ds);
298 
299 	err = -ENOMEM;
300 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
301 
302 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
303 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
304 						  IO_BITMAP_BYTES, GFP_KERNEL);
305 		if (!p->thread.io_bitmap_ptr) {
306 			p->thread.io_bitmap_max = 0;
307 			return -ENOMEM;
308 		}
309 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
310 	}
311 
312 	/*
313 	 * Set a new TLS for the child thread?
314 	 */
315 	if (clone_flags & CLONE_SETTLS) {
316 #ifdef CONFIG_IA32_EMULATION
317 		if (test_thread_flag(TIF_IA32))
318 			err = do_set_thread_area(p, -1,
319 				(struct user_desc __user *)childregs->si, 0);
320 		else
321 #endif
322 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
323 		if (err)
324 			goto out;
325 	}
326 	err = 0;
327 out:
328 	if (err && p->thread.io_bitmap_ptr) {
329 		kfree(p->thread.io_bitmap_ptr);
330 		p->thread.io_bitmap_max = 0;
331 	}
332 
333 	return err;
334 }
335 
336 static void
337 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
338 		    unsigned long new_sp,
339 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
340 {
341 	loadsegment(fs, 0);
342 	loadsegment(es, _ds);
343 	loadsegment(ds, _ds);
344 	load_gs_index(0);
345 	regs->ip		= new_ip;
346 	regs->sp		= new_sp;
347 	percpu_write(old_rsp, new_sp);
348 	regs->cs		= _cs;
349 	regs->ss		= _ss;
350 	regs->flags		= X86_EFLAGS_IF;
351 	/*
352 	 * Free the old FP and other extended state
353 	 */
354 	free_thread_xstate(current);
355 }
356 
357 void
358 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
359 {
360 	start_thread_common(regs, new_ip, new_sp,
361 			    __USER_CS, __USER_DS, 0);
362 }
363 
364 #ifdef CONFIG_IA32_EMULATION
365 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
366 {
367 	start_thread_common(regs, new_ip, new_sp,
368 			    __USER32_CS, __USER32_DS, __USER32_DS);
369 }
370 #endif
371 
372 /*
373  *	switch_to(x,y) should switch tasks from x to y.
374  *
375  * This could still be optimized:
376  * - fold all the options into a flag word and test it with a single test.
377  * - could test fs/gs bitsliced
378  *
379  * Kprobes not supported here. Set the probe on schedule instead.
380  * Function graph tracer not supported too.
381  */
382 __notrace_funcgraph struct task_struct *
383 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
384 {
385 	struct thread_struct *prev = &prev_p->thread;
386 	struct thread_struct *next = &next_p->thread;
387 	int cpu = smp_processor_id();
388 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
389 	unsigned fsindex, gsindex;
390 	fpu_switch_t fpu;
391 
392 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
393 
394 	/*
395 	 * Reload esp0, LDT and the page table pointer:
396 	 */
397 	load_sp0(tss, next);
398 
399 	/*
400 	 * Switch DS and ES.
401 	 * This won't pick up thread selector changes, but I guess that is ok.
402 	 */
403 	savesegment(es, prev->es);
404 	if (unlikely(next->es | prev->es))
405 		loadsegment(es, next->es);
406 
407 	savesegment(ds, prev->ds);
408 	if (unlikely(next->ds | prev->ds))
409 		loadsegment(ds, next->ds);
410 
411 
412 	/* We must save %fs and %gs before load_TLS() because
413 	 * %fs and %gs may be cleared by load_TLS().
414 	 *
415 	 * (e.g. xen_load_tls())
416 	 */
417 	savesegment(fs, fsindex);
418 	savesegment(gs, gsindex);
419 
420 	load_TLS(next, cpu);
421 
422 	/*
423 	 * Leave lazy mode, flushing any hypercalls made here.
424 	 * This must be done before restoring TLS segments so
425 	 * the GDT and LDT are properly updated, and must be
426 	 * done before math_state_restore, so the TS bit is up
427 	 * to date.
428 	 */
429 	arch_end_context_switch(next_p);
430 
431 	/*
432 	 * Switch FS and GS.
433 	 *
434 	 * Segment register != 0 always requires a reload.  Also
435 	 * reload when it has changed.  When prev process used 64bit
436 	 * base always reload to avoid an information leak.
437 	 */
438 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
439 		loadsegment(fs, next->fsindex);
440 		/*
441 		 * Check if the user used a selector != 0; if yes
442 		 *  clear 64bit base, since overloaded base is always
443 		 *  mapped to the Null selector
444 		 */
445 		if (fsindex)
446 			prev->fs = 0;
447 	}
448 	/* when next process has a 64bit base use it */
449 	if (next->fs)
450 		wrmsrl(MSR_FS_BASE, next->fs);
451 	prev->fsindex = fsindex;
452 
453 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
454 		load_gs_index(next->gsindex);
455 		if (gsindex)
456 			prev->gs = 0;
457 	}
458 	if (next->gs)
459 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
460 	prev->gsindex = gsindex;
461 
462 	switch_fpu_finish(next_p, fpu);
463 
464 	/*
465 	 * Switch the PDA and FPU contexts.
466 	 */
467 	prev->usersp = percpu_read(old_rsp);
468 	percpu_write(old_rsp, next->usersp);
469 	percpu_write(current_task, next_p);
470 
471 	percpu_write(kernel_stack,
472 		  (unsigned long)task_stack_page(next_p) +
473 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
474 
475 	/*
476 	 * Now maybe reload the debug registers and handle I/O bitmaps
477 	 */
478 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
479 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
480 		__switch_to_xtra(prev_p, next_p, tss);
481 
482 	return prev_p;
483 }
484 
485 void set_personality_64bit(void)
486 {
487 	/* inherit personality from parent */
488 
489 	/* Make sure to be in 64bit mode */
490 	clear_thread_flag(TIF_IA32);
491 
492 	/* Ensure the corresponding mm is not marked. */
493 	if (current->mm)
494 		current->mm->context.ia32_compat = 0;
495 
496 	/* TBD: overwrites user setup. Should have two bits.
497 	   But 64bit processes have always behaved this way,
498 	   so it's not too bad. The main problem is just that
499 	   32bit childs are affected again. */
500 	current->personality &= ~READ_IMPLIES_EXEC;
501 }
502 
503 void set_personality_ia32(void)
504 {
505 	/* inherit personality from parent */
506 
507 	/* Make sure to be in 32bit mode */
508 	set_thread_flag(TIF_IA32);
509 	current->personality |= force_personality32;
510 
511 	/* Mark the associated mm as containing 32-bit tasks. */
512 	if (current->mm)
513 		current->mm->context.ia32_compat = 1;
514 
515 	/* Prepare the first "return" to user space */
516 	current_thread_info()->status |= TS_COMPAT;
517 }
518 
519 unsigned long get_wchan(struct task_struct *p)
520 {
521 	unsigned long stack;
522 	u64 fp, ip;
523 	int count = 0;
524 
525 	if (!p || p == current || p->state == TASK_RUNNING)
526 		return 0;
527 	stack = (unsigned long)task_stack_page(p);
528 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
529 		return 0;
530 	fp = *(u64 *)(p->thread.sp);
531 	do {
532 		if (fp < (unsigned long)stack ||
533 		    fp >= (unsigned long)stack+THREAD_SIZE)
534 			return 0;
535 		ip = *(u64 *)(fp+8);
536 		if (!in_sched_functions(ip))
537 			return ip;
538 		fp = *(u64 *)fp;
539 	} while (count++ < 16);
540 	return 0;
541 }
542 
543 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
544 {
545 	int ret = 0;
546 	int doit = task == current;
547 	int cpu;
548 
549 	switch (code) {
550 	case ARCH_SET_GS:
551 		if (addr >= TASK_SIZE_OF(task))
552 			return -EPERM;
553 		cpu = get_cpu();
554 		/* handle small bases via the GDT because that's faster to
555 		   switch. */
556 		if (addr <= 0xffffffff) {
557 			set_32bit_tls(task, GS_TLS, addr);
558 			if (doit) {
559 				load_TLS(&task->thread, cpu);
560 				load_gs_index(GS_TLS_SEL);
561 			}
562 			task->thread.gsindex = GS_TLS_SEL;
563 			task->thread.gs = 0;
564 		} else {
565 			task->thread.gsindex = 0;
566 			task->thread.gs = addr;
567 			if (doit) {
568 				load_gs_index(0);
569 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
570 			}
571 		}
572 		put_cpu();
573 		break;
574 	case ARCH_SET_FS:
575 		/* Not strictly needed for fs, but do it for symmetry
576 		   with gs */
577 		if (addr >= TASK_SIZE_OF(task))
578 			return -EPERM;
579 		cpu = get_cpu();
580 		/* handle small bases via the GDT because that's faster to
581 		   switch. */
582 		if (addr <= 0xffffffff) {
583 			set_32bit_tls(task, FS_TLS, addr);
584 			if (doit) {
585 				load_TLS(&task->thread, cpu);
586 				loadsegment(fs, FS_TLS_SEL);
587 			}
588 			task->thread.fsindex = FS_TLS_SEL;
589 			task->thread.fs = 0;
590 		} else {
591 			task->thread.fsindex = 0;
592 			task->thread.fs = addr;
593 			if (doit) {
594 				/* set the selector to 0 to not confuse
595 				   __switch_to */
596 				loadsegment(fs, 0);
597 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
598 			}
599 		}
600 		put_cpu();
601 		break;
602 	case ARCH_GET_FS: {
603 		unsigned long base;
604 		if (task->thread.fsindex == FS_TLS_SEL)
605 			base = read_32bit_tls(task, FS_TLS);
606 		else if (doit)
607 			rdmsrl(MSR_FS_BASE, base);
608 		else
609 			base = task->thread.fs;
610 		ret = put_user(base, (unsigned long __user *)addr);
611 		break;
612 	}
613 	case ARCH_GET_GS: {
614 		unsigned long base;
615 		unsigned gsindex;
616 		if (task->thread.gsindex == GS_TLS_SEL)
617 			base = read_32bit_tls(task, GS_TLS);
618 		else if (doit) {
619 			savesegment(gs, gsindex);
620 			if (gsindex)
621 				rdmsrl(MSR_KERNEL_GS_BASE, base);
622 			else
623 				base = task->thread.gs;
624 		} else
625 			base = task->thread.gs;
626 		ret = put_user(base, (unsigned long __user *)addr);
627 		break;
628 	}
629 
630 	default:
631 		ret = -EINVAL;
632 		break;
633 	}
634 
635 	return ret;
636 }
637 
638 long sys_arch_prctl(int code, unsigned long addr)
639 {
640 	return do_arch_prctl(current, code, addr);
641 }
642 
643 unsigned long KSTK_ESP(struct task_struct *task)
644 {
645 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
646 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
647 }
648