xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision a09d2831)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/processor.h>
44 #include <asm/i387.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/idle.h>
51 #include <asm/syscalls.h>
52 #include <asm/ds.h>
53 #include <asm/debugreg.h>
54 
55 asmlinkage extern void ret_from_fork(void);
56 
57 DEFINE_PER_CPU(unsigned long, old_rsp);
58 static DEFINE_PER_CPU(unsigned char, is_idle);
59 
60 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
61 
62 void idle_notifier_register(struct notifier_block *n)
63 {
64 	atomic_notifier_chain_register(&idle_notifier, n);
65 }
66 EXPORT_SYMBOL_GPL(idle_notifier_register);
67 
68 void idle_notifier_unregister(struct notifier_block *n)
69 {
70 	atomic_notifier_chain_unregister(&idle_notifier, n);
71 }
72 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
73 
74 void enter_idle(void)
75 {
76 	percpu_write(is_idle, 1);
77 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
78 }
79 
80 static void __exit_idle(void)
81 {
82 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
83 		return;
84 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
85 }
86 
87 /* Called from interrupts to signify idle end */
88 void exit_idle(void)
89 {
90 	/* idle loop has pid 0 */
91 	if (current->pid)
92 		return;
93 	__exit_idle();
94 }
95 
96 #ifndef CONFIG_SMP
97 static inline void play_dead(void)
98 {
99 	BUG();
100 }
101 #endif
102 
103 /*
104  * The idle thread. There's no useful work to be
105  * done, so just try to conserve power and have a
106  * low exit latency (ie sit in a loop waiting for
107  * somebody to say that they'd like to reschedule)
108  */
109 void cpu_idle(void)
110 {
111 	current_thread_info()->status |= TS_POLLING;
112 
113 	/*
114 	 * If we're the non-boot CPU, nothing set the stack canary up
115 	 * for us.  CPU0 already has it initialized but no harm in
116 	 * doing it again.  This is a good place for updating it, as
117 	 * we wont ever return from this function (so the invalid
118 	 * canaries already on the stack wont ever trigger).
119 	 */
120 	boot_init_stack_canary();
121 
122 	/* endless idle loop with no priority at all */
123 	while (1) {
124 		tick_nohz_stop_sched_tick(1);
125 		while (!need_resched()) {
126 
127 			rmb();
128 
129 			if (cpu_is_offline(smp_processor_id()))
130 				play_dead();
131 			/*
132 			 * Idle routines should keep interrupts disabled
133 			 * from here on, until they go to idle.
134 			 * Otherwise, idle callbacks can misfire.
135 			 */
136 			local_irq_disable();
137 			enter_idle();
138 			/* Don't trace irqs off for idle */
139 			stop_critical_timings();
140 			pm_idle();
141 			start_critical_timings();
142 			/* In many cases the interrupt that ended idle
143 			   has already called exit_idle. But some idle
144 			   loops can be woken up without interrupt. */
145 			__exit_idle();
146 		}
147 
148 		tick_nohz_restart_sched_tick();
149 		preempt_enable_no_resched();
150 		schedule();
151 		preempt_disable();
152 	}
153 }
154 
155 /* Prints also some state that isn't saved in the pt_regs */
156 void __show_regs(struct pt_regs *regs, int all)
157 {
158 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
159 	unsigned long d0, d1, d2, d3, d6, d7;
160 	unsigned int fsindex, gsindex;
161 	unsigned int ds, cs, es;
162 
163 	show_regs_common();
164 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
165 	printk_address(regs->ip, 1);
166 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
167 			regs->sp, regs->flags);
168 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
169 	       regs->ax, regs->bx, regs->cx);
170 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
171 	       regs->dx, regs->si, regs->di);
172 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
173 	       regs->bp, regs->r8, regs->r9);
174 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
175 	       regs->r10, regs->r11, regs->r12);
176 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
177 	       regs->r13, regs->r14, regs->r15);
178 
179 	asm("movl %%ds,%0" : "=r" (ds));
180 	asm("movl %%cs,%0" : "=r" (cs));
181 	asm("movl %%es,%0" : "=r" (es));
182 	asm("movl %%fs,%0" : "=r" (fsindex));
183 	asm("movl %%gs,%0" : "=r" (gsindex));
184 
185 	rdmsrl(MSR_FS_BASE, fs);
186 	rdmsrl(MSR_GS_BASE, gs);
187 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
188 
189 	if (!all)
190 		return;
191 
192 	cr0 = read_cr0();
193 	cr2 = read_cr2();
194 	cr3 = read_cr3();
195 	cr4 = read_cr4();
196 
197 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
198 	       fs, fsindex, gs, gsindex, shadowgs);
199 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
200 			es, cr0);
201 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
202 			cr4);
203 
204 	get_debugreg(d0, 0);
205 	get_debugreg(d1, 1);
206 	get_debugreg(d2, 2);
207 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
208 	get_debugreg(d3, 3);
209 	get_debugreg(d6, 6);
210 	get_debugreg(d7, 7);
211 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212 }
213 
214 void show_regs(struct pt_regs *regs)
215 {
216 	show_registers(regs);
217 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
218 }
219 
220 void release_thread(struct task_struct *dead_task)
221 {
222 	if (dead_task->mm) {
223 		if (dead_task->mm->context.size) {
224 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
225 					dead_task->comm,
226 					dead_task->mm->context.ldt,
227 					dead_task->mm->context.size);
228 			BUG();
229 		}
230 	}
231 }
232 
233 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
234 {
235 	struct user_desc ud = {
236 		.base_addr = addr,
237 		.limit = 0xfffff,
238 		.seg_32bit = 1,
239 		.limit_in_pages = 1,
240 		.useable = 1,
241 	};
242 	struct desc_struct *desc = t->thread.tls_array;
243 	desc += tls;
244 	fill_ldt(desc, &ud);
245 }
246 
247 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
248 {
249 	return get_desc_base(&t->thread.tls_array[tls]);
250 }
251 
252 /*
253  * This gets called before we allocate a new thread and copy
254  * the current task into it.
255  */
256 void prepare_to_copy(struct task_struct *tsk)
257 {
258 	unlazy_fpu(tsk);
259 }
260 
261 int copy_thread(unsigned long clone_flags, unsigned long sp,
262 		unsigned long unused,
263 	struct task_struct *p, struct pt_regs *regs)
264 {
265 	int err;
266 	struct pt_regs *childregs;
267 	struct task_struct *me = current;
268 
269 	childregs = ((struct pt_regs *)
270 			(THREAD_SIZE + task_stack_page(p))) - 1;
271 	*childregs = *regs;
272 
273 	childregs->ax = 0;
274 	if (user_mode(regs))
275 		childregs->sp = sp;
276 	else
277 		childregs->sp = (unsigned long)childregs;
278 
279 	p->thread.sp = (unsigned long) childregs;
280 	p->thread.sp0 = (unsigned long) (childregs+1);
281 	p->thread.usersp = me->thread.usersp;
282 
283 	set_tsk_thread_flag(p, TIF_FORK);
284 
285 	p->thread.fs = me->thread.fs;
286 	p->thread.gs = me->thread.gs;
287 	p->thread.io_bitmap_ptr = NULL;
288 
289 	savesegment(gs, p->thread.gsindex);
290 	savesegment(fs, p->thread.fsindex);
291 	savesegment(es, p->thread.es);
292 	savesegment(ds, p->thread.ds);
293 
294 	err = -ENOMEM;
295 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
296 
297 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
298 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
299 		if (!p->thread.io_bitmap_ptr) {
300 			p->thread.io_bitmap_max = 0;
301 			return -ENOMEM;
302 		}
303 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
304 				IO_BITMAP_BYTES);
305 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
306 	}
307 
308 	/*
309 	 * Set a new TLS for the child thread?
310 	 */
311 	if (clone_flags & CLONE_SETTLS) {
312 #ifdef CONFIG_IA32_EMULATION
313 		if (test_thread_flag(TIF_IA32))
314 			err = do_set_thread_area(p, -1,
315 				(struct user_desc __user *)childregs->si, 0);
316 		else
317 #endif
318 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
319 		if (err)
320 			goto out;
321 	}
322 
323 	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
324 	p->thread.ds_ctx = NULL;
325 
326 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
327 	p->thread.debugctlmsr = 0;
328 
329 	err = 0;
330 out:
331 	if (err && p->thread.io_bitmap_ptr) {
332 		kfree(p->thread.io_bitmap_ptr);
333 		p->thread.io_bitmap_max = 0;
334 	}
335 
336 	return err;
337 }
338 
339 static void
340 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
341 		    unsigned long new_sp,
342 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
343 {
344 	loadsegment(fs, 0);
345 	loadsegment(es, _ds);
346 	loadsegment(ds, _ds);
347 	load_gs_index(0);
348 	regs->ip		= new_ip;
349 	regs->sp		= new_sp;
350 	percpu_write(old_rsp, new_sp);
351 	regs->cs		= _cs;
352 	regs->ss		= _ss;
353 	regs->flags		= X86_EFLAGS_IF;
354 	set_fs(USER_DS);
355 	/*
356 	 * Free the old FP and other extended state
357 	 */
358 	free_thread_xstate(current);
359 }
360 
361 void
362 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
363 {
364 	start_thread_common(regs, new_ip, new_sp,
365 			    __USER_CS, __USER_DS, 0);
366 }
367 
368 #ifdef CONFIG_IA32_EMULATION
369 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
370 {
371 	start_thread_common(regs, new_ip, new_sp,
372 			    __USER32_CS, __USER32_DS, __USER32_DS);
373 }
374 #endif
375 
376 /*
377  *	switch_to(x,y) should switch tasks from x to y.
378  *
379  * This could still be optimized:
380  * - fold all the options into a flag word and test it with a single test.
381  * - could test fs/gs bitsliced
382  *
383  * Kprobes not supported here. Set the probe on schedule instead.
384  * Function graph tracer not supported too.
385  */
386 __notrace_funcgraph struct task_struct *
387 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
388 {
389 	struct thread_struct *prev = &prev_p->thread;
390 	struct thread_struct *next = &next_p->thread;
391 	int cpu = smp_processor_id();
392 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
393 	unsigned fsindex, gsindex;
394 	bool preload_fpu;
395 
396 	/*
397 	 * If the task has used fpu the last 5 timeslices, just do a full
398 	 * restore of the math state immediately to avoid the trap; the
399 	 * chances of needing FPU soon are obviously high now
400 	 */
401 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
402 
403 	/* we're going to use this soon, after a few expensive things */
404 	if (preload_fpu)
405 		prefetch(next->xstate);
406 
407 	/*
408 	 * Reload esp0, LDT and the page table pointer:
409 	 */
410 	load_sp0(tss, next);
411 
412 	/*
413 	 * Switch DS and ES.
414 	 * This won't pick up thread selector changes, but I guess that is ok.
415 	 */
416 	savesegment(es, prev->es);
417 	if (unlikely(next->es | prev->es))
418 		loadsegment(es, next->es);
419 
420 	savesegment(ds, prev->ds);
421 	if (unlikely(next->ds | prev->ds))
422 		loadsegment(ds, next->ds);
423 
424 
425 	/* We must save %fs and %gs before load_TLS() because
426 	 * %fs and %gs may be cleared by load_TLS().
427 	 *
428 	 * (e.g. xen_load_tls())
429 	 */
430 	savesegment(fs, fsindex);
431 	savesegment(gs, gsindex);
432 
433 	load_TLS(next, cpu);
434 
435 	/* Must be after DS reload */
436 	unlazy_fpu(prev_p);
437 
438 	/* Make sure cpu is ready for new context */
439 	if (preload_fpu)
440 		clts();
441 
442 	/*
443 	 * Leave lazy mode, flushing any hypercalls made here.
444 	 * This must be done before restoring TLS segments so
445 	 * the GDT and LDT are properly updated, and must be
446 	 * done before math_state_restore, so the TS bit is up
447 	 * to date.
448 	 */
449 	arch_end_context_switch(next_p);
450 
451 	/*
452 	 * Switch FS and GS.
453 	 *
454 	 * Segment register != 0 always requires a reload.  Also
455 	 * reload when it has changed.  When prev process used 64bit
456 	 * base always reload to avoid an information leak.
457 	 */
458 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
459 		loadsegment(fs, next->fsindex);
460 		/*
461 		 * Check if the user used a selector != 0; if yes
462 		 *  clear 64bit base, since overloaded base is always
463 		 *  mapped to the Null selector
464 		 */
465 		if (fsindex)
466 			prev->fs = 0;
467 	}
468 	/* when next process has a 64bit base use it */
469 	if (next->fs)
470 		wrmsrl(MSR_FS_BASE, next->fs);
471 	prev->fsindex = fsindex;
472 
473 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
474 		load_gs_index(next->gsindex);
475 		if (gsindex)
476 			prev->gs = 0;
477 	}
478 	if (next->gs)
479 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
480 	prev->gsindex = gsindex;
481 
482 	/*
483 	 * Switch the PDA and FPU contexts.
484 	 */
485 	prev->usersp = percpu_read(old_rsp);
486 	percpu_write(old_rsp, next->usersp);
487 	percpu_write(current_task, next_p);
488 
489 	percpu_write(kernel_stack,
490 		  (unsigned long)task_stack_page(next_p) +
491 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
492 
493 	/*
494 	 * Now maybe reload the debug registers and handle I/O bitmaps
495 	 */
496 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
497 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
498 		__switch_to_xtra(prev_p, next_p, tss);
499 
500 	/*
501 	 * Preload the FPU context, now that we've determined that the
502 	 * task is likely to be using it.
503 	 */
504 	if (preload_fpu)
505 		__math_state_restore();
506 
507 	return prev_p;
508 }
509 
510 void set_personality_64bit(void)
511 {
512 	/* inherit personality from parent */
513 
514 	/* Make sure to be in 64bit mode */
515 	clear_thread_flag(TIF_IA32);
516 
517 	/* TBD: overwrites user setup. Should have two bits.
518 	   But 64bit processes have always behaved this way,
519 	   so it's not too bad. The main problem is just that
520 	   32bit childs are affected again. */
521 	current->personality &= ~READ_IMPLIES_EXEC;
522 }
523 
524 unsigned long get_wchan(struct task_struct *p)
525 {
526 	unsigned long stack;
527 	u64 fp, ip;
528 	int count = 0;
529 
530 	if (!p || p == current || p->state == TASK_RUNNING)
531 		return 0;
532 	stack = (unsigned long)task_stack_page(p);
533 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
534 		return 0;
535 	fp = *(u64 *)(p->thread.sp);
536 	do {
537 		if (fp < (unsigned long)stack ||
538 		    fp >= (unsigned long)stack+THREAD_SIZE)
539 			return 0;
540 		ip = *(u64 *)(fp+8);
541 		if (!in_sched_functions(ip))
542 			return ip;
543 		fp = *(u64 *)fp;
544 	} while (count++ < 16);
545 	return 0;
546 }
547 
548 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
549 {
550 	int ret = 0;
551 	int doit = task == current;
552 	int cpu;
553 
554 	switch (code) {
555 	case ARCH_SET_GS:
556 		if (addr >= TASK_SIZE_OF(task))
557 			return -EPERM;
558 		cpu = get_cpu();
559 		/* handle small bases via the GDT because that's faster to
560 		   switch. */
561 		if (addr <= 0xffffffff) {
562 			set_32bit_tls(task, GS_TLS, addr);
563 			if (doit) {
564 				load_TLS(&task->thread, cpu);
565 				load_gs_index(GS_TLS_SEL);
566 			}
567 			task->thread.gsindex = GS_TLS_SEL;
568 			task->thread.gs = 0;
569 		} else {
570 			task->thread.gsindex = 0;
571 			task->thread.gs = addr;
572 			if (doit) {
573 				load_gs_index(0);
574 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
575 			}
576 		}
577 		put_cpu();
578 		break;
579 	case ARCH_SET_FS:
580 		/* Not strictly needed for fs, but do it for symmetry
581 		   with gs */
582 		if (addr >= TASK_SIZE_OF(task))
583 			return -EPERM;
584 		cpu = get_cpu();
585 		/* handle small bases via the GDT because that's faster to
586 		   switch. */
587 		if (addr <= 0xffffffff) {
588 			set_32bit_tls(task, FS_TLS, addr);
589 			if (doit) {
590 				load_TLS(&task->thread, cpu);
591 				loadsegment(fs, FS_TLS_SEL);
592 			}
593 			task->thread.fsindex = FS_TLS_SEL;
594 			task->thread.fs = 0;
595 		} else {
596 			task->thread.fsindex = 0;
597 			task->thread.fs = addr;
598 			if (doit) {
599 				/* set the selector to 0 to not confuse
600 				   __switch_to */
601 				loadsegment(fs, 0);
602 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
603 			}
604 		}
605 		put_cpu();
606 		break;
607 	case ARCH_GET_FS: {
608 		unsigned long base;
609 		if (task->thread.fsindex == FS_TLS_SEL)
610 			base = read_32bit_tls(task, FS_TLS);
611 		else if (doit)
612 			rdmsrl(MSR_FS_BASE, base);
613 		else
614 			base = task->thread.fs;
615 		ret = put_user(base, (unsigned long __user *)addr);
616 		break;
617 	}
618 	case ARCH_GET_GS: {
619 		unsigned long base;
620 		unsigned gsindex;
621 		if (task->thread.gsindex == GS_TLS_SEL)
622 			base = read_32bit_tls(task, GS_TLS);
623 		else if (doit) {
624 			savesegment(gs, gsindex);
625 			if (gsindex)
626 				rdmsrl(MSR_KERNEL_GS_BASE, base);
627 			else
628 				base = task->thread.gs;
629 		} else
630 			base = task->thread.gs;
631 		ret = put_user(base, (unsigned long __user *)addr);
632 		break;
633 	}
634 
635 	default:
636 		ret = -EINVAL;
637 		break;
638 	}
639 
640 	return ret;
641 }
642 
643 long sys_arch_prctl(int code, unsigned long addr)
644 {
645 	return do_arch_prctl(current, code, addr);
646 }
647 
648 unsigned long KSTK_ESP(struct task_struct *task)
649 {
650 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
651 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
652 }
653