xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision baa7eb025ab14f3cba2e35c0a8648f9c9f01d24f)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/processor.h>
44 #include <asm/i387.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/idle.h>
51 #include <asm/syscalls.h>
52 #include <asm/debugreg.h>
53 
54 #include <trace/events/power.h>
55 
56 asmlinkage extern void ret_from_fork(void);
57 
58 DEFINE_PER_CPU(unsigned long, old_rsp);
59 static DEFINE_PER_CPU(unsigned char, is_idle);
60 
61 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62 
63 void idle_notifier_register(struct notifier_block *n)
64 {
65 	atomic_notifier_chain_register(&idle_notifier, n);
66 }
67 EXPORT_SYMBOL_GPL(idle_notifier_register);
68 
69 void idle_notifier_unregister(struct notifier_block *n)
70 {
71 	atomic_notifier_chain_unregister(&idle_notifier, n);
72 }
73 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74 
75 void enter_idle(void)
76 {
77 	percpu_write(is_idle, 1);
78 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79 }
80 
81 static void __exit_idle(void)
82 {
83 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84 		return;
85 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86 }
87 
88 /* Called from interrupts to signify idle end */
89 void exit_idle(void)
90 {
91 	/* idle loop has pid 0 */
92 	if (current->pid)
93 		return;
94 	__exit_idle();
95 }
96 
97 #ifndef CONFIG_SMP
98 static inline void play_dead(void)
99 {
100 	BUG();
101 }
102 #endif
103 
104 /*
105  * The idle thread. There's no useful work to be
106  * done, so just try to conserve power and have a
107  * low exit latency (ie sit in a loop waiting for
108  * somebody to say that they'd like to reschedule)
109  */
110 void cpu_idle(void)
111 {
112 	current_thread_info()->status |= TS_POLLING;
113 
114 	/*
115 	 * If we're the non-boot CPU, nothing set the stack canary up
116 	 * for us.  CPU0 already has it initialized but no harm in
117 	 * doing it again.  This is a good place for updating it, as
118 	 * we wont ever return from this function (so the invalid
119 	 * canaries already on the stack wont ever trigger).
120 	 */
121 	boot_init_stack_canary();
122 
123 	/* endless idle loop with no priority at all */
124 	while (1) {
125 		tick_nohz_stop_sched_tick(1);
126 		while (!need_resched()) {
127 
128 			rmb();
129 
130 			if (cpu_is_offline(smp_processor_id()))
131 				play_dead();
132 			/*
133 			 * Idle routines should keep interrupts disabled
134 			 * from here on, until they go to idle.
135 			 * Otherwise, idle callbacks can misfire.
136 			 */
137 			local_irq_disable();
138 			enter_idle();
139 			/* Don't trace irqs off for idle */
140 			stop_critical_timings();
141 			pm_idle();
142 			start_critical_timings();
143 
144 			trace_power_end(smp_processor_id());
145 			trace_cpu_idle(PWR_EVENT_EXIT,
146 				       smp_processor_id());
147 
148 			/* In many cases the interrupt that ended idle
149 			   has already called exit_idle. But some idle
150 			   loops can be woken up without interrupt. */
151 			__exit_idle();
152 		}
153 
154 		tick_nohz_restart_sched_tick();
155 		preempt_enable_no_resched();
156 		schedule();
157 		preempt_disable();
158 	}
159 }
160 
161 /* Prints also some state that isn't saved in the pt_regs */
162 void __show_regs(struct pt_regs *regs, int all)
163 {
164 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
165 	unsigned long d0, d1, d2, d3, d6, d7;
166 	unsigned int fsindex, gsindex;
167 	unsigned int ds, cs, es;
168 
169 	show_regs_common();
170 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
171 	printk_address(regs->ip, 1);
172 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
173 			regs->sp, regs->flags);
174 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
175 	       regs->ax, regs->bx, regs->cx);
176 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
177 	       regs->dx, regs->si, regs->di);
178 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
179 	       regs->bp, regs->r8, regs->r9);
180 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
181 	       regs->r10, regs->r11, regs->r12);
182 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
183 	       regs->r13, regs->r14, regs->r15);
184 
185 	asm("movl %%ds,%0" : "=r" (ds));
186 	asm("movl %%cs,%0" : "=r" (cs));
187 	asm("movl %%es,%0" : "=r" (es));
188 	asm("movl %%fs,%0" : "=r" (fsindex));
189 	asm("movl %%gs,%0" : "=r" (gsindex));
190 
191 	rdmsrl(MSR_FS_BASE, fs);
192 	rdmsrl(MSR_GS_BASE, gs);
193 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
194 
195 	if (!all)
196 		return;
197 
198 	cr0 = read_cr0();
199 	cr2 = read_cr2();
200 	cr3 = read_cr3();
201 	cr4 = read_cr4();
202 
203 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
204 	       fs, fsindex, gs, gsindex, shadowgs);
205 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
206 			es, cr0);
207 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
208 			cr4);
209 
210 	get_debugreg(d0, 0);
211 	get_debugreg(d1, 1);
212 	get_debugreg(d2, 2);
213 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
214 	get_debugreg(d3, 3);
215 	get_debugreg(d6, 6);
216 	get_debugreg(d7, 7);
217 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
218 }
219 
220 void release_thread(struct task_struct *dead_task)
221 {
222 	if (dead_task->mm) {
223 		if (dead_task->mm->context.size) {
224 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
225 					dead_task->comm,
226 					dead_task->mm->context.ldt,
227 					dead_task->mm->context.size);
228 			BUG();
229 		}
230 	}
231 }
232 
233 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
234 {
235 	struct user_desc ud = {
236 		.base_addr = addr,
237 		.limit = 0xfffff,
238 		.seg_32bit = 1,
239 		.limit_in_pages = 1,
240 		.useable = 1,
241 	};
242 	struct desc_struct *desc = t->thread.tls_array;
243 	desc += tls;
244 	fill_ldt(desc, &ud);
245 }
246 
247 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
248 {
249 	return get_desc_base(&t->thread.tls_array[tls]);
250 }
251 
252 /*
253  * This gets called before we allocate a new thread and copy
254  * the current task into it.
255  */
256 void prepare_to_copy(struct task_struct *tsk)
257 {
258 	unlazy_fpu(tsk);
259 }
260 
261 int copy_thread(unsigned long clone_flags, unsigned long sp,
262 		unsigned long unused,
263 	struct task_struct *p, struct pt_regs *regs)
264 {
265 	int err;
266 	struct pt_regs *childregs;
267 	struct task_struct *me = current;
268 
269 	childregs = ((struct pt_regs *)
270 			(THREAD_SIZE + task_stack_page(p))) - 1;
271 	*childregs = *regs;
272 
273 	childregs->ax = 0;
274 	if (user_mode(regs))
275 		childregs->sp = sp;
276 	else
277 		childregs->sp = (unsigned long)childregs;
278 
279 	p->thread.sp = (unsigned long) childregs;
280 	p->thread.sp0 = (unsigned long) (childregs+1);
281 	p->thread.usersp = me->thread.usersp;
282 
283 	set_tsk_thread_flag(p, TIF_FORK);
284 
285 	p->thread.io_bitmap_ptr = NULL;
286 
287 	savesegment(gs, p->thread.gsindex);
288 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
289 	savesegment(fs, p->thread.fsindex);
290 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
291 	savesegment(es, p->thread.es);
292 	savesegment(ds, p->thread.ds);
293 
294 	err = -ENOMEM;
295 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
296 
297 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
298 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
299 		if (!p->thread.io_bitmap_ptr) {
300 			p->thread.io_bitmap_max = 0;
301 			return -ENOMEM;
302 		}
303 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
304 				IO_BITMAP_BYTES);
305 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
306 	}
307 
308 	/*
309 	 * Set a new TLS for the child thread?
310 	 */
311 	if (clone_flags & CLONE_SETTLS) {
312 #ifdef CONFIG_IA32_EMULATION
313 		if (test_thread_flag(TIF_IA32))
314 			err = do_set_thread_area(p, -1,
315 				(struct user_desc __user *)childregs->si, 0);
316 		else
317 #endif
318 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
319 		if (err)
320 			goto out;
321 	}
322 	err = 0;
323 out:
324 	if (err && p->thread.io_bitmap_ptr) {
325 		kfree(p->thread.io_bitmap_ptr);
326 		p->thread.io_bitmap_max = 0;
327 	}
328 
329 	return err;
330 }
331 
332 static void
333 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
334 		    unsigned long new_sp,
335 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
336 {
337 	loadsegment(fs, 0);
338 	loadsegment(es, _ds);
339 	loadsegment(ds, _ds);
340 	load_gs_index(0);
341 	regs->ip		= new_ip;
342 	regs->sp		= new_sp;
343 	percpu_write(old_rsp, new_sp);
344 	regs->cs		= _cs;
345 	regs->ss		= _ss;
346 	regs->flags		= X86_EFLAGS_IF;
347 	set_fs(USER_DS);
348 	/*
349 	 * Free the old FP and other extended state
350 	 */
351 	free_thread_xstate(current);
352 }
353 
354 void
355 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
356 {
357 	start_thread_common(regs, new_ip, new_sp,
358 			    __USER_CS, __USER_DS, 0);
359 }
360 
361 #ifdef CONFIG_IA32_EMULATION
362 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
363 {
364 	start_thread_common(regs, new_ip, new_sp,
365 			    __USER32_CS, __USER32_DS, __USER32_DS);
366 }
367 #endif
368 
369 /*
370  *	switch_to(x,y) should switch tasks from x to y.
371  *
372  * This could still be optimized:
373  * - fold all the options into a flag word and test it with a single test.
374  * - could test fs/gs bitsliced
375  *
376  * Kprobes not supported here. Set the probe on schedule instead.
377  * Function graph tracer not supported too.
378  */
379 __notrace_funcgraph struct task_struct *
380 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
381 {
382 	struct thread_struct *prev = &prev_p->thread;
383 	struct thread_struct *next = &next_p->thread;
384 	int cpu = smp_processor_id();
385 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
386 	unsigned fsindex, gsindex;
387 	bool preload_fpu;
388 
389 	/*
390 	 * If the task has used fpu the last 5 timeslices, just do a full
391 	 * restore of the math state immediately to avoid the trap; the
392 	 * chances of needing FPU soon are obviously high now
393 	 */
394 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
395 
396 	/* we're going to use this soon, after a few expensive things */
397 	if (preload_fpu)
398 		prefetch(next->fpu.state);
399 
400 	/*
401 	 * Reload esp0, LDT and the page table pointer:
402 	 */
403 	load_sp0(tss, next);
404 
405 	/*
406 	 * Switch DS and ES.
407 	 * This won't pick up thread selector changes, but I guess that is ok.
408 	 */
409 	savesegment(es, prev->es);
410 	if (unlikely(next->es | prev->es))
411 		loadsegment(es, next->es);
412 
413 	savesegment(ds, prev->ds);
414 	if (unlikely(next->ds | prev->ds))
415 		loadsegment(ds, next->ds);
416 
417 
418 	/* We must save %fs and %gs before load_TLS() because
419 	 * %fs and %gs may be cleared by load_TLS().
420 	 *
421 	 * (e.g. xen_load_tls())
422 	 */
423 	savesegment(fs, fsindex);
424 	savesegment(gs, gsindex);
425 
426 	load_TLS(next, cpu);
427 
428 	/* Must be after DS reload */
429 	__unlazy_fpu(prev_p);
430 
431 	/* Make sure cpu is ready for new context */
432 	if (preload_fpu)
433 		clts();
434 
435 	/*
436 	 * Leave lazy mode, flushing any hypercalls made here.
437 	 * This must be done before restoring TLS segments so
438 	 * the GDT and LDT are properly updated, and must be
439 	 * done before math_state_restore, so the TS bit is up
440 	 * to date.
441 	 */
442 	arch_end_context_switch(next_p);
443 
444 	/*
445 	 * Switch FS and GS.
446 	 *
447 	 * Segment register != 0 always requires a reload.  Also
448 	 * reload when it has changed.  When prev process used 64bit
449 	 * base always reload to avoid an information leak.
450 	 */
451 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
452 		loadsegment(fs, next->fsindex);
453 		/*
454 		 * Check if the user used a selector != 0; if yes
455 		 *  clear 64bit base, since overloaded base is always
456 		 *  mapped to the Null selector
457 		 */
458 		if (fsindex)
459 			prev->fs = 0;
460 	}
461 	/* when next process has a 64bit base use it */
462 	if (next->fs)
463 		wrmsrl(MSR_FS_BASE, next->fs);
464 	prev->fsindex = fsindex;
465 
466 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
467 		load_gs_index(next->gsindex);
468 		if (gsindex)
469 			prev->gs = 0;
470 	}
471 	if (next->gs)
472 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
473 	prev->gsindex = gsindex;
474 
475 	/*
476 	 * Switch the PDA and FPU contexts.
477 	 */
478 	prev->usersp = percpu_read(old_rsp);
479 	percpu_write(old_rsp, next->usersp);
480 	percpu_write(current_task, next_p);
481 
482 	percpu_write(kernel_stack,
483 		  (unsigned long)task_stack_page(next_p) +
484 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
485 
486 	/*
487 	 * Now maybe reload the debug registers and handle I/O bitmaps
488 	 */
489 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
490 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
491 		__switch_to_xtra(prev_p, next_p, tss);
492 
493 	/*
494 	 * Preload the FPU context, now that we've determined that the
495 	 * task is likely to be using it.
496 	 */
497 	if (preload_fpu)
498 		__math_state_restore();
499 
500 	return prev_p;
501 }
502 
503 void set_personality_64bit(void)
504 {
505 	/* inherit personality from parent */
506 
507 	/* Make sure to be in 64bit mode */
508 	clear_thread_flag(TIF_IA32);
509 
510 	/* TBD: overwrites user setup. Should have two bits.
511 	   But 64bit processes have always behaved this way,
512 	   so it's not too bad. The main problem is just that
513 	   32bit childs are affected again. */
514 	current->personality &= ~READ_IMPLIES_EXEC;
515 }
516 
517 void set_personality_ia32(void)
518 {
519 	/* inherit personality from parent */
520 
521 	/* Make sure to be in 32bit mode */
522 	set_thread_flag(TIF_IA32);
523 	current->personality |= force_personality32;
524 
525 	/* Prepare the first "return" to user space */
526 	current_thread_info()->status |= TS_COMPAT;
527 }
528 
529 unsigned long get_wchan(struct task_struct *p)
530 {
531 	unsigned long stack;
532 	u64 fp, ip;
533 	int count = 0;
534 
535 	if (!p || p == current || p->state == TASK_RUNNING)
536 		return 0;
537 	stack = (unsigned long)task_stack_page(p);
538 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
539 		return 0;
540 	fp = *(u64 *)(p->thread.sp);
541 	do {
542 		if (fp < (unsigned long)stack ||
543 		    fp >= (unsigned long)stack+THREAD_SIZE)
544 			return 0;
545 		ip = *(u64 *)(fp+8);
546 		if (!in_sched_functions(ip))
547 			return ip;
548 		fp = *(u64 *)fp;
549 	} while (count++ < 16);
550 	return 0;
551 }
552 
553 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
554 {
555 	int ret = 0;
556 	int doit = task == current;
557 	int cpu;
558 
559 	switch (code) {
560 	case ARCH_SET_GS:
561 		if (addr >= TASK_SIZE_OF(task))
562 			return -EPERM;
563 		cpu = get_cpu();
564 		/* handle small bases via the GDT because that's faster to
565 		   switch. */
566 		if (addr <= 0xffffffff) {
567 			set_32bit_tls(task, GS_TLS, addr);
568 			if (doit) {
569 				load_TLS(&task->thread, cpu);
570 				load_gs_index(GS_TLS_SEL);
571 			}
572 			task->thread.gsindex = GS_TLS_SEL;
573 			task->thread.gs = 0;
574 		} else {
575 			task->thread.gsindex = 0;
576 			task->thread.gs = addr;
577 			if (doit) {
578 				load_gs_index(0);
579 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
580 			}
581 		}
582 		put_cpu();
583 		break;
584 	case ARCH_SET_FS:
585 		/* Not strictly needed for fs, but do it for symmetry
586 		   with gs */
587 		if (addr >= TASK_SIZE_OF(task))
588 			return -EPERM;
589 		cpu = get_cpu();
590 		/* handle small bases via the GDT because that's faster to
591 		   switch. */
592 		if (addr <= 0xffffffff) {
593 			set_32bit_tls(task, FS_TLS, addr);
594 			if (doit) {
595 				load_TLS(&task->thread, cpu);
596 				loadsegment(fs, FS_TLS_SEL);
597 			}
598 			task->thread.fsindex = FS_TLS_SEL;
599 			task->thread.fs = 0;
600 		} else {
601 			task->thread.fsindex = 0;
602 			task->thread.fs = addr;
603 			if (doit) {
604 				/* set the selector to 0 to not confuse
605 				   __switch_to */
606 				loadsegment(fs, 0);
607 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
608 			}
609 		}
610 		put_cpu();
611 		break;
612 	case ARCH_GET_FS: {
613 		unsigned long base;
614 		if (task->thread.fsindex == FS_TLS_SEL)
615 			base = read_32bit_tls(task, FS_TLS);
616 		else if (doit)
617 			rdmsrl(MSR_FS_BASE, base);
618 		else
619 			base = task->thread.fs;
620 		ret = put_user(base, (unsigned long __user *)addr);
621 		break;
622 	}
623 	case ARCH_GET_GS: {
624 		unsigned long base;
625 		unsigned gsindex;
626 		if (task->thread.gsindex == GS_TLS_SEL)
627 			base = read_32bit_tls(task, GS_TLS);
628 		else if (doit) {
629 			savesegment(gs, gsindex);
630 			if (gsindex)
631 				rdmsrl(MSR_KERNEL_GS_BASE, base);
632 			else
633 				base = task->thread.gs;
634 		} else
635 			base = task->thread.gs;
636 		ret = put_user(base, (unsigned long __user *)addr);
637 		break;
638 	}
639 
640 	default:
641 		ret = -EINVAL;
642 		break;
643 	}
644 
645 	return ret;
646 }
647 
648 long sys_arch_prctl(int code, unsigned long addr)
649 {
650 	return do_arch_prctl(current, code, addr);
651 }
652 
653 unsigned long KSTK_ESP(struct task_struct *task)
654 {
655 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
656 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
657 }
658