xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision fd589a8f)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/utsname.h>
30 #include <linux/delay.h>
31 #include <linux/module.h>
32 #include <linux/ptrace.h>
33 #include <linux/notifier.h>
34 #include <linux/kprobes.h>
35 #include <linux/kdebug.h>
36 #include <linux/tick.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41 #include <linux/dmi.h>
42 
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
45 #include <asm/processor.h>
46 #include <asm/i387.h>
47 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/desc.h>
50 #include <asm/proto.h>
51 #include <asm/ia32.h>
52 #include <asm/idle.h>
53 #include <asm/syscalls.h>
54 #include <asm/ds.h>
55 
56 asmlinkage extern void ret_from_fork(void);
57 
58 DEFINE_PER_CPU(unsigned long, old_rsp);
59 static DEFINE_PER_CPU(unsigned char, is_idle);
60 
61 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62 
63 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
64 
65 void idle_notifier_register(struct notifier_block *n)
66 {
67 	atomic_notifier_chain_register(&idle_notifier, n);
68 }
69 EXPORT_SYMBOL_GPL(idle_notifier_register);
70 
71 void idle_notifier_unregister(struct notifier_block *n)
72 {
73 	atomic_notifier_chain_unregister(&idle_notifier, n);
74 }
75 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
76 
77 void enter_idle(void)
78 {
79 	percpu_write(is_idle, 1);
80 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
81 }
82 
83 static void __exit_idle(void)
84 {
85 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
86 		return;
87 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
88 }
89 
90 /* Called from interrupts to signify idle end */
91 void exit_idle(void)
92 {
93 	/* idle loop has pid 0 */
94 	if (current->pid)
95 		return;
96 	__exit_idle();
97 }
98 
99 #ifndef CONFIG_SMP
100 static inline void play_dead(void)
101 {
102 	BUG();
103 }
104 #endif
105 
106 /*
107  * The idle thread. There's no useful work to be
108  * done, so just try to conserve power and have a
109  * low exit latency (ie sit in a loop waiting for
110  * somebody to say that they'd like to reschedule)
111  */
112 void cpu_idle(void)
113 {
114 	current_thread_info()->status |= TS_POLLING;
115 
116 	/*
117 	 * If we're the non-boot CPU, nothing set the stack canary up
118 	 * for us.  CPU0 already has it initialized but no harm in
119 	 * doing it again.  This is a good place for updating it, as
120 	 * we wont ever return from this function (so the invalid
121 	 * canaries already on the stack wont ever trigger).
122 	 */
123 	boot_init_stack_canary();
124 
125 	/* endless idle loop with no priority at all */
126 	while (1) {
127 		tick_nohz_stop_sched_tick(1);
128 		while (!need_resched()) {
129 
130 			rmb();
131 
132 			if (cpu_is_offline(smp_processor_id()))
133 				play_dead();
134 			/*
135 			 * Idle routines should keep interrupts disabled
136 			 * from here on, until they go to idle.
137 			 * Otherwise, idle callbacks can misfire.
138 			 */
139 			local_irq_disable();
140 			enter_idle();
141 			/* Don't trace irqs off for idle */
142 			stop_critical_timings();
143 			pm_idle();
144 			start_critical_timings();
145 			/* In many cases the interrupt that ended idle
146 			   has already called exit_idle. But some idle
147 			   loops can be woken up without interrupt. */
148 			__exit_idle();
149 		}
150 
151 		tick_nohz_restart_sched_tick();
152 		preempt_enable_no_resched();
153 		schedule();
154 		preempt_disable();
155 	}
156 }
157 
158 /* Prints also some state that isn't saved in the pt_regs */
159 void __show_regs(struct pt_regs *regs, int all)
160 {
161 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
162 	unsigned long d0, d1, d2, d3, d6, d7;
163 	unsigned int fsindex, gsindex;
164 	unsigned int ds, cs, es;
165 	const char *board;
166 
167 	printk("\n");
168 	print_modules();
169 	board = dmi_get_system_info(DMI_PRODUCT_NAME);
170 	if (!board)
171 		board = "";
172 	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
173 		current->pid, current->comm, print_tainted(),
174 		init_utsname()->release,
175 		(int)strcspn(init_utsname()->version, " "),
176 		init_utsname()->version, board);
177 	printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
178 	printk_address(regs->ip, 1);
179 	printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
180 			regs->sp, regs->flags);
181 	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
182 	       regs->ax, regs->bx, regs->cx);
183 	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
184 	       regs->dx, regs->si, regs->di);
185 	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
186 	       regs->bp, regs->r8, regs->r9);
187 	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
188 	       regs->r10, regs->r11, regs->r12);
189 	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
190 	       regs->r13, regs->r14, regs->r15);
191 
192 	asm("movl %%ds,%0" : "=r" (ds));
193 	asm("movl %%cs,%0" : "=r" (cs));
194 	asm("movl %%es,%0" : "=r" (es));
195 	asm("movl %%fs,%0" : "=r" (fsindex));
196 	asm("movl %%gs,%0" : "=r" (gsindex));
197 
198 	rdmsrl(MSR_FS_BASE, fs);
199 	rdmsrl(MSR_GS_BASE, gs);
200 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
201 
202 	if (!all)
203 		return;
204 
205 	cr0 = read_cr0();
206 	cr2 = read_cr2();
207 	cr3 = read_cr3();
208 	cr4 = read_cr4();
209 
210 	printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
211 	       fs, fsindex, gs, gsindex, shadowgs);
212 	printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
213 			es, cr0);
214 	printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
215 			cr4);
216 
217 	get_debugreg(d0, 0);
218 	get_debugreg(d1, 1);
219 	get_debugreg(d2, 2);
220 	printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
221 	get_debugreg(d3, 3);
222 	get_debugreg(d6, 6);
223 	get_debugreg(d7, 7);
224 	printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
225 }
226 
227 void show_regs(struct pt_regs *regs)
228 {
229 	printk(KERN_INFO "CPU %d:", smp_processor_id());
230 	__show_regs(regs, 1);
231 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
232 }
233 
234 void release_thread(struct task_struct *dead_task)
235 {
236 	if (dead_task->mm) {
237 		if (dead_task->mm->context.size) {
238 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
239 					dead_task->comm,
240 					dead_task->mm->context.ldt,
241 					dead_task->mm->context.size);
242 			BUG();
243 		}
244 	}
245 }
246 
247 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
248 {
249 	struct user_desc ud = {
250 		.base_addr = addr,
251 		.limit = 0xfffff,
252 		.seg_32bit = 1,
253 		.limit_in_pages = 1,
254 		.useable = 1,
255 	};
256 	struct desc_struct *desc = t->thread.tls_array;
257 	desc += tls;
258 	fill_ldt(desc, &ud);
259 }
260 
261 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
262 {
263 	return get_desc_base(&t->thread.tls_array[tls]);
264 }
265 
266 /*
267  * This gets called before we allocate a new thread and copy
268  * the current task into it.
269  */
270 void prepare_to_copy(struct task_struct *tsk)
271 {
272 	unlazy_fpu(tsk);
273 }
274 
275 int copy_thread(unsigned long clone_flags, unsigned long sp,
276 		unsigned long unused,
277 	struct task_struct *p, struct pt_regs *regs)
278 {
279 	int err;
280 	struct pt_regs *childregs;
281 	struct task_struct *me = current;
282 
283 	childregs = ((struct pt_regs *)
284 			(THREAD_SIZE + task_stack_page(p))) - 1;
285 	*childregs = *regs;
286 
287 	childregs->ax = 0;
288 	childregs->sp = sp;
289 	if (sp == ~0UL)
290 		childregs->sp = (unsigned long)childregs;
291 
292 	p->thread.sp = (unsigned long) childregs;
293 	p->thread.sp0 = (unsigned long) (childregs+1);
294 	p->thread.usersp = me->thread.usersp;
295 
296 	set_tsk_thread_flag(p, TIF_FORK);
297 
298 	p->thread.fs = me->thread.fs;
299 	p->thread.gs = me->thread.gs;
300 
301 	savesegment(gs, p->thread.gsindex);
302 	savesegment(fs, p->thread.fsindex);
303 	savesegment(es, p->thread.es);
304 	savesegment(ds, p->thread.ds);
305 
306 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 		if (!p->thread.io_bitmap_ptr) {
309 			p->thread.io_bitmap_max = 0;
310 			return -ENOMEM;
311 		}
312 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
313 				IO_BITMAP_BYTES);
314 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
315 	}
316 
317 	/*
318 	 * Set a new TLS for the child thread?
319 	 */
320 	if (clone_flags & CLONE_SETTLS) {
321 #ifdef CONFIG_IA32_EMULATION
322 		if (test_thread_flag(TIF_IA32))
323 			err = do_set_thread_area(p, -1,
324 				(struct user_desc __user *)childregs->si, 0);
325 		else
326 #endif
327 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
328 		if (err)
329 			goto out;
330 	}
331 
332 	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
333 	p->thread.ds_ctx = NULL;
334 
335 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
336 	p->thread.debugctlmsr = 0;
337 
338 	err = 0;
339 out:
340 	if (err && p->thread.io_bitmap_ptr) {
341 		kfree(p->thread.io_bitmap_ptr);
342 		p->thread.io_bitmap_max = 0;
343 	}
344 	return err;
345 }
346 
347 void
348 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
349 {
350 	loadsegment(fs, 0);
351 	loadsegment(es, 0);
352 	loadsegment(ds, 0);
353 	load_gs_index(0);
354 	regs->ip		= new_ip;
355 	regs->sp		= new_sp;
356 	percpu_write(old_rsp, new_sp);
357 	regs->cs		= __USER_CS;
358 	regs->ss		= __USER_DS;
359 	regs->flags		= 0x200;
360 	set_fs(USER_DS);
361 	/*
362 	 * Free the old FP and other extended state
363 	 */
364 	free_thread_xstate(current);
365 }
366 EXPORT_SYMBOL_GPL(start_thread);
367 
368 /*
369  *	switch_to(x,y) should switch tasks from x to y.
370  *
371  * This could still be optimized:
372  * - fold all the options into a flag word and test it with a single test.
373  * - could test fs/gs bitsliced
374  *
375  * Kprobes not supported here. Set the probe on schedule instead.
376  * Function graph tracer not supported too.
377  */
378 __notrace_funcgraph struct task_struct *
379 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
380 {
381 	struct thread_struct *prev = &prev_p->thread;
382 	struct thread_struct *next = &next_p->thread;
383 	int cpu = smp_processor_id();
384 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
385 	unsigned fsindex, gsindex;
386 	bool preload_fpu;
387 
388 	/*
389 	 * If the task has used fpu the last 5 timeslices, just do a full
390 	 * restore of the math state immediately to avoid the trap; the
391 	 * chances of needing FPU soon are obviously high now
392 	 */
393 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
394 
395 	/* we're going to use this soon, after a few expensive things */
396 	if (preload_fpu)
397 		prefetch(next->xstate);
398 
399 	/*
400 	 * Reload esp0, LDT and the page table pointer:
401 	 */
402 	load_sp0(tss, next);
403 
404 	/*
405 	 * Switch DS and ES.
406 	 * This won't pick up thread selector changes, but I guess that is ok.
407 	 */
408 	savesegment(es, prev->es);
409 	if (unlikely(next->es | prev->es))
410 		loadsegment(es, next->es);
411 
412 	savesegment(ds, prev->ds);
413 	if (unlikely(next->ds | prev->ds))
414 		loadsegment(ds, next->ds);
415 
416 
417 	/* We must save %fs and %gs before load_TLS() because
418 	 * %fs and %gs may be cleared by load_TLS().
419 	 *
420 	 * (e.g. xen_load_tls())
421 	 */
422 	savesegment(fs, fsindex);
423 	savesegment(gs, gsindex);
424 
425 	load_TLS(next, cpu);
426 
427 	/* Must be after DS reload */
428 	unlazy_fpu(prev_p);
429 
430 	/* Make sure cpu is ready for new context */
431 	if (preload_fpu)
432 		clts();
433 
434 	/*
435 	 * Leave lazy mode, flushing any hypercalls made here.
436 	 * This must be done before restoring TLS segments so
437 	 * the GDT and LDT are properly updated, and must be
438 	 * done before math_state_restore, so the TS bit is up
439 	 * to date.
440 	 */
441 	arch_end_context_switch(next_p);
442 
443 	/*
444 	 * Switch FS and GS.
445 	 *
446 	 * Segment register != 0 always requires a reload.  Also
447 	 * reload when it has changed.  When prev process used 64bit
448 	 * base always reload to avoid an information leak.
449 	 */
450 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
451 		loadsegment(fs, next->fsindex);
452 		/*
453 		 * Check if the user used a selector != 0; if yes
454 		 *  clear 64bit base, since overloaded base is always
455 		 *  mapped to the Null selector
456 		 */
457 		if (fsindex)
458 			prev->fs = 0;
459 	}
460 	/* when next process has a 64bit base use it */
461 	if (next->fs)
462 		wrmsrl(MSR_FS_BASE, next->fs);
463 	prev->fsindex = fsindex;
464 
465 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
466 		load_gs_index(next->gsindex);
467 		if (gsindex)
468 			prev->gs = 0;
469 	}
470 	if (next->gs)
471 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
472 	prev->gsindex = gsindex;
473 
474 	/*
475 	 * Switch the PDA and FPU contexts.
476 	 */
477 	prev->usersp = percpu_read(old_rsp);
478 	percpu_write(old_rsp, next->usersp);
479 	percpu_write(current_task, next_p);
480 
481 	percpu_write(kernel_stack,
482 		  (unsigned long)task_stack_page(next_p) +
483 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
484 
485 	/*
486 	 * Now maybe reload the debug registers and handle I/O bitmaps
487 	 */
488 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
489 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
490 		__switch_to_xtra(prev_p, next_p, tss);
491 
492 	/*
493 	 * Preload the FPU context, now that we've determined that the
494 	 * task is likely to be using it.
495 	 */
496 	if (preload_fpu)
497 		__math_state_restore();
498 	return prev_p;
499 }
500 
501 /*
502  * sys_execve() executes a new program.
503  */
504 asmlinkage
505 long sys_execve(char __user *name, char __user * __user *argv,
506 		char __user * __user *envp, struct pt_regs *regs)
507 {
508 	long error;
509 	char *filename;
510 
511 	filename = getname(name);
512 	error = PTR_ERR(filename);
513 	if (IS_ERR(filename))
514 		return error;
515 	error = do_execve(filename, argv, envp, regs);
516 	putname(filename);
517 	return error;
518 }
519 
520 void set_personality_64bit(void)
521 {
522 	/* inherit personality from parent */
523 
524 	/* Make sure to be in 64bit mode */
525 	clear_thread_flag(TIF_IA32);
526 
527 	/* TBD: overwrites user setup. Should have two bits.
528 	   But 64bit processes have always behaved this way,
529 	   so it's not too bad. The main problem is just that
530 	   32bit childs are affected again. */
531 	current->personality &= ~READ_IMPLIES_EXEC;
532 }
533 
534 asmlinkage long
535 sys_clone(unsigned long clone_flags, unsigned long newsp,
536 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
537 {
538 	if (!newsp)
539 		newsp = regs->sp;
540 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
541 }
542 
543 unsigned long get_wchan(struct task_struct *p)
544 {
545 	unsigned long stack;
546 	u64 fp, ip;
547 	int count = 0;
548 
549 	if (!p || p == current || p->state == TASK_RUNNING)
550 		return 0;
551 	stack = (unsigned long)task_stack_page(p);
552 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
553 		return 0;
554 	fp = *(u64 *)(p->thread.sp);
555 	do {
556 		if (fp < (unsigned long)stack ||
557 		    fp >= (unsigned long)stack+THREAD_SIZE)
558 			return 0;
559 		ip = *(u64 *)(fp+8);
560 		if (!in_sched_functions(ip))
561 			return ip;
562 		fp = *(u64 *)fp;
563 	} while (count++ < 16);
564 	return 0;
565 }
566 
567 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
568 {
569 	int ret = 0;
570 	int doit = task == current;
571 	int cpu;
572 
573 	switch (code) {
574 	case ARCH_SET_GS:
575 		if (addr >= TASK_SIZE_OF(task))
576 			return -EPERM;
577 		cpu = get_cpu();
578 		/* handle small bases via the GDT because that's faster to
579 		   switch. */
580 		if (addr <= 0xffffffff) {
581 			set_32bit_tls(task, GS_TLS, addr);
582 			if (doit) {
583 				load_TLS(&task->thread, cpu);
584 				load_gs_index(GS_TLS_SEL);
585 			}
586 			task->thread.gsindex = GS_TLS_SEL;
587 			task->thread.gs = 0;
588 		} else {
589 			task->thread.gsindex = 0;
590 			task->thread.gs = addr;
591 			if (doit) {
592 				load_gs_index(0);
593 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
594 			}
595 		}
596 		put_cpu();
597 		break;
598 	case ARCH_SET_FS:
599 		/* Not strictly needed for fs, but do it for symmetry
600 		   with gs */
601 		if (addr >= TASK_SIZE_OF(task))
602 			return -EPERM;
603 		cpu = get_cpu();
604 		/* handle small bases via the GDT because that's faster to
605 		   switch. */
606 		if (addr <= 0xffffffff) {
607 			set_32bit_tls(task, FS_TLS, addr);
608 			if (doit) {
609 				load_TLS(&task->thread, cpu);
610 				loadsegment(fs, FS_TLS_SEL);
611 			}
612 			task->thread.fsindex = FS_TLS_SEL;
613 			task->thread.fs = 0;
614 		} else {
615 			task->thread.fsindex = 0;
616 			task->thread.fs = addr;
617 			if (doit) {
618 				/* set the selector to 0 to not confuse
619 				   __switch_to */
620 				loadsegment(fs, 0);
621 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
622 			}
623 		}
624 		put_cpu();
625 		break;
626 	case ARCH_GET_FS: {
627 		unsigned long base;
628 		if (task->thread.fsindex == FS_TLS_SEL)
629 			base = read_32bit_tls(task, FS_TLS);
630 		else if (doit)
631 			rdmsrl(MSR_FS_BASE, base);
632 		else
633 			base = task->thread.fs;
634 		ret = put_user(base, (unsigned long __user *)addr);
635 		break;
636 	}
637 	case ARCH_GET_GS: {
638 		unsigned long base;
639 		unsigned gsindex;
640 		if (task->thread.gsindex == GS_TLS_SEL)
641 			base = read_32bit_tls(task, GS_TLS);
642 		else if (doit) {
643 			savesegment(gs, gsindex);
644 			if (gsindex)
645 				rdmsrl(MSR_KERNEL_GS_BASE, base);
646 			else
647 				base = task->thread.gs;
648 		} else
649 			base = task->thread.gs;
650 		ret = put_user(base, (unsigned long __user *)addr);
651 		break;
652 	}
653 
654 	default:
655 		ret = -EINVAL;
656 		break;
657 	}
658 
659 	return ret;
660 }
661 
662 long sys_arch_prctl(int code, unsigned long addr)
663 {
664 	return do_arch_prctl(current, code, addr);
665 }
666 
667