xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision b627b4ed)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <stdarg.h>
18 
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
42 #include <linux/io.h>
43 #include <linux/ftrace.h>
44 #include <linux/dmi.h>
45 
46 #include <asm/pgtable.h>
47 #include <asm/system.h>
48 #include <asm/processor.h>
49 #include <asm/i387.h>
50 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
52 #include <asm/desc.h>
53 #include <asm/proto.h>
54 #include <asm/ia32.h>
55 #include <asm/idle.h>
56 #include <asm/syscalls.h>
57 #include <asm/ds.h>
58 
59 asmlinkage extern void ret_from_fork(void);
60 
61 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62 EXPORT_PER_CPU_SYMBOL(current_task);
63 
64 DEFINE_PER_CPU(unsigned long, old_rsp);
65 static DEFINE_PER_CPU(unsigned char, is_idle);
66 
67 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
68 
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 
71 void idle_notifier_register(struct notifier_block *n)
72 {
73 	atomic_notifier_chain_register(&idle_notifier, n);
74 }
75 EXPORT_SYMBOL_GPL(idle_notifier_register);
76 
77 void idle_notifier_unregister(struct notifier_block *n)
78 {
79 	atomic_notifier_chain_unregister(&idle_notifier, n);
80 }
81 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
82 
83 void enter_idle(void)
84 {
85 	percpu_write(is_idle, 1);
86 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
87 }
88 
89 static void __exit_idle(void)
90 {
91 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
92 		return;
93 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
94 }
95 
96 /* Called from interrupts to signify idle end */
97 void exit_idle(void)
98 {
99 	/* idle loop has pid 0 */
100 	if (current->pid)
101 		return;
102 	__exit_idle();
103 }
104 
105 #ifndef CONFIG_SMP
106 static inline void play_dead(void)
107 {
108 	BUG();
109 }
110 #endif
111 
112 /*
113  * The idle thread. There's no useful work to be
114  * done, so just try to conserve power and have a
115  * low exit latency (ie sit in a loop waiting for
116  * somebody to say that they'd like to reschedule)
117  */
118 void cpu_idle(void)
119 {
120 	current_thread_info()->status |= TS_POLLING;
121 
122 	/*
123 	 * If we're the non-boot CPU, nothing set the stack canary up
124 	 * for us.  CPU0 already has it initialized but no harm in
125 	 * doing it again.  This is a good place for updating it, as
126 	 * we wont ever return from this function (so the invalid
127 	 * canaries already on the stack wont ever trigger).
128 	 */
129 	boot_init_stack_canary();
130 
131 	/* endless idle loop with no priority at all */
132 	while (1) {
133 		tick_nohz_stop_sched_tick(1);
134 		while (!need_resched()) {
135 
136 			rmb();
137 
138 			if (cpu_is_offline(smp_processor_id()))
139 				play_dead();
140 			/*
141 			 * Idle routines should keep interrupts disabled
142 			 * from here on, until they go to idle.
143 			 * Otherwise, idle callbacks can misfire.
144 			 */
145 			local_irq_disable();
146 			enter_idle();
147 			/* Don't trace irqs off for idle */
148 			stop_critical_timings();
149 			pm_idle();
150 			start_critical_timings();
151 			/* In many cases the interrupt that ended idle
152 			   has already called exit_idle. But some idle
153 			   loops can be woken up without interrupt. */
154 			__exit_idle();
155 		}
156 
157 		tick_nohz_restart_sched_tick();
158 		preempt_enable_no_resched();
159 		schedule();
160 		preempt_disable();
161 	}
162 }
163 
164 /* Prints also some state that isn't saved in the pt_regs */
165 void __show_regs(struct pt_regs *regs, int all)
166 {
167 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
168 	unsigned long d0, d1, d2, d3, d6, d7;
169 	unsigned int fsindex, gsindex;
170 	unsigned int ds, cs, es;
171 	const char *board;
172 
173 	printk("\n");
174 	print_modules();
175 	board = dmi_get_system_info(DMI_PRODUCT_NAME);
176 	if (!board)
177 		board = "";
178 	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
179 		current->pid, current->comm, print_tainted(),
180 		init_utsname()->release,
181 		(int)strcspn(init_utsname()->version, " "),
182 		init_utsname()->version, board);
183 	printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
184 	printk_address(regs->ip, 1);
185 	printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
186 			regs->sp, regs->flags);
187 	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
188 	       regs->ax, regs->bx, regs->cx);
189 	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
190 	       regs->dx, regs->si, regs->di);
191 	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
192 	       regs->bp, regs->r8, regs->r9);
193 	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
194 	       regs->r10, regs->r11, regs->r12);
195 	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
196 	       regs->r13, regs->r14, regs->r15);
197 
198 	asm("movl %%ds,%0" : "=r" (ds));
199 	asm("movl %%cs,%0" : "=r" (cs));
200 	asm("movl %%es,%0" : "=r" (es));
201 	asm("movl %%fs,%0" : "=r" (fsindex));
202 	asm("movl %%gs,%0" : "=r" (gsindex));
203 
204 	rdmsrl(MSR_FS_BASE, fs);
205 	rdmsrl(MSR_GS_BASE, gs);
206 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
207 
208 	if (!all)
209 		return;
210 
211 	cr0 = read_cr0();
212 	cr2 = read_cr2();
213 	cr3 = read_cr3();
214 	cr4 = read_cr4();
215 
216 	printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
217 	       fs, fsindex, gs, gsindex, shadowgs);
218 	printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
219 			es, cr0);
220 	printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
221 			cr4);
222 
223 	get_debugreg(d0, 0);
224 	get_debugreg(d1, 1);
225 	get_debugreg(d2, 2);
226 	printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
227 	get_debugreg(d3, 3);
228 	get_debugreg(d6, 6);
229 	get_debugreg(d7, 7);
230 	printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
231 }
232 
233 void show_regs(struct pt_regs *regs)
234 {
235 	printk(KERN_INFO "CPU %d:", smp_processor_id());
236 	__show_regs(regs, 1);
237 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
238 }
239 
240 void release_thread(struct task_struct *dead_task)
241 {
242 	if (dead_task->mm) {
243 		if (dead_task->mm->context.size) {
244 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
245 					dead_task->comm,
246 					dead_task->mm->context.ldt,
247 					dead_task->mm->context.size);
248 			BUG();
249 		}
250 	}
251 }
252 
253 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
254 {
255 	struct user_desc ud = {
256 		.base_addr = addr,
257 		.limit = 0xfffff,
258 		.seg_32bit = 1,
259 		.limit_in_pages = 1,
260 		.useable = 1,
261 	};
262 	struct desc_struct *desc = t->thread.tls_array;
263 	desc += tls;
264 	fill_ldt(desc, &ud);
265 }
266 
267 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
268 {
269 	return get_desc_base(&t->thread.tls_array[tls]);
270 }
271 
272 /*
273  * This gets called before we allocate a new thread and copy
274  * the current task into it.
275  */
276 void prepare_to_copy(struct task_struct *tsk)
277 {
278 	unlazy_fpu(tsk);
279 }
280 
281 int copy_thread(unsigned long clone_flags, unsigned long sp,
282 		unsigned long unused,
283 	struct task_struct *p, struct pt_regs *regs)
284 {
285 	int err;
286 	struct pt_regs *childregs;
287 	struct task_struct *me = current;
288 
289 	childregs = ((struct pt_regs *)
290 			(THREAD_SIZE + task_stack_page(p))) - 1;
291 	*childregs = *regs;
292 
293 	childregs->ax = 0;
294 	childregs->sp = sp;
295 	if (sp == ~0UL)
296 		childregs->sp = (unsigned long)childregs;
297 
298 	p->thread.sp = (unsigned long) childregs;
299 	p->thread.sp0 = (unsigned long) (childregs+1);
300 	p->thread.usersp = me->thread.usersp;
301 
302 	set_tsk_thread_flag(p, TIF_FORK);
303 
304 	p->thread.fs = me->thread.fs;
305 	p->thread.gs = me->thread.gs;
306 
307 	savesegment(gs, p->thread.gsindex);
308 	savesegment(fs, p->thread.fsindex);
309 	savesegment(es, p->thread.es);
310 	savesegment(ds, p->thread.ds);
311 
312 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
313 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
314 		if (!p->thread.io_bitmap_ptr) {
315 			p->thread.io_bitmap_max = 0;
316 			return -ENOMEM;
317 		}
318 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
319 				IO_BITMAP_BYTES);
320 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
321 	}
322 
323 	/*
324 	 * Set a new TLS for the child thread?
325 	 */
326 	if (clone_flags & CLONE_SETTLS) {
327 #ifdef CONFIG_IA32_EMULATION
328 		if (test_thread_flag(TIF_IA32))
329 			err = do_set_thread_area(p, -1,
330 				(struct user_desc __user *)childregs->si, 0);
331 		else
332 #endif
333 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
334 		if (err)
335 			goto out;
336 	}
337 
338 	ds_copy_thread(p, me);
339 
340 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
341 	p->thread.debugctlmsr = 0;
342 
343 	err = 0;
344 out:
345 	if (err && p->thread.io_bitmap_ptr) {
346 		kfree(p->thread.io_bitmap_ptr);
347 		p->thread.io_bitmap_max = 0;
348 	}
349 	return err;
350 }
351 
352 void
353 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
354 {
355 	loadsegment(fs, 0);
356 	loadsegment(es, 0);
357 	loadsegment(ds, 0);
358 	load_gs_index(0);
359 	regs->ip		= new_ip;
360 	regs->sp		= new_sp;
361 	percpu_write(old_rsp, new_sp);
362 	regs->cs		= __USER_CS;
363 	regs->ss		= __USER_DS;
364 	regs->flags		= 0x200;
365 	set_fs(USER_DS);
366 	/*
367 	 * Free the old FP and other extended state
368 	 */
369 	free_thread_xstate(current);
370 }
371 EXPORT_SYMBOL_GPL(start_thread);
372 
373 /*
374  *	switch_to(x,y) should switch tasks from x to y.
375  *
376  * This could still be optimized:
377  * - fold all the options into a flag word and test it with a single test.
378  * - could test fs/gs bitsliced
379  *
380  * Kprobes not supported here. Set the probe on schedule instead.
381  * Function graph tracer not supported too.
382  */
383 __notrace_funcgraph struct task_struct *
384 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
385 {
386 	struct thread_struct *prev = &prev_p->thread;
387 	struct thread_struct *next = &next_p->thread;
388 	int cpu = smp_processor_id();
389 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
390 	unsigned fsindex, gsindex;
391 
392 	/* we're going to use this soon, after a few expensive things */
393 	if (next_p->fpu_counter > 5)
394 		prefetch(next->xstate);
395 
396 	/*
397 	 * Reload esp0, LDT and the page table pointer:
398 	 */
399 	load_sp0(tss, next);
400 
401 	/*
402 	 * Switch DS and ES.
403 	 * This won't pick up thread selector changes, but I guess that is ok.
404 	 */
405 	savesegment(es, prev->es);
406 	if (unlikely(next->es | prev->es))
407 		loadsegment(es, next->es);
408 
409 	savesegment(ds, prev->ds);
410 	if (unlikely(next->ds | prev->ds))
411 		loadsegment(ds, next->ds);
412 
413 
414 	/* We must save %fs and %gs before load_TLS() because
415 	 * %fs and %gs may be cleared by load_TLS().
416 	 *
417 	 * (e.g. xen_load_tls())
418 	 */
419 	savesegment(fs, fsindex);
420 	savesegment(gs, gsindex);
421 
422 	load_TLS(next, cpu);
423 
424 	/*
425 	 * Leave lazy mode, flushing any hypercalls made here.
426 	 * This must be done before restoring TLS segments so
427 	 * the GDT and LDT are properly updated, and must be
428 	 * done before math_state_restore, so the TS bit is up
429 	 * to date.
430 	 */
431 	arch_leave_lazy_cpu_mode();
432 
433 	/*
434 	 * Switch FS and GS.
435 	 *
436 	 * Segment register != 0 always requires a reload.  Also
437 	 * reload when it has changed.  When prev process used 64bit
438 	 * base always reload to avoid an information leak.
439 	 */
440 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
441 		loadsegment(fs, next->fsindex);
442 		/*
443 		 * Check if the user used a selector != 0; if yes
444 		 *  clear 64bit base, since overloaded base is always
445 		 *  mapped to the Null selector
446 		 */
447 		if (fsindex)
448 			prev->fs = 0;
449 	}
450 	/* when next process has a 64bit base use it */
451 	if (next->fs)
452 		wrmsrl(MSR_FS_BASE, next->fs);
453 	prev->fsindex = fsindex;
454 
455 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
456 		load_gs_index(next->gsindex);
457 		if (gsindex)
458 			prev->gs = 0;
459 	}
460 	if (next->gs)
461 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
462 	prev->gsindex = gsindex;
463 
464 	/* Must be after DS reload */
465 	unlazy_fpu(prev_p);
466 
467 	/*
468 	 * Switch the PDA and FPU contexts.
469 	 */
470 	prev->usersp = percpu_read(old_rsp);
471 	percpu_write(old_rsp, next->usersp);
472 	percpu_write(current_task, next_p);
473 
474 	percpu_write(kernel_stack,
475 		  (unsigned long)task_stack_page(next_p) +
476 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
477 
478 	/*
479 	 * Now maybe reload the debug registers and handle I/O bitmaps
480 	 */
481 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
482 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
483 		__switch_to_xtra(prev_p, next_p, tss);
484 
485 	/* If the task has used fpu the last 5 timeslices, just do a full
486 	 * restore of the math state immediately to avoid the trap; the
487 	 * chances of needing FPU soon are obviously high now
488 	 *
489 	 * tsk_used_math() checks prevent calling math_state_restore(),
490 	 * which can sleep in the case of !tsk_used_math()
491 	 */
492 	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
493 		math_state_restore();
494 	return prev_p;
495 }
496 
497 /*
498  * sys_execve() executes a new program.
499  */
500 asmlinkage
501 long sys_execve(char __user *name, char __user * __user *argv,
502 		char __user * __user *envp, struct pt_regs *regs)
503 {
504 	long error;
505 	char *filename;
506 
507 	filename = getname(name);
508 	error = PTR_ERR(filename);
509 	if (IS_ERR(filename))
510 		return error;
511 	error = do_execve(filename, argv, envp, regs);
512 	putname(filename);
513 	return error;
514 }
515 
516 void set_personality_64bit(void)
517 {
518 	/* inherit personality from parent */
519 
520 	/* Make sure to be in 64bit mode */
521 	clear_thread_flag(TIF_IA32);
522 
523 	/* TBD: overwrites user setup. Should have two bits.
524 	   But 64bit processes have always behaved this way,
525 	   so it's not too bad. The main problem is just that
526 	   32bit childs are affected again. */
527 	current->personality &= ~READ_IMPLIES_EXEC;
528 }
529 
530 asmlinkage long
531 sys_clone(unsigned long clone_flags, unsigned long newsp,
532 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
533 {
534 	if (!newsp)
535 		newsp = regs->sp;
536 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
537 }
538 
539 unsigned long get_wchan(struct task_struct *p)
540 {
541 	unsigned long stack;
542 	u64 fp, ip;
543 	int count = 0;
544 
545 	if (!p || p == current || p->state == TASK_RUNNING)
546 		return 0;
547 	stack = (unsigned long)task_stack_page(p);
548 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
549 		return 0;
550 	fp = *(u64 *)(p->thread.sp);
551 	do {
552 		if (fp < (unsigned long)stack ||
553 		    fp >= (unsigned long)stack+THREAD_SIZE)
554 			return 0;
555 		ip = *(u64 *)(fp+8);
556 		if (!in_sched_functions(ip))
557 			return ip;
558 		fp = *(u64 *)fp;
559 	} while (count++ < 16);
560 	return 0;
561 }
562 
563 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
564 {
565 	int ret = 0;
566 	int doit = task == current;
567 	int cpu;
568 
569 	switch (code) {
570 	case ARCH_SET_GS:
571 		if (addr >= TASK_SIZE_OF(task))
572 			return -EPERM;
573 		cpu = get_cpu();
574 		/* handle small bases via the GDT because that's faster to
575 		   switch. */
576 		if (addr <= 0xffffffff) {
577 			set_32bit_tls(task, GS_TLS, addr);
578 			if (doit) {
579 				load_TLS(&task->thread, cpu);
580 				load_gs_index(GS_TLS_SEL);
581 			}
582 			task->thread.gsindex = GS_TLS_SEL;
583 			task->thread.gs = 0;
584 		} else {
585 			task->thread.gsindex = 0;
586 			task->thread.gs = addr;
587 			if (doit) {
588 				load_gs_index(0);
589 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
590 			}
591 		}
592 		put_cpu();
593 		break;
594 	case ARCH_SET_FS:
595 		/* Not strictly needed for fs, but do it for symmetry
596 		   with gs */
597 		if (addr >= TASK_SIZE_OF(task))
598 			return -EPERM;
599 		cpu = get_cpu();
600 		/* handle small bases via the GDT because that's faster to
601 		   switch. */
602 		if (addr <= 0xffffffff) {
603 			set_32bit_tls(task, FS_TLS, addr);
604 			if (doit) {
605 				load_TLS(&task->thread, cpu);
606 				loadsegment(fs, FS_TLS_SEL);
607 			}
608 			task->thread.fsindex = FS_TLS_SEL;
609 			task->thread.fs = 0;
610 		} else {
611 			task->thread.fsindex = 0;
612 			task->thread.fs = addr;
613 			if (doit) {
614 				/* set the selector to 0 to not confuse
615 				   __switch_to */
616 				loadsegment(fs, 0);
617 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
618 			}
619 		}
620 		put_cpu();
621 		break;
622 	case ARCH_GET_FS: {
623 		unsigned long base;
624 		if (task->thread.fsindex == FS_TLS_SEL)
625 			base = read_32bit_tls(task, FS_TLS);
626 		else if (doit)
627 			rdmsrl(MSR_FS_BASE, base);
628 		else
629 			base = task->thread.fs;
630 		ret = put_user(base, (unsigned long __user *)addr);
631 		break;
632 	}
633 	case ARCH_GET_GS: {
634 		unsigned long base;
635 		unsigned gsindex;
636 		if (task->thread.gsindex == GS_TLS_SEL)
637 			base = read_32bit_tls(task, GS_TLS);
638 		else if (doit) {
639 			savesegment(gs, gsindex);
640 			if (gsindex)
641 				rdmsrl(MSR_KERNEL_GS_BASE, base);
642 			else
643 				base = task->thread.gs;
644 		} else
645 			base = task->thread.gs;
646 		ret = put_user(base, (unsigned long __user *)addr);
647 		break;
648 	}
649 
650 	default:
651 		ret = -EINVAL;
652 		break;
653 	}
654 
655 	return ret;
656 }
657 
658 long sys_arch_prctl(int code, unsigned long addr)
659 {
660 	return do_arch_prctl(current, code, addr);
661 }
662 
663 unsigned long arch_align_stack(unsigned long sp)
664 {
665 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
666 		sp -= get_random_int() % 8192;
667 	return sp & ~0xf;
668 }
669 
670 unsigned long arch_randomize_brk(struct mm_struct *mm)
671 {
672 	unsigned long range_end = mm->brk + 0x02000000;
673 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
674 }
675