xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision 78c99ba1)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/utsname.h>
30 #include <linux/delay.h>
31 #include <linux/module.h>
32 #include <linux/ptrace.h>
33 #include <linux/notifier.h>
34 #include <linux/kprobes.h>
35 #include <linux/kdebug.h>
36 #include <linux/tick.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41 #include <linux/dmi.h>
42 
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
45 #include <asm/processor.h>
46 #include <asm/i387.h>
47 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/desc.h>
50 #include <asm/proto.h>
51 #include <asm/ia32.h>
52 #include <asm/idle.h>
53 #include <asm/syscalls.h>
54 #include <asm/ds.h>
55 
56 asmlinkage extern void ret_from_fork(void);
57 
58 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
59 EXPORT_PER_CPU_SYMBOL(current_task);
60 
61 DEFINE_PER_CPU(unsigned long, old_rsp);
62 static DEFINE_PER_CPU(unsigned char, is_idle);
63 
64 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
65 
66 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
67 
68 void idle_notifier_register(struct notifier_block *n)
69 {
70 	atomic_notifier_chain_register(&idle_notifier, n);
71 }
72 EXPORT_SYMBOL_GPL(idle_notifier_register);
73 
74 void idle_notifier_unregister(struct notifier_block *n)
75 {
76 	atomic_notifier_chain_unregister(&idle_notifier, n);
77 }
78 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
79 
80 void enter_idle(void)
81 {
82 	percpu_write(is_idle, 1);
83 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
84 }
85 
86 static void __exit_idle(void)
87 {
88 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
89 		return;
90 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
91 }
92 
93 /* Called from interrupts to signify idle end */
94 void exit_idle(void)
95 {
96 	/* idle loop has pid 0 */
97 	if (current->pid)
98 		return;
99 	__exit_idle();
100 }
101 
102 #ifndef CONFIG_SMP
103 static inline void play_dead(void)
104 {
105 	BUG();
106 }
107 #endif
108 
109 /*
110  * The idle thread. There's no useful work to be
111  * done, so just try to conserve power and have a
112  * low exit latency (ie sit in a loop waiting for
113  * somebody to say that they'd like to reschedule)
114  */
115 void cpu_idle(void)
116 {
117 	current_thread_info()->status |= TS_POLLING;
118 
119 	/*
120 	 * If we're the non-boot CPU, nothing set the stack canary up
121 	 * for us.  CPU0 already has it initialized but no harm in
122 	 * doing it again.  This is a good place for updating it, as
123 	 * we wont ever return from this function (so the invalid
124 	 * canaries already on the stack wont ever trigger).
125 	 */
126 	boot_init_stack_canary();
127 
128 	/* endless idle loop with no priority at all */
129 	while (1) {
130 		tick_nohz_stop_sched_tick(1);
131 		while (!need_resched()) {
132 
133 			rmb();
134 
135 			if (cpu_is_offline(smp_processor_id()))
136 				play_dead();
137 			/*
138 			 * Idle routines should keep interrupts disabled
139 			 * from here on, until they go to idle.
140 			 * Otherwise, idle callbacks can misfire.
141 			 */
142 			local_irq_disable();
143 			enter_idle();
144 			/* Don't trace irqs off for idle */
145 			stop_critical_timings();
146 			pm_idle();
147 			start_critical_timings();
148 			/* In many cases the interrupt that ended idle
149 			   has already called exit_idle. But some idle
150 			   loops can be woken up without interrupt. */
151 			__exit_idle();
152 		}
153 
154 		tick_nohz_restart_sched_tick();
155 		preempt_enable_no_resched();
156 		schedule();
157 		preempt_disable();
158 	}
159 }
160 
161 /* Prints also some state that isn't saved in the pt_regs */
162 void __show_regs(struct pt_regs *regs, int all)
163 {
164 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
165 	unsigned long d0, d1, d2, d3, d6, d7;
166 	unsigned int fsindex, gsindex;
167 	unsigned int ds, cs, es;
168 	const char *board;
169 
170 	printk("\n");
171 	print_modules();
172 	board = dmi_get_system_info(DMI_PRODUCT_NAME);
173 	if (!board)
174 		board = "";
175 	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
176 		current->pid, current->comm, print_tainted(),
177 		init_utsname()->release,
178 		(int)strcspn(init_utsname()->version, " "),
179 		init_utsname()->version, board);
180 	printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
181 	printk_address(regs->ip, 1);
182 	printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
183 			regs->sp, regs->flags);
184 	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
185 	       regs->ax, regs->bx, regs->cx);
186 	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
187 	       regs->dx, regs->si, regs->di);
188 	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
189 	       regs->bp, regs->r8, regs->r9);
190 	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
191 	       regs->r10, regs->r11, regs->r12);
192 	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
193 	       regs->r13, regs->r14, regs->r15);
194 
195 	asm("movl %%ds,%0" : "=r" (ds));
196 	asm("movl %%cs,%0" : "=r" (cs));
197 	asm("movl %%es,%0" : "=r" (es));
198 	asm("movl %%fs,%0" : "=r" (fsindex));
199 	asm("movl %%gs,%0" : "=r" (gsindex));
200 
201 	rdmsrl(MSR_FS_BASE, fs);
202 	rdmsrl(MSR_GS_BASE, gs);
203 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
204 
205 	if (!all)
206 		return;
207 
208 	cr0 = read_cr0();
209 	cr2 = read_cr2();
210 	cr3 = read_cr3();
211 	cr4 = read_cr4();
212 
213 	printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
214 	       fs, fsindex, gs, gsindex, shadowgs);
215 	printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
216 			es, cr0);
217 	printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
218 			cr4);
219 
220 	get_debugreg(d0, 0);
221 	get_debugreg(d1, 1);
222 	get_debugreg(d2, 2);
223 	printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
224 	get_debugreg(d3, 3);
225 	get_debugreg(d6, 6);
226 	get_debugreg(d7, 7);
227 	printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
228 }
229 
230 void show_regs(struct pt_regs *regs)
231 {
232 	printk(KERN_INFO "CPU %d:", smp_processor_id());
233 	__show_regs(regs, 1);
234 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
235 }
236 
237 void release_thread(struct task_struct *dead_task)
238 {
239 	if (dead_task->mm) {
240 		if (dead_task->mm->context.size) {
241 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
242 					dead_task->comm,
243 					dead_task->mm->context.ldt,
244 					dead_task->mm->context.size);
245 			BUG();
246 		}
247 	}
248 }
249 
250 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
251 {
252 	struct user_desc ud = {
253 		.base_addr = addr,
254 		.limit = 0xfffff,
255 		.seg_32bit = 1,
256 		.limit_in_pages = 1,
257 		.useable = 1,
258 	};
259 	struct desc_struct *desc = t->thread.tls_array;
260 	desc += tls;
261 	fill_ldt(desc, &ud);
262 }
263 
264 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
265 {
266 	return get_desc_base(&t->thread.tls_array[tls]);
267 }
268 
269 /*
270  * This gets called before we allocate a new thread and copy
271  * the current task into it.
272  */
273 void prepare_to_copy(struct task_struct *tsk)
274 {
275 	unlazy_fpu(tsk);
276 }
277 
278 int copy_thread(unsigned long clone_flags, unsigned long sp,
279 		unsigned long unused,
280 	struct task_struct *p, struct pt_regs *regs)
281 {
282 	int err;
283 	struct pt_regs *childregs;
284 	struct task_struct *me = current;
285 
286 	childregs = ((struct pt_regs *)
287 			(THREAD_SIZE + task_stack_page(p))) - 1;
288 	*childregs = *regs;
289 
290 	childregs->ax = 0;
291 	childregs->sp = sp;
292 	if (sp == ~0UL)
293 		childregs->sp = (unsigned long)childregs;
294 
295 	p->thread.sp = (unsigned long) childregs;
296 	p->thread.sp0 = (unsigned long) (childregs+1);
297 	p->thread.usersp = me->thread.usersp;
298 
299 	set_tsk_thread_flag(p, TIF_FORK);
300 
301 	p->thread.fs = me->thread.fs;
302 	p->thread.gs = me->thread.gs;
303 
304 	savesegment(gs, p->thread.gsindex);
305 	savesegment(fs, p->thread.fsindex);
306 	savesegment(es, p->thread.es);
307 	savesegment(ds, p->thread.ds);
308 
309 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
310 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
311 		if (!p->thread.io_bitmap_ptr) {
312 			p->thread.io_bitmap_max = 0;
313 			return -ENOMEM;
314 		}
315 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
316 				IO_BITMAP_BYTES);
317 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
318 	}
319 
320 	/*
321 	 * Set a new TLS for the child thread?
322 	 */
323 	if (clone_flags & CLONE_SETTLS) {
324 #ifdef CONFIG_IA32_EMULATION
325 		if (test_thread_flag(TIF_IA32))
326 			err = do_set_thread_area(p, -1,
327 				(struct user_desc __user *)childregs->si, 0);
328 		else
329 #endif
330 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
331 		if (err)
332 			goto out;
333 	}
334 
335 	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
336 	p->thread.ds_ctx = NULL;
337 
338 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
339 	p->thread.debugctlmsr = 0;
340 
341 	err = 0;
342 out:
343 	if (err && p->thread.io_bitmap_ptr) {
344 		kfree(p->thread.io_bitmap_ptr);
345 		p->thread.io_bitmap_max = 0;
346 	}
347 	return err;
348 }
349 
350 void
351 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
352 {
353 	loadsegment(fs, 0);
354 	loadsegment(es, 0);
355 	loadsegment(ds, 0);
356 	load_gs_index(0);
357 	regs->ip		= new_ip;
358 	regs->sp		= new_sp;
359 	percpu_write(old_rsp, new_sp);
360 	regs->cs		= __USER_CS;
361 	regs->ss		= __USER_DS;
362 	regs->flags		= 0x200;
363 	set_fs(USER_DS);
364 	/*
365 	 * Free the old FP and other extended state
366 	 */
367 	free_thread_xstate(current);
368 }
369 EXPORT_SYMBOL_GPL(start_thread);
370 
371 /*
372  *	switch_to(x,y) should switch tasks from x to y.
373  *
374  * This could still be optimized:
375  * - fold all the options into a flag word and test it with a single test.
376  * - could test fs/gs bitsliced
377  *
378  * Kprobes not supported here. Set the probe on schedule instead.
379  * Function graph tracer not supported too.
380  */
381 __notrace_funcgraph struct task_struct *
382 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
383 {
384 	struct thread_struct *prev = &prev_p->thread;
385 	struct thread_struct *next = &next_p->thread;
386 	int cpu = smp_processor_id();
387 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
388 	unsigned fsindex, gsindex;
389 
390 	/* we're going to use this soon, after a few expensive things */
391 	if (next_p->fpu_counter > 5)
392 		prefetch(next->xstate);
393 
394 	/*
395 	 * Reload esp0, LDT and the page table pointer:
396 	 */
397 	load_sp0(tss, next);
398 
399 	/*
400 	 * Switch DS and ES.
401 	 * This won't pick up thread selector changes, but I guess that is ok.
402 	 */
403 	savesegment(es, prev->es);
404 	if (unlikely(next->es | prev->es))
405 		loadsegment(es, next->es);
406 
407 	savesegment(ds, prev->ds);
408 	if (unlikely(next->ds | prev->ds))
409 		loadsegment(ds, next->ds);
410 
411 
412 	/* We must save %fs and %gs before load_TLS() because
413 	 * %fs and %gs may be cleared by load_TLS().
414 	 *
415 	 * (e.g. xen_load_tls())
416 	 */
417 	savesegment(fs, fsindex);
418 	savesegment(gs, gsindex);
419 
420 	load_TLS(next, cpu);
421 
422 	/*
423 	 * Leave lazy mode, flushing any hypercalls made here.
424 	 * This must be done before restoring TLS segments so
425 	 * the GDT and LDT are properly updated, and must be
426 	 * done before math_state_restore, so the TS bit is up
427 	 * to date.
428 	 */
429 	arch_end_context_switch(next_p);
430 
431 	/*
432 	 * Switch FS and GS.
433 	 *
434 	 * Segment register != 0 always requires a reload.  Also
435 	 * reload when it has changed.  When prev process used 64bit
436 	 * base always reload to avoid an information leak.
437 	 */
438 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
439 		loadsegment(fs, next->fsindex);
440 		/*
441 		 * Check if the user used a selector != 0; if yes
442 		 *  clear 64bit base, since overloaded base is always
443 		 *  mapped to the Null selector
444 		 */
445 		if (fsindex)
446 			prev->fs = 0;
447 	}
448 	/* when next process has a 64bit base use it */
449 	if (next->fs)
450 		wrmsrl(MSR_FS_BASE, next->fs);
451 	prev->fsindex = fsindex;
452 
453 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
454 		load_gs_index(next->gsindex);
455 		if (gsindex)
456 			prev->gs = 0;
457 	}
458 	if (next->gs)
459 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
460 	prev->gsindex = gsindex;
461 
462 	/* Must be after DS reload */
463 	unlazy_fpu(prev_p);
464 
465 	/*
466 	 * Switch the PDA and FPU contexts.
467 	 */
468 	prev->usersp = percpu_read(old_rsp);
469 	percpu_write(old_rsp, next->usersp);
470 	percpu_write(current_task, next_p);
471 
472 	percpu_write(kernel_stack,
473 		  (unsigned long)task_stack_page(next_p) +
474 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
475 
476 	/*
477 	 * Now maybe reload the debug registers and handle I/O bitmaps
478 	 */
479 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
480 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
481 		__switch_to_xtra(prev_p, next_p, tss);
482 
483 	/* If the task has used fpu the last 5 timeslices, just do a full
484 	 * restore of the math state immediately to avoid the trap; the
485 	 * chances of needing FPU soon are obviously high now
486 	 *
487 	 * tsk_used_math() checks prevent calling math_state_restore(),
488 	 * which can sleep in the case of !tsk_used_math()
489 	 */
490 	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
491 		math_state_restore();
492 	return prev_p;
493 }
494 
495 /*
496  * sys_execve() executes a new program.
497  */
498 asmlinkage
499 long sys_execve(char __user *name, char __user * __user *argv,
500 		char __user * __user *envp, struct pt_regs *regs)
501 {
502 	long error;
503 	char *filename;
504 
505 	filename = getname(name);
506 	error = PTR_ERR(filename);
507 	if (IS_ERR(filename))
508 		return error;
509 	error = do_execve(filename, argv, envp, regs);
510 	putname(filename);
511 	return error;
512 }
513 
514 void set_personality_64bit(void)
515 {
516 	/* inherit personality from parent */
517 
518 	/* Make sure to be in 64bit mode */
519 	clear_thread_flag(TIF_IA32);
520 
521 	/* TBD: overwrites user setup. Should have two bits.
522 	   But 64bit processes have always behaved this way,
523 	   so it's not too bad. The main problem is just that
524 	   32bit childs are affected again. */
525 	current->personality &= ~READ_IMPLIES_EXEC;
526 }
527 
528 asmlinkage long
529 sys_clone(unsigned long clone_flags, unsigned long newsp,
530 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
531 {
532 	if (!newsp)
533 		newsp = regs->sp;
534 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
535 }
536 
537 unsigned long get_wchan(struct task_struct *p)
538 {
539 	unsigned long stack;
540 	u64 fp, ip;
541 	int count = 0;
542 
543 	if (!p || p == current || p->state == TASK_RUNNING)
544 		return 0;
545 	stack = (unsigned long)task_stack_page(p);
546 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
547 		return 0;
548 	fp = *(u64 *)(p->thread.sp);
549 	do {
550 		if (fp < (unsigned long)stack ||
551 		    fp >= (unsigned long)stack+THREAD_SIZE)
552 			return 0;
553 		ip = *(u64 *)(fp+8);
554 		if (!in_sched_functions(ip))
555 			return ip;
556 		fp = *(u64 *)fp;
557 	} while (count++ < 16);
558 	return 0;
559 }
560 
561 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
562 {
563 	int ret = 0;
564 	int doit = task == current;
565 	int cpu;
566 
567 	switch (code) {
568 	case ARCH_SET_GS:
569 		if (addr >= TASK_SIZE_OF(task))
570 			return -EPERM;
571 		cpu = get_cpu();
572 		/* handle small bases via the GDT because that's faster to
573 		   switch. */
574 		if (addr <= 0xffffffff) {
575 			set_32bit_tls(task, GS_TLS, addr);
576 			if (doit) {
577 				load_TLS(&task->thread, cpu);
578 				load_gs_index(GS_TLS_SEL);
579 			}
580 			task->thread.gsindex = GS_TLS_SEL;
581 			task->thread.gs = 0;
582 		} else {
583 			task->thread.gsindex = 0;
584 			task->thread.gs = addr;
585 			if (doit) {
586 				load_gs_index(0);
587 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
588 			}
589 		}
590 		put_cpu();
591 		break;
592 	case ARCH_SET_FS:
593 		/* Not strictly needed for fs, but do it for symmetry
594 		   with gs */
595 		if (addr >= TASK_SIZE_OF(task))
596 			return -EPERM;
597 		cpu = get_cpu();
598 		/* handle small bases via the GDT because that's faster to
599 		   switch. */
600 		if (addr <= 0xffffffff) {
601 			set_32bit_tls(task, FS_TLS, addr);
602 			if (doit) {
603 				load_TLS(&task->thread, cpu);
604 				loadsegment(fs, FS_TLS_SEL);
605 			}
606 			task->thread.fsindex = FS_TLS_SEL;
607 			task->thread.fs = 0;
608 		} else {
609 			task->thread.fsindex = 0;
610 			task->thread.fs = addr;
611 			if (doit) {
612 				/* set the selector to 0 to not confuse
613 				   __switch_to */
614 				loadsegment(fs, 0);
615 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
616 			}
617 		}
618 		put_cpu();
619 		break;
620 	case ARCH_GET_FS: {
621 		unsigned long base;
622 		if (task->thread.fsindex == FS_TLS_SEL)
623 			base = read_32bit_tls(task, FS_TLS);
624 		else if (doit)
625 			rdmsrl(MSR_FS_BASE, base);
626 		else
627 			base = task->thread.fs;
628 		ret = put_user(base, (unsigned long __user *)addr);
629 		break;
630 	}
631 	case ARCH_GET_GS: {
632 		unsigned long base;
633 		unsigned gsindex;
634 		if (task->thread.gsindex == GS_TLS_SEL)
635 			base = read_32bit_tls(task, GS_TLS);
636 		else if (doit) {
637 			savesegment(gs, gsindex);
638 			if (gsindex)
639 				rdmsrl(MSR_KERNEL_GS_BASE, base);
640 			else
641 				base = task->thread.gs;
642 		} else
643 			base = task->thread.gs;
644 		ret = put_user(base, (unsigned long __user *)addr);
645 		break;
646 	}
647 
648 	default:
649 		ret = -EINVAL;
650 		break;
651 	}
652 
653 	return ret;
654 }
655 
656 long sys_arch_prctl(int code, unsigned long addr)
657 {
658 	return do_arch_prctl(current, code, addr);
659 }
660 
661