xref: /openbmc/linux/arch/x86/kernel/process_64.c (revision 96de0e252cedffad61b3cb5e05662c591898e69a)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <stdarg.h>
18 
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/fs.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/module.h>
30 #include <linux/a.out.h>
31 #include <linux/interrupt.h>
32 #include <linux/delay.h>
33 #include <linux/ptrace.h>
34 #include <linux/utsname.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/io.h>
45 #include <asm/processor.h>
46 #include <asm/i387.h>
47 #include <asm/mmu_context.h>
48 #include <asm/pda.h>
49 #include <asm/prctl.h>
50 #include <asm/desc.h>
51 #include <asm/proto.h>
52 #include <asm/ia32.h>
53 #include <asm/idle.h>
54 
55 asmlinkage extern void ret_from_fork(void);
56 
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58 
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
61 
62 /*
63  * Powermanagement idle function, if any..
64  */
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68 
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 
71 void idle_notifier_register(struct notifier_block *n)
72 {
73 	atomic_notifier_chain_register(&idle_notifier, n);
74 }
75 EXPORT_SYMBOL_GPL(idle_notifier_register);
76 
77 void idle_notifier_unregister(struct notifier_block *n)
78 {
79 	atomic_notifier_chain_unregister(&idle_notifier, n);
80 }
81 EXPORT_SYMBOL(idle_notifier_unregister);
82 
83 void enter_idle(void)
84 {
85 	write_pda(isidle, 1);
86 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
87 }
88 
89 static void __exit_idle(void)
90 {
91 	if (test_and_clear_bit_pda(0, isidle) == 0)
92 		return;
93 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
94 }
95 
96 /* Called from interrupts to signify idle end */
97 void exit_idle(void)
98 {
99 	/* idle loop has pid 0 */
100 	if (current->pid)
101 		return;
102 	__exit_idle();
103 }
104 
105 /*
106  * We use this if we don't have any better
107  * idle routine..
108  */
109 static void default_idle(void)
110 {
111 	current_thread_info()->status &= ~TS_POLLING;
112 	/*
113 	 * TS_POLLING-cleared state must be visible before we
114 	 * test NEED_RESCHED:
115 	 */
116 	smp_mb();
117 	local_irq_disable();
118 	if (!need_resched()) {
119 		/* Enables interrupts one instruction before HLT.
120 		   x86 special cases this so there is no race. */
121 		safe_halt();
122 	} else
123 		local_irq_enable();
124 	current_thread_info()->status |= TS_POLLING;
125 }
126 
127 /*
128  * On SMP it's slightly faster (but much more power-consuming!)
129  * to poll the ->need_resched flag instead of waiting for the
130  * cross-CPU IPI to arrive. Use this option with caution.
131  */
132 static void poll_idle (void)
133 {
134 	local_irq_enable();
135 	cpu_relax();
136 }
137 
138 void cpu_idle_wait(void)
139 {
140 	unsigned int cpu, this_cpu = get_cpu();
141 	cpumask_t map, tmp = current->cpus_allowed;
142 
143 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
144 	put_cpu();
145 
146 	cpus_clear(map);
147 	for_each_online_cpu(cpu) {
148 		per_cpu(cpu_idle_state, cpu) = 1;
149 		cpu_set(cpu, map);
150 	}
151 
152 	__get_cpu_var(cpu_idle_state) = 0;
153 
154 	wmb();
155 	do {
156 		ssleep(1);
157 		for_each_online_cpu(cpu) {
158 			if (cpu_isset(cpu, map) &&
159 					!per_cpu(cpu_idle_state, cpu))
160 				cpu_clear(cpu, map);
161 		}
162 		cpus_and(map, map, cpu_online_map);
163 	} while (!cpus_empty(map));
164 
165 	set_cpus_allowed(current, tmp);
166 }
167 EXPORT_SYMBOL_GPL(cpu_idle_wait);
168 
169 #ifdef CONFIG_HOTPLUG_CPU
170 DECLARE_PER_CPU(int, cpu_state);
171 
172 #include <asm/nmi.h>
173 /* We halt the CPU with physical CPU hotplug */
174 static inline void play_dead(void)
175 {
176 	idle_task_exit();
177 	wbinvd();
178 	mb();
179 	/* Ack it */
180 	__get_cpu_var(cpu_state) = CPU_DEAD;
181 
182 	local_irq_disable();
183 	while (1)
184 		halt();
185 }
186 #else
187 static inline void play_dead(void)
188 {
189 	BUG();
190 }
191 #endif /* CONFIG_HOTPLUG_CPU */
192 
193 /*
194  * The idle thread. There's no useful work to be
195  * done, so just try to conserve power and have a
196  * low exit latency (ie sit in a loop waiting for
197  * somebody to say that they'd like to reschedule)
198  */
199 void cpu_idle (void)
200 {
201 	current_thread_info()->status |= TS_POLLING;
202 	/* endless idle loop with no priority at all */
203 	while (1) {
204 		while (!need_resched()) {
205 			void (*idle)(void);
206 
207 			if (__get_cpu_var(cpu_idle_state))
208 				__get_cpu_var(cpu_idle_state) = 0;
209 
210 			tick_nohz_stop_sched_tick();
211 
212 			rmb();
213 			idle = pm_idle;
214 			if (!idle)
215 				idle = default_idle;
216 			if (cpu_is_offline(smp_processor_id()))
217 				play_dead();
218 			/*
219 			 * Idle routines should keep interrupts disabled
220 			 * from here on, until they go to idle.
221 			 * Otherwise, idle callbacks can misfire.
222 			 */
223 			local_irq_disable();
224 			enter_idle();
225 			idle();
226 			/* In many cases the interrupt that ended idle
227 			   has already called exit_idle. But some idle
228 			   loops can be woken up without interrupt. */
229 			__exit_idle();
230 		}
231 
232 		tick_nohz_restart_sched_tick();
233 		preempt_enable_no_resched();
234 		schedule();
235 		preempt_disable();
236 	}
237 }
238 
239 /*
240  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
241  * which can obviate IPI to trigger checking of need_resched.
242  * We execute MONITOR against need_resched and enter optimized wait state
243  * through MWAIT. Whenever someone changes need_resched, we would be woken
244  * up from MWAIT (without an IPI).
245  *
246  * New with Core Duo processors, MWAIT can take some hints based on CPU
247  * capability.
248  */
249 void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
250 {
251 	if (!need_resched()) {
252 		__monitor((void *)&current_thread_info()->flags, 0, 0);
253 		smp_mb();
254 		if (!need_resched())
255 			__mwait(eax, ecx);
256 	}
257 }
258 
259 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
260 static void mwait_idle(void)
261 {
262 	if (!need_resched()) {
263 		__monitor((void *)&current_thread_info()->flags, 0, 0);
264 		smp_mb();
265 		if (!need_resched())
266 			__sti_mwait(0, 0);
267 		else
268 			local_irq_enable();
269 	} else {
270 		local_irq_enable();
271 	}
272 }
273 
274 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
275 {
276 	static int printed;
277 	if (cpu_has(c, X86_FEATURE_MWAIT)) {
278 		/*
279 		 * Skip, if setup has overridden idle.
280 		 * One CPU supports mwait => All CPUs supports mwait
281 		 */
282 		if (!pm_idle) {
283 			if (!printed) {
284 				printk(KERN_INFO "using mwait in idle threads.\n");
285 				printed = 1;
286 			}
287 			pm_idle = mwait_idle;
288 		}
289 	}
290 }
291 
292 static int __init idle_setup (char *str)
293 {
294 	if (!strcmp(str, "poll")) {
295 		printk("using polling idle threads.\n");
296 		pm_idle = poll_idle;
297 	} else if (!strcmp(str, "mwait"))
298 		force_mwait = 1;
299 	else
300 		return -1;
301 
302 	boot_option_idle_override = 1;
303 	return 0;
304 }
305 early_param("idle", idle_setup);
306 
307 /* Prints also some state that isn't saved in the pt_regs */
308 void __show_regs(struct pt_regs * regs)
309 {
310 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
311 	unsigned long d0, d1, d2, d3, d6, d7;
312 	unsigned int fsindex,gsindex;
313 	unsigned int ds,cs,es;
314 
315 	printk("\n");
316 	print_modules();
317 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
318 		current->pid, current->comm, print_tainted(),
319 		init_utsname()->release,
320 		(int)strcspn(init_utsname()->version, " "),
321 		init_utsname()->version);
322 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
323 	printk_address(regs->rip);
324 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
325 		regs->eflags);
326 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
327 	       regs->rax, regs->rbx, regs->rcx);
328 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
329 	       regs->rdx, regs->rsi, regs->rdi);
330 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
331 	       regs->rbp, regs->r8, regs->r9);
332 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
333 	       regs->r10, regs->r11, regs->r12);
334 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
335 	       regs->r13, regs->r14, regs->r15);
336 
337 	asm("movl %%ds,%0" : "=r" (ds));
338 	asm("movl %%cs,%0" : "=r" (cs));
339 	asm("movl %%es,%0" : "=r" (es));
340 	asm("movl %%fs,%0" : "=r" (fsindex));
341 	asm("movl %%gs,%0" : "=r" (gsindex));
342 
343 	rdmsrl(MSR_FS_BASE, fs);
344 	rdmsrl(MSR_GS_BASE, gs);
345 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
346 
347 	cr0 = read_cr0();
348 	cr2 = read_cr2();
349 	cr3 = read_cr3();
350 	cr4 = read_cr4();
351 
352 	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
353 	       fs,fsindex,gs,gsindex,shadowgs);
354 	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
355 	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
356 
357 	get_debugreg(d0, 0);
358 	get_debugreg(d1, 1);
359 	get_debugreg(d2, 2);
360 	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
361 	get_debugreg(d3, 3);
362 	get_debugreg(d6, 6);
363 	get_debugreg(d7, 7);
364 	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
365 }
366 
367 void show_regs(struct pt_regs *regs)
368 {
369 	printk("CPU %d:", smp_processor_id());
370 	__show_regs(regs);
371 	show_trace(NULL, regs, (void *)(regs + 1));
372 }
373 
374 /*
375  * Free current thread data structures etc..
376  */
377 void exit_thread(void)
378 {
379 	struct task_struct *me = current;
380 	struct thread_struct *t = &me->thread;
381 
382 	if (me->thread.io_bitmap_ptr) {
383 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
384 
385 		kfree(t->io_bitmap_ptr);
386 		t->io_bitmap_ptr = NULL;
387 		clear_thread_flag(TIF_IO_BITMAP);
388 		/*
389 		 * Careful, clear this in the TSS too:
390 		 */
391 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
392 		t->io_bitmap_max = 0;
393 		put_cpu();
394 	}
395 }
396 
397 void flush_thread(void)
398 {
399 	struct task_struct *tsk = current;
400 
401 	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
402 		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
403 		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
404 			clear_tsk_thread_flag(tsk, TIF_IA32);
405 		} else {
406 			set_tsk_thread_flag(tsk, TIF_IA32);
407 			current_thread_info()->status |= TS_COMPAT;
408 		}
409 	}
410 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
411 
412 	tsk->thread.debugreg0 = 0;
413 	tsk->thread.debugreg1 = 0;
414 	tsk->thread.debugreg2 = 0;
415 	tsk->thread.debugreg3 = 0;
416 	tsk->thread.debugreg6 = 0;
417 	tsk->thread.debugreg7 = 0;
418 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
419 	/*
420 	 * Forget coprocessor state..
421 	 */
422 	clear_fpu(tsk);
423 	clear_used_math();
424 }
425 
426 void release_thread(struct task_struct *dead_task)
427 {
428 	if (dead_task->mm) {
429 		if (dead_task->mm->context.size) {
430 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
431 					dead_task->comm,
432 					dead_task->mm->context.ldt,
433 					dead_task->mm->context.size);
434 			BUG();
435 		}
436 	}
437 }
438 
439 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
440 {
441 	struct user_desc ud = {
442 		.base_addr = addr,
443 		.limit = 0xfffff,
444 		.seg_32bit = 1,
445 		.limit_in_pages = 1,
446 		.useable = 1,
447 	};
448 	struct n_desc_struct *desc = (void *)t->thread.tls_array;
449 	desc += tls;
450 	desc->a = LDT_entry_a(&ud);
451 	desc->b = LDT_entry_b(&ud);
452 }
453 
454 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
455 {
456 	struct desc_struct *desc = (void *)t->thread.tls_array;
457 	desc += tls;
458 	return desc->base0 |
459 		(((u32)desc->base1) << 16) |
460 		(((u32)desc->base2) << 24);
461 }
462 
463 /*
464  * This gets called before we allocate a new thread and copy
465  * the current task into it.
466  */
467 void prepare_to_copy(struct task_struct *tsk)
468 {
469 	unlazy_fpu(tsk);
470 }
471 
472 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
473 		unsigned long unused,
474 	struct task_struct * p, struct pt_regs * regs)
475 {
476 	int err;
477 	struct pt_regs * childregs;
478 	struct task_struct *me = current;
479 
480 	childregs = ((struct pt_regs *)
481 			(THREAD_SIZE + task_stack_page(p))) - 1;
482 	*childregs = *regs;
483 
484 	childregs->rax = 0;
485 	childregs->rsp = rsp;
486 	if (rsp == ~0UL)
487 		childregs->rsp = (unsigned long)childregs;
488 
489 	p->thread.rsp = (unsigned long) childregs;
490 	p->thread.rsp0 = (unsigned long) (childregs+1);
491 	p->thread.userrsp = me->thread.userrsp;
492 
493 	set_tsk_thread_flag(p, TIF_FORK);
494 
495 	p->thread.fs = me->thread.fs;
496 	p->thread.gs = me->thread.gs;
497 
498 	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
499 	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
500 	asm("mov %%es,%0" : "=m" (p->thread.es));
501 	asm("mov %%ds,%0" : "=m" (p->thread.ds));
502 
503 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
504 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
505 		if (!p->thread.io_bitmap_ptr) {
506 			p->thread.io_bitmap_max = 0;
507 			return -ENOMEM;
508 		}
509 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
510 				IO_BITMAP_BYTES);
511 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
512 	}
513 
514 	/*
515 	 * Set a new TLS for the child thread?
516 	 */
517 	if (clone_flags & CLONE_SETTLS) {
518 #ifdef CONFIG_IA32_EMULATION
519 		if (test_thread_flag(TIF_IA32))
520 			err = ia32_child_tls(p, childregs);
521 		else
522 #endif
523 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
524 		if (err)
525 			goto out;
526 	}
527 	err = 0;
528 out:
529 	if (err && p->thread.io_bitmap_ptr) {
530 		kfree(p->thread.io_bitmap_ptr);
531 		p->thread.io_bitmap_max = 0;
532 	}
533 	return err;
534 }
535 
536 /*
537  * This special macro can be used to load a debugging register
538  */
539 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
540 
541 static inline void __switch_to_xtra(struct task_struct *prev_p,
542 			     	    struct task_struct *next_p,
543 			     	    struct tss_struct *tss)
544 {
545 	struct thread_struct *prev, *next;
546 
547 	prev = &prev_p->thread,
548 	next = &next_p->thread;
549 
550 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
551 		loaddebug(next, 0);
552 		loaddebug(next, 1);
553 		loaddebug(next, 2);
554 		loaddebug(next, 3);
555 		/* no 4 and 5 */
556 		loaddebug(next, 6);
557 		loaddebug(next, 7);
558 	}
559 
560 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
561 		/*
562 		 * Copy the relevant range of the IO bitmap.
563 		 * Normally this is 128 bytes or less:
564 		 */
565 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
566 		       max(prev->io_bitmap_max, next->io_bitmap_max));
567 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
568 		/*
569 		 * Clear any possible leftover bits:
570 		 */
571 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
572 	}
573 }
574 
575 /*
576  *	switch_to(x,y) should switch tasks from x to y.
577  *
578  * This could still be optimized:
579  * - fold all the options into a flag word and test it with a single test.
580  * - could test fs/gs bitsliced
581  *
582  * Kprobes not supported here. Set the probe on schedule instead.
583  */
584 struct task_struct *
585 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586 {
587 	struct thread_struct *prev = &prev_p->thread,
588 				 *next = &next_p->thread;
589 	int cpu = smp_processor_id();
590 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
591 
592 	/* we're going to use this soon, after a few expensive things */
593 	if (next_p->fpu_counter>5)
594 		prefetch(&next->i387.fxsave);
595 
596 	/*
597 	 * Reload esp0, LDT and the page table pointer:
598 	 */
599 	tss->rsp0 = next->rsp0;
600 
601 	/*
602 	 * Switch DS and ES.
603 	 * This won't pick up thread selector changes, but I guess that is ok.
604 	 */
605 	asm volatile("mov %%es,%0" : "=m" (prev->es));
606 	if (unlikely(next->es | prev->es))
607 		loadsegment(es, next->es);
608 
609 	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
610 	if (unlikely(next->ds | prev->ds))
611 		loadsegment(ds, next->ds);
612 
613 	load_TLS(next, cpu);
614 
615 	/*
616 	 * Switch FS and GS.
617 	 */
618 	{
619 		unsigned fsindex;
620 		asm volatile("movl %%fs,%0" : "=r" (fsindex));
621 		/* segment register != 0 always requires a reload.
622 		   also reload when it has changed.
623 		   when prev process used 64bit base always reload
624 		   to avoid an information leak. */
625 		if (unlikely(fsindex | next->fsindex | prev->fs)) {
626 			loadsegment(fs, next->fsindex);
627 			/* check if the user used a selector != 0
628 	                 * if yes clear 64bit base, since overloaded base
629                          * is always mapped to the Null selector
630                          */
631 			if (fsindex)
632 			prev->fs = 0;
633 		}
634 		/* when next process has a 64bit base use it */
635 		if (next->fs)
636 			wrmsrl(MSR_FS_BASE, next->fs);
637 		prev->fsindex = fsindex;
638 	}
639 	{
640 		unsigned gsindex;
641 		asm volatile("movl %%gs,%0" : "=r" (gsindex));
642 		if (unlikely(gsindex | next->gsindex | prev->gs)) {
643 			load_gs_index(next->gsindex);
644 			if (gsindex)
645 			prev->gs = 0;
646 		}
647 		if (next->gs)
648 			wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
649 		prev->gsindex = gsindex;
650 	}
651 
652 	/* Must be after DS reload */
653 	unlazy_fpu(prev_p);
654 
655 	/*
656 	 * Switch the PDA and FPU contexts.
657 	 */
658 	prev->userrsp = read_pda(oldrsp);
659 	write_pda(oldrsp, next->userrsp);
660 	write_pda(pcurrent, next_p);
661 
662 	write_pda(kernelstack,
663 	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
664 #ifdef CONFIG_CC_STACKPROTECTOR
665 	write_pda(stack_canary, next_p->stack_canary);
666 	/*
667 	 * Build time only check to make sure the stack_canary is at
668 	 * offset 40 in the pda; this is a gcc ABI requirement
669 	 */
670 	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
671 #endif
672 
673 	/*
674 	 * Now maybe reload the debug registers and handle I/O bitmaps
675 	 */
676 	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
677 	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
678 		__switch_to_xtra(prev_p, next_p, tss);
679 
680 	/* If the task has used fpu the last 5 timeslices, just do a full
681 	 * restore of the math state immediately to avoid the trap; the
682 	 * chances of needing FPU soon are obviously high now
683 	 */
684 	if (next_p->fpu_counter>5)
685 		math_state_restore();
686 	return prev_p;
687 }
688 
689 /*
690  * sys_execve() executes a new program.
691  */
692 asmlinkage
693 long sys_execve(char __user *name, char __user * __user *argv,
694 		char __user * __user *envp, struct pt_regs regs)
695 {
696 	long error;
697 	char * filename;
698 
699 	filename = getname(name);
700 	error = PTR_ERR(filename);
701 	if (IS_ERR(filename))
702 		return error;
703 	error = do_execve(filename, argv, envp, &regs);
704 	if (error == 0) {
705 		task_lock(current);
706 		current->ptrace &= ~PT_DTRACE;
707 		task_unlock(current);
708 	}
709 	putname(filename);
710 	return error;
711 }
712 
713 void set_personality_64bit(void)
714 {
715 	/* inherit personality from parent */
716 
717 	/* Make sure to be in 64bit mode */
718 	clear_thread_flag(TIF_IA32);
719 
720 	/* TBD: overwrites user setup. Should have two bits.
721 	   But 64bit processes have always behaved this way,
722 	   so it's not too bad. The main problem is just that
723    	   32bit childs are affected again. */
724 	current->personality &= ~READ_IMPLIES_EXEC;
725 }
726 
727 asmlinkage long sys_fork(struct pt_regs *regs)
728 {
729 	return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
730 }
731 
732 asmlinkage long
733 sys_clone(unsigned long clone_flags, unsigned long newsp,
734 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
735 {
736 	if (!newsp)
737 		newsp = regs->rsp;
738 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
739 }
740 
741 /*
742  * This is trivial, and on the face of it looks like it
743  * could equally well be done in user mode.
744  *
745  * Not so, for quite unobvious reasons - register pressure.
746  * In user mode vfork() cannot have a stack frame, and if
747  * done by calling the "clone()" system call directly, you
748  * do not have enough call-clobbered registers to hold all
749  * the information you need.
750  */
751 asmlinkage long sys_vfork(struct pt_regs *regs)
752 {
753 	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
754 		    NULL, NULL);
755 }
756 
757 unsigned long get_wchan(struct task_struct *p)
758 {
759 	unsigned long stack;
760 	u64 fp,rip;
761 	int count = 0;
762 
763 	if (!p || p == current || p->state==TASK_RUNNING)
764 		return 0;
765 	stack = (unsigned long)task_stack_page(p);
766 	if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
767 		return 0;
768 	fp = *(u64 *)(p->thread.rsp);
769 	do {
770 		if (fp < (unsigned long)stack ||
771 		    fp > (unsigned long)stack+THREAD_SIZE)
772 			return 0;
773 		rip = *(u64 *)(fp+8);
774 		if (!in_sched_functions(rip))
775 			return rip;
776 		fp = *(u64 *)fp;
777 	} while (count++ < 16);
778 	return 0;
779 }
780 
781 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
782 {
783 	int ret = 0;
784 	int doit = task == current;
785 	int cpu;
786 
787 	switch (code) {
788 	case ARCH_SET_GS:
789 		if (addr >= TASK_SIZE_OF(task))
790 			return -EPERM;
791 		cpu = get_cpu();
792 		/* handle small bases via the GDT because that's faster to
793 		   switch. */
794 		if (addr <= 0xffffffff) {
795 			set_32bit_tls(task, GS_TLS, addr);
796 			if (doit) {
797 				load_TLS(&task->thread, cpu);
798 				load_gs_index(GS_TLS_SEL);
799 			}
800 			task->thread.gsindex = GS_TLS_SEL;
801 			task->thread.gs = 0;
802 		} else {
803 			task->thread.gsindex = 0;
804 			task->thread.gs = addr;
805 			if (doit) {
806 				load_gs_index(0);
807 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
808 			}
809 		}
810 		put_cpu();
811 		break;
812 	case ARCH_SET_FS:
813 		/* Not strictly needed for fs, but do it for symmetry
814 		   with gs */
815 		if (addr >= TASK_SIZE_OF(task))
816 			return -EPERM;
817 		cpu = get_cpu();
818 		/* handle small bases via the GDT because that's faster to
819 		   switch. */
820 		if (addr <= 0xffffffff) {
821 			set_32bit_tls(task, FS_TLS, addr);
822 			if (doit) {
823 				load_TLS(&task->thread, cpu);
824 				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
825 			}
826 			task->thread.fsindex = FS_TLS_SEL;
827 			task->thread.fs = 0;
828 		} else {
829 			task->thread.fsindex = 0;
830 			task->thread.fs = addr;
831 			if (doit) {
832 				/* set the selector to 0 to not confuse
833 				   __switch_to */
834 				asm volatile("movl %0,%%fs" :: "r" (0));
835 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
836 			}
837 		}
838 		put_cpu();
839 		break;
840 	case ARCH_GET_FS: {
841 		unsigned long base;
842 		if (task->thread.fsindex == FS_TLS_SEL)
843 			base = read_32bit_tls(task, FS_TLS);
844 		else if (doit)
845 			rdmsrl(MSR_FS_BASE, base);
846 		else
847 			base = task->thread.fs;
848 		ret = put_user(base, (unsigned long __user *)addr);
849 		break;
850 	}
851 	case ARCH_GET_GS: {
852 		unsigned long base;
853 		unsigned gsindex;
854 		if (task->thread.gsindex == GS_TLS_SEL)
855 			base = read_32bit_tls(task, GS_TLS);
856 		else if (doit) {
857  			asm("movl %%gs,%0" : "=r" (gsindex));
858 			if (gsindex)
859 				rdmsrl(MSR_KERNEL_GS_BASE, base);
860 			else
861 				base = task->thread.gs;
862 		}
863 		else
864 			base = task->thread.gs;
865 		ret = put_user(base, (unsigned long __user *)addr);
866 		break;
867 	}
868 
869 	default:
870 		ret = -EINVAL;
871 		break;
872 	}
873 
874 	return ret;
875 }
876 
877 long sys_arch_prctl(int code, unsigned long addr)
878 {
879 	return do_arch_prctl(current, code, addr);
880 }
881 
882 /*
883  * Capture the user space registers if the task is not running (in user space)
884  */
885 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
886 {
887 	struct pt_regs *pp, ptregs;
888 
889 	pp = task_pt_regs(tsk);
890 
891 	ptregs = *pp;
892 	ptregs.cs &= 0xffff;
893 	ptregs.ss &= 0xffff;
894 
895 	elf_core_copy_regs(regs, &ptregs);
896 
897 	return 1;
898 }
899 
900 unsigned long arch_align_stack(unsigned long sp)
901 {
902 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
903 		sp -= get_random_int() % 8192;
904 	return sp & ~0xf;
905 }
906