xref: /openbmc/linux/arch/x86/kernel/step.c (revision 1379a5ce3ffc549a7ff3daffc49c5e1c372717a3)
1fa1e03eaSRoland McGrath /*
2fa1e03eaSRoland McGrath  * x86 single-step support code, common to 32-bit and 64-bit.
3fa1e03eaSRoland McGrath  */
4fa1e03eaSRoland McGrath #include <linux/sched.h>
5fa1e03eaSRoland McGrath #include <linux/mm.h>
6fa1e03eaSRoland McGrath #include <linux/ptrace.h>
7fa1e03eaSRoland McGrath 
87122ec81SRoland McGrath #ifdef CONFIG_X86_32
9*1379a5ceSHarvey Harrison #include <linux/uaccess.h>
10*1379a5ceSHarvey Harrison 
11*1379a5ceSHarvey Harrison #include <asm/desc.h>
12*1379a5ceSHarvey Harrison 
13*1379a5ceSHarvey Harrison /*
14*1379a5ceSHarvey Harrison  * Return EIP plus the CS segment base.  The segment limit is also
15*1379a5ceSHarvey Harrison  * adjusted, clamped to the kernel/user address space (whichever is
16*1379a5ceSHarvey Harrison  * appropriate), and returned in *eip_limit.
17*1379a5ceSHarvey Harrison  *
18*1379a5ceSHarvey Harrison  * The segment is checked, because it might have been changed by another
19*1379a5ceSHarvey Harrison  * task between the original faulting instruction and here.
20*1379a5ceSHarvey Harrison  *
21*1379a5ceSHarvey Harrison  * If CS is no longer a valid code segment, or if EIP is beyond the
22*1379a5ceSHarvey Harrison  * limit, or if it is a kernel address when CS is not a kernel segment,
23*1379a5ceSHarvey Harrison  * then the returned value will be greater than *eip_limit.
24*1379a5ceSHarvey Harrison  *
25*1379a5ceSHarvey Harrison  * This is slow, but is very rarely executed.
26*1379a5ceSHarvey Harrison  */
27*1379a5ceSHarvey Harrison unsigned long get_segment_eip(struct pt_regs *regs,
28*1379a5ceSHarvey Harrison 					    unsigned long *eip_limit)
29*1379a5ceSHarvey Harrison {
30*1379a5ceSHarvey Harrison 	unsigned long ip = regs->ip;
31*1379a5ceSHarvey Harrison 	unsigned seg = regs->cs & 0xffff;
32*1379a5ceSHarvey Harrison 	u32 seg_ar, seg_limit, base, *desc;
33*1379a5ceSHarvey Harrison 
34*1379a5ceSHarvey Harrison 	/* Unlikely, but must come before segment checks. */
35*1379a5ceSHarvey Harrison 	if (unlikely(regs->flags & VM_MASK)) {
36*1379a5ceSHarvey Harrison 		base = seg << 4;
37*1379a5ceSHarvey Harrison 		*eip_limit = base + 0xffff;
38*1379a5ceSHarvey Harrison 		return base + (ip & 0xffff);
39*1379a5ceSHarvey Harrison 	}
40*1379a5ceSHarvey Harrison 
41*1379a5ceSHarvey Harrison 	/* The standard kernel/user address space limit. */
42*1379a5ceSHarvey Harrison 	*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
43*1379a5ceSHarvey Harrison 
44*1379a5ceSHarvey Harrison 	/* By far the most common cases. */
45*1379a5ceSHarvey Harrison 	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
46*1379a5ceSHarvey Harrison 		return ip;
47*1379a5ceSHarvey Harrison 
48*1379a5ceSHarvey Harrison 	/* Check the segment exists, is within the current LDT/GDT size,
49*1379a5ceSHarvey Harrison 	   that kernel/user (ring 0..3) has the appropriate privilege,
50*1379a5ceSHarvey Harrison 	   that it's a code segment, and get the limit. */
51*1379a5ceSHarvey Harrison 	__asm__("larl %3,%0; lsll %3,%1"
52*1379a5ceSHarvey Harrison 		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
53*1379a5ceSHarvey Harrison 	if ((~seg_ar & 0x9800) || ip > seg_limit) {
54*1379a5ceSHarvey Harrison 		*eip_limit = 0;
55*1379a5ceSHarvey Harrison 		return 1;	 /* So that returned ip > *eip_limit. */
56*1379a5ceSHarvey Harrison 	}
57*1379a5ceSHarvey Harrison 
58*1379a5ceSHarvey Harrison 	/* Get the GDT/LDT descriptor base.
59*1379a5ceSHarvey Harrison 	   When you look for races in this code remember that
60*1379a5ceSHarvey Harrison 	   LDT and other horrors are only used in user space. */
61*1379a5ceSHarvey Harrison 	if (seg & (1<<2)) {
62*1379a5ceSHarvey Harrison 		/* Must lock the LDT while reading it. */
63*1379a5ceSHarvey Harrison 		mutex_lock(&current->mm->context.lock);
64*1379a5ceSHarvey Harrison 		desc = current->mm->context.ldt;
65*1379a5ceSHarvey Harrison 		desc = (void *)desc + (seg & ~7);
66*1379a5ceSHarvey Harrison 	} else {
67*1379a5ceSHarvey Harrison 		/* Must disable preemption while reading the GDT. */
68*1379a5ceSHarvey Harrison 		desc = (u32 *)get_cpu_gdt_table(get_cpu());
69*1379a5ceSHarvey Harrison 		desc = (void *)desc + (seg & ~7);
70*1379a5ceSHarvey Harrison 	}
71*1379a5ceSHarvey Harrison 
72*1379a5ceSHarvey Harrison 	/* Decode the code segment base from the descriptor */
73*1379a5ceSHarvey Harrison 	base = get_desc_base((struct desc_struct *)desc);
74*1379a5ceSHarvey Harrison 
75*1379a5ceSHarvey Harrison 	if (seg & (1<<2))
76*1379a5ceSHarvey Harrison 		mutex_unlock(&current->mm->context.lock);
77*1379a5ceSHarvey Harrison 	else
78*1379a5ceSHarvey Harrison 		put_cpu();
79*1379a5ceSHarvey Harrison 
80*1379a5ceSHarvey Harrison 	/* Adjust EIP and segment limit, and clamp at the kernel limit.
81*1379a5ceSHarvey Harrison 	   It's legitimate for segments to wrap at 0xffffffff. */
82*1379a5ceSHarvey Harrison 	seg_limit += base;
83*1379a5ceSHarvey Harrison 	if (seg_limit < *eip_limit && seg_limit >= base)
84*1379a5ceSHarvey Harrison 		*eip_limit = seg_limit;
85*1379a5ceSHarvey Harrison 	return ip + base;
86*1379a5ceSHarvey Harrison }
87*1379a5ceSHarvey Harrison #endif
88*1379a5ceSHarvey Harrison 
89*1379a5ceSHarvey Harrison #ifdef CONFIG_X86_32
907122ec81SRoland McGrath static
917122ec81SRoland McGrath #endif
92fa1e03eaSRoland McGrath unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
93fa1e03eaSRoland McGrath {
94fa1e03eaSRoland McGrath 	unsigned long addr, seg;
95fa1e03eaSRoland McGrath 
9665ea5b03SH. Peter Anvin 	addr = regs->ip;
97fa1e03eaSRoland McGrath 	seg = regs->cs & 0xffff;
9865ea5b03SH. Peter Anvin 	if (v8086_mode(regs)) {
997122ec81SRoland McGrath 		addr = (addr & 0xffff) + (seg << 4);
1007122ec81SRoland McGrath 		return addr;
1017122ec81SRoland McGrath 	}
102fa1e03eaSRoland McGrath 
103fa1e03eaSRoland McGrath 	/*
104fa1e03eaSRoland McGrath 	 * We'll assume that the code segments in the GDT
105fa1e03eaSRoland McGrath 	 * are all zero-based. That is largely true: the
106fa1e03eaSRoland McGrath 	 * TLS segments are used for data, and the PNPBIOS
107fa1e03eaSRoland McGrath 	 * and APM bios ones we just ignore here.
108fa1e03eaSRoland McGrath 	 */
1093f80c1adSRoland McGrath 	if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
110fa1e03eaSRoland McGrath 		u32 *desc;
111fa1e03eaSRoland McGrath 		unsigned long base;
112fa1e03eaSRoland McGrath 
113fa1e03eaSRoland McGrath 		seg &= ~7UL;
114fa1e03eaSRoland McGrath 
115fa1e03eaSRoland McGrath 		mutex_lock(&child->mm->context.lock);
116fa1e03eaSRoland McGrath 		if (unlikely((seg >> 3) >= child->mm->context.size))
117fa1e03eaSRoland McGrath 			addr = -1L; /* bogus selector, access would fault */
118fa1e03eaSRoland McGrath 		else {
119fa1e03eaSRoland McGrath 			desc = child->mm->context.ldt + seg;
120fa1e03eaSRoland McGrath 			base = ((desc[0] >> 16) |
121fa1e03eaSRoland McGrath 				((desc[1] & 0xff) << 16) |
122fa1e03eaSRoland McGrath 				(desc[1] & 0xff000000));
123fa1e03eaSRoland McGrath 
124fa1e03eaSRoland McGrath 			/* 16-bit code segment? */
125fa1e03eaSRoland McGrath 			if (!((desc[1] >> 22) & 1))
126fa1e03eaSRoland McGrath 				addr &= 0xffff;
127fa1e03eaSRoland McGrath 			addr += base;
128fa1e03eaSRoland McGrath 		}
129fa1e03eaSRoland McGrath 		mutex_unlock(&child->mm->context.lock);
130fa1e03eaSRoland McGrath 	}
131fa1e03eaSRoland McGrath 
132fa1e03eaSRoland McGrath 	return addr;
133fa1e03eaSRoland McGrath }
134fa1e03eaSRoland McGrath 
135fa1e03eaSRoland McGrath static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
136fa1e03eaSRoland McGrath {
137fa1e03eaSRoland McGrath 	int i, copied;
138fa1e03eaSRoland McGrath 	unsigned char opcode[15];
139fa1e03eaSRoland McGrath 	unsigned long addr = convert_rip_to_linear(child, regs);
140fa1e03eaSRoland McGrath 
141fa1e03eaSRoland McGrath 	copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
142fa1e03eaSRoland McGrath 	for (i = 0; i < copied; i++) {
143fa1e03eaSRoland McGrath 		switch (opcode[i]) {
144fa1e03eaSRoland McGrath 		/* popf and iret */
145fa1e03eaSRoland McGrath 		case 0x9d: case 0xcf:
146fa1e03eaSRoland McGrath 			return 1;
147fa1e03eaSRoland McGrath 
148fa1e03eaSRoland McGrath 			/* CHECKME: 64 65 */
149fa1e03eaSRoland McGrath 
150fa1e03eaSRoland McGrath 		/* opcode and address size prefixes */
151fa1e03eaSRoland McGrath 		case 0x66: case 0x67:
152fa1e03eaSRoland McGrath 			continue;
153fa1e03eaSRoland McGrath 		/* irrelevant prefixes (segment overrides and repeats) */
154fa1e03eaSRoland McGrath 		case 0x26: case 0x2e:
155fa1e03eaSRoland McGrath 		case 0x36: case 0x3e:
156fa1e03eaSRoland McGrath 		case 0x64: case 0x65:
1575f76cb1fSRoland McGrath 		case 0xf0: case 0xf2: case 0xf3:
158fa1e03eaSRoland McGrath 			continue;
159fa1e03eaSRoland McGrath 
1607122ec81SRoland McGrath #ifdef CONFIG_X86_64
161fa1e03eaSRoland McGrath 		case 0x40 ... 0x4f:
162fa1e03eaSRoland McGrath 			if (regs->cs != __USER_CS)
163fa1e03eaSRoland McGrath 				/* 32-bit mode: register increment */
164fa1e03eaSRoland McGrath 				return 0;
165fa1e03eaSRoland McGrath 			/* 64-bit mode: REX prefix */
166fa1e03eaSRoland McGrath 			continue;
1677122ec81SRoland McGrath #endif
168fa1e03eaSRoland McGrath 
169fa1e03eaSRoland McGrath 			/* CHECKME: f2, f3 */
170fa1e03eaSRoland McGrath 
171fa1e03eaSRoland McGrath 		/*
172fa1e03eaSRoland McGrath 		 * pushf: NOTE! We should probably not let
173fa1e03eaSRoland McGrath 		 * the user see the TF bit being set. But
174fa1e03eaSRoland McGrath 		 * it's more pain than it's worth to avoid
175fa1e03eaSRoland McGrath 		 * it, and a debugger could emulate this
176fa1e03eaSRoland McGrath 		 * all in user space if it _really_ cares.
177fa1e03eaSRoland McGrath 		 */
178fa1e03eaSRoland McGrath 		case 0x9c:
179fa1e03eaSRoland McGrath 		default:
180fa1e03eaSRoland McGrath 			return 0;
181fa1e03eaSRoland McGrath 		}
182fa1e03eaSRoland McGrath 	}
183fa1e03eaSRoland McGrath 	return 0;
184fa1e03eaSRoland McGrath }
185fa1e03eaSRoland McGrath 
18610faa81eSRoland McGrath /*
18710faa81eSRoland McGrath  * Enable single-stepping.  Return nonzero if user mode is not using TF itself.
18810faa81eSRoland McGrath  */
18910faa81eSRoland McGrath static int enable_single_step(struct task_struct *child)
190fa1e03eaSRoland McGrath {
191fa1e03eaSRoland McGrath 	struct pt_regs *regs = task_pt_regs(child);
192fa1e03eaSRoland McGrath 
193fa1e03eaSRoland McGrath 	/*
194fa1e03eaSRoland McGrath 	 * Always set TIF_SINGLESTEP - this guarantees that
195fa1e03eaSRoland McGrath 	 * we single-step system calls etc..  This will also
196fa1e03eaSRoland McGrath 	 * cause us to set TF when returning to user mode.
197fa1e03eaSRoland McGrath 	 */
198fa1e03eaSRoland McGrath 	set_tsk_thread_flag(child, TIF_SINGLESTEP);
199fa1e03eaSRoland McGrath 
200fa1e03eaSRoland McGrath 	/*
201fa1e03eaSRoland McGrath 	 * If TF was already set, don't do anything else
202fa1e03eaSRoland McGrath 	 */
20365ea5b03SH. Peter Anvin 	if (regs->flags & X86_EFLAGS_TF)
20410faa81eSRoland McGrath 		return 0;
205fa1e03eaSRoland McGrath 
206fa1e03eaSRoland McGrath 	/* Set TF on the kernel stack.. */
20765ea5b03SH. Peter Anvin 	regs->flags |= X86_EFLAGS_TF;
208fa1e03eaSRoland McGrath 
209fa1e03eaSRoland McGrath 	/*
210fa1e03eaSRoland McGrath 	 * ..but if TF is changed by the instruction we will trace,
211fa1e03eaSRoland McGrath 	 * don't mark it as being "us" that set it, so that we
212fa1e03eaSRoland McGrath 	 * won't clear it by hand later.
213fa1e03eaSRoland McGrath 	 */
214fa1e03eaSRoland McGrath 	if (is_setting_trap_flag(child, regs))
21510faa81eSRoland McGrath 		return 0;
216fa1e03eaSRoland McGrath 
217e1f28773SRoland McGrath 	set_tsk_thread_flag(child, TIF_FORCED_TF);
21810faa81eSRoland McGrath 
21910faa81eSRoland McGrath 	return 1;
22010faa81eSRoland McGrath }
22110faa81eSRoland McGrath 
22210faa81eSRoland McGrath /*
22310faa81eSRoland McGrath  * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
22410faa81eSRoland McGrath  */
22510faa81eSRoland McGrath static void write_debugctlmsr(struct task_struct *child, unsigned long val)
22610faa81eSRoland McGrath {
22710faa81eSRoland McGrath 	child->thread.debugctlmsr = val;
22810faa81eSRoland McGrath 
22910faa81eSRoland McGrath 	if (child != current)
23010faa81eSRoland McGrath 		return;
23110faa81eSRoland McGrath 
23210faa81eSRoland McGrath #ifdef CONFIG_X86_64
23310faa81eSRoland McGrath 	wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
23410faa81eSRoland McGrath #else
23510faa81eSRoland McGrath 	wrmsr(MSR_IA32_DEBUGCTLMSR, val, 0);
23610faa81eSRoland McGrath #endif
23710faa81eSRoland McGrath }
23810faa81eSRoland McGrath 
23910faa81eSRoland McGrath /*
24010faa81eSRoland McGrath  * Enable single or block step.
24110faa81eSRoland McGrath  */
24210faa81eSRoland McGrath static void enable_step(struct task_struct *child, bool block)
24310faa81eSRoland McGrath {
24410faa81eSRoland McGrath 	/*
24510faa81eSRoland McGrath 	 * Make sure block stepping (BTF) is not enabled unless it should be.
24610faa81eSRoland McGrath 	 * Note that we don't try to worry about any is_setting_trap_flag()
24710faa81eSRoland McGrath 	 * instructions after the first when using block stepping.
24810faa81eSRoland McGrath 	 * So noone should try to use debugger block stepping in a program
24910faa81eSRoland McGrath 	 * that uses user-mode single stepping itself.
25010faa81eSRoland McGrath 	 */
25110faa81eSRoland McGrath 	if (enable_single_step(child) && block) {
25210faa81eSRoland McGrath 		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
253eee3af4aSMarkus Metzger 		write_debugctlmsr(child,
254eee3af4aSMarkus Metzger 				  child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
255eee3af4aSMarkus Metzger 	} else {
256eee3af4aSMarkus Metzger 	    write_debugctlmsr(child,
257eee3af4aSMarkus Metzger 			      child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
258eee3af4aSMarkus Metzger 
259eee3af4aSMarkus Metzger 	    if (!child->thread.debugctlmsr)
260eee3af4aSMarkus Metzger 		    clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
26110faa81eSRoland McGrath 	}
26210faa81eSRoland McGrath }
26310faa81eSRoland McGrath 
26410faa81eSRoland McGrath void user_enable_single_step(struct task_struct *child)
26510faa81eSRoland McGrath {
26610faa81eSRoland McGrath 	enable_step(child, 0);
26710faa81eSRoland McGrath }
26810faa81eSRoland McGrath 
26910faa81eSRoland McGrath void user_enable_block_step(struct task_struct *child)
27010faa81eSRoland McGrath {
27110faa81eSRoland McGrath 	enable_step(child, 1);
272fa1e03eaSRoland McGrath }
273fa1e03eaSRoland McGrath 
274fa1e03eaSRoland McGrath void user_disable_single_step(struct task_struct *child)
275fa1e03eaSRoland McGrath {
27610faa81eSRoland McGrath 	/*
27710faa81eSRoland McGrath 	 * Make sure block stepping (BTF) is disabled.
27810faa81eSRoland McGrath 	 */
279eee3af4aSMarkus Metzger 	write_debugctlmsr(child,
280eee3af4aSMarkus Metzger 			  child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
281eee3af4aSMarkus Metzger 
282eee3af4aSMarkus Metzger 	if (!child->thread.debugctlmsr)
283eee3af4aSMarkus Metzger 		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
28410faa81eSRoland McGrath 
285fa1e03eaSRoland McGrath 	/* Always clear TIF_SINGLESTEP... */
286fa1e03eaSRoland McGrath 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
287fa1e03eaSRoland McGrath 
288fa1e03eaSRoland McGrath 	/* But touch TF only if it was set by us.. */
289e1f28773SRoland McGrath 	if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
29065ea5b03SH. Peter Anvin 		task_pt_regs(child)->flags &= ~X86_EFLAGS_TF;
291fa1e03eaSRoland McGrath }
292