linux/arch/x86/kernel/ptrace_64.c
Roland McGrath df5d438e33 x86: ptrace fs/gs_base
The fs_base and gs_base fields are available in user_regs_struct.
But reading these via ptrace (PTRACE_GETREGS or PTRACE_PEEKUSR) does
not give a reliably useful value.  The thread_struct fields are 0
when do_arch_prctl decided to use a GDT slot instead of MSR_FS_BASE,
which it does for a value under 1<<32.

This changes ptrace access to fs_base and gs_base to work like
PTRACE_ARCH_PRCTL does.  That is, it reads the base address that
user-mode memory access using the fs/gs instruction prefixes will
use, regardless of how it's being implemented in the kernel.  The
MSR vs GDT is an implementation detail that is pretty much hidden
from userland in the actual using, and there is no reason that
ptrace should give the internal implementation picture rather than
the user-mode semantic picture.  In the case of setting the value,
this can implicitly change the fsindex/gsindex value (also
separately in user_regs_struct), which is what happens when the
thread calls arch_prctl itself.  In a PTRACE_SETREGS, the fs_base
change will come after the fsindex change due to the order of the
struct, and so a change the debugger made to fs_base will have the
effect intended, another part of the user_regs_struct will now
differ when read back from what the debugger wrote.

This makes PTRACE_ARCH_PRCTL obsolete.  We could consider declaring
it deprecated and removing it one day, though there is no hurry.
For the foreseeable future, debuggers have to assume an old kernel
that does not report reliable fs_base/gs_base values in user_regs_struct
and stick to PTRACE_ARCH_PRCTL anyway.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-01-30 13:30:45 +01:00

648 lines
17 KiB
C

/* By Ross Biro 1/23/92 */
/*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
*
* x86-64 port 2000-2002 Andi Kleen
*/
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/user.h>
#include <linux/security.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
#include <asm/processor.h>
#include <asm/prctl.h>
#include <asm/i387.h>
#include <asm/debugreg.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
*/
/*
* Determines which flags the user has access to [1 = access, 0 = no access].
* Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
* Also masks reserved bits (63-22, 15, 5, 3, 1).
*/
#define FLAG_MASK 0x54dd5UL
/* set's the trap flag. */
#define TRAP_FLAG 0x100UL
/*
* eflags and offset of eflags on child stack..
*/
#define EFLAGS offsetof(struct pt_regs, eflags)
#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
/*
* this routine will get a word off of the processes privileged stack.
* the offset is how far from the base addr as stored in the TSS.
* this routine assumes that all the privileged stacks are in our
* data space.
*/
static inline unsigned long get_stack_long(struct task_struct *task, int offset)
{
unsigned char *stack;
stack = (unsigned char *)task->thread.rsp0;
stack += offset;
return (*((unsigned long *)stack));
}
/*
* this routine will put a word on the processes privileged stack.
* the offset is how far from the base addr as stored in the TSS.
* this routine assumes that all the privileged stacks are in our
* data space.
*/
static inline long put_stack_long(struct task_struct *task, int offset,
unsigned long data)
{
unsigned char * stack;
stack = (unsigned char *) task->thread.rsp0;
stack += offset;
*(unsigned long *) stack = data;
return 0;
}
#define LDT_SEGMENT 4
unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
{
unsigned long addr, seg;
addr = regs->rip;
seg = regs->cs & 0xffff;
/*
* We'll assume that the code segments in the GDT
* are all zero-based. That is largely true: the
* TLS segments are used for data, and the PNPBIOS
* and APM bios ones we just ignore here.
*/
if (seg & LDT_SEGMENT) {
u32 *desc;
unsigned long base;
seg &= ~7UL;
mutex_lock(&child->mm->context.lock);
if (unlikely((seg >> 3) >= child->mm->context.size))
addr = -1L; /* bogus selector, access would fault */
else {
desc = child->mm->context.ldt + seg;
base = ((desc[0] >> 16) |
((desc[1] & 0xff) << 16) |
(desc[1] & 0xff000000));
/* 16-bit code segment? */
if (!((desc[1] >> 22) & 1))
addr &= 0xffff;
addr += base;
}
mutex_unlock(&child->mm->context.lock);
}
return addr;
}
static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
{
int i, copied;
unsigned char opcode[15];
unsigned long addr = convert_rip_to_linear(child, regs);
copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
for (i = 0; i < copied; i++) {
switch (opcode[i]) {
/* popf and iret */
case 0x9d: case 0xcf:
return 1;
/* CHECKME: 64 65 */
/* opcode and address size prefixes */
case 0x66: case 0x67:
continue;
/* irrelevant prefixes (segment overrides and repeats) */
case 0x26: case 0x2e:
case 0x36: case 0x3e:
case 0x64: case 0x65:
case 0xf2: case 0xf3:
continue;
case 0x40 ... 0x4f:
if (regs->cs != __USER_CS)
/* 32-bit mode: register increment */
return 0;
/* 64-bit mode: REX prefix */
continue;
/* CHECKME: f2, f3 */
/*
* pushf: NOTE! We should probably not let
* the user see the TF bit being set. But
* it's more pain than it's worth to avoid
* it, and a debugger could emulate this
* all in user space if it _really_ cares.
*/
case 0x9c:
default:
return 0;
}
}
return 0;
}
static void set_singlestep(struct task_struct *child)
{
struct pt_regs *regs = task_pt_regs(child);
/*
* Always set TIF_SINGLESTEP - this guarantees that
* we single-step system calls etc.. This will also
* cause us to set TF when returning to user mode.
*/
set_tsk_thread_flag(child, TIF_SINGLESTEP);
/*
* If TF was already set, don't do anything else
*/
if (regs->eflags & TRAP_FLAG)
return;
/* Set TF on the kernel stack.. */
regs->eflags |= TRAP_FLAG;
/*
* ..but if TF is changed by the instruction we will trace,
* don't mark it as being "us" that set it, so that we
* won't clear it by hand later.
*/
if (is_setting_trap_flag(child, regs))
return;
child->ptrace |= PT_DTRACE;
}
static void clear_singlestep(struct task_struct *child)
{
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
/* But touch TF only if it was set by us.. */
if (child->ptrace & PT_DTRACE) {
struct pt_regs *regs = task_pt_regs(child);
regs->eflags &= ~TRAP_FLAG;
child->ptrace &= ~PT_DTRACE;
}
}
/*
* Called by kernel/ptrace.c when detaching..
*
* Make sure the single step bit is not set.
*/
void ptrace_disable(struct task_struct *child)
{
clear_singlestep(child);
}
static int putreg(struct task_struct *child,
unsigned long regno, unsigned long value)
{
unsigned long tmp;
switch (regno) {
case offsetof(struct user_regs_struct,fs):
if (value && (value & 3) != 3)
return -EIO;
child->thread.fsindex = value & 0xffff;
return 0;
case offsetof(struct user_regs_struct,gs):
if (value && (value & 3) != 3)
return -EIO;
child->thread.gsindex = value & 0xffff;
return 0;
case offsetof(struct user_regs_struct,ds):
if (value && (value & 3) != 3)
return -EIO;
child->thread.ds = value & 0xffff;
return 0;
case offsetof(struct user_regs_struct,es):
if (value && (value & 3) != 3)
return -EIO;
child->thread.es = value & 0xffff;
return 0;
case offsetof(struct user_regs_struct,ss):
if ((value & 3) != 3)
return -EIO;
value &= 0xffff;
return 0;
case offsetof(struct user_regs_struct,fs_base):
if (value >= TASK_SIZE_OF(child))
return -EIO;
/*
* When changing the segment base, use do_arch_prctl
* to set either thread.fs or thread.fsindex and the
* corresponding GDT slot.
*/
if (child->thread.fs != value)
return do_arch_prctl(child, ARCH_SET_FS, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
/*
* Exactly the same here as the %fs handling above.
*/
if (value >= TASK_SIZE_OF(child))
return -EIO;
if (child->thread.gs != value)
return do_arch_prctl(child, ARCH_SET_GS, value);
return 0;
case offsetof(struct user_regs_struct, eflags):
value &= FLAG_MASK;
tmp = get_stack_long(child, EFL_OFFSET);
tmp &= ~FLAG_MASK;
value |= tmp;
break;
case offsetof(struct user_regs_struct,cs):
if ((value & 3) != 3)
return -EIO;
value &= 0xffff;
break;
}
put_stack_long(child, regno - sizeof(struct pt_regs), value);
return 0;
}
static unsigned long getreg(struct task_struct *child, unsigned long regno)
{
unsigned long val;
switch (regno) {
case offsetof(struct user_regs_struct, fs):
return child->thread.fsindex;
case offsetof(struct user_regs_struct, gs):
return child->thread.gsindex;
case offsetof(struct user_regs_struct, ds):
return child->thread.ds;
case offsetof(struct user_regs_struct, es):
return child->thread.es;
case offsetof(struct user_regs_struct, fs_base):
/*
* do_arch_prctl may have used a GDT slot instead of
* the MSR. To userland, it appears the same either
* way, except the %fs segment selector might not be 0.
*/
if (child->thread.fs != 0)
return child->thread.fs;
if (child->thread.fsindex != FS_TLS_SEL)
return 0;
return get_desc_base(&child->thread.tls_array[FS_TLS]);
case offsetof(struct user_regs_struct, gs_base):
/*
* Exactly the same here as the %fs handling above.
*/
if (child->thread.gs != 0)
return child->thread.gs;
if (child->thread.gsindex != GS_TLS_SEL)
return 0;
return get_desc_base(&child->thread.tls_array[GS_TLS]);
default:
regno = regno - sizeof(struct pt_regs);
val = get_stack_long(child, regno);
if (test_tsk_thread_flag(child, TIF_IA32))
val &= 0xffffffff;
return val;
}
}
long arch_ptrace(struct task_struct *child, long request, long addr, long data)
{
long i, ret;
unsigned ui;
switch (request) {
/* when I and D space are separate, these will need to be fixed. */
case PTRACE_PEEKTEXT: /* read word at location addr. */
case PTRACE_PEEKDATA:
ret = generic_ptrace_peekdata(child, addr, data);
break;
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
unsigned long tmp;
ret = -EIO;
if ((addr & 7) ||
addr > sizeof(struct user) - 7)
break;
switch (addr) {
case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
tmp = getreg(child, addr);
break;
case offsetof(struct user, u_debugreg[0]):
tmp = child->thread.debugreg0;
break;
case offsetof(struct user, u_debugreg[1]):
tmp = child->thread.debugreg1;
break;
case offsetof(struct user, u_debugreg[2]):
tmp = child->thread.debugreg2;
break;
case offsetof(struct user, u_debugreg[3]):
tmp = child->thread.debugreg3;
break;
case offsetof(struct user, u_debugreg[6]):
tmp = child->thread.debugreg6;
break;
case offsetof(struct user, u_debugreg[7]):
tmp = child->thread.debugreg7;
break;
default:
tmp = 0;
break;
}
ret = put_user(tmp,(unsigned long __user *) data);
break;
}
/* when I and D space are separate, this will have to be fixed. */
case PTRACE_POKETEXT: /* write the word at location addr. */
case PTRACE_POKEDATA:
ret = generic_ptrace_pokedata(child, addr, data);
break;
case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
{
int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
ret = -EIO;
if ((addr & 7) ||
addr > sizeof(struct user) - 7)
break;
switch (addr) {
case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
ret = putreg(child, addr, data);
break;
/* Disallows to set a breakpoint into the vsyscall */
case offsetof(struct user, u_debugreg[0]):
if (data >= TASK_SIZE_OF(child) - dsize) break;
child->thread.debugreg0 = data;
ret = 0;
break;
case offsetof(struct user, u_debugreg[1]):
if (data >= TASK_SIZE_OF(child) - dsize) break;
child->thread.debugreg1 = data;
ret = 0;
break;
case offsetof(struct user, u_debugreg[2]):
if (data >= TASK_SIZE_OF(child) - dsize) break;
child->thread.debugreg2 = data;
ret = 0;
break;
case offsetof(struct user, u_debugreg[3]):
if (data >= TASK_SIZE_OF(child) - dsize) break;
child->thread.debugreg3 = data;
ret = 0;
break;
case offsetof(struct user, u_debugreg[6]):
if (data >> 32)
break;
child->thread.debugreg6 = data;
ret = 0;
break;
case offsetof(struct user, u_debugreg[7]):
/* See arch/i386/kernel/ptrace.c for an explanation of
* this awkward check.*/
data &= ~DR_CONTROL_RESERVED;
for(i=0; i<4; i++)
if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
break;
if (i == 4) {
child->thread.debugreg7 = data;
if (data)
set_tsk_thread_flag(child, TIF_DEBUG);
else
clear_tsk_thread_flag(child, TIF_DEBUG);
ret = 0;
}
break;
}
break;
}
case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
case PTRACE_CONT: /* restart after signal. */
ret = -EIO;
if (!valid_signal(data))
break;
if (request == PTRACE_SYSCALL)
set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
else
clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
child->exit_code = data;
/* make sure the single step bit is not set. */
clear_singlestep(child);
wake_up_process(child);
ret = 0;
break;
#ifdef CONFIG_IA32_EMULATION
/* This makes only sense with 32bit programs. Allow a
64bit debugger to fully examine them too. Better
don't use it against 64bit processes, use
PTRACE_ARCH_PRCTL instead. */
case PTRACE_SET_THREAD_AREA: {
struct user_desc __user *p;
int old;
p = (struct user_desc __user *)data;
get_user(old, &p->entry_number);
put_user(addr, &p->entry_number);
ret = do_set_thread_area(&child->thread, p);
put_user(old, &p->entry_number);
break;
case PTRACE_GET_THREAD_AREA:
p = (struct user_desc __user *)data;
get_user(old, &p->entry_number);
put_user(addr, &p->entry_number);
ret = do_get_thread_area(&child->thread, p);
put_user(old, &p->entry_number);
break;
}
#endif
/* normal 64bit interface to access TLS data.
Works just like arch_prctl, except that the arguments
are reversed. */
case PTRACE_ARCH_PRCTL:
ret = do_arch_prctl(child, data, addr);
break;
/*
* make the child exit. Best I can do is send it a sigkill.
* perhaps it should be put in the status that it wants to
* exit.
*/
case PTRACE_KILL:
ret = 0;
if (child->exit_state == EXIT_ZOMBIE) /* already dead */
break;
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
child->exit_code = SIGKILL;
/* make sure the single step bit is not set. */
clear_singlestep(child);
wake_up_process(child);
break;
case PTRACE_SINGLESTEP: /* set the trap flag. */
ret = -EIO;
if (!valid_signal(data))
break;
clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
set_singlestep(child);
child->exit_code = data;
/* give it a chance to run. */
wake_up_process(child);
ret = 0;
break;
case PTRACE_GETREGS: { /* Get all gp regs from the child. */
if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
sizeof(struct user_regs_struct))) {
ret = -EIO;
break;
}
ret = 0;
for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
data += sizeof(long);
}
break;
}
case PTRACE_SETREGS: { /* Set all gp regs in the child. */
unsigned long tmp;
if (!access_ok(VERIFY_READ, (unsigned __user *)data,
sizeof(struct user_regs_struct))) {
ret = -EIO;
break;
}
ret = 0;
for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
ret = __get_user(tmp, (unsigned long __user *) data);
if (ret)
break;
ret = putreg(child, ui, tmp);
if (ret)
break;
data += sizeof(long);
}
break;
}
case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
sizeof(struct user_i387_struct))) {
ret = -EIO;
break;
}
ret = get_fpregs((struct user_i387_struct __user *)data, child);
break;
}
case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
if (!access_ok(VERIFY_READ, (unsigned __user *)data,
sizeof(struct user_i387_struct))) {
ret = -EIO;
break;
}
set_stopped_child_used_math(child);
ret = set_fpregs(child, (struct user_i387_struct __user *)data);
break;
}
default:
ret = ptrace_request(child, request, addr, data);
break;
}
return ret;
}
static void syscall_trace(struct pt_regs *regs)
{
#if 0
printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
current->comm,
regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
current_thread_info()->flags, current->ptrace);
#endif
ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
? 0x80 : 0));
/*
* this isn't the same as continuing with a signal, but it will do
* for normal use. strace only continues with a signal if the
* stopping signal is not SIGTRAP. -brl
*/
if (current->exit_code) {
send_sig(current->exit_code, current, 1);
current->exit_code = 0;
}
}
asmlinkage void syscall_trace_enter(struct pt_regs *regs)
{
/* do the secure computing check first */
secure_computing(regs->orig_rax);
if (test_thread_flag(TIF_SYSCALL_TRACE)
&& (current->ptrace & PT_PTRACED))
syscall_trace(regs);
if (unlikely(current->audit_context)) {
if (test_thread_flag(TIF_IA32)) {
audit_syscall_entry(AUDIT_ARCH_I386,
regs->orig_rax,
regs->rbx, regs->rcx,
regs->rdx, regs->rsi);
} else {
audit_syscall_entry(AUDIT_ARCH_X86_64,
regs->orig_rax,
regs->rdi, regs->rsi,
regs->rdx, regs->r10);
}
}
}
asmlinkage void syscall_trace_leave(struct pt_regs *regs)
{
if (unlikely(current->audit_context))
audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
if ((test_thread_flag(TIF_SYSCALL_TRACE)
|| test_thread_flag(TIF_SINGLESTEP))
&& (current->ptrace & PT_PTRACED))
syscall_trace(regs);
}