实现Linux ptrace

共计 5172 个字符，预计需要花费 13 分钟才能阅读完成。

导读	本文介绍这些工具的底层 ptrace 是如何实现的。这里选用了 1.2.13 的早期版本，原理是类似的，新版内核代码过多，没必要陷入过多细节中。

实现 Linux ptrace

ptrace 是 Linux 内核提供的非常强大的系统调用，通过 ptrace 可以实现进程的单步调试和收集系统调用情况。比如 strace 和 gdb 都是基于 ptrace 实现的，strace 可以显示进程调用了哪些系统调用，gdb 可以实现对进程的调试。本文介绍这些工具的底层 ptrace 是如何实现的。这里选用了 1.2.13 的早期版本，原理是类似的，新版内核代码过多，没必要陷入过多细节中。

进程调试

ptrace 系统调用的实现中包含了很多功能，首先来看一下单步调试的实现。通过 ptrace 实现单步调试的方式有两种。

1. 父进程执行 fork 创建一个子进程，通过 ptrace 设置子进程为 PF_PTRACED 标记，然后执行 execve 加载被调试的程序。

2. 通过 ptrace attach 到指定的 pid 完成对进程的调试 (控制)。

首先看一下第一种的实现。

方式 1

 pid_t pid = fork();// 子进程 if (pid == 0) {ptrace(PTRACE_TRACEME,0,NULL,NULL); 
    // 加载被调试的程序 
    execve(argv[1], NULL, NULL); 
}

执行 fork 创建子进程后，通过 ptrace 的 PTRACE_TRACEME 指示操作系统设置子进程为被调试 (设置 PF_PTRACED 标记)。来看一下这一步操作系统做了什么事情。

 asmlinkage int sys_ptrace(long request, long pid, long addr, long data){if (request == PTRACE_TRACEME) { 
        current->flags |= PF_PTRACED; 
        return 0; 
    } 
}

这一步非常简单，接着看 execve 加载程序到内存执行时又是如何处理的。

 int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs) { 
    // 加载程序 
    for (fmt = formats ; fmt ; fmt = fmt->next) {int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; 
        retval = fn(&bprm, regs); 
    } 
}

do_execve 逻辑非常复杂，不过我们只关注需要的就好。do_execve 通过钩子函数加载程序，我们看看 formats 是什么。

 struct linux_binfmt { 
    struct linux_binfmt * next; 
    int *use_count; 
    int (*load_binary)(struct linux_binprm *, struct  pt_regs * regs); 
    int (*load_shlib)(int fd); 
    int (*core_dump)(long signr, struct pt_regs * regs); 
}; 
 
static struct linux_binfmt *formats = &aout_format;int register_binfmt(struct linux_binfmt * fmt){ 
    struct linux_binfmt ** tmp = &formats; 
 
    if (!fmt) 
        return -EINVAL; 
    if (fmt->next) 
        return -EBUSY; 
    while (*tmp) {if (fmt == *tmp) 
            return -EBUSY; 
        tmp = &(*tmp)->next; 
    } 
    *tmp = fmt; 
    return 0;    
}

可以看到 formats 是一个链表。可以通过 register_binfmt 函数注册节点。那么谁调用了这个函数呢?

 struct linux_binfmt elf_format = {NULL, NULL, load_elf_binary, load_elf_library, NULL};int init_module(void) {register_binfmt(&elf_format); 
 
return 0; 
 
}

所以最终调用了 load_elf_binary 函数加载程序。同样我们只关注相关的逻辑。

 if (current->flags & PF_PTRACED) 
        send_sig(SIGTRAP, current, 0);

load_elf_binary 中会判断如果进程设置了 PF_PTRACED 标记，那么会给当前进程发送一个 SIGTRAP 信号。接着看信号处理函数的相关逻辑。

 if ((current->flags & PF_PTRACED) && signr != SIGKILL) { 
    current->exit_code = signr; 
    // 修改当前进程（被调试的进程）为暂停状态 
    current->state = TASK_STOPPED; 
    // 通知父进程 
    notify_parent(current); 
    // 调度其他进程执行 
    schedule();}

所以程序被加载到内存后，根本没有机会执行就直接被修改为暂停状态了，接下来看看 notify_parent 通知父进程干什么。

 void notify_parent(struct task_struct * tsk){    
    // 给父进程发送 SIGCHLD 信号 
    if (tsk->p_pptr == task[1]) 
        tsk->exit_signal = SIGCHLD; 
    send_sig(tsk->exit_signal, tsk->p_pptr, 1); 
    wake_up_interruptible(&tsk->p_pptr->wait_chldexit); 
}

父进程收到信号后，可以通过 sys_ptrace 控制子进程，sys_ptrace 还提供了很多功能，比如读取子进程的数据。

 // pid 为子进程 id 
 
num = ptrace(PTRACE_PEEKUSER, pid, ORIG_RAX * 8, NULL);

这个就不展开了，主要是内存的校验和数据读取。这里讲一下 PTRACE_SINGLESTEP 命令，这个命令控制子进程单步执行的。

 case PTRACE_SINGLESTEP: {  /* set the trap flag. */ 
        long tmp; 
        child->flags &= ~PF_TRACESYS; 
        // 设置 eflags 的单步调试 flag 
        tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) | TRAP_FLAG; 
        put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp); 
        // 修改子进程状态为可执行 
        child->state = TASK_RUNNING; 
        child->exit_code = data; 
        return 0; 
}

PTRACE_SINGLESTEP 让子进程重新进入运行状态，但是有一个很关键的是，设置好了单步调试 flag。我们看看 trap flag 是什么。

A trap flag permits operation of a processor in single-step mode. If such a flag is available, debuggers can use it to step through the execution of a computer program.

也就是说，子进程执行一个指令后，就会被中断，然后系统会给被调试进程发送 SIGTRAP 信号。同样，被调试进程在信号处理函数里，通知父进程，从而控制权又回到了父进程手中，如此循环。

方式 2

除了开始时通过 ptrace 设置进程调试，也可以通过 ptrace 动态设置调试进程的能力，具体是通过 PTRACE_ATTACH 命令实现的。

 if (request == PTRACE_ATTACH) { 
        // 设置被调试标记 
        child->flags |= PF_PTRACED; 
        // 设置和父进程的关系 
        if (child->p_pptr != current) {REMOVE_LINKS(child); 
            child->p_pptr = current; 
            SET_LINKS(child); 
        } 
        // 给被调试进程发送 SIGSTOP 信号 
        send_sig(SIGSTOP, child, 1); 
        return 0; 
}

前面已经分析过，信号处理函数里会设置进程为暂停状态，然后通知主进程，主进程就可以控制子进程，具体和前面流程一样。

跟踪系统调用

ptrace 处理追踪进程执行过程之外，还可以实现跟踪系统调用。具体是通过 PTRACE_SYSCALL 命令实现。

 case PTRACE_SYSCALL: 
case PTRACE_CONT: { 
    long tmp; 
    // 设置 PF_TRACESYS 标记 
    if (request == PTRACE_SYSCALL) 
        child->flags |= PF_TRACESYS; 
    child->exit_code = data; 
    child->state = TASK_RUNNING; 
    // 清除 trap flag 标记 
    tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) & ~TRAP_FLAG; 
    put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp); 
    return 0; 
}

看起来很简单，就是设置了一个新的标记 PF_TRACESYS。看看这个标记有什么用。

 // 调用 syscall_trace 函数 
1:  call _syscall_trace 
    movl  
    movl ORIG_EAX(%esp),%eax 
    // 调用系统调用 
    call _sys_call_table(,%eax,4) 
    movl %eax,EAX(%esp)     # save the return value 
    movl _current,%eax 
    movl errno(%eax),%edx 
    negl %edx 
    je 1f 
    movl %edx,EAX(%esp) 
    orl $(CF_MASK),EFLAGS(%esp) # set carry to indicate error 
// 调用 syscall_trace 函数 
1:  call _syscall_trace

可以看到在系统调用的前后都有一个 syscall_trace 的逻辑，所以在系统调用前和后，我们都可以做点事情。来看看这个函数做了什么。

 asmlinkage void syscall_trace(void){ 
    // 暂停子进程，通知父进程，并调度其他进程执行 
    current->exit_code = SIGTRAP; 
    current->state = TASK_STOPPED; 
    notify_parent(current); 
    schedule();}

这里的逻辑就是把逻辑切换到主进程中，然后主进程就可以通过命令获取被调试进程的系统调用信息。下面是一个追踪进程所有系统调用的例子。

 /* 
  use ptrace to find all system call that call by certain process 
*/ 
#include  
#include  
#include  
#include  
#include  
#include  
 
int main(int argc, char *argv[]) {pid_t pid = fork(); 
    if (pid  ORIG_RAX * 8 or os of 32-it => ORIG_EAX * 4 
        num = ptrace(PTRACE_PEEKUSER, pid, ORIG_RAX * 8, NULL); 
        printf("system call num = %ld\n", num); 
        ptrace(PTRACE_SYSCALL, pid, NULL, NULL); 
        while(1) {wait(&status); 
            if(WIFEXITED(status)) 
                return 0; 
            // for enter system call 
            if(bit) {num = ptrace(PTRACE_PEEKUSER, pid, ORIG_RAX * 8, NULL); 
                printf("system call num = %ld", num); 
                bit = 0; 
            } else { // for return of system call 
                ret = ptrace(PTRACE_PEEKUSER, pid, RAX*8, NULL); 
                printf("system call return = %ld \n", ret); 
                bit = 1; 
            } 
            // let this child process continue to run until call next system call 
            ptrace(PTRACE_SYSCALL,pid,NULL,NULL); 
        } 
    } 
}

总结

ptrace 功能复杂而强大，理解它的原理对理解其他技术和工具都非常有意义，本文大概做了一个介绍，有兴趣的同学可以自行查看源码。

阿里云 2 核 2G 服务器 3M 带宽 61 元 1 年，有高配

腾讯云新客低至 82 元 / 年，老客户 99 元 / 年

代金券：在阿里云专用满减优惠券

申请腾讯混元的API Key并且使用LobeChat调用混元AI

基于Docker快速搭建一个开源的IT人员在线工具箱-it-tools

让每个人都可以轻松使用Git-腾讯自研Git客户端

使用Docker部署开源的WPS-Office

案例：php利用淘宝IP库获取用户ip地理位置

解析Linux中出现的错误：toomanyopenfiles

阿里云对象存储OSS资源包是自动抵扣吗？需要配置吗？

Mariadb学习总结（六）：索引

Nginx设置404页面

如何才能快速发现钓鱼邮件？

	pid_t pid = fork();// 子进程 if (pid == 0) {ptrace(PTRACE_TRACEME,0,NULL,NULL);
	// 加载被调试的程序
	execve(argv[1], NULL, NULL);
	}

	asmlinkage int sys_ptrace(long request, long pid, long addr, long data){if (request == PTRACE_TRACEME) {
	current->flags \|= PF_PTRACED;
	return 0;
	}
	}

	int do_execve(char * filename, char argv, char envp, struct pt_regs * regs) {
	// 加载程序
	for (fmt = formats ; fmt ; fmt = fmt->next) {int (fn)(struct linux_binprm , struct pt_regs *) = fmt->load_binary;
	retval = fn(&bprm, regs);
	}
	}

	struct linux_binfmt {
	struct linux_binfmt * next;
	int *use_count;
	int (load_binary)(struct linux_binprm , struct pt_regs * regs);
	int (*load_shlib)(int fd);
	int (core_dump)(long signr, struct pt_regs regs);
	};

	static struct linux_binfmt formats = &aout_format;int register_binfmt(struct linux_binfmt fmt){
	struct linux_binfmt ** tmp = &formats;

	if (!fmt)
	return -EINVAL;
	if (fmt->next)
	return -EBUSY;
	while (tmp) {if (fmt == tmp)
	return -EBUSY;
	tmp = &(*tmp)->next;
	}
	*tmp = fmt;
	return 0;
	}

	struct linux_binfmt elf_format = {NULL, NULL, load_elf_binary, load_elf_library, NULL};int init_module(void) {register_binfmt(&elf_format);

	return 0;

	}

	if (current->flags & PF_PTRACED)
	send_sig(SIGTRAP, current, 0);

	if ((current->flags & PF_PTRACED) && signr != SIGKILL) {
	current->exit_code = signr;
	// 修改当前进程（被调试的进程）为暂停状态
	current->state = TASK_STOPPED;
	// 通知父进程
	notify_parent(current);
	// 调度其他进程执行
	schedule();}

	void notify_parent(struct task_struct * tsk){
	// 给父进程发送 SIGCHLD 信号
	if (tsk->p_pptr == task[1])
	tsk->exit_signal = SIGCHLD;
	send_sig(tsk->exit_signal, tsk->p_pptr, 1);
	wake_up_interruptible(&tsk->p_pptr->wait_chldexit);
	}

	// pid 为子进程 id

	num = ptrace(PTRACE_PEEKUSER, pid, ORIG_RAX * 8, NULL);

	case PTRACE_SINGLESTEP: { /* set the trap flag. */
	long tmp;
	child->flags &= ~PF_TRACESYS;
	// 设置 eflags 的单步调试 flag
	tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) \| TRAP_FLAG;
	put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp);
	// 修改子进程状态为可执行
	child->state = TASK_RUNNING;
	child->exit_code = data;
	return 0;
	}

	if (request == PTRACE_ATTACH) {
	// 设置被调试标记
	child->flags \|= PF_PTRACED;
	// 设置和父进程的关系
	if (child->p_pptr != current) {REMOVE_LINKS(child);
	child->p_pptr = current;
	SET_LINKS(child);
	}
	// 给被调试进程发送 SIGSTOP 信号
	send_sig(SIGSTOP, child, 1);
	return 0;
	}

	case PTRACE_SYSCALL:
	case PTRACE_CONT: {
	long tmp;
	// 设置 PF_TRACESYS 标记
	if (request == PTRACE_SYSCALL)
	child->flags \|= PF_TRACESYS;
	child->exit_code = data;
	child->state = TASK_RUNNING;
	// 清除 trap flag 标记
	tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) & ~TRAP_FLAG;
	put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp);
	return 0;
	}

	// 调用 syscall_trace 函数
	1: call _syscall_trace
	movl
	movl ORIG_EAX(%esp),%eax
	// 调用系统调用
	call _sys_call_table(,%eax,4)
	movl %eax,EAX(%esp) # save the return value
	movl _current,%eax
	movl errno(%eax),%edx
	negl %edx
	je 1f
	movl %edx,EAX(%esp)
	orl $(CF_MASK),EFLAGS(%esp) # set carry to indicate error
	// 调用 syscall_trace 函数
	1: call _syscall_trace

	asmlinkage void syscall_trace(void){
	// 暂停子进程，通知父进程，并调度其他进程执行
	current->exit_code = SIGTRAP;
	current->state = TASK_STOPPED;
	notify_parent(current);
	schedule();}

	/*
	use ptrace to find all system call that call by certain process
	*/
	#include
	#include
	#include
	#include
	#include
	#include

	int main(int argc, char *argv[]) {pid_t pid = fork();
	if (pid ORIG_RAX * 8 or os of 32-it => ORIG_EAX * 4
	num = ptrace(PTRACE_PEEKUSER, pid, ORIG_RAX * 8, NULL);
	printf("system call num = %ld\n", num);
	ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
	while(1) {wait(&status);
	if(WIFEXITED(status))
	return 0;
	// for enter system call
	if(bit) {num = ptrace(PTRACE_PEEKUSER, pid, ORIG_RAX * 8, NULL);
	printf("system call num = %ld", num);
	bit = 0;
	} else { // for return of system call
	ret = ptrace(PTRACE_PEEKUSER, pid, RAX*8, NULL);
	printf("system call return = %ld \n", ret);
	bit = 1;
	}
	// let this child process continue to run until call next system call
	ptrace(PTRACE_SYSCALL,pid,NULL,NULL);
	}
	}
	}