diff options
Diffstat (limited to 'exec/trace.c')
-rw-r--r-- | exec/trace.c | 1702 |
1 files changed, 1702 insertions, 0 deletions
diff --git a/exec/trace.c b/exec/trace.c new file mode 100644 index 00000000000..05d862f5b9f --- /dev/null +++ b/exec/trace.c @@ -0,0 +1,1702 @@ +/* Program execution for Emacs. + +Copyright (C) 2023-2024 Free Software Foundation, Inc. + +This file is part of GNU Emacs. + +GNU Emacs is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or (at +your option) any later version. + +GNU Emacs is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <sys/ptrace.h> +#include <sys/types.h> +#include <sys/wait.h> + +#include <limits.h> +#include <stddef.h> +#include <string.h> +#include <assert.h> +#include <signal.h> +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> +#include <fcntl.h> + +#include "exec.h" + +#include SYSCALL_HEADER +#include USER_HEADER + +#ifdef __aarch64__ +#include <sys/uio.h> /* for struct iovec */ +#include <linux/elf.h> /* for NT_* */ +#endif /* __aarch64__ */ + +#ifdef HAVE_SYS_UIO_H +#include <sys/uio.h> /* for process_vm_readv */ +#endif /* HAVE_SYS_UIO_H */ + +#ifndef SYS_SECCOMP +#define SYS_SECCOMP 1 +#endif /* SYS_SECCOMP */ + +#ifndef PTRACE_GETEVENTMSG +#define PTRACE_GETEVENTMSG 0x4201 +#endif /* PTRACE_GETEVENTMSG */ + + + +/* Program tracing functions. + + The main entry point is the function `tracing_execve', which traces + the thread and calls exec. Each time that thread calls `clone', + the new child is traced as well. + + Instead of calling `waitpid', call `exec_waitpid' instead. */ + + + +/* Number of tracees children are allowed to create. */ +#define MAX_TRACEES 4096 + +#ifdef __aarch64__ + +/* Place PID's registers into *REGS. Return 1 upon failure, else + 0. */ + +int +aarch64_get_regs (pid_t pid, USER_REGS_STRUCT *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof *regs; + + return (ptrace (PTRACE_GETREGSET, pid, NT_PRSTATUS, + &iov) != 0); +} + +/* Set PID's registers to *REGS. If SYSCALL_P, also update the + current system call number to the `x8' register. + + Value is 1 upon failure, else 0. */ + +int +aarch64_set_regs (pid_t pid, USER_REGS_STRUCT *regs, + bool syscall_p) +{ + struct iovec iov; + USER_WORD callno; + long rc; + + /* Write the user registers. */ + + iov.iov_base = regs; + iov.iov_len = sizeof *regs; + + rc = ptrace (PTRACE_SETREGSET, pid, NT_PRSTATUS, + &iov); + if (rc < 0) + return 1; + + /* Now, write the system call number if necessary. */ + + if (syscall_p) + { + callno = regs->regs[8]; + iov.iov_base = &callno; + iov.iov_len = sizeof callno; + + return (ptrace (PTRACE_SETREGSET, pid, NT_ARM_SYSTEM_CALL, + &iov) != 0); + } + + return 0; +} + +#endif /* __aarch64__ */ + + + +/* List of all processes which are being traced. */ +static struct exec_tracee *tracing_processes; + + + +/* Read N bytes from TRACEE's memory, starting at the specified user + ADDRESS. Return its contents in BUFFER. + + If there are unreadable pages within ADDRESS + N, the contents of + BUFFER after the first such page becomes undefined. */ + +static void +read_memory (struct exec_tracee *tracee, char *buffer, + USER_WORD n, USER_WORD address) +{ + USER_WORD word, n_words, n_bytes, i; + long rc; +#ifdef HAVE_PROCESS_VM + struct iovec iov, remote; + + /* If `process_vm_readv' is available, use it instead. */ + + iov.iov_base = buffer; + iov.iov_len = n; + remote.iov_base = (void *) address; + remote.iov_len = n; + + /* Return immediately if successful. As long as some bytes were + read, consider the read to have been a success. */ + + if (n <= SSIZE_MAX + && ((size_t) process_vm_readv (tracee->pid, &iov, 1, + &remote, 1, 0) != -1)) + return; + +#endif /* HAVE_PROCESS_VM */ + + /* First, read entire words from the tracee. */ + n_words = n & ~(sizeof (USER_WORD) - 1); + + /* Next, determine the number of bytes to read from the last + word. */ + n_bytes = n & (sizeof (USER_WORD) - 1); + + /* Start reading words. */ + i = 0; + while (n_words) + { + rc = ptrace (PTRACE_PEEKTEXT, tracee->pid, + (void *) address + i, NULL); + word = rc; + memcpy (buffer, &word, sizeof word); + buffer += sizeof word; + i += sizeof word; + n_words -= sizeof word; + } + + /* Now, read the remaining bytes. */ + assert (n_bytes < sizeof (word)); + + if (n_bytes) + { + rc = ptrace (PTRACE_PEEKTEXT, tracee->pid, + (void *) address + i, NULL); + word = rc; + + /* Copy only n_bytes to the caller. */ + memcpy (buffer, &word, n_bytes); + } +} + +/* Allocate N bytes of memory from TRACEE's stack. Return the address + of that memory upon success, else 0. + + Place the updated user-mode registers of TRACEE in *NEW_REGS, which + should initially contain the current stack pointer of TRACEE. + + REGS should contain the user mode registers of TRACEE prior to the + system call starting; it is not updated to reflect any changes. */ + +USER_WORD +user_alloca (struct exec_tracee *tracee, USER_REGS_STRUCT *regs, + USER_REGS_STRUCT *new_regs, USER_WORD n) +{ + USER_WORD sp, old_sp; + + /* Get the current stack pointer. */ + old_sp = sp = new_regs->STACK_POINTER; + +#if RED_ZONE_SIZE + /* Some ABI rules specify a ``red zone'' around the stack pointer + that is reserved for compiler optimizations. */ + +#ifdef STACK_GROWS_DOWNWARDS + if (sp == regs->STACK_POINTER) + sp -= RED_ZONE_SIZE; +#else /* !STACK_GROWS_DOWNWARDS */ + if (sp == regs->STACK_POINTER) + sp += RED_ZONE_SIZE; +#endif /* STACK_GROWS_DOWNWARDS */ +#endif /* RED_ZONE_SIZE */ + + /* Now take N off the stack. */ + +#ifdef STACK_GROWS_DOWNWARDS + sp = sp - n; + + /* Check for overflow. */ + + if (sp > new_regs->STACK_POINTER) + return 0; +#else /* !STACK_GROWS_DOWNWARDS */ + sp = sp + n; + + /* Check for overflow. */ + + if (sp < new_regs->STACK_POINTER) + return 0; +#endif /* STACK_GROWS_DOWNWARDS */ + + /* Set the stack pointer. */ + new_regs->STACK_POINTER = sp; + +#ifdef __aarch64__ + if (aarch64_set_regs (tracee->pid, new_regs, false)) + goto fail; +#else /* !__aarch64__ */ + if (ptrace (PTRACE_SETREGS, tracee->pid, NULL, + new_regs)) + goto fail; +#endif /* __aarch64__ */ + + /* Now return the start of the new area. */ +#ifdef STACK_GROWS_DOWNWARDS + return sp; +#else /* !STACK_GROWS_DOWNWARDS */ + return sp - n; +#endif /* STACK_GROWS_DOWNWARDS */ + + fail: + /* Restore the old stack pointer. */ + new_regs->STACK_POINTER = old_sp; + return 0; +} + +/* Copy N bytes to ADDRESS in TRACEE's address space from BUFFER. + Value is 0 upon success, else 1. */ + +int +user_copy (struct exec_tracee *tracee, const unsigned char *buffer, + USER_WORD address, USER_WORD n) +{ + USER_WORD start, end, word; + unsigned char *bytes; +#ifdef HAVE_PROCESS_VM + struct iovec iov, remote; + + /* Try to use `process_vm_writev' if possible, but fall back to + ptrace if something bad happens. */ + + iov.iov_base = (void *) buffer; + iov.iov_len = n; + remote.iov_base = (void *) address; + remote.iov_len = n; + + if (n <= SSIZE_MAX + && ((size_t) process_vm_writev (tracee->pid, &iov, 1, + &remote, 1, 0) == n)) + return 0; +#endif /* HAVE_PROCESS_VM */ + + /* Calculate the start and end positions for the write. */ + + start = address; + end = address + n; + + /* Write from start to the last word. */ + + while (start < end) + { + if (start + sizeof word <= end) + { + /* Write a word by itself and increment start. */ + memcpy (&word, buffer, sizeof word); + buffer += sizeof word; + + if (ptrace (PTRACE_POKEDATA, tracee->pid, + (void *) start, (void *) word)) + return 1; + + start += sizeof word; + } + else + { + /* Only end - start bytes should be written. + Read the word at start from tracee->pid, then write + it back with changes. */ + + word = ptrace (PTRACE_PEEKDATA, tracee->pid, + (void *) start, NULL); + bytes = (unsigned char *) &word; + memcpy (bytes, buffer, end - start); + + if (ptrace (PTRACE_POKEDATA, tracee->pid, + (void *) start, (void *) word)) + return 1; + + /* Writing was successful. */ + return 0; + } + } + + return 0; +} + + + +/* Chain of free exec_tracee structures. */ +static struct exec_tracee *free_tracees; + +/* Remove the specified TRACEE from the chain of all processes being + traced. */ + +static void +remove_tracee (struct exec_tracee *tracee) +{ + struct exec_tracee **last; + + last = &tracing_processes; + while (*last) + { + if (*last == tracee) + { + *last = tracee->next; + + /* Link the tracee onto the list of free tracees. */ + tracee->next = free_tracees; + +#ifndef REENTRANT + /* Free the exec file, if any. */ + free (tracee->exec_file); + tracee->exec_file = NULL; +#endif /* REENTRANT */ + + free_tracees = tracee; + + return; + } + else + last = &(*last)->next; + } +} + + + +/* Child process tracing. */ + +/* Array of `struct exec_tracees' that they are allocated from. */ +static struct exec_tracee static_tracees[MAX_TRACEES]; + +/* Number of tracees currently allocated. */ +static int tracees; + +/* Return the `struct exec_tracee' corresponding to the specified + PROCESS. */ + +static struct exec_tracee * +find_tracee (pid_t process) +{ + struct exec_tracee *tracee; + + for (tracee = tracing_processes; tracee; tracee = tracee->next) + { + if (tracee->pid == process) + return tracee; + } + + return NULL; +} + +/* Prepare to handle the completion of a `clone' system call. + + If the new clone is not yet being traced, create a new tracee for + PARENT's child, copying over its current command line. Then, set + `new_child' in the new tracee. Otherwise, continue it until the + next syscall. */ + +static void +handle_clone_prepare (struct exec_tracee *parent) +{ +#ifndef REENTRANT + long rc; + unsigned long pid; + struct exec_tracee *tracee; + + rc = ptrace (PTRACE_GETEVENTMSG, parent->pid, NULL, + &pid); + if (rc) + return; + + /* See if the tracee already exists. */ + tracee = find_tracee (pid); + + if (tracee) + { + /* Continue the tracee. Record its command line, as that has + not yet been done. */ + + assert (tracee->new_child); + tracee->new_child = false; + tracee->exec_file = NULL; + ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0); + + if (parent->exec_file) + tracee->exec_file = strdup (parent->exec_file); + return; + } + + if (free_tracees) + { + tracee = free_tracees; + free_tracees = free_tracees->next; + } + else if (tracees < MAX_TRACEES) + { + tracee = &static_tracees[tracees]; + tracees++; + } +#ifndef REENTRANT + /* Try to allocate a tracee using `malloc' if this library is + not being built to run inside a signal handler. */ + else if ((tracee = malloc (sizeof *tracee))) + ; +#endif /* REENTRANT */ + else + return; + + tracee->pid = pid; + tracee->next = tracing_processes; + tracee->waiting_for_syscall = false; + tracee->new_child = true; + tracee->exec_file = NULL; + tracing_processes = tracee; + + /* Copy over the command line. */ + + if (parent->exec_file) + tracee->exec_file = strdup (parent->exec_file); +#endif /* REENTRANT */ +} + +/* Handle the completion of a `clone' or `clone3' system call, + resulting in the creation of the process PID. If TRACEE is NULL, + allocate a new tracee structure from a static area for the + processes's pid, then set TRACEE->new_child to true and await the + parent's corresponding ptrace event to arrive; otherwise, just + clear TRACEE->new_child. + + Value is 0 upon success, 2 if TRACEE should remain suspended until + the parent's ptrace-stop, and 1 otherwise. */ + +static int +handle_clone (struct exec_tracee *tracee, pid_t pid) +{ + long rc; + int flags, value; + + /* Now allocate a new tracee, either from static_tracees or the free + list, if no tracee was supplied. */ + + value = 0; + + if (!tracee) + { + if (free_tracees) + { + tracee = free_tracees; + free_tracees = free_tracees->next; + } + else if (tracees < MAX_TRACEES) + { + tracee = &static_tracees[tracees]; + tracees++; + } +#ifndef REENTRANT + /* Try to allocate a tracee using `malloc' if this library is + not being built to run inside a signal handler. */ + else if ((tracee = malloc (sizeof *tracee))) + ; +#endif /* REENTRANT */ + else + return 1; + + tracee->pid = pid; + tracee->next = tracing_processes; + tracee->waiting_for_syscall = false; +#ifndef REENTRANT + tracee->exec_file = NULL; +#endif /* REENTRANT */ + tracing_processes = tracee; + tracee->new_child = true; + + /* Wait for the ptrace-stop to happen in the parent. */ + value = 2; + } + else + /* Clear the flag saying that this is a newly created child + process. */ + tracee->new_child = false; + + /* Apply required options to the child, so that the kernel + automatically traces children and makes it easy to differentiate + between system call traps and other kinds of traps. */ + + flags = PTRACE_O_TRACECLONE; + flags |= PTRACE_O_TRACEVFORK; + flags |= PTRACE_O_TRACEFORK; + flags |= PTRACE_O_TRACESYSGOOD; + flags |= PTRACE_O_TRACEEXIT; + + rc = ptrace (PTRACE_SETOPTIONS, pid, 0, flags); + + if (rc) + goto bail; + + if (value != 2) + { + /* The new tracee is currently stopped. Continue it until the next + system call. */ + + rc = ptrace (PTRACE_SYSCALL, pid, 0, 0); + + if (rc) + goto bail; + } + + return value; + + bail: + remove_tracee (tracee); + return 1; +} + + + +/* NOTICE: none of these functions should ever call `malloc' or + another async signal unsafe function. */ + +/* File name of the loader binary. */ +static const char *loader_name; + + + +/* Return whether or not the trap signal described by SIGNAL is + generated by a system call being attempted by a tracee. */ + +static bool +syscall_trap_p (siginfo_t *signal) +{ + /* SIGTRAP delivered by the kernel means this is a system call + stop. */ + return (signal->si_code == SIGTRAP + || signal->si_code == (SIGTRAP | SI_KERNEL)); +} + +/* Check if the wait status STATUS indicates a system call trap. + TRACEE is the process whose stop STATUS describes. If TRACEE exits + while this information is being determined, return -1; if STATUS + indicates some other kind of stop, return 1 after continuing + TRACEE. Value is 0 otherwise. */ + +static int +check_signal (struct exec_tracee *tracee, int status) +{ + siginfo_t siginfo; + + switch ((status & 0xfff00) >> 8) + { + case SIGTRAP: + /* Now, use PTRACE_GETSIGINFO to determine whether or not the + signal was delivered in response to a system call. */ + + if (ptrace (PTRACE_GETSIGINFO, tracee->pid, 0, &siginfo)) + return -1; + + if (!syscall_trap_p (&siginfo)) + { + if (siginfo.si_code < 0) + /* SIGTRAP delivered from userspace. Pass it on. */ + ptrace (PTRACE_SYSCALL, tracee->pid, 0, SIGTRAP); + else + ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0); + + return 1; + } + + case SIGTRAP | 0x80: /* SIGTRAP | 0x80 specifically refers to + system call traps. */ + break; + +#ifdef SIGSYS + case SIGSYS: + if (ptrace (PTRACE_GETSIGINFO, tracee->pid, 0, &siginfo)) + return -1; + + /* Continue the process until the next syscall, but don't + pass through the signal if an emulated syscall led to + it. */ +#ifdef HAVE_SIGINFO_T_SI_SYSCALL +#ifndef __arm__ + ptrace (PTRACE_SYSCALL, tracee->pid, + 0, ((siginfo.si_code == SYS_SECCOMP + && siginfo.si_syscall == -1) + ? 0 : status)); +#else /* __arm__ */ + ptrace (PTRACE_SYSCALL, tracee->pid, + 0, ((siginfo.si_code == SYS_SECCOMP + && siginfo.si_syscall == 222) + ? 0 : status)); +#endif /* !__arm__ */ +#else /* !HAVE_SIGINFO_T_SI_SYSCALL */ + /* Drop this signal, since what caused it is unknown. */ + ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0); +#endif /* HAVE_SIGINFO_T_SI_SYSCALL */ + return 1; +#endif /* SIGSYS */ + + default: + /* Continue the process until the next syscall. */ + ptrace (PTRACE_SYSCALL, tracee->pid, 0, status); + return 1; + } + + return 0; +} + + + +/* Handle an `exec' system call from the given TRACEE. REGS are the + tracee's current user-mode registers. + + Rewrite the system call arguments to use the loader binary. Then, + continue the system call until the loader is loaded. Write the + information necessary to load the original executable into the + loader's stack. + + Value is 0 upon success, 1 upon a generic failure before the loader + is loaded, 2 if the process has stopped, and 3 if something failed, + but it is too late to handle it. + + Set errno appropriately upon returning a generic failure. */ + +static int +handle_exec (struct exec_tracee *tracee, USER_REGS_STRUCT *regs) +{ + char buffer[PATH_MAX + 80], *area; + USER_REGS_STRUCT original; + size_t size, loader_size; + USER_WORD loader, size1, sp; + int rc, wstatus; + siginfo_t siginfo; + + /* Save the old stack pointer. */ + sp = regs->STACK_POINTER; + + /* Read the file name. */ + read_memory (tracee, buffer, PATH_MAX, + regs->SYSCALL_ARG_REG); + + /* Make sure BUFFER is NULL terminated. */ + + if (!memchr (buffer, '\0', PATH_MAX)) + { + errno = ENAMETOOLONG; + return 1; + } + + /* Copy over the registers as they originally were. */ + memcpy (&original, regs, sizeof *regs); + + /* Figure out what the loader needs to do. */ + again1: + area = exec_0 (buffer, tracee, &size, regs); + + if (!area) + { + /* Handle SIGINTR errors caused by IO. */ + if (errno == EINTR) + goto again1; + + return 1; + } + + /* Rewrite the first argument to point to the loader. */ + + loader_size = strlen (loader_name) + 1; + loader = user_alloca (tracee, &original, regs, + loader_size); + + if (!loader) + { + errno = ENOMEM; + return 1; + } + + if (user_copy (tracee, (unsigned char *) loader_name, + loader, loader_size)) + { + errno = EIO; + return 1; + } + + regs->SYSCALL_ARG_REG = loader; + +#ifdef __aarch64__ + + if (aarch64_set_regs (tracee->pid, regs, false)) + { + errno = EIO; + return 1; + } + +#else /* !__aarch64__ */ + + if (ptrace (PTRACE_SETREGS, tracee->pid, NULL, + regs)) + { + errno = EIO; + return 1; + } + +#endif /* __aarch64__ */ + + /* Continue the system call until loader starts. */ + + if (ptrace (PTRACE_SYSCALL, tracee->pid, NULL, NULL)) + { + errno = EIO; + return 1; + } + +#ifndef REENTRANT + /* Now that the loader has started, record the value to use for + /proc/self/exe. Don't give up just because strdup fails. + + Note that exec_0 copies the absolute file name into buffer. */ + + if (tracee->exec_file) + free (tracee->exec_file); + tracee->exec_file = strdup (buffer); +#endif /* REENTRANT */ + + again: + rc = waitpid (tracee->pid, &wstatus, __WALL); + if (rc == -1 && errno == EINTR) + goto again; + + if (rc < 0) + return 1; + + if (!WIFSTOPPED (wstatus)) + /* The process has been killed in response to a signal. + In this case, simply return 2. */ + return 2; + else + { + /* Then, check if STATUS is not a syscall-stop, and try again if + it isn't. */ + rc = check_signal (tracee, wstatus); + + if (rc == -1) + return 2; + else if (rc) + goto again; + + /* Retrieve the signal information and determine whether or not + the system call has completed. */ + + if (ptrace (PTRACE_GETSIGINFO, tracee->pid, 0, + &siginfo)) + return 3; + + if (!syscall_trap_p (&siginfo)) + { + /* Continue. */ + if (ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0)) + return 3; + + goto again; + } + } + +#ifdef __aarch64__ + + if (aarch64_get_regs (tracee->pid, &original)) + return 3; + +#else /* !__aarch64__ */ + + /* The system call has now completed. Get the registers again. */ + + if (ptrace (PTRACE_GETREGS, tracee->pid, NULL, + &original)) + return 3; + +#endif /* __aarch64__ */ + + *regs = original; + + /* Upon failure, wait for the next system call and return + success. */ + + if (original.SYSCALL_RET_REG) + { + /* Restore the original stack pointer. */ + regs->STACK_POINTER = sp; + +#ifdef __aarch64__ + aarch64_set_regs (tracee->pid, regs, false); +#else /* !__aarch64__ */ + ptrace (PTRACE_SETREGS, tracee->pid, NULL, regs); +#endif /* __aarch64__ */ + + goto exec_failure; + } + + /* Write the loader area to the stack, followed by its size and the + original stack pointer. */ + + loader = user_alloca (tracee, &original, regs, + size + sizeof loader * 2); + if (!loader) + return 3; + + size1 = size; + +#ifndef STACK_GROWS_DOWNWARDS + + NOT_IMPLEMENTED; + +#else /* STACK_GROWS_DOWNWARDS */ + + if (user_copy (tracee, (unsigned char *) area, + loader + sizeof size1 * 2, size) + || user_copy (tracee, (unsigned char *) &size1, + loader + sizeof size1, sizeof size1)) + return 3; + + size1 = original.STACK_POINTER; + + if (user_copy (tracee, (unsigned char *) &size1, + loader, sizeof size1)) + return 3; + +#endif /* STACK_GROWS_DOWNWARDS */ + + /* Continue. */ + if (ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0)) + return 3; + + return 0; + + exec_failure: + return 3; +} + + + +/* Define replacements for required string functions. */ + +#if !defined HAVE_STPCPY || !defined HAVE_DECL_STPCPY + +/* Copy SRC to DEST, returning the address of the terminating '\0' in + DEST. */ + +static char * +rpl_stpcpy (char *dest, const char *src) +{ + register char *d; + register const char *s; + + d = dest; + s = src; + + do + *d++ = *s; + while (*s++ != '\0'); + + return d - 1; +} + +#define stpcpy rpl_stpcpy +#endif /* !defined HAVE_STPCPY || !defined HAVE_DECL_STPCPY */ + + + +/* Modify BUFFER, of size SIZE, so that it holds the absolute name of + the file identified by BUFFER, relative to the current working + directory of TRACEE if FD be AT_FDCWD, or the file referenced by FD + otherwise. + + Value is 1 if this information is unavailable (of which there are + variety of causes), and 0 on success. */ + +static int +canon_path (struct exec_tracee *tracee, int fd, char *buffer, + ptrdiff_t size) +{ + char link[sizeof "/proc//fd/" + 48], *p; /* Or /proc/pid/cwd. */ + char target[PATH_MAX]; + ssize_t rc, length; + + if (buffer[0] == '/') + /* Absolute file name; return immediately. */ + return 0; + else if (fd == AT_FDCWD) + { + p = stpcpy (link, "/proc/"); + p = format_pid (p, tracee->pid); + stpcpy (p, "/cwd"); + } + else if (fd < 0) + /* Invalid file descriptor. */ + return 1; + else + { + p = stpcpy (link, "/proc/"); + p = format_pid (p, tracee->pid); + p = stpcpy (p, "/fd/"); + format_pid (p, fd); + } + + /* Read LINK's target, and should it be oversized, punt. */ + rc = readlink (link, target, PATH_MAX); + if (rc < 0 || rc >= PATH_MAX) + return 1; + + /* Consider the amount by which BUFFER's existing contents should be + displaced. */ + + length = strlen (buffer) + 1; + if ((length + rc + (target[rc - 1] != '/')) > size) + /* Punt if this would overflow. */ + return 1; + + memmove ((buffer + rc + (target[rc - 1] != '/')), + buffer, length); + + /* Copy the new file name into BUFFER. */ + memcpy (buffer, target, rc); + + /* Insert separator in between if need be. */ + if (target[rc - 1] != '/') + buffer[rc] = '/'; + + return 0; +} + +/* Handle a `readlink' or `readlinkat' system call. + + CALLNO is the system call number, and REGS are the current user + registers of the TRACEE. + + If the file name specified in either a `readlink' or `readlinkat' + system call is `/proc/self/exe', write the name of the executable + being run into the buffer specified in the system call. Do not + handle relative file names at the moment. + + Return the number of bytes written to the tracee's buffer in + *RESULT. + + Value is 0 upon success. Value is 1 upon failure, and 2 if the + system call has been emulated. */ + +static int +handle_readlinkat (USER_WORD callno, USER_REGS_STRUCT *regs, + struct exec_tracee *tracee, USER_WORD *result) +{ +#ifdef REENTRANT + /* readlinkat cannot be handled specially when the library is built + to be reentrant, as the file name information cannot be + recorded. */ + return 0; +#else /* !REENTRANT */ + + char buffer[PATH_MAX + 1]; + USER_WORD address, return_buffer, size; + size_t length; + char proc_pid_exe[sizeof "/proc//exe" + 24], *p; + int dirfd; + + /* Read the file name. */ + +#ifdef READLINK_SYSCALL + if (callno == READLINK_SYSCALL) + { + dirfd = AT_FDCWD; + address = regs->SYSCALL_ARG_REG; + return_buffer = regs->SYSCALL_ARG1_REG; + size = regs->SYSCALL_ARG2_REG; + } + else +#endif /* READLINK_SYSCALL */ + { + dirfd = (USER_SWORD) regs->SYSCALL_ARG_REG; + address = regs->SYSCALL_ARG1_REG; + return_buffer = regs->SYSCALL_ARG2_REG; + size = regs->SYSCALL_ARG3_REG; + } + + read_memory (tracee, buffer, PATH_MAX, address); + + /* Make sure BUFFER is NULL terminated. */ + + if (!memchr (buffer, '\0', PATH_MAX)) + { + errno = ENAMETOOLONG; + return 1; + } + + /* Expand BUFFER into an absolute file name. TODO: + AT_SYMLINK_FOLLOW? */ + + if (canon_path (tracee, dirfd, buffer, sizeof buffer)) + return 0; + + /* Now check if the caller is looking for /proc/self/exe or its + equivalent with the PID made explicit. + + dirfd can be ignored, as for now only absolute file names are + handled. FIXME. */ + + p = stpcpy (proc_pid_exe, "/proc/"); + p = format_pid (p, tracee->pid); + stpcpy (p, "/exe"); + + if ((strcmp (buffer, "/proc/self/exe") + && strcmp (buffer, proc_pid_exe)) + || !tracee->exec_file) + return 0; + + /* Copy over tracee->exec_file. Truncate it to PATH_MAX, length, or + size, whichever is smaller. */ + + length = strlen (tracee->exec_file); + length = MIN (size, MIN (PATH_MAX, length)); + strncpy (buffer, tracee->exec_file, length); + + if (user_copy (tracee, (unsigned char *) buffer, + return_buffer, length)) + { + errno = EIO; + return 1; + } + + *result = length; + return 2; +#endif /* REENTRANT */ +} + +/* Handle an `open' or `openat' system call. + + CALLNO is the system call number, and REGS are the current user + registers of the TRACEE. + + If the file name specified in such system call is `/proc/self/exe', + replace the file name with the executable loaded into the process + issuing this system call. + + Value is 0 upon success and 1 upon failure. */ + +static int +handle_openat (USER_WORD callno, USER_REGS_STRUCT *regs, + struct exec_tracee *tracee, USER_WORD *result) +{ +#ifdef REENTRANT + /* readlinkat cannot be handled specially when the library is built + to be reentrant, as the file name information cannot be + recorded. */ + return 0; +#else /* !REENTRANT */ + char buffer[PATH_MAX + 1]; + USER_WORD address; + size_t length; + USER_REGS_STRUCT original; + char proc_pid_exe[sizeof "/proc//exe" + 24], *p; + int dirfd; + + /* Read the file name. */ + +#ifdef OPEN_SYSCALL + if (callno == OPEN_SYSCALL) + { + dirfd = AT_FDCWD; + address = regs->SYSCALL_ARG_REG; + } + else +#endif /* OPEN_SYSCALL */ + { + dirfd = (USER_SWORD) regs->SYSCALL_ARG_REG; + address = regs->SYSCALL_ARG1_REG; + } + + /* Read the file name into the buffer and verify that it is NULL + terminated. */ + read_memory (tracee, buffer, PATH_MAX, address); + + if (!memchr (buffer, '\0', PATH_MAX)) + { + errno = ENAMETOOLONG; + return 1; + } + + /* Expand BUFFER into an absolute file name. TODO: + AT_SYMLINK_FOLLOW? */ + + if (canon_path (tracee, dirfd, buffer, sizeof buffer)) + return 0; + + /* Now check if the caller is looking for /proc/self/exe or its + equivalent with the PID made explicit. + + dirfd can be ignored, as for now only absolute file names are + handled. FIXME. */ + + p = stpcpy (proc_pid_exe, "/proc/"); + p = format_pid (p, tracee->pid); + stpcpy (p, "/exe"); + + if ((strcmp (buffer, "/proc/self/exe") + && strcmp (buffer, proc_pid_exe)) + || !tracee->exec_file) + return 0; + + /* Copy over tracee->exec_file. This doesn't correctly handle the + scenario where tracee->exec_file is longer than PATH_MAX, but + that has yet to be encountered in practice. */ + + original = *regs; + length = strlen (tracee->exec_file); + address = user_alloca (tracee, &original, regs, length + 1); + + if (!address + || user_copy (tracee, (unsigned char *) tracee->exec_file, + address, length)) + goto fail; + + /* Replace the file name buffer with ADDRESS. */ + +#ifdef OPEN_SYSCALL + if (callno == OPEN_SYSCALL) + regs->SYSCALL_ARG_REG = address; + else +#endif /* OPEN_SYSCALL */ + regs->SYSCALL_ARG1_REG = address; + +#ifdef __aarch64__ + if (aarch64_set_regs (tracee->pid, regs, false)) + goto fail; +#else /* !__aarch64__ */ + if (ptrace (PTRACE_SETREGS, tracee->pid, NULL, regs)) + goto fail; +#endif /* __aarch64__ */ + + /* Resume the system call. */ + return 0; + + fail: + errno = EIO; + return 1; +#endif /* REENTRANT */ +} + +/* Process the system call at which TRACEE is stopped. If the system + call is not known or not exec, send TRACEE on its way. Otherwise, + rewrite it to load the loader and perform an appropriate action. */ + +static void +process_system_call (struct exec_tracee *tracee) +{ + USER_REGS_STRUCT regs; + int rc, wstatus, save_errno; + USER_WORD callno, sp; + USER_WORD result; + bool reporting_error; + +#ifdef __aarch64__ + rc = aarch64_get_regs (tracee->pid, ®s); +#else /* !__aarch64__ */ + rc = ptrace (PTRACE_GETREGS, tracee->pid, NULL, + ®s); +#endif /* __aarch64__ */ + + /* TODO: what to do if this fails? */ + if (rc < 0) + return; + + /* Save the stack pointer. */ + sp = regs.STACK_POINTER; + + /* Now dispatch based on the system call. */ + callno = regs.SYSCALL_NUM_REG; + switch (callno) + { + case EXEC_SYSCALL: + + /* exec system calls should be handled synchronously. */ + assert (!tracee->waiting_for_syscall); + rc = handle_exec (tracee, ®s); + + switch (rc) + { + case 3: + /* It's too late to do anything about this error,. */ + break; + + case 2: + /* The process has gone away. */ + remove_tracee (tracee); + break; + + case 1: + /* An error has occurred; errno is set to the error. */ + goto report_syscall_error; + } + + break; + +#ifdef READLINK_SYSCALL + case READLINK_SYSCALL: +#endif /* READLINK_SYSCALL */ + case READLINKAT_SYSCALL: + + /* This system call is already in progress if + TRACEE->waiting_for_syscall is true. */ + + if (!tracee->waiting_for_syscall) + { + /* Handle this readlinkat system call. */ + rc = handle_readlinkat (callno, ®s, tracee, + &result); + + /* rc means the same as in `handle_exec'. */ + + if (rc == 1) + goto report_syscall_error; + else if (rc == 2) + goto emulate_syscall; + } + + goto continue_syscall; + +#ifdef OPEN_SYSCALL + case OPEN_SYSCALL: +#endif /* OPEN_SYSCALL */ + case OPENAT_SYSCALL: + + /* This system call is already in progress if + TRACEE->waiting_for_syscall is true. */ + + if (!tracee->waiting_for_syscall) + { + /* Handle this open system call. */ + rc = handle_openat (callno, ®s, tracee, &result); + + /* rc means the same as in `handle_exec', except that `open' + is never emulated. */ + + if (rc == 1) + goto report_syscall_error; + + /* The stack pointer must be restored after it was modified + by `user_alloca'; record sp in TRACEE, which will be + restored after this system call completes. */ + tracee->sp = sp; + } + else + { + /* Restore that stack pointer. */ + regs.STACK_POINTER = tracee->sp; + +#ifdef __aarch64__ + if (aarch64_set_regs (tracee->pid, ®s, true)) + return; +#else /* !__aarch64__ */ + if (ptrace (PTRACE_SETREGS, tracee->pid, NULL, ®s)) + return; +#endif /* __aarch64__ */ + } + + /* Fallthrough. */ + + default: + continue_syscall: + /* Don't wait for the system call to finish; instead, the system + will DTRT upon the next call to PTRACE_SYSCALL after the + syscall-trap signal is delivered. */ + + rc = ptrace (PTRACE_SYSCALL, tracee->pid, + NULL, NULL); + if (rc < 0) + return; + + tracee->waiting_for_syscall = !tracee->waiting_for_syscall; + } + + return; + + report_syscall_error: + reporting_error = true; + goto common; + + emulate_syscall: + reporting_error = false; + common: + + /* Reporting an error or emulating a system call works by setting + the system call number to -1, letting it continue, and then + substituting errno for ENOSYS in the case of an error. + + Make sure that the stack pointer is restored to its original + position upon exit, or bad things can happen. */ + + /* First, save errno; system calls below will clobber it. */ + save_errno = errno; + + regs.SYSCALL_NUM_REG = -1; + regs.STACK_POINTER = sp; + +#ifdef __aarch64__ + if (aarch64_set_regs (tracee->pid, ®s, true)) + return; +#else /* !__aarch64__ */ + +#ifdef __arm__ + /* On ARM systems, a special request is used to update the system + call number as known to the kernel. In addition, the system call + number must be valid, so use `tuxcall'. Hopefully, nobody will + run this on a kernel with Tux. */ + + if (ptrace (PTRACE_SET_SYSCALL, tracee->pid, NULL, 222)) + return; +#endif /* __arm__ */ + + if (ptrace (PTRACE_SETREGS, tracee->pid, NULL, ®s)) + return; +#endif /* __aarch64__ */ + + /* Do this invalid system call. */ + if (ptrace (PTRACE_SYSCALL, tracee->pid, NULL, NULL)) + return; + + again1: + rc = waitpid (tracee->pid, &wstatus, __WALL); + if (rc == -1 && errno == EINTR) + goto again1; + + /* Return if waitpid fails. */ + + if (rc == -1) + return; + + /* If the process received a signal, see if the signal is SIGSYS and + from seccomp. If so, discard it. */ + + if (WIFSTOPPED (wstatus)) + { + rc = check_signal (tracee, wstatus); + + if (rc == -1) + return; + else if (rc) + goto again1; + } + + if (!WIFSTOPPED (wstatus)) + /* The process has been killed in response to a signal. In this + case, simply unlink the tracee and return. */ + remove_tracee (tracee); + else if (reporting_error) + { +#ifdef __mips__ + /* MIPS systems place errno in v0 and set a3 to 1. */ + regs.gregs[2] = save_errno; + regs.gregs[7] = 1; +#else /* !__mips__ */ + regs.SYSCALL_RET_REG = -save_errno; +#endif /* __mips__ */ + + /* Report errno. */ +#ifdef __aarch64__ + aarch64_set_regs (tracee->pid, ®s, false); +#else /* !__aarch64__ */ + ptrace (PTRACE_SETREGS, tracee->pid, NULL, ®s); +#endif /* __aarch64__ */ + + /* Now wait for the next system call to happen. */ + ptrace (PTRACE_SYSCALL, tracee->pid, NULL, NULL); + } + else + { + /* No error is being reported. Return the result in the + appropriate registers. */ + +#ifdef __mips__ + /* MIPS systems place errno in v0 and set a3 to 1. */ + regs.gregs[2] = result; + regs.gregs[7] = 0; +#else /* !__mips__ */ + regs.SYSCALL_RET_REG = result; +#endif /* __mips__ */ + + /* Report errno. */ +#ifdef __aarch64__ + aarch64_set_regs (tracee->pid, ®s, false); +#else /* !__aarch64__ */ + ptrace (PTRACE_SETREGS, tracee->pid, NULL, ®s); +#endif /* __aarch64__ */ + + /* Now wait for the next system call to happen. */ + ptrace (PTRACE_SYSCALL, tracee->pid, NULL, NULL); + } +} + + + +/* Like `execve', but asks the parent to begin tracing this thread. + Fail if tracing is unsuccessful. */ + +int +tracing_execve (const char *file, char *const *argv, + char *const *envp) +{ + int rc; + + /* Start tracing self. */ + rc = ptrace (PTRACE_TRACEME, 0, NULL, NULL); + if (rc) + return rc; + + /* Notify the parent to enter signal-delivery-stop. */ + raise (SIGSTOP); + return execve (file, argv, envp); +} + +/* Wait for PID to trace itself, and make a record of that process. + Value is 1 or 2 upon failure, 0 otherwise. Make sure that SIGCHLD + is blocked around calls to this function. + + If failure occurs because PID exited, value is 2; upon any other + kind of failure, value is 1. */ + +int +after_fork (pid_t pid) +{ + int wstatus, rc, flags; + struct exec_tracee *tracee; + + /* First, wait for something to happen to PID. */ + again: + rc = waitpid (pid, &wstatus, __WALL); + if (rc != pid && errno == EINTR) + goto again; + + if (rc != pid) + return 1; + + /* If the child exited (or in general wasn't traced), return 2. */ + + if (!WIFSTOPPED (wstatus)) + return 2; + + /* Apply required options to the child, so that the kernel + automatically traces children and makes it easy to differentiate + between system call traps and other kinds of traps. */ + + flags = PTRACE_O_TRACECLONE; + flags |= PTRACE_O_TRACEVFORK; + flags |= PTRACE_O_TRACEFORK; + flags |= PTRACE_O_TRACESYSGOOD; + flags |= PTRACE_O_TRACEEXIT; + + rc = ptrace (PTRACE_SETOPTIONS, pid, 0, flags); + + if (rc) + { + /* If the kernel can't trace child processes upon creation and + exit, then it can't work reliably. */ + ptrace (PTRACE_DETACH, pid, 0, 0); + return 1; + } + + /* Request that the child stop upon the next system call. */ + rc = ptrace (PTRACE_SYSCALL, pid, 0, 0); + if (rc) + return 1; + + /* Enter the child in `tracing_processes'. */ + + if (free_tracees) + { + tracee = free_tracees; + free_tracees = free_tracees->next; + } + else + tracee = malloc (sizeof *tracee); + + if (!tracee) + return 1; + + tracee->pid = pid; + tracee->next = tracing_processes; + tracee->waiting_for_syscall = false; + tracee->new_child = false; +#ifndef REENTRANT + tracee->exec_file = NULL; +#endif /* REENTRANT */ + tracing_processes = tracee; + return 0; +} + +/* Wait for a child process to exit, like `waitpid'. However, if a + child stops to perform a system call, send it on its way and return + -1. OPTIONS must not contain WUNTRACED. */ + +pid_t +exec_waitpid (pid_t pid, int *wstatus, int options) +{ + int status; + struct exec_tracee *tracee; + siginfo_t siginfo; + + pid = waitpid (pid, &status, options | __WALL); + if (pid < 0) + return pid; + + /* Copy status into *WSTATUS if specified. */ + if (wstatus) + *wstatus = status; + + /* WIFSTOPPED (status) means that the process has been stopped in + response to a system call. Find its tracee and process the + system call. */ + + if (WIFSTOPPED (status)) + { + tracee = find_tracee (pid); + + if (!tracee || tracee->new_child) + { + if (WSTOPSIG (status) == SIGSTOP) + /* A new process has been created and stopped. Record + it now. */ + handle_clone (tracee, pid); + + return -1; + } + + /* Now extract the stop signal, including ptrace event bits. */ + status &= 0xfff00; + status = status >> 8; + + switch (status) + { + case SIGTRAP: + /* Now, use PTRACE_GETSIGINFO to determine whether or not the + signal was delivered in response to a system call. */ + + if (ptrace (PTRACE_GETSIGINFO, pid, 0, &siginfo)) + return -1; + + if (!syscall_trap_p (&siginfo)) + { + if (siginfo.si_code < 0) + /* SIGTRAP delivered from userspace. Pass it on. */ + ptrace (PTRACE_SYSCALL, pid, 0, SIGTRAP); + else + ptrace (PTRACE_SYSCALL, pid, 0, 0); + + return -1; + } + + case SIGTRAP | 0x80: /* SIGTRAP | 0x80 specifically refers to + system call traps. */ + /* Otherwise, process the system call and continue waiting. */ + process_system_call (tracee); + return -1; + + case SIGTRAP | (PTRACE_EVENT_EXIT << 8): + /* The tracee has exited. Make it finish correctly. */ + ptrace (PTRACE_SYSCALL, pid, 0, 0); + remove_tracee (tracee); + return -1; + + case SIGTRAP | (PTRACE_EVENT_FORK << 8): + case SIGTRAP | (PTRACE_EVENT_VFORK << 8): + case SIGTRAP | (PTRACE_EVENT_CLONE << 8): + + /* Both PTRACE_EVENT_CLONE and SIGSTOP must arrive before a + process is continued. Otherwise, its parent's cmdline + cannot be obtained and propagated. + + If the PID of the new process is currently not being + traced, create a new tracee. Set `new_child' to true, + and copy over the old command line in preparation for a + SIGSTOP signal being delivered to it. + + Otherwise, start the tracee running until the next + syscall. */ + + handle_clone_prepare (tracee); + + /* These events are handled by tracing SIGSTOP signals sent + to unknown tracees. Make sure not to pass through + status, as there's no signal really being delivered. */ + ptrace (PTRACE_SYSCALL, pid, 0, 0); + return -1; + +#ifdef SIGSYS + case SIGSYS: + if (ptrace (PTRACE_GETSIGINFO, pid, 0, &siginfo)) + return -1; + + /* Continue the process until the next syscall, but don't + pass through the signal if an emulated syscall led to + it. */ +#ifdef HAVE_SIGINFO_T_SI_SYSCALL +#ifndef __arm__ + ptrace (PTRACE_SYSCALL, pid, 0, ((siginfo.si_code == SYS_SECCOMP + && siginfo.si_syscall == -1) + ? 0 : status)); +#else /* __arm__ */ + ptrace (PTRACE_SYSCALL, pid, 0, ((siginfo.si_code == SYS_SECCOMP + && siginfo.si_syscall == 222) + ? 0 : status)); +#endif /* !__arm__ */ +#else /* !HAVE_SIGINFO_T_SI_SYSCALL */ + /* Drop this signal, since what caused it is unknown. */ + ptrace (PTRACE_SYSCALL, pid, 0, 0); +#endif /* HAVE_SIGINFO_T_SI_SYSCALL */ + return -1; +#endif /* SIGSYS */ + + default: + /* Continue the process until the next syscall. */ + ptrace (PTRACE_SYSCALL, pid, 0, status); + return -1; + } + } + else + { + /* The process has exited. Unlink the associated tracee. */ + tracee = find_tracee (pid); + + if (tracee) + remove_tracee (tracee); + + return pid; + } +} + + + +/* Initialize the exec library. LOADER should be the file name of the + loader binary; it is not copied. */ + +void +exec_init (const char *loader) +{ + loader_name = loader; +} |