Rollup merge of #113939 - the8472:pidfd-from-child, r=Mark-Simulacrum
open pidfd in child process and send to the parent via SOCK_SEQPACKET+CMSG This avoids using `clone3` when a pidfd is requested while still getting it in a 100% race-free manner by passing it up from the child process. This should solve most concerns in #82971
This commit is contained in:
commit
3feab00093
@ -10,9 +10,6 @@
|
|||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
use crate::os::linux::process::PidFd;
|
use crate::os::linux::process::PidFd;
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
|
||||||
use crate::sys::weak::raw_syscall;
|
|
||||||
|
|
||||||
#[cfg(any(
|
#[cfg(any(
|
||||||
target_os = "macos",
|
target_os = "macos",
|
||||||
target_os = "watchos",
|
target_os = "watchos",
|
||||||
@ -91,6 +88,11 @@ pub fn spawn(
|
|||||||
if let Some(ret) = self.posix_spawn(&theirs, envp.as_ref())? {
|
if let Some(ret) = self.posix_spawn(&theirs, envp.as_ref())? {
|
||||||
return Ok((ret, ours));
|
return Ok((ret, ours));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
let (input, output) = sys::net::Socket::new_pair(libc::AF_UNIX, libc::SOCK_SEQPACKET)?;
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "linux"))]
|
||||||
let (input, output) = sys::pipe::anon_pipe()?;
|
let (input, output) = sys::pipe::anon_pipe()?;
|
||||||
|
|
||||||
// Whatever happens after the fork is almost for sure going to touch or
|
// Whatever happens after the fork is almost for sure going to touch or
|
||||||
@ -104,12 +106,16 @@ pub fn spawn(
|
|||||||
// The child calls `mem::forget` to leak the lock, which is crucial because
|
// The child calls `mem::forget` to leak the lock, which is crucial because
|
||||||
// releasing a lock is not async-signal-safe.
|
// releasing a lock is not async-signal-safe.
|
||||||
let env_lock = sys::os::env_read_lock();
|
let env_lock = sys::os::env_read_lock();
|
||||||
let (pid, pidfd) = unsafe { self.do_fork()? };
|
let pid = unsafe { self.do_fork()? };
|
||||||
|
|
||||||
if pid == 0 {
|
if pid == 0 {
|
||||||
crate::panic::always_abort();
|
crate::panic::always_abort();
|
||||||
mem::forget(env_lock); // avoid non-async-signal-safe unlocking
|
mem::forget(env_lock); // avoid non-async-signal-safe unlocking
|
||||||
drop(input);
|
drop(input);
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
if self.get_create_pidfd() {
|
||||||
|
self.send_pidfd(&output);
|
||||||
|
}
|
||||||
let Err(err) = unsafe { self.do_exec(theirs, envp.as_ref()) };
|
let Err(err) = unsafe { self.do_exec(theirs, envp.as_ref()) };
|
||||||
let errno = err.raw_os_error().unwrap_or(libc::EINVAL) as u32;
|
let errno = err.raw_os_error().unwrap_or(libc::EINVAL) as u32;
|
||||||
let errno = errno.to_be_bytes();
|
let errno = errno.to_be_bytes();
|
||||||
@ -133,6 +139,12 @@ pub fn spawn(
|
|||||||
drop(env_lock);
|
drop(env_lock);
|
||||||
drop(output);
|
drop(output);
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
let pidfd = if self.get_create_pidfd() { self.recv_pidfd(&input) } else { -1 };
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "linux"))]
|
||||||
|
let pidfd = -1;
|
||||||
|
|
||||||
// Safety: We obtained the pidfd from calling `clone3` with
|
// Safety: We obtained the pidfd from calling `clone3` with
|
||||||
// `CLONE_PIDFD` so it's valid an otherwise unowned.
|
// `CLONE_PIDFD` so it's valid an otherwise unowned.
|
||||||
let mut p = unsafe { Process::new(pid, pidfd) };
|
let mut p = unsafe { Process::new(pid, pidfd) };
|
||||||
@ -160,6 +172,7 @@ pub fn spawn(
|
|||||||
}
|
}
|
||||||
Ok(..) => {
|
Ok(..) => {
|
||||||
// pipe I/O up to PIPE_BUF bytes should be atomic
|
// pipe I/O up to PIPE_BUF bytes should be atomic
|
||||||
|
// similarly SOCK_SEQPACKET messages should arrive whole
|
||||||
assert!(p.wait().is_ok(), "wait() should either return Ok or panic");
|
assert!(p.wait().is_ok(), "wait() should either return Ok or panic");
|
||||||
panic!("short read on the CLOEXEC pipe")
|
panic!("short read on the CLOEXEC pipe")
|
||||||
}
|
}
|
||||||
@ -185,20 +198,19 @@ pub fn output(&mut self) -> io::Result<(ExitStatus, Vec<u8>, Vec<u8>)> {
|
|||||||
);
|
);
|
||||||
|
|
||||||
#[cfg(any(target_os = "tvos", target_os = "watchos"))]
|
#[cfg(any(target_os = "tvos", target_os = "watchos"))]
|
||||||
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
|
unsafe fn do_fork(&mut self) -> Result<pid_t, io::Error> {
|
||||||
return Err(Self::ERR_APPLE_TV_WATCH_NO_FORK_EXEC);
|
return Err(Self::ERR_APPLE_TV_WATCH_NO_FORK_EXEC);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Attempts to fork the process. If successful, returns Ok((0, -1))
|
// Attempts to fork the process. If successful, returns Ok((0, -1))
|
||||||
// in the child, and Ok((child_pid, -1)) in the parent.
|
// in the child, and Ok((child_pid, -1)) in the parent.
|
||||||
#[cfg(not(any(
|
#[cfg(not(any(
|
||||||
target_os = "linux",
|
|
||||||
target_os = "watchos",
|
target_os = "watchos",
|
||||||
target_os = "tvos",
|
target_os = "tvos",
|
||||||
all(target_os = "nto", target_env = "nto71"),
|
all(target_os = "nto", target_env = "nto71"),
|
||||||
)))]
|
)))]
|
||||||
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
|
unsafe fn do_fork(&mut self) -> Result<pid_t, io::Error> {
|
||||||
cvt(libc::fork()).map(|res| (res, -1))
|
cvt(libc::fork())
|
||||||
}
|
}
|
||||||
|
|
||||||
// On QNX Neutrino, fork can fail with EBADF in case "another thread might have opened
|
// On QNX Neutrino, fork can fail with EBADF in case "another thread might have opened
|
||||||
@ -206,7 +218,7 @@ unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
|
|||||||
// Documentation says "... or try calling fork() again". This is what we do here.
|
// Documentation says "... or try calling fork() again". This is what we do here.
|
||||||
// See also https://www.qnx.com/developers/docs/7.1/#com.qnx.doc.neutrino.lib_ref/topic/f/fork.html
|
// See also https://www.qnx.com/developers/docs/7.1/#com.qnx.doc.neutrino.lib_ref/topic/f/fork.html
|
||||||
#[cfg(all(target_os = "nto", target_env = "nto71"))]
|
#[cfg(all(target_os = "nto", target_env = "nto71"))]
|
||||||
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
|
unsafe fn do_fork(&mut self) -> Result<pid_t, io::Error> {
|
||||||
use crate::sys::os::errno;
|
use crate::sys::os::errno;
|
||||||
|
|
||||||
let mut delay = MIN_FORKSPAWN_SLEEP;
|
let mut delay = MIN_FORKSPAWN_SLEEP;
|
||||||
@ -229,91 +241,11 @@ unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
|
|||||||
delay *= 2;
|
delay *= 2;
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
return cvt(r).map(|res| (res, -1));
|
return cvt(r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Attempts to fork the process. If successful, returns Ok((0, -1))
|
|
||||||
// in the child, and Ok((child_pid, child_pidfd)) in the parent.
|
|
||||||
#[cfg(target_os = "linux")]
|
|
||||||
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
|
|
||||||
use crate::sync::atomic::{AtomicBool, Ordering};
|
|
||||||
|
|
||||||
static HAS_CLONE3: AtomicBool = AtomicBool::new(true);
|
|
||||||
const CLONE_PIDFD: u64 = 0x00001000;
|
|
||||||
|
|
||||||
#[repr(C)]
|
|
||||||
struct clone_args {
|
|
||||||
flags: u64,
|
|
||||||
pidfd: u64,
|
|
||||||
child_tid: u64,
|
|
||||||
parent_tid: u64,
|
|
||||||
exit_signal: u64,
|
|
||||||
stack: u64,
|
|
||||||
stack_size: u64,
|
|
||||||
tls: u64,
|
|
||||||
set_tid: u64,
|
|
||||||
set_tid_size: u64,
|
|
||||||
cgroup: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
raw_syscall! {
|
|
||||||
fn clone3(cl_args: *mut clone_args, len: libc::size_t) -> libc::c_long
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bypassing libc for `clone3` can make further libc calls unsafe,
|
|
||||||
// so we use it sparingly for now. See #89522 for details.
|
|
||||||
// Some tools (e.g. sandboxing tools) may also expect `fork`
|
|
||||||
// rather than `clone3`.
|
|
||||||
let want_clone3_pidfd = self.get_create_pidfd();
|
|
||||||
|
|
||||||
// If we fail to create a pidfd for any reason, this will
|
|
||||||
// stay as -1, which indicates an error.
|
|
||||||
let mut pidfd: pid_t = -1;
|
|
||||||
|
|
||||||
// Attempt to use the `clone3` syscall, which supports more arguments
|
|
||||||
// (in particular, the ability to create a pidfd). If this fails,
|
|
||||||
// we will fall through this block to a call to `fork()`
|
|
||||||
if want_clone3_pidfd && HAS_CLONE3.load(Ordering::Relaxed) {
|
|
||||||
let mut args = clone_args {
|
|
||||||
flags: CLONE_PIDFD,
|
|
||||||
pidfd: &mut pidfd as *mut pid_t as u64,
|
|
||||||
child_tid: 0,
|
|
||||||
parent_tid: 0,
|
|
||||||
exit_signal: libc::SIGCHLD as u64,
|
|
||||||
stack: 0,
|
|
||||||
stack_size: 0,
|
|
||||||
tls: 0,
|
|
||||||
set_tid: 0,
|
|
||||||
set_tid_size: 0,
|
|
||||||
cgroup: 0,
|
|
||||||
};
|
|
||||||
|
|
||||||
let args_ptr = &mut args as *mut clone_args;
|
|
||||||
let args_size = crate::mem::size_of::<clone_args>();
|
|
||||||
|
|
||||||
let res = cvt(clone3(args_ptr, args_size));
|
|
||||||
match res {
|
|
||||||
Ok(n) => return Ok((n as pid_t, pidfd)),
|
|
||||||
Err(e) => match e.raw_os_error() {
|
|
||||||
// Multiple threads can race to execute this store,
|
|
||||||
// but that's fine - that just means that multiple threads
|
|
||||||
// will have tried and failed to execute the same syscall,
|
|
||||||
// with no other side effects.
|
|
||||||
Some(libc::ENOSYS) => HAS_CLONE3.store(false, Ordering::Relaxed),
|
|
||||||
// Fallback to fork if `EPERM` is returned. (e.g. blocked by seccomp)
|
|
||||||
Some(libc::EPERM) => {}
|
|
||||||
_ => return Err(e),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generally, we just call `fork`. If we get here after wanting `clone3`,
|
|
||||||
// then the syscall does not exist or we do not have permission to call it.
|
|
||||||
cvt(libc::fork()).map(|res| (res, pidfd))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn exec(&mut self, default: Stdio) -> io::Error {
|
pub fn exec(&mut self, default: Stdio) -> io::Error {
|
||||||
let envp = self.capture_env();
|
let envp = self.capture_env();
|
||||||
|
|
||||||
@ -722,6 +654,115 @@ fn drop(&mut self) {
|
|||||||
Ok(Some(p))
|
Ok(Some(p))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
fn send_pidfd(&self, sock: &crate::sys::net::Socket) {
|
||||||
|
use crate::io::IoSlice;
|
||||||
|
use crate::os::fd::RawFd;
|
||||||
|
use crate::sys::cvt_r;
|
||||||
|
use libc::{CMSG_DATA, CMSG_FIRSTHDR, CMSG_LEN, CMSG_SPACE, SCM_RIGHTS, SOL_SOCKET};
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
let child_pid = libc::getpid();
|
||||||
|
// pidfd_open sets CLOEXEC by default
|
||||||
|
let pidfd = libc::syscall(libc::SYS_pidfd_open, child_pid, 0);
|
||||||
|
|
||||||
|
let fds: [c_int; 1] = [pidfd as RawFd];
|
||||||
|
|
||||||
|
const SCM_MSG_LEN: usize = mem::size_of::<[c_int; 1]>();
|
||||||
|
|
||||||
|
#[repr(C)]
|
||||||
|
union Cmsg {
|
||||||
|
buf: [u8; unsafe { CMSG_SPACE(SCM_MSG_LEN as u32) as usize }],
|
||||||
|
_align: libc::cmsghdr,
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut cmsg: Cmsg = mem::zeroed();
|
||||||
|
|
||||||
|
// 0-length message to send through the socket so we can pass along the fd
|
||||||
|
let mut iov = [IoSlice::new(b"")];
|
||||||
|
let mut msg: libc::msghdr = mem::zeroed();
|
||||||
|
|
||||||
|
msg.msg_iov = &mut iov as *mut _ as *mut _;
|
||||||
|
msg.msg_iovlen = 1;
|
||||||
|
msg.msg_controllen = mem::size_of_val(&cmsg.buf) as _;
|
||||||
|
msg.msg_control = &mut cmsg.buf as *mut _ as *mut _;
|
||||||
|
|
||||||
|
// only attach cmsg if we successfully acquired the pidfd
|
||||||
|
if pidfd >= 0 {
|
||||||
|
let hdr = CMSG_FIRSTHDR(&mut msg as *mut _ as *mut _);
|
||||||
|
(*hdr).cmsg_level = SOL_SOCKET;
|
||||||
|
(*hdr).cmsg_type = SCM_RIGHTS;
|
||||||
|
(*hdr).cmsg_len = CMSG_LEN(SCM_MSG_LEN as _) as _;
|
||||||
|
let data = CMSG_DATA(hdr);
|
||||||
|
crate::ptr::copy_nonoverlapping(
|
||||||
|
fds.as_ptr().cast::<u8>(),
|
||||||
|
data as *mut _,
|
||||||
|
SCM_MSG_LEN,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// we send the 0-length message even if we failed to acquire the pidfd
|
||||||
|
// so we get a consistent SEQPACKET order
|
||||||
|
match cvt_r(|| libc::sendmsg(sock.as_raw(), &msg, 0)) {
|
||||||
|
Ok(0) => {}
|
||||||
|
_ => rtabort!("failed to communicate with parent process"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
fn recv_pidfd(&self, sock: &crate::sys::net::Socket) -> pid_t {
|
||||||
|
use crate::io::IoSliceMut;
|
||||||
|
use crate::sys::cvt_r;
|
||||||
|
|
||||||
|
use libc::{CMSG_DATA, CMSG_FIRSTHDR, CMSG_LEN, CMSG_SPACE, SCM_RIGHTS, SOL_SOCKET};
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
const SCM_MSG_LEN: usize = mem::size_of::<[c_int; 1]>();
|
||||||
|
|
||||||
|
#[repr(C)]
|
||||||
|
union Cmsg {
|
||||||
|
_buf: [u8; unsafe { CMSG_SPACE(SCM_MSG_LEN as u32) as usize }],
|
||||||
|
_align: libc::cmsghdr,
|
||||||
|
}
|
||||||
|
let mut cmsg: Cmsg = mem::zeroed();
|
||||||
|
// 0-length read to get the fd
|
||||||
|
let mut iov = [IoSliceMut::new(&mut [])];
|
||||||
|
|
||||||
|
let mut msg: libc::msghdr = mem::zeroed();
|
||||||
|
|
||||||
|
msg.msg_iov = &mut iov as *mut _ as *mut _;
|
||||||
|
msg.msg_iovlen = 1;
|
||||||
|
msg.msg_controllen = mem::size_of::<Cmsg>() as _;
|
||||||
|
msg.msg_control = &mut cmsg as *mut _ as *mut _;
|
||||||
|
|
||||||
|
match cvt_r(|| libc::recvmsg(sock.as_raw(), &mut msg, 0)) {
|
||||||
|
Err(_) => return -1,
|
||||||
|
Ok(_) => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
let hdr = CMSG_FIRSTHDR(&mut msg as *mut _ as *mut _);
|
||||||
|
if hdr.is_null()
|
||||||
|
|| (*hdr).cmsg_level != SOL_SOCKET
|
||||||
|
|| (*hdr).cmsg_type != SCM_RIGHTS
|
||||||
|
|| (*hdr).cmsg_len != CMSG_LEN(SCM_MSG_LEN as _) as _
|
||||||
|
{
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
let data = CMSG_DATA(hdr);
|
||||||
|
|
||||||
|
let mut fds = [-1 as c_int];
|
||||||
|
|
||||||
|
crate::ptr::copy_nonoverlapping(
|
||||||
|
data as *const _,
|
||||||
|
fds.as_mut_ptr().cast::<u8>(),
|
||||||
|
SCM_MSG_LEN,
|
||||||
|
);
|
||||||
|
|
||||||
|
fds[0]
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -60,3 +60,28 @@ fn test_command_fork_no_unwind() {
|
|||||||
|| signal == libc::SIGSEGV
|
|| signal == libc::SIGSEGV
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
fn test_command_pidfd() {
|
||||||
|
use crate::os::fd::RawFd;
|
||||||
|
use crate::os::linux::process::{ChildExt, CommandExt};
|
||||||
|
use crate::process::Command;
|
||||||
|
|
||||||
|
let our_pid = crate::process::id();
|
||||||
|
let pidfd = unsafe { libc::syscall(libc::SYS_pidfd_open, our_pid, 0) };
|
||||||
|
let pidfd_open_available = if pidfd >= 0 {
|
||||||
|
unsafe { libc::close(pidfd as RawFd) };
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
};
|
||||||
|
|
||||||
|
// always exercise creation attempts
|
||||||
|
let child = Command::new("echo").create_pidfd(true).spawn().unwrap();
|
||||||
|
|
||||||
|
// but only check if we know that the kernel supports pidfds
|
||||||
|
if pidfd_open_available {
|
||||||
|
assert!(child.pidfd().is_ok())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user