cloud-hypervisor/vhost_user_fs/src/sandbox.rs

293 lines
10 KiB
Rust
Raw Normal View History

// Copyright 2020 Red Hat, Inc. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use std::ffi::CString;
use std::os::unix::io::RawFd;
use std::{fmt, io};
use tempdir::TempDir;
#[derive(Debug)]
pub enum Error {
/// Failed to bind mount `/proc/self/fd` into a temporary directory.
BindMountProcSelfFd(io::Error),
/// Failed to bind mount shared directory.
BindMountSharedDir(io::Error),
/// Failed to change to the old root directory.
ChdirOldRoot(io::Error),
/// Failed to change to the new root directory.
ChdirNewRoot(io::Error),
/// Failed to clean the properties of the mount point.
CleanMount(io::Error),
/// Failed to create a temporary directory.
CreateTempDir(io::Error),
/// Call to libc::fork returned an error.
Fork(io::Error),
/// Error bind-mounting a directory.
MountBind(io::Error),
/// Failed to mount old root.
MountOldRoot(io::Error),
/// Error mounting proc.
MountProc(io::Error),
/// Failed to mount new root.
MountNewRoot(io::Error),
/// Error mounting target directory.
MountTarget(io::Error),
/// Failed to open new root.
OpenNewRoot(io::Error),
/// Failed to open old root.
OpenOldRoot(io::Error),
/// Failed to open `/proc/self/fd`.
OpenProcSelfFd(io::Error),
/// Error switching root directory.
PivotRoot(io::Error),
/// Failed to remove temporary directory.
RmdirTempDir(io::Error),
/// Failed to lazily unmount old root.
UmountOldRoot(io::Error),
/// Failed to lazily unmount temporary directory.
UmountTempDir(io::Error),
/// Call to libc::unshare returned an error.
Unshare(io::Error),
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "vhost_user_fs_sandbox_error: {:?}", self)
}
}
/// A helper for creating a sandbox for isolating the service.
pub struct Sandbox {
/// The directory that is going to be shared with the VM. The sandbox will be constructed on top
/// of this directory.
shared_dir: String,
/// A file descriptor for `/proc/self/fd` obtained from the sandboxed context.
proc_self_fd: Option<RawFd>,
}
impl Sandbox {
pub fn new(shared_dir: String) -> Self {
Sandbox {
shared_dir,
proc_self_fd: None,
}
}
// Make `self.shared_dir` our root directory, and get an isolated file descriptor for
// `/proc/self/fd`.
//
// This is based on virtiofsd's setup_namespaces() and setup_mounts(), and it's very similar to
// the strategy used in containers. Consists on a careful sequence of mounts and bind-mounts to
// ensure it's not possible to escape the sandbox through `self.shared_dir` nor the file
// descriptor obtained for `/proc/self/fd`.
//
// It's ugly, but it's the only way until Linux implements a proper containerization API.
fn setup_mounts(&mut self) -> Result<(), Error> {
// Ensure our mount changes don't affect the parent mount namespace.
let c_root_dir = CString::new("/").unwrap();
let ret = unsafe {
libc::mount(
std::ptr::null(),
c_root_dir.as_ptr(),
std::ptr::null(),
libc::MS_SLAVE | libc::MS_REC,
std::ptr::null(),
)
};
if ret != 0 {
return Err(Error::CleanMount(std::io::Error::last_os_error()));
}
// Mount `/proc` in this context.
let c_proc_dir = CString::new("/proc").unwrap();
let c_proc_fs = CString::new("proc").unwrap();
let ret = unsafe {
libc::mount(
c_proc_fs.as_ptr(),
c_proc_dir.as_ptr(),
c_proc_fs.as_ptr(),
libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_NOSUID | libc::MS_RELATIME,
std::ptr::null(),
)
};
if ret != 0 {
return Err(Error::MountProc(std::io::Error::last_os_error()));
}
// Bind-mount `/proc/self/fd` info a temporary directory, preventing access to ancestor
// directories.
let c_proc_self_fd = CString::new("/proc/self/fd").unwrap();
let tmp_dir = TempDir::new("vhostuserfs-")
.map_err(|_| Error::CreateTempDir(std::io::Error::last_os_error()))?;
let c_tmp_dir = CString::new(tmp_dir.into_path().to_str().unwrap()).unwrap();
let ret = unsafe {
libc::mount(
c_proc_self_fd.as_ptr(),
c_tmp_dir.as_ptr(),
std::ptr::null(),
libc::MS_BIND,
std::ptr::null(),
)
};
if ret < 0 {
return Err(Error::BindMountProcSelfFd(std::io::Error::last_os_error()));
}
// Obtain a file descriptor for `/proc/self/fd` through the bind-mounted temporary directory.
let proc_self_fd = unsafe { libc::open(c_tmp_dir.as_ptr(), libc::O_PATH) };
if proc_self_fd < 0 {
return Err(Error::OpenProcSelfFd(std::io::Error::last_os_error()));
}
self.proc_self_fd = Some(proc_self_fd);
// Now that we have a file descriptor for `/proc/self/fd`, we no longer need the bind-mount.
// Unmount it and remove the temporary directory.
let ret = unsafe { libc::umount2(c_tmp_dir.as_ptr(), libc::MNT_DETACH) };
if ret < 0 {
return Err(Error::UmountTempDir(std::io::Error::last_os_error()));
}
let ret = unsafe { libc::rmdir(c_tmp_dir.as_ptr()) };
if ret < 0 {
return Err(Error::RmdirTempDir(std::io::Error::last_os_error()));
}
// Bind-mount `self.shared_dir` on itself so we can use as new root on `pivot_root` syscall.
let c_shared_dir = CString::new(self.shared_dir.clone()).unwrap();
let ret = unsafe {
libc::mount(
c_shared_dir.as_ptr(),
c_shared_dir.as_ptr(),
std::ptr::null(),
libc::MS_BIND | libc::MS_REC,
std::ptr::null(),
)
};
if ret < 0 {
return Err(Error::BindMountSharedDir(std::io::Error::last_os_error()));
}
// Get a file descriptor to our old root so we can reference it after switching root.
let oldroot_fd = unsafe {
libc::open(
c_root_dir.as_ptr(),
libc::O_DIRECTORY | libc::O_RDONLY | libc::O_CLOEXEC,
)
};
if oldroot_fd < 0 {
return Err(Error::OpenOldRoot(std::io::Error::last_os_error()));
}
// Get a file descriptor to the new root so we can reference it after switching root.
let newroot_fd = unsafe {
libc::open(
c_shared_dir.as_ptr(),
libc::O_DIRECTORY | libc::O_RDONLY | libc::O_CLOEXEC,
)
};
if newroot_fd < 0 {
return Err(Error::OpenNewRoot(std::io::Error::last_os_error()));
}
// Change to new root directory to prepare for `pivot_root` syscall.
let ret = unsafe { libc::fchdir(newroot_fd) };
if ret < 0 {
return Err(Error::ChdirNewRoot(std::io::Error::last_os_error()));
}
// Call to `pivot_root` using `.` as both new and old root.
let c_current_dir = CString::new(".").unwrap();
let ret = unsafe {
libc::syscall(
libc::SYS_pivot_root,
c_current_dir.as_ptr(),
c_current_dir.as_ptr(),
)
};
if ret < 0 {
return Err(Error::PivotRoot(std::io::Error::last_os_error()));
}
// Change to old root directory to prepare for cleaning up and unmounting it.
let ret = unsafe { libc::fchdir(oldroot_fd) };
if ret < 0 {
return Err(Error::ChdirOldRoot(std::io::Error::last_os_error()));
}
// Clean up old root to avoid mount namespace propagation.
let c_empty = CString::new("").unwrap();
let ret = unsafe {
libc::mount(
c_empty.as_ptr(),
c_current_dir.as_ptr(),
c_empty.as_ptr(),
libc::MS_SLAVE | libc::MS_REC,
std::ptr::null(),
)
};
if ret != 0 {
return Err(Error::CleanMount(std::io::Error::last_os_error()));
}
// Lazily unmount old root.
let ret = unsafe { libc::umount2(c_current_dir.as_ptr(), libc::MNT_DETACH) };
if ret < 0 {
return Err(Error::UmountOldRoot(std::io::Error::last_os_error()));
}
// Change to new root.
let ret = unsafe { libc::fchdir(newroot_fd) };
if ret < 0 {
return Err(Error::ChdirNewRoot(std::io::Error::last_os_error()));
}
// We no longer need these file descriptors, so close them.
unsafe { libc::close(newroot_fd) };
unsafe { libc::close(oldroot_fd) };
Ok(())
}
/// Set up sandbox, fork and jump into it.
///
/// On success, the returned value will be the PID of the child for the parent and `None` for
/// the child itself, with the latter running isolated in `self.shared_dir`.
pub fn enter(&mut self) -> Result<Option<i32>, Error> {
let uid = unsafe { libc::geteuid() };
let flags = if uid == 0 {
libc::CLONE_NEWPID | libc::CLONE_NEWNS | libc::CLONE_NEWNET
} else {
// If running as an unprivileged user, rely on user_namespaces(7) for isolation. The
// main limitation of this strategy is that only the current uid/gid are mapped into
// the new namespace, so most operations on permissions will fail.
libc::CLONE_NEWPID | libc::CLONE_NEWNS | libc::CLONE_NEWNET | libc::CLONE_NEWUSER
};
let ret = unsafe { libc::unshare(flags) };
if ret != 0 {
return Err(Error::Unshare(std::io::Error::last_os_error()));
}
let child = unsafe { libc::fork() };
match child {
0 => {
// This is the child. Request to receive SIGTERM on parent's death.
unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) };
self.setup_mounts()?;
Ok(None)
}
x if x > 0 => {
// This is the parent.
Ok(Some(child))
}
_ => Err(Error::Fork(std::io::Error::last_os_error())),
}
}
pub fn get_proc_self_fd(&self) -> Option<RawFd> {
self.proc_self_fd
}
}