vmm: notify virtio-console of pty resizes

When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize.  This is the only way to be notified
by the kernel of a pty resize.

We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in.  To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty.  The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal.  This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.

Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity.  I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.

I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process.  The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).

Signed-off-by: Alyssa Ross <hi@alyssa.is>
This commit is contained in:
Alyssa Ross 2021-09-10 11:12:17 +00:00 committed by Rob Bradford
parent 98bfd1e988
commit 330b5ea3be
7 changed files with 276 additions and 5 deletions

View File

@ -42,6 +42,8 @@ const INPUT_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3;
const CONFIG_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 4;
// File written to (input ready)
const FILE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 5;
// Console resized
const RESIZE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 6;
//Console size feature bit
const VIRTIO_CONSOLE_F_SIZE: u64 = 0;
@ -74,11 +76,13 @@ struct ConsoleEpollHandler {
mem: GuestMemoryAtomic<GuestMemoryMmap>,
interrupt_cb: Arc<dyn VirtioInterrupt>,
in_buffer: Arc<Mutex<VecDeque<u8>>>,
resizer: Arc<ConsoleResizer>,
endpoint: Endpoint,
input_queue_evt: EventFd,
output_queue_evt: EventFd,
input_evt: EventFd,
config_evt: EventFd,
resize_pipe: Option<File>,
kill_evt: EventFd,
pause_evt: EventFd,
}
@ -210,6 +214,9 @@ impl ConsoleEpollHandler {
helper.add_event(self.output_queue_evt.as_raw_fd(), OUTPUT_QUEUE_EVENT)?;
helper.add_event(self.input_evt.as_raw_fd(), INPUT_EVENT)?;
helper.add_event(self.config_evt.as_raw_fd(), CONFIG_EVENT)?;
if let Some(resize_pipe) = self.resize_pipe.as_ref() {
helper.add_event(resize_pipe.as_raw_fd(), RESIZE_EVENT)?;
}
if let Some(in_file) = self.endpoint.in_file() {
helper.add_event(in_file.as_raw_fd(), FILE_EVENT)?;
}
@ -265,6 +272,14 @@ impl EpollHelperHandler for ConsoleEpollHandler {
return true;
}
}
RESIZE_EVENT => {
if let Err(e) = self.resize_pipe.as_ref().unwrap().read_exact(&mut [0]) {
error!("Failed to get resize event: {:?}", e);
return true;
}
self.resizer.update_console_size();
}
FILE_EVENT => {
let mut input = [0u8; 64];
if let Some(ref mut in_file) = self.endpoint.in_file() {
@ -328,6 +343,7 @@ pub struct Console {
id: String,
config: Arc<Mutex<VirtioConsoleConfig>>,
resizer: Arc<ConsoleResizer>,
resize_pipe: Option<File>,
endpoint: Endpoint,
seccomp_action: SeccompAction,
in_buffer: Arc<Mutex<VecDeque<u8>>>,
@ -367,6 +383,7 @@ impl Console {
pub fn new(
id: String,
endpoint: Endpoint,
resize_pipe: Option<File>,
iommu: bool,
seccomp_action: SeccompAction,
exit_evt: EventFd,
@ -401,6 +418,7 @@ impl Console {
id,
config: console_config,
resizer: resizer.clone(),
resize_pipe,
endpoint,
seccomp_action,
in_buffer: Arc::new(Mutex::new(VecDeque::new())),
@ -488,6 +506,8 @@ impl VirtioDevice for Console {
output_queue_evt: queue_evts.remove(0),
input_evt,
config_evt: self.resizer.config_evt.try_clone().unwrap(),
resize_pipe: self.resize_pipe.as_ref().map(|p| p.try_clone().unwrap()),
resizer: Arc::clone(&self.resizer),
kill_evt,
pause_evt,
};

27
vmm/src/clone3.rs Normal file
View File

@ -0,0 +1,27 @@
// Copyright 2021 Alyssa Ross <hi@alyssa.is>
// SPDX-License-Identifier: Apache-2.0
use libc::{c_long, size_t, syscall, SYS_clone3};
pub const CLONE_CLEAR_SIGHAND: u64 = 0x100000000;
#[repr(C)]
#[derive(Default)]
#[allow(non_camel_case_types)]
pub struct clone_args {
pub flags: u64,
pub pidfd: u64,
pub child_tid: u64,
pub parent_tid: u64,
pub exit_signal: u64,
pub stack: u64,
pub stack_size: u64,
pub tls: u64,
pub set_tid: u64,
pub set_tid_size: u64,
pub cgroup: u64,
}
pub unsafe fn clone3(args: &mut clone_args, size: size_t) -> c_long {
syscall(SYS_clone3, args, size)
}

View File

@ -22,7 +22,9 @@ use crate::interrupt::LegacyUserspaceInterruptManager;
#[cfg(feature = "acpi")]
use crate::memory_manager::MEMORY_MANAGER_ACPI_SIZE;
use crate::memory_manager::{Error as MemoryManagerError, MemoryManager};
use crate::seccomp_filters::{get_seccomp_filter, Thread};
use crate::serial_buffer::SerialBuffer;
use crate::sigwinch_listener::start_sigwinch_listener;
use crate::GuestRegionMmap;
use crate::PciDeviceInfo;
use crate::{device_node, DEVICE_MANAGER_SNAPSHOT_ID};
@ -791,6 +793,9 @@ pub struct DeviceManager {
// serial PTY
serial_pty: Option<Arc<Mutex<PtyPair>>>,
// pty foreground status,
console_resize_pipe: Option<Arc<File>>,
// Interrupt controller
#[cfg(target_arch = "x86_64")]
interrupt_controller: Option<Arc<Mutex<ioapic::Ioapic>>>,
@ -977,6 +982,7 @@ impl DeviceManager {
acpi_address,
serial_pty: None,
console_pty: None,
console_resize_pipe: None,
virtio_mem_devices: Vec::new(),
#[cfg(target_arch = "aarch64")]
gpio_device: None,
@ -1011,10 +1017,15 @@ impl DeviceManager {
.map(|pty| pty.lock().unwrap().clone())
}
pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
self.console_resize_pipe.as_ref().map(Arc::clone)
}
pub fn create_devices(
&mut self,
serial_pty: Option<PtyPair>,
console_pty: Option<PtyPair>,
console_resize_pipe: Option<File>,
) -> DeviceManagerResult<()> {
let mut virtio_devices: Vec<(VirtioDeviceArc, bool, String)> = Vec::new();
@ -1069,6 +1080,7 @@ impl DeviceManager {
&mut virtio_devices,
serial_pty,
console_pty,
console_resize_pipe,
)?;
// Reserve some IRQs for PCI devices in case they need to support INTx.
@ -1653,10 +1665,22 @@ impl DeviceManager {
self.modify_mode(f.as_raw_fd(), |t| t.c_lflag &= !(ICANON | ECHO | ISIG))
}
fn listen_for_sigwinch_on_tty(&mut self, pty: &File) -> std::io::Result<()> {
let seccomp_filter =
get_seccomp_filter(&self.seccomp_action, Thread::PtyForeground).unwrap();
let pipe = start_sigwinch_listener(seccomp_filter, pty)?;
self.console_resize_pipe = Some(Arc::new(pipe));
Ok(())
}
fn add_virtio_console_device(
&mut self,
virtio_devices: &mut Vec<(VirtioDeviceArc, bool, String)>,
console_pty: Option<PtyPair>,
resize_pipe: Option<File>,
) -> DeviceManagerResult<Option<Arc<virtio_devices::ConsoleResizer>>> {
let console_config = self.config.lock().unwrap().console.clone();
let endpoint = match console_config.mode {
@ -1670,6 +1694,7 @@ impl DeviceManager {
self.config.lock().unwrap().console.file = Some(pty.path.clone());
let file = pty.main.try_clone().unwrap();
self.console_pty = Some(Arc::new(Mutex::new(pty)));
self.console_resize_pipe = Some(Arc::new(resize_pipe.unwrap()));
Endpoint::FilePair(file.try_clone().unwrap(), file)
} else {
let (main, mut sub, path) =
@ -1678,6 +1703,8 @@ impl DeviceManager {
.map_err(DeviceManagerError::SetPtyRaw)?;
self.config.lock().unwrap().console.file = Some(path.clone());
let file = main.try_clone().unwrap();
assert!(resize_pipe.is_none());
self.listen_for_sigwinch_on_tty(&sub).unwrap();
self.console_pty = Some(Arc::new(Mutex::new(PtyPair { main, sub, path })));
Endpoint::FilePair(file.try_clone().unwrap(), file)
}
@ -1703,6 +1730,9 @@ impl DeviceManager {
let (virtio_console_device, console_resizer) = virtio_devices::Console::new(
id.clone(),
endpoint,
self.console_resize_pipe
.as_ref()
.map(|p| p.try_clone().unwrap()),
self.force_iommu | console_config.iommu,
self.seccomp_action.clone(),
self.exit_evt
@ -1739,6 +1769,7 @@ impl DeviceManager {
virtio_devices: &mut Vec<(VirtioDeviceArc, bool, String)>,
serial_pty: Option<PtyPair>,
console_pty: Option<PtyPair>,
console_resize_pipe: Option<File>,
) -> DeviceManagerResult<Arc<Console>> {
let serial_config = self.config.lock().unwrap().serial.clone();
let serial_writer: Option<Box<dyn io::Write + Send>> = match serial_config.mode {
@ -1774,7 +1805,8 @@ impl DeviceManager {
None
};
let console_resizer = self.add_virtio_console_device(virtio_devices, console_pty)?;
let console_resizer =
self.add_virtio_console_device(virtio_devices, console_pty, console_resize_pipe)?;
Ok(Arc::new(Console {
serial,
@ -4243,7 +4275,7 @@ impl Snapshottable for DeviceManager {
// Now that DeviceManager is updated with the right states, it's time
// to create the devices based on the configuration.
self.create_devices(None, None)
self.create_devices(None, None, None)
.map_err(|e| MigratableError::Restore(anyhow!("Could not create devices {:?}", e)))?;
Ok(())

View File

@ -47,6 +47,7 @@ use vm_migration::{MigratableError, Pausable, Snapshot, Snapshottable, Transport
use vmm_sys_util::eventfd::EventFd;
pub mod api;
mod clone3;
pub mod config;
pub mod cpu;
pub mod device_manager;
@ -55,6 +56,7 @@ pub mod interrupt;
pub mod memory_manager;
pub mod migration;
pub mod seccomp_filters;
mod sigwinch_listener;
pub mod vm;
#[cfg(feature = "acpi")]
@ -405,6 +407,7 @@ impl Vmm {
activate_evt,
None,
None,
None,
)?;
if let Some(serial_pty) = vm.serial_pty() {
self.epoll
@ -532,6 +535,10 @@ impl Vmm {
let config = vm.get_config();
let serial_pty = vm.serial_pty();
let console_pty = vm.console_pty();
let console_resize_pipe = vm
.console_resize_pipe()
.as_ref()
.map(|pipe| pipe.try_clone().unwrap());
self.vm_shutdown()?;
let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?;
@ -556,6 +563,7 @@ impl Vmm {
activate_evt,
serial_pty,
console_pty,
console_resize_pipe,
)?);
}

View File

@ -15,6 +15,7 @@ pub enum Thread {
SignalHandler,
Vcpu,
Vmm,
PtyForeground,
}
/// Shorthand for chaining `SeccompCondition`s with the `and` operator in a `SeccompRule`.
@ -39,6 +40,8 @@ macro_rules! or {
// See include/uapi/asm-generic/ioctls.h in the kernel code.
const TCGETS: u64 = 0x5401;
const TCSETS: u64 = 0x5402;
const TIOCSCTTY: u64 = 0x540E;
const TIOCSPGRP: u64 = 0x5410;
const TIOCGWINSZ: u64 = 0x5413;
const TIOCSPTLCK: u64 = 0x4004_5431;
const TIOCGTPEER: u64 = 0x5441;
@ -217,9 +220,11 @@ fn create_vmm_ioctl_seccomp_rule_common() -> Result<Vec<SeccompRule>, BackendErr
and![Cond::new(1, ArgLen::Dword, Eq, SIOCSIFNETMASK)?],
and![Cond::new(1, ArgLen::Dword, Eq, TCSETS)?],
and![Cond::new(1, ArgLen::Dword, Eq, TCGETS)?],
and![Cond::new(1, ArgLen::Dword, Eq, TIOCGWINSZ)?],
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPTLCK)?],
and![Cond::new(1, ArgLen::Dword, Eq, TIOCGTPEER)?],
and![Cond::new(1, ArgLen::Dword, Eq, TIOCGWINSZ)?],
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSCTTY)?],
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPGRP)?],
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPTLCK)?],
and![Cond::new(1, ArgLen::Dword, Eq, TUNGETFEATURES)?],
and![Cond::new(1, ArgLen::Dword, Eq, TUNGETIFF)?],
and![Cond::new(1, ArgLen::Dword, Eq, TUNSETIFF)?],
@ -367,6 +372,35 @@ fn signal_handler_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, Backend
])
}
fn create_pty_foreground_ioctl_seccomp_rule() -> Result<Vec<SeccompRule>, BackendError> {
Ok(or![
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSCTTY)?],
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPGRP)?],
])
}
fn pty_foreground_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
Ok(vec![
(libc::SYS_close, vec![]),
(libc::SYS_exit_group, vec![]),
(libc::SYS_getpgid, vec![]),
#[cfg(target_arch = "x86_64")]
(libc::SYS_getpgrp, vec![]),
(libc::SYS_ioctl, create_pty_foreground_ioctl_seccomp_rule()?),
(libc::SYS_munmap, vec![]),
#[cfg(target_arch = "x86_64")]
(libc::SYS_poll, vec![]),
#[cfg(target_arch = "aarch64")]
(libc::SYS_ppoll, vec![]),
(libc::SYS_read, vec![]),
(libc::SYS_rt_sigaction, vec![]),
(libc::SYS_rt_sigreturn, vec![]),
(libc::SYS_setsid, vec![]),
(libc::SYS_sigaltstack, vec![]),
(libc::SYS_write, vec![]),
])
}
// The filter containing the white listed syscall rules required by the VMM to
// function.
fn vmm_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
@ -381,6 +415,7 @@ fn vmm_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
(libc::SYS_clock_gettime, vec![]),
(libc::SYS_clock_nanosleep, vec![]),
(libc::SYS_clone, vec![]),
(libc::SYS_clone3, vec![]),
(libc::SYS_close, vec![]),
(libc::SYS_connect, vec![]),
(libc::SYS_dup, vec![]),
@ -406,6 +441,9 @@ fn vmm_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
#[cfg(target_arch = "aarch64")]
(libc::SYS_newfstatat, vec![]),
(libc::SYS_futex, vec![]),
(libc::SYS_getpgid, vec![]),
#[cfg(target_arch = "x86_64")]
(libc::SYS_getpgrp, vec![]),
(libc::SYS_getpid, vec![]),
(libc::SYS_getrandom, vec![]),
(libc::SYS_gettid, vec![]),
@ -431,6 +469,10 @@ fn vmm_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
(libc::SYS_open, vec![]),
(libc::SYS_openat, vec![]),
(libc::SYS_pipe2, vec![]),
#[cfg(target_arch = "x86_64")]
(libc::SYS_poll, vec![]),
#[cfg(target_arch = "aarch64")]
(libc::SYS_ppoll, vec![]),
(libc::SYS_prctl, vec![]),
(libc::SYS_pread64, vec![]),
(libc::SYS_preadv, vec![]),
@ -454,6 +496,7 @@ fn vmm_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
(libc::SYS_sendto, vec![]),
(libc::SYS_set_robust_list, vec![]),
(libc::SYS_set_tid_address, vec![]),
(libc::SYS_setsid, vec![]),
(libc::SYS_sigaltstack, vec![]),
(
libc::SYS_socket,
@ -615,6 +658,7 @@ fn get_seccomp_rules(thread_type: Thread) -> Result<Vec<(i64, Vec<SeccompRule>)>
Thread::SignalHandler => Ok(signal_handler_thread_rules()?),
Thread::Vcpu => Ok(vcpu_thread_rules()?),
Thread::Vmm => Ok(vmm_thread_rules()?),
Thread::PtyForeground => Ok(pty_foreground_thread_rules()?),
}
}

View File

@ -0,0 +1,135 @@
// Copyright 2021 Alyssa Ross <hi@alyssa.is>
// SPDX-License-Identifier: Apache-2.0
use crate::clone3::{clone3, clone_args, CLONE_CLEAR_SIGHAND};
use libc::{
c_int, c_void, close, getpgrp, ioctl, pipe2, poll, pollfd, setsid, sigemptyset, siginfo_t,
sigprocmask, tcsetpgrp, O_CLOEXEC, POLLERR, SIGWINCH, SIG_SETMASK, STDIN_FILENO, STDOUT_FILENO,
TIOCSCTTY,
};
use seccompiler::{apply_filter, BpfProgram};
use std::cell::RefCell;
use std::fs::File;
use std::io::{self, ErrorKind, Read, Write};
use std::mem::size_of;
use std::mem::MaybeUninit;
use std::os::unix::prelude::*;
use std::process::exit;
use std::ptr::null_mut;
use vmm_sys_util::signal::register_signal_handler;
thread_local! {
// The tty file descriptor is stored in a global variable so it
// can be accessed by a signal handler.
static TX: RefCell<Option<File>> = RefCell::new(None);
}
fn with_tx<R, F: FnOnce(&File) -> R>(f: F) -> R {
TX.with(|tx| f(tx.borrow().as_ref().unwrap()))
}
// This function has to be safe to call from a signal handler, and
// therefore must not panic.
fn notify() {
if let Err(e) = with_tx(|mut tx| tx.write_all(b"\n")) {
if e.kind() == ErrorKind::BrokenPipe {
exit(0);
}
exit(1);
}
}
extern "C" fn sigwinch_handler(_signo: c_int, _info: *mut siginfo_t, _unused: *mut c_void) {
notify();
}
fn unblock_all_signals() -> io::Result<()> {
let mut set = MaybeUninit::uninit();
if unsafe { sigemptyset(set.as_mut_ptr()) } == -1 {
return Err(io::Error::last_os_error());
}
let set = unsafe { set.assume_init() };
if unsafe { sigprocmask(SIG_SETMASK, &set, null_mut()) } == -1 {
return Err(io::Error::last_os_error());
}
Ok(())
}
fn sigwinch_listener_main(seccomp_filter: BpfProgram, tx: File, tty: &File) -> ! {
TX.with(|opt| opt.replace(Some(tx)));
unsafe {
close(STDIN_FILENO);
close(STDOUT_FILENO);
}
unblock_all_signals().unwrap();
apply_filter(&seccomp_filter).unwrap();
register_signal_handler(SIGWINCH, sigwinch_handler).unwrap();
unsafe {
// Create a new session (and therefore a new process group).
assert_ne!(setsid(), -1);
// Set the tty to be this process's controlling terminal.
assert_ne!(ioctl(tty.as_raw_fd(), TIOCSCTTY, 0), -1);
// Become the foreground process group of the tty.
assert_ne!(tcsetpgrp(tty.as_raw_fd(), getpgrp()), -1);
}
notify();
// Wait for the pipe to close, indicating the parent has exited.
with_tx(|tx| {
let mut pollfd = pollfd {
fd: tx.as_raw_fd(),
events: 0,
revents: 0,
};
while unsafe { poll(&mut pollfd, 1, -1) } == -1 {
let e = io::Error::last_os_error();
if !matches!(e.kind(), ErrorKind::Interrupted | ErrorKind::WouldBlock) {
panic!("poll: {}", e);
}
}
assert_eq!(pollfd.revents, POLLERR);
});
exit(0);
}
pub fn start_sigwinch_listener(seccomp_filter: BpfProgram, pty: &File) -> io::Result<File> {
let mut pipe = [-1; 2];
if unsafe { pipe2(pipe.as_mut_ptr(), O_CLOEXEC) } == -1 {
return Err(io::Error::last_os_error());
}
let mut rx = unsafe { File::from_raw_fd(pipe[0]) };
let tx = unsafe { File::from_raw_fd(pipe[1]) };
let mut args = clone_args::default();
args.flags |= CLONE_CLEAR_SIGHAND;
match unsafe { clone3(&mut args, size_of::<clone_args>()) } {
-1 => return Err(io::Error::last_os_error()),
0 => {
drop(rx);
sigwinch_listener_main(seccomp_filter, tx, pty);
}
_ => (),
}
drop(tx);
// Wait for a notification indicating readiness.
rx.read_exact(&mut [0])?;
Ok(rx)
}

View File

@ -708,6 +708,7 @@ impl Vm {
activate_evt: EventFd,
serial_pty: Option<PtyPair>,
console_pty: Option<PtyPair>,
console_resize_pipe: Option<File>,
) -> Result<Self> {
#[cfg(feature = "tdx")]
let tdx_enabled = config.lock().unwrap().tdx.is_some();
@ -771,7 +772,7 @@ impl Vm {
.device_manager
.lock()
.unwrap()
.create_devices(serial_pty, console_pty)
.create_devices(serial_pty, console_pty, console_resize_pipe)
.map_err(Error::DeviceManager)?;
Ok(new_vm)
}
@ -1140,6 +1141,10 @@ impl Vm {
self.device_manager.lock().unwrap().console_pty()
}
pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
self.device_manager.lock().unwrap().console_resize_pipe()
}
pub fn shutdown(&mut self) -> Result<()> {
let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
let new_state = VmState::Shutdown;