2020-03-19 12:26:16 +00:00
|
|
|
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
|
|
//
|
|
|
|
// Copyright © 2020 Intel Corporation
|
|
|
|
//
|
|
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
use hypervisor::HypervisorType;
|
2021-08-17 03:40:11 +00:00
|
|
|
use seccompiler::{
|
|
|
|
BackendError, BpfProgram, Error, SeccompAction, SeccompCmpArgLen as ArgLen, SeccompCmpOp::Eq,
|
|
|
|
SeccompCondition as Cond, SeccompFilter, SeccompRule,
|
2020-03-19 12:26:16 +00:00
|
|
|
};
|
|
|
|
use std::convert::TryInto;
|
|
|
|
|
2020-03-20 16:57:03 +00:00
|
|
|
pub enum Thread {
|
|
|
|
Api,
|
2020-09-09 23:33:58 +00:00
|
|
|
SignalHandler,
|
2020-09-09 22:15:26 +00:00
|
|
|
Vcpu,
|
|
|
|
Vmm,
|
vmm: notify virtio-console of pty resizes
When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize. This is the only way to be notified
by the kernel of a pty resize.
We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in. To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty. The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal. This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.
Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity. I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.
I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process. The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-10 11:12:17 +00:00
|
|
|
PtyForeground,
|
2020-03-20 16:57:03 +00:00
|
|
|
}
|
|
|
|
|
2020-03-19 12:26:16 +00:00
|
|
|
/// Shorthand for chaining `SeccompCondition`s with the `and` operator in a `SeccompRule`.
|
|
|
|
/// The rule will take the `Allow` action if _all_ the conditions are true.
|
|
|
|
///
|
|
|
|
/// [`SeccompCondition`]: struct.SeccompCondition.html
|
|
|
|
/// [`SeccompRule`]: struct.SeccompRule.html
|
|
|
|
macro_rules! and {
|
2021-08-17 03:40:11 +00:00
|
|
|
($($x:expr),*) => (SeccompRule::new(vec![$($x),*]).unwrap())
|
2020-03-19 12:26:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Shorthand for chaining `SeccompRule`s with the `or` operator in a `SeccompFilter`.
|
|
|
|
///
|
|
|
|
/// [`SeccompFilter`]: struct.SeccompFilter.html
|
|
|
|
/// [`SeccompRule`]: struct.SeccompRule.html
|
|
|
|
macro_rules! or {
|
|
|
|
($($x:expr,)*) => (vec![$($x),*]);
|
|
|
|
($($x:expr),*) => (vec![$($x),*])
|
|
|
|
}
|
|
|
|
|
|
|
|
// See include/uapi/asm-generic/ioctls.h in the kernel code.
|
|
|
|
const TCGETS: u64 = 0x5401;
|
|
|
|
const TCSETS: u64 = 0x5402;
|
vmm: notify virtio-console of pty resizes
When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize. This is the only way to be notified
by the kernel of a pty resize.
We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in. To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty. The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal. This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.
Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity. I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.
I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process. The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-10 11:12:17 +00:00
|
|
|
const TIOCSCTTY: u64 = 0x540E;
|
|
|
|
const TIOCSPGRP: u64 = 0x5410;
|
2020-03-19 12:26:16 +00:00
|
|
|
const TIOCGWINSZ: u64 = 0x5413;
|
2021-01-14 03:03:53 +00:00
|
|
|
const TIOCSPTLCK: u64 = 0x4004_5431;
|
|
|
|
const TIOCGTPEER: u64 = 0x5441;
|
2020-03-19 12:26:16 +00:00
|
|
|
const FIOCLEX: u64 = 0x5451;
|
|
|
|
const FIONBIO: u64 = 0x5421;
|
|
|
|
|
2021-12-16 17:31:30 +00:00
|
|
|
// See include/uapi/linux/fs.h in the kernel code.
|
|
|
|
const BLKSSZGET: u64 = 0x1268;
|
|
|
|
const BLKPBSZGET: u64 = 0x127b;
|
|
|
|
const BLKIOMIN: u64 = 0x1278;
|
|
|
|
const BLKIOOPT: u64 = 0x1279;
|
|
|
|
|
2020-03-19 12:26:16 +00:00
|
|
|
// See include/uapi/linux/if_tun.h in the kernel code.
|
2020-12-16 16:59:02 +00:00
|
|
|
const TUNGETIFF: u64 = 0x8004_54d2;
|
2020-03-19 12:26:16 +00:00
|
|
|
const TUNSETIFF: u64 = 0x4004_54ca;
|
|
|
|
const TUNSETOFFLOAD: u64 = 0x4004_54d0;
|
|
|
|
const TUNSETVNETHDRSZ: u64 = 0x4004_54d8;
|
|
|
|
const TUNGETFEATURES: u64 = 0x8004_54cf;
|
|
|
|
|
|
|
|
// See include/uapi/linux/sockios.h in the kernel code.
|
2020-06-05 10:56:57 +00:00
|
|
|
const SIOCGIFFLAGS: u64 = 0x8913;
|
2020-05-15 08:58:50 +00:00
|
|
|
const SIOCGIFHWADDR: u64 = 0x8927;
|
2020-03-19 12:26:16 +00:00
|
|
|
const SIOCSIFFLAGS: u64 = 0x8914;
|
|
|
|
const SIOCSIFADDR: u64 = 0x8916;
|
2022-09-21 09:56:24 +00:00
|
|
|
const SIOCGIFMTU: u64 = 0x8921;
|
|
|
|
const SIOCSIFMTU: u64 = 0x8922;
|
2020-05-15 08:58:50 +00:00
|
|
|
const SIOCSIFHWADDR: u64 = 0x8924;
|
2020-03-19 12:26:16 +00:00
|
|
|
const SIOCSIFNETMASK: u64 = 0x891c;
|
|
|
|
|
|
|
|
// See include/uapi/linux/vfio.h in the kernel code.
|
|
|
|
const VFIO_GET_API_VERSION: u64 = 0x3b64;
|
|
|
|
const VFIO_CHECK_EXTENSION: u64 = 0x3b65;
|
|
|
|
const VFIO_SET_IOMMU: u64 = 0x3b66;
|
|
|
|
const VFIO_GROUP_GET_STATUS: u64 = 0x3b67;
|
|
|
|
const VFIO_GROUP_SET_CONTAINER: u64 = 0x3b68;
|
|
|
|
const VFIO_GROUP_UNSET_CONTAINER: u64 = 0x3b69;
|
|
|
|
const VFIO_GROUP_GET_DEVICE_FD: u64 = 0x3b6a;
|
|
|
|
const VFIO_DEVICE_GET_INFO: u64 = 0x3b6b;
|
|
|
|
const VFIO_DEVICE_GET_REGION_INFO: u64 = 0x3b6c;
|
|
|
|
const VFIO_DEVICE_GET_IRQ_INFO: u64 = 0x3b6d;
|
|
|
|
const VFIO_DEVICE_SET_IRQS: u64 = 0x3b6e;
|
|
|
|
const VFIO_DEVICE_RESET: u64 = 0x3b6f;
|
|
|
|
const VFIO_IOMMU_MAP_DMA: u64 = 0x3b71;
|
|
|
|
const VFIO_IOMMU_UNMAP_DMA: u64 = 0x3b72;
|
|
|
|
const VFIO_DEVICE_IOEVENTFD: u64 = 0x3b74;
|
|
|
|
|
2022-03-11 11:31:25 +00:00
|
|
|
// See include/uapi/linux/vhost.h in the kernel code
|
|
|
|
const VHOST_GET_FEATURES: u64 = 0x8008af00;
|
|
|
|
const VHOST_SET_FEATURES: u64 = 0x4008af00;
|
|
|
|
const VHOST_SET_OWNER: u64 = 0xaf01;
|
|
|
|
const VHOST_SET_VRING_NUM: u64 = 0x4008af10;
|
|
|
|
const VHOST_SET_VRING_ADDR: u64 = 0x4028af11;
|
|
|
|
const VHOST_SET_VRING_BASE: u64 = 0x4008af12;
|
|
|
|
const VHOST_SET_VRING_KICK: u64 = 0x4008af20;
|
|
|
|
const VHOST_SET_VRING_CALL: u64 = 0x4008af21;
|
|
|
|
const VHOST_SET_BACKEND_FEATURES: u64 = 0x4008af25;
|
|
|
|
const VHOST_GET_BACKEND_FEATURES: u64 = 0x8008af26;
|
|
|
|
const VHOST_VDPA_GET_DEVICE_ID: u64 = 0x8004af70;
|
|
|
|
const VHOST_VDPA_GET_STATUS: u64 = 0x8001af71;
|
|
|
|
const VHOST_VDPA_SET_STATUS: u64 = 0x4001af72;
|
|
|
|
const VHOST_VDPA_GET_CONFIG: u64 = 0x8008af73;
|
|
|
|
const VHOST_VDPA_SET_CONFIG: u64 = 0x4008af74;
|
|
|
|
const VHOST_VDPA_SET_VRING_ENABLE: u64 = 0x4008af75;
|
|
|
|
const VHOST_VDPA_GET_VRING_NUM: u64 = 0x8002af76;
|
|
|
|
const VHOST_VDPA_SET_CONFIG_CALL: u64 = 0x4004af77;
|
|
|
|
const VHOST_VDPA_GET_IOVA_RANGE: u64 = 0x8010af78;
|
2022-10-11 13:59:38 +00:00
|
|
|
const VHOST_VDPA_GET_CONFIG_SIZE: u64 = 0x8004af79;
|
|
|
|
const VHOST_VDPA_SUSPEND: u64 = 0xaf7d;
|
2022-03-11 11:31:25 +00:00
|
|
|
|
2020-09-09 22:15:26 +00:00
|
|
|
// See include/uapi/linux/kvm.h in the kernel code.
|
2021-07-02 14:39:56 +00:00
|
|
|
#[cfg(feature = "kvm")]
|
|
|
|
mod kvm {
|
|
|
|
pub const KVM_GET_API_VERSION: u64 = 0xae00;
|
|
|
|
pub const KVM_CREATE_VM: u64 = 0xae01;
|
|
|
|
pub const KVM_CHECK_EXTENSION: u64 = 0xae03;
|
|
|
|
pub const KVM_GET_VCPU_MMAP_SIZE: u64 = 0xae04;
|
|
|
|
pub const KVM_CREATE_VCPU: u64 = 0xae41;
|
|
|
|
pub const KVM_CREATE_IRQCHIP: u64 = 0xae60;
|
|
|
|
pub const KVM_RUN: u64 = 0xae80;
|
|
|
|
pub const KVM_SET_MP_STATE: u64 = 0x4004_ae99;
|
|
|
|
pub const KVM_SET_GSI_ROUTING: u64 = 0x4008_ae6a;
|
|
|
|
pub const KVM_SET_DEVICE_ATTR: u64 = 0x4018_aee1;
|
2022-01-20 01:51:39 +00:00
|
|
|
pub const KVM_HAS_DEVICE_ATTR: u64 = 0x4018_aee3;
|
2021-07-02 14:39:56 +00:00
|
|
|
pub const KVM_SET_ONE_REG: u64 = 0x4010_aeac;
|
|
|
|
pub const KVM_SET_USER_MEMORY_REGION: u64 = 0x4020_ae46;
|
|
|
|
pub const KVM_IRQFD: u64 = 0x4020_ae76;
|
|
|
|
pub const KVM_IOEVENTFD: u64 = 0x4040_ae79;
|
|
|
|
pub const KVM_SET_VCPU_EVENTS: u64 = 0x4040_aea0;
|
|
|
|
pub const KVM_ENABLE_CAP: u64 = 0x4068_aea3;
|
|
|
|
pub const KVM_SET_REGS: u64 = 0x4090_ae82;
|
|
|
|
pub const KVM_GET_MP_STATE: u64 = 0x8004_ae98;
|
|
|
|
pub const KVM_GET_DEVICE_ATTR: u64 = 0x4018_aee2;
|
|
|
|
pub const KVM_GET_DIRTY_LOG: u64 = 0x4010_ae42;
|
|
|
|
pub const KVM_GET_VCPU_EVENTS: u64 = 0x8040_ae9f;
|
|
|
|
pub const KVM_GET_ONE_REG: u64 = 0x4010_aeab;
|
|
|
|
pub const KVM_GET_REGS: u64 = 0x8090_ae81;
|
|
|
|
pub const KVM_GET_SUPPORTED_CPUID: u64 = 0xc008_ae05;
|
|
|
|
pub const KVM_CREATE_DEVICE: u64 = 0xc00c_aee0;
|
|
|
|
pub const KVM_GET_REG_LIST: u64 = 0xc008_aeb0;
|
|
|
|
pub const KVM_MEMORY_ENCRYPT_OP: u64 = 0xc008_aeba;
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(feature = "kvm")]
|
|
|
|
use kvm::*;
|
2020-07-06 05:37:14 +00:00
|
|
|
|
2021-07-02 15:40:20 +00:00
|
|
|
// MSHV IOCTL code. This is unstable until the kernel code has been declared stable.
|
|
|
|
#[cfg(feature = "mshv")]
|
|
|
|
mod mshv {
|
|
|
|
pub const MSHV_GET_API_VERSION: u64 = 0xb800;
|
|
|
|
pub const MSHV_CREATE_VM: u64 = 0x4028_b801;
|
|
|
|
pub const MSHV_MAP_GUEST_MEMORY: u64 = 0x4020_b802;
|
|
|
|
pub const MSHV_UNMAP_GUEST_MEMORY: u64 = 0x4020_b803;
|
|
|
|
pub const MSHV_CREATE_VP: u64 = 0x4004_b804;
|
|
|
|
pub const MSHV_IRQFD: u64 = 0x4010_b80e;
|
|
|
|
pub const MSHV_IOEVENTFD: u64 = 0x4020_b80f;
|
|
|
|
pub const MSHV_SET_MSI_ROUTING: u64 = 0x4008_b811;
|
|
|
|
pub const MSHV_GET_VP_REGISTERS: u64 = 0xc010_b805;
|
|
|
|
pub const MSHV_SET_VP_REGISTERS: u64 = 0x4010_b806;
|
|
|
|
pub const MSHV_RUN_VP: u64 = 0x8100_b807;
|
|
|
|
pub const MSHV_GET_VP_STATE: u64 = 0xc028_b80a;
|
|
|
|
pub const MSHV_SET_VP_STATE: u64 = 0xc028_b80b;
|
2021-07-08 17:07:18 +00:00
|
|
|
pub const MSHV_SET_PARTITION_PROPERTY: u64 = 0x4010_b80c;
|
|
|
|
pub const MSHV_GET_GPA_ACCESS_STATES: u64 = 0xc01c_b812;
|
2021-07-19 17:21:49 +00:00
|
|
|
pub const MSHV_VP_TRANSLATE_GVA: u64 = 0xc020_b80e;
|
2022-10-11 00:09:21 +00:00
|
|
|
pub const MSHV_CREATE_PARTITION: u64 = 0x4030_b801;
|
2021-07-02 15:40:20 +00:00
|
|
|
}
|
|
|
|
#[cfg(feature = "mshv")]
|
|
|
|
use mshv::*;
|
|
|
|
|
|
|
|
#[cfg(feature = "mshv")]
|
2021-08-17 03:40:11 +00:00
|
|
|
fn create_vmm_ioctl_seccomp_rule_common_mshv() -> Result<Vec<SeccompRule>, BackendError> {
|
2021-07-02 15:40:20 +00:00
|
|
|
Ok(or![
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_GET_API_VERSION,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_CREATE_VM)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_MAP_GUEST_MEMORY)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_UNMAP_GUEST_MEMORY)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_CREATE_VP)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_IRQFD)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_IOEVENTFD)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_SET_MSI_ROUTING)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_GET_VP_REGISTERS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_SET_VP_REGISTERS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_RUN_VP)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_GET_VP_STATE)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_SET_VP_STATE)?],
|
2021-07-08 17:07:18 +00:00
|
|
|
and![Cond::new(
|
|
|
|
1,
|
2021-08-17 03:40:11 +00:00
|
|
|
ArgLen::Dword,
|
2021-07-08 17:07:18 +00:00
|
|
|
Eq,
|
|
|
|
MSHV_SET_PARTITION_PROPERTY
|
|
|
|
)?],
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_GET_GPA_ACCESS_STATES)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_VP_TRANSLATE_GVA)?],
|
2022-10-11 00:09:21 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_CREATE_PARTITION)?],
|
2021-07-02 15:40:20 +00:00
|
|
|
])
|
|
|
|
}
|
|
|
|
|
2021-07-02 14:39:56 +00:00
|
|
|
#[cfg(feature = "kvm")]
|
2021-08-17 03:40:11 +00:00
|
|
|
fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result<Vec<SeccompRule>, BackendError> {
|
2020-03-19 12:26:16 +00:00
|
|
|
Ok(or![
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_CHECK_EXTENSION)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_DEVICE,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_IRQCHIP,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_VCPU)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_VM)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_ENABLE_CAP)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_API_VERSION,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_DEVICE_ATTR,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_DIRTY_LOG)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_MP_STATE)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_ONE_REG)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_REGS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_REG_LIST)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_SUPPORTED_CPUID,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_VCPU_EVENTS,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_VCPU_MMAP_SIZE,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_IOEVENTFD)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_IRQFD)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_RUN)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_MEMORY_ENCRYPT_OP)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_DEVICE_ATTR,)?],
|
2022-01-20 01:51:39 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_HAS_DEVICE_ATTR,)?],
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_GSI_ROUTING)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_MP_STATE)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_ONE_REG)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_REGS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_USER_MEMORY_REGION,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_VCPU_EVENTS,)?],
|
2021-07-02 14:39:56 +00:00
|
|
|
])
|
|
|
|
}
|
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
fn create_vmm_ioctl_seccomp_rule_hypervisor(
|
|
|
|
hypervisor_type: HypervisorType,
|
|
|
|
) -> Result<Vec<SeccompRule>, BackendError> {
|
|
|
|
match hypervisor_type {
|
|
|
|
#[cfg(feature = "kvm")]
|
|
|
|
HypervisorType::Kvm => create_vmm_ioctl_seccomp_rule_common_kvm(),
|
|
|
|
#[cfg(feature = "mshv")]
|
|
|
|
HypervisorType::Mshv => create_vmm_ioctl_seccomp_rule_common_mshv(),
|
|
|
|
#[allow(unreachable_patterns)]
|
|
|
|
_ => panic!("Invalid hypervisor {:?}", hypervisor_type),
|
|
|
|
}
|
2021-07-02 14:39:56 +00:00
|
|
|
}
|
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
fn create_vmm_ioctl_seccomp_rule_common(
|
|
|
|
hypervisor_type: HypervisorType,
|
|
|
|
) -> Result<Vec<SeccompRule>, BackendError> {
|
2021-07-02 14:39:56 +00:00
|
|
|
let mut common_rules = or![
|
2021-12-16 17:31:30 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, BLKSSZGET)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, BLKPBSZGET)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, BLKIOMIN)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, BLKIOOPT)?],
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, FIOCLEX)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, FIONBIO)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, SIOCGIFFLAGS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, SIOCGIFHWADDR)?],
|
2022-09-21 09:56:24 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, SIOCGIFMTU)?],
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, SIOCSIFADDR)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, SIOCSIFFLAGS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, SIOCSIFHWADDR)?],
|
2022-09-21 09:56:24 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, SIOCSIFMTU)?],
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, SIOCSIFNETMASK)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TCSETS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TCGETS)?],
|
vmm: notify virtio-console of pty resizes
When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize. This is the only way to be notified
by the kernel of a pty resize.
We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in. To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty. The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal. This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.
Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity. I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.
I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process. The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-10 11:12:17 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TIOCGTPEER)?],
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TIOCGWINSZ)?],
|
vmm: notify virtio-console of pty resizes
When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize. This is the only way to be notified
by the kernel of a pty resize.
We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in. To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty. The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal. This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.
Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity. I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.
I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process. The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-10 11:12:17 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSCTTY)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPGRP)?],
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPTLCK)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TUNGETFEATURES)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TUNGETIFF)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TUNSETIFF)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TUNSETOFFLOAD)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TUNSETVNETHDRSZ)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_GET_API_VERSION)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_CHECK_EXTENSION)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_SET_IOMMU)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_GROUP_GET_STATUS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_GROUP_SET_CONTAINER)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_GROUP_UNSET_CONTAINER)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_GROUP_GET_DEVICE_FD)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_DEVICE_GET_INFO)?],
|
2020-03-19 12:26:16 +00:00
|
|
|
and![Cond::new(
|
|
|
|
1,
|
2021-08-17 03:40:11 +00:00
|
|
|
ArgLen::Dword,
|
2020-03-19 12:26:16 +00:00
|
|
|
Eq,
|
|
|
|
VFIO_DEVICE_GET_REGION_INFO
|
|
|
|
)?],
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_DEVICE_GET_IRQ_INFO)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_DEVICE_SET_IRQS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_DEVICE_RESET)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_MAP_DMA)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_UNMAP_DMA)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_DEVICE_IOEVENTFD)?],
|
2022-03-11 11:31:25 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_GET_FEATURES)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_FEATURES)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_OWNER)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_NUM)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_ADDR)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_BASE)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_KICK)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_CALL)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_BACKEND_FEATURES)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_GET_BACKEND_FEATURES)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_DEVICE_ID)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_STATUS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_STATUS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_CONFIG)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_CONFIG)?],
|
|
|
|
and![Cond::new(
|
|
|
|
1,
|
|
|
|
ArgLen::Dword,
|
|
|
|
Eq,
|
|
|
|
VHOST_VDPA_SET_VRING_ENABLE
|
|
|
|
)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_VRING_NUM)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_CONFIG_CALL)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_IOVA_RANGE)?],
|
2022-10-11 13:59:38 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_CONFIG_SIZE)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SUSPEND)?],
|
2021-07-02 14:39:56 +00:00
|
|
|
];
|
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
let hypervisor_rules = create_vmm_ioctl_seccomp_rule_hypervisor(hypervisor_type)?;
|
2021-07-02 14:39:56 +00:00
|
|
|
|
|
|
|
common_rules.extend(hypervisor_rules);
|
|
|
|
|
|
|
|
Ok(common_rules)
|
2020-03-19 12:26:16 +00:00
|
|
|
}
|
|
|
|
|
2021-07-02 14:39:56 +00:00
|
|
|
#[cfg(all(target_arch = "x86_64", feature = "kvm"))]
|
2021-08-17 03:40:11 +00:00
|
|
|
fn create_vmm_ioctl_seccomp_rule_kvm() -> Result<Vec<SeccompRule>, BackendError> {
|
2020-07-06 05:37:14 +00:00
|
|
|
const KVM_CREATE_PIT2: u64 = 0x4040_ae77;
|
|
|
|
const KVM_GET_CLOCK: u64 = 0x8030_ae7c;
|
2020-10-16 16:12:46 +00:00
|
|
|
const KVM_GET_CPUID2: u64 = 0xc008_ae91;
|
2020-07-06 05:37:14 +00:00
|
|
|
const KVM_GET_FPU: u64 = 0x81a0_ae8c;
|
|
|
|
const KVM_GET_LAPIC: u64 = 0x8400_ae8e;
|
|
|
|
const KVM_GET_MSR_INDEX_LIST: u64 = 0xc004_ae02;
|
|
|
|
const KVM_GET_MSRS: u64 = 0xc008_ae88;
|
|
|
|
const KVM_GET_SREGS: u64 = 0x8138_ae83;
|
|
|
|
const KVM_GET_XCRS: u64 = 0x8188_aea6;
|
|
|
|
const KVM_GET_XSAVE: u64 = 0x9000_aea4;
|
|
|
|
const KVM_KVMCLOCK_CTRL: u64 = 0xaead;
|
|
|
|
const KVM_SET_CLOCK: u64 = 0x4030_ae7b;
|
|
|
|
const KVM_SET_CPUID2: u64 = 0x4008_ae90;
|
|
|
|
const KVM_SET_FPU: u64 = 0x41a0_ae8d;
|
2021-12-04 12:59:11 +00:00
|
|
|
const KVM_SET_IDENTITY_MAP_ADDR: u64 = 0x4008_ae48;
|
2020-07-06 05:37:14 +00:00
|
|
|
const KVM_SET_LAPIC: u64 = 0x4400_ae8f;
|
|
|
|
const KVM_SET_MSRS: u64 = 0x4008_ae89;
|
|
|
|
const KVM_SET_SREGS: u64 = 0x4138_ae84;
|
|
|
|
const KVM_SET_TSS_ADDR: u64 = 0xae47;
|
|
|
|
const KVM_SET_XCRS: u64 = 0x4188_aea7;
|
|
|
|
const KVM_SET_XSAVE: u64 = 0x5000_aea5;
|
2022-01-30 10:59:15 +00:00
|
|
|
const KVM_SET_GUEST_DEBUG: u64 = 0x4048_ae9b;
|
|
|
|
const KVM_TRANSLATE: u64 = 0xc018_ae85;
|
2020-07-06 05:37:14 +00:00
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
let common_rules = create_vmm_ioctl_seccomp_rule_common(HypervisorType::Kvm)?;
|
2020-07-06 05:37:14 +00:00
|
|
|
let mut arch_rules = or![
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_PIT2)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_CLOCK,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_CPUID2,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_FPU)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_LAPIC)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_MSR_INDEX_LIST)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_MSRS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_SREGS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_XCRS,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_XSAVE,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_KVMCLOCK_CTRL)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_CLOCK)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_CPUID2)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_FPU)?],
|
2021-12-04 12:59:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_IDENTITY_MAP_ADDR)?],
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_LAPIC)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_SREGS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_TSS_ADDR,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_MSRS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_XCRS,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_XSAVE,)?],
|
2022-01-30 10:59:15 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_GUEST_DEBUG,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_TRANSLATE,)?],
|
2020-07-06 05:37:14 +00:00
|
|
|
];
|
|
|
|
arch_rules.extend(common_rules);
|
|
|
|
|
|
|
|
Ok(arch_rules)
|
|
|
|
}
|
|
|
|
|
2021-07-02 14:39:56 +00:00
|
|
|
#[cfg(all(target_arch = "aarch64", feature = "kvm"))]
|
2021-08-17 03:40:11 +00:00
|
|
|
fn create_vmm_ioctl_seccomp_rule_kvm() -> Result<Vec<SeccompRule>, BackendError> {
|
2020-07-06 05:37:14 +00:00
|
|
|
const KVM_ARM_PREFERRED_TARGET: u64 = 0x8020_aeaf;
|
|
|
|
const KVM_ARM_VCPU_INIT: u64 = 0x4020_aeae;
|
2022-08-18 09:34:24 +00:00
|
|
|
const KVM_SET_GUEST_DEBUG: u64 = 0x4208_ae9b;
|
2020-07-06 05:37:14 +00:00
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
let common_rules = create_vmm_ioctl_seccomp_rule_common(HypervisorType::Kvm)?;
|
2020-07-06 05:37:14 +00:00
|
|
|
let mut arch_rules = or![
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_ARM_PREFERRED_TARGET,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_ARM_VCPU_INIT,)?],
|
2022-08-18 09:34:24 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_GUEST_DEBUG,)?],
|
2020-07-06 05:37:14 +00:00
|
|
|
];
|
|
|
|
arch_rules.extend(common_rules);
|
|
|
|
|
|
|
|
Ok(arch_rules)
|
|
|
|
}
|
|
|
|
|
2021-07-02 15:40:20 +00:00
|
|
|
#[cfg(all(target_arch = "x86_64", feature = "mshv"))]
|
2021-08-17 03:40:11 +00:00
|
|
|
fn create_vmm_ioctl_seccomp_rule_mshv() -> Result<Vec<SeccompRule>, BackendError> {
|
2022-07-20 22:51:15 +00:00
|
|
|
create_vmm_ioctl_seccomp_rule_common(HypervisorType::Mshv)
|
2021-07-02 15:40:20 +00:00
|
|
|
}
|
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
fn create_vmm_ioctl_seccomp_rule(
|
|
|
|
hypervisor_type: HypervisorType,
|
|
|
|
) -> Result<Vec<SeccompRule>, BackendError> {
|
|
|
|
match hypervisor_type {
|
|
|
|
#[cfg(feature = "kvm")]
|
|
|
|
HypervisorType::Kvm => create_vmm_ioctl_seccomp_rule_kvm(),
|
|
|
|
#[cfg(feature = "mshv")]
|
|
|
|
HypervisorType::Mshv => create_vmm_ioctl_seccomp_rule_mshv(),
|
|
|
|
#[allow(unreachable_patterns)]
|
|
|
|
_ => panic!("Invalid hypervisor {:?}", hypervisor_type),
|
|
|
|
}
|
2021-07-02 14:39:56 +00:00
|
|
|
}
|
|
|
|
|
2021-08-17 03:40:11 +00:00
|
|
|
fn create_api_ioctl_seccomp_rule() -> Result<Vec<SeccompRule>, BackendError> {
|
|
|
|
Ok(or![and![Cond::new(1, ArgLen::Dword, Eq, FIONBIO)?]])
|
2020-03-20 16:57:03 +00:00
|
|
|
}
|
|
|
|
|
2021-08-17 03:40:11 +00:00
|
|
|
fn create_signal_handler_ioctl_seccomp_rule() -> Result<Vec<SeccompRule>, BackendError> {
|
2020-09-18 07:03:09 +00:00
|
|
|
Ok(or![
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TCGETS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TCSETS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TIOCGWINSZ)?],
|
2020-09-18 07:03:09 +00:00
|
|
|
])
|
2020-09-09 23:33:58 +00:00
|
|
|
}
|
|
|
|
|
2021-08-17 03:40:11 +00:00
|
|
|
fn signal_handler_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
|
2020-09-09 23:33:58 +00:00
|
|
|
Ok(vec![
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_brk, vec![]),
|
|
|
|
(libc::SYS_close, vec![]),
|
|
|
|
(libc::SYS_exit, vec![]),
|
|
|
|
(libc::SYS_exit_group, vec![]),
|
|
|
|
(libc::SYS_futex, vec![]),
|
|
|
|
(libc::SYS_ioctl, create_signal_handler_ioctl_seccomp_rule()?),
|
|
|
|
(libc::SYS_madvise, vec![]),
|
2021-09-10 18:16:17 +00:00
|
|
|
(libc::SYS_mmap, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_munmap, vec![]),
|
|
|
|
(libc::SYS_recvfrom, vec![]),
|
|
|
|
(libc::SYS_rt_sigprocmask, vec![]),
|
vmm: ensure signal handlers run on the right thread
Despite setting up a dedicated thread for signal handling, we weren't
making sure that the signals we were listening for there were actually
dispatched to the right thread. While the signal-hook provides an
iterator API, so we can know that we're only processing the signals
coming out of the iterator on our signal handling thread, the actual
signal handling code from signal-hook, which pushes the signals onto
the iterator, can run on any thread. This can lead to seccomp
violations when the signal-hook signal handler does something that
isn't allowed on that thread by our seccomp policy.
To reproduce, resize a terminal running cloud-hypervisor continuously
for a few minutes. Eventually, the kernel will deliver a SIGWINCH to
a thread with a restrictive seccomp policy, and a seccomp violation
will trigger.
As part of this change, it's also necessary to allow rt_sigreturn(2)
on the signal handling thread, so signal handlers are actually allowed
to run on it. The fact that this didn't seem to be needed before
makes me think that signal handlers were almost _never_ actually
running on the signal handling thread.
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-02 19:16:45 +00:00
|
|
|
(libc::SYS_rt_sigreturn, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_sendto, vec![]),
|
|
|
|
(libc::SYS_sigaltstack, vec![]),
|
|
|
|
(libc::SYS_write, vec![]),
|
2020-09-09 23:33:58 +00:00
|
|
|
])
|
|
|
|
}
|
|
|
|
|
vmm: notify virtio-console of pty resizes
When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize. This is the only way to be notified
by the kernel of a pty resize.
We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in. To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty. The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal. This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.
Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity. I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.
I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process. The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-10 11:12:17 +00:00
|
|
|
fn create_pty_foreground_ioctl_seccomp_rule() -> Result<Vec<SeccompRule>, BackendError> {
|
|
|
|
Ok(or![
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSCTTY)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPGRP)?],
|
|
|
|
])
|
|
|
|
}
|
|
|
|
|
|
|
|
fn pty_foreground_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
|
|
|
|
Ok(vec![
|
|
|
|
(libc::SYS_close, vec![]),
|
|
|
|
(libc::SYS_exit_group, vec![]),
|
|
|
|
(libc::SYS_getpgid, vec![]),
|
|
|
|
#[cfg(target_arch = "x86_64")]
|
|
|
|
(libc::SYS_getpgrp, vec![]),
|
|
|
|
(libc::SYS_ioctl, create_pty_foreground_ioctl_seccomp_rule()?),
|
|
|
|
(libc::SYS_munmap, vec![]),
|
|
|
|
#[cfg(target_arch = "x86_64")]
|
|
|
|
(libc::SYS_poll, vec![]),
|
|
|
|
#[cfg(target_arch = "aarch64")]
|
|
|
|
(libc::SYS_ppoll, vec![]),
|
|
|
|
(libc::SYS_read, vec![]),
|
|
|
|
(libc::SYS_rt_sigaction, vec![]),
|
|
|
|
(libc::SYS_rt_sigreturn, vec![]),
|
|
|
|
(libc::SYS_setsid, vec![]),
|
|
|
|
(libc::SYS_sigaltstack, vec![]),
|
|
|
|
(libc::SYS_write, vec![]),
|
|
|
|
])
|
|
|
|
}
|
|
|
|
|
2020-08-04 02:39:07 +00:00
|
|
|
// The filter containing the white listed syscall rules required by the VMM to
|
|
|
|
// function.
|
2022-07-20 22:51:15 +00:00
|
|
|
fn vmm_thread_rules(
|
|
|
|
hypervisor_type: HypervisorType,
|
|
|
|
) -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
|
2020-08-04 02:39:07 +00:00
|
|
|
Ok(vec![
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_accept4, vec![]),
|
2020-08-04 02:39:07 +00:00
|
|
|
#[cfg(target_arch = "x86_64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_access, vec![]),
|
|
|
|
(libc::SYS_bind, vec![]),
|
|
|
|
(libc::SYS_brk, vec![]),
|
|
|
|
(libc::SYS_clock_gettime, vec![]),
|
|
|
|
(libc::SYS_clock_nanosleep, vec![]),
|
|
|
|
(libc::SYS_clone, vec![]),
|
vmm: notify virtio-console of pty resizes
When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize. This is the only way to be notified
by the kernel of a pty resize.
We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in. To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty. The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal. This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.
Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity. I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.
I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process. The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-10 11:12:17 +00:00
|
|
|
(libc::SYS_clone3, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_close, vec![]),
|
|
|
|
(libc::SYS_connect, vec![]),
|
|
|
|
(libc::SYS_dup, vec![]),
|
|
|
|
(libc::SYS_epoll_create1, vec![]),
|
|
|
|
(libc::SYS_epoll_ctl, vec![]),
|
|
|
|
(libc::SYS_epoll_pwait, vec![]),
|
2020-08-04 02:39:07 +00:00
|
|
|
#[cfg(target_arch = "x86_64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_epoll_wait, vec![]),
|
|
|
|
(libc::SYS_eventfd2, vec![]),
|
|
|
|
(libc::SYS_exit, vec![]),
|
|
|
|
(libc::SYS_exit_group, vec![]),
|
|
|
|
(libc::SYS_fallocate, vec![]),
|
|
|
|
(libc::SYS_fcntl, vec![]),
|
|
|
|
(libc::SYS_fdatasync, vec![]),
|
|
|
|
(libc::SYS_fstat, vec![]),
|
|
|
|
(libc::SYS_fsync, vec![]),
|
2021-08-17 23:49:07 +00:00
|
|
|
(libc::SYS_ftruncate, vec![]),
|
2020-08-26 04:56:51 +00:00
|
|
|
#[cfg(target_arch = "aarch64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_faccessat, vec![]),
|
2020-08-26 04:56:51 +00:00
|
|
|
#[cfg(target_arch = "aarch64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_newfstatat, vec![]),
|
|
|
|
(libc::SYS_futex, vec![]),
|
vmm: notify virtio-console of pty resizes
When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize. This is the only way to be notified
by the kernel of a pty resize.
We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in. To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty. The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal. This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.
Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity. I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.
I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process. The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-10 11:12:17 +00:00
|
|
|
(libc::SYS_getpgid, vec![]),
|
|
|
|
#[cfg(target_arch = "x86_64")]
|
|
|
|
(libc::SYS_getpgrp, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_getpid, vec![]),
|
|
|
|
(libc::SYS_getrandom, vec![]),
|
|
|
|
(libc::SYS_gettid, vec![]),
|
|
|
|
(libc::SYS_gettimeofday, vec![]),
|
|
|
|
(libc::SYS_getuid, vec![]),
|
2022-07-20 22:51:15 +00:00
|
|
|
(
|
|
|
|
libc::SYS_ioctl,
|
|
|
|
create_vmm_ioctl_seccomp_rule(hypervisor_type)?,
|
|
|
|
),
|
2021-08-17 23:49:07 +00:00
|
|
|
(libc::SYS_io_uring_enter, vec![]),
|
|
|
|
(libc::SYS_io_uring_setup, vec![]),
|
|
|
|
(libc::SYS_io_uring_register, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_kill, vec![]),
|
|
|
|
(libc::SYS_listen, vec![]),
|
|
|
|
(libc::SYS_lseek, vec![]),
|
|
|
|
(libc::SYS_madvise, vec![]),
|
|
|
|
(libc::SYS_mbind, vec![]),
|
|
|
|
(libc::SYS_memfd_create, vec![]),
|
|
|
|
(libc::SYS_mmap, vec![]),
|
|
|
|
(libc::SYS_mprotect, vec![]),
|
|
|
|
(libc::SYS_mremap, vec![]),
|
|
|
|
(libc::SYS_munmap, vec![]),
|
|
|
|
(libc::SYS_nanosleep, vec![]),
|
|
|
|
(libc::SYS_newfstatat, vec![]),
|
2020-08-04 02:39:07 +00:00
|
|
|
#[cfg(target_arch = "x86_64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_open, vec![]),
|
|
|
|
(libc::SYS_openat, vec![]),
|
|
|
|
(libc::SYS_pipe2, vec![]),
|
vmm: notify virtio-console of pty resizes
When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize. This is the only way to be notified
by the kernel of a pty resize.
We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in. To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty. The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal. This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.
Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity. I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.
I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process. The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-10 11:12:17 +00:00
|
|
|
#[cfg(target_arch = "x86_64")]
|
|
|
|
(libc::SYS_poll, vec![]),
|
|
|
|
#[cfg(target_arch = "aarch64")]
|
|
|
|
(libc::SYS_ppoll, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_prctl, vec![]),
|
|
|
|
(libc::SYS_pread64, vec![]),
|
|
|
|
(libc::SYS_preadv, vec![]),
|
|
|
|
(libc::SYS_prlimit64, vec![]),
|
|
|
|
(libc::SYS_pwrite64, vec![]),
|
|
|
|
(libc::SYS_pwritev, vec![]),
|
|
|
|
(libc::SYS_read, vec![]),
|
|
|
|
(libc::SYS_readv, vec![]),
|
2020-08-04 02:39:07 +00:00
|
|
|
#[cfg(target_arch = "x86_64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_readlink, vec![]),
|
2021-06-10 11:38:13 +00:00
|
|
|
#[cfg(target_arch = "aarch64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_readlinkat, vec![]),
|
|
|
|
(libc::SYS_recvfrom, vec![]),
|
|
|
|
(libc::SYS_recvmsg, vec![]),
|
|
|
|
(libc::SYS_restart_syscall, vec![]),
|
2022-04-21 09:31:06 +00:00
|
|
|
// musl is missing this constant
|
|
|
|
// (libc::SYS_rseq, vec![]),
|
|
|
|
#[cfg(target_arch = "x86_64")]
|
|
|
|
(334, vec![]),
|
|
|
|
#[cfg(target_arch = "aarch64")]
|
|
|
|
(293, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_rt_sigaction, vec![]),
|
|
|
|
(libc::SYS_rt_sigprocmask, vec![]),
|
|
|
|
(libc::SYS_rt_sigreturn, vec![]),
|
|
|
|
(libc::SYS_sched_getaffinity, vec![]),
|
2021-11-10 09:43:52 +00:00
|
|
|
(libc::SYS_sched_setaffinity, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_sendmsg, vec![]),
|
|
|
|
(libc::SYS_sendto, vec![]),
|
|
|
|
(libc::SYS_set_robust_list, vec![]),
|
vmm: notify virtio-console of pty resizes
When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize. This is the only way to be notified
by the kernel of a pty resize.
We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in. To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty. The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal. This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.
Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity. I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.
I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process. The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-10 11:12:17 +00:00
|
|
|
(libc::SYS_setsid, vec![]),
|
2022-03-29 17:54:44 +00:00
|
|
|
(libc::SYS_shutdown, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_sigaltstack, vec![]),
|
|
|
|
(
|
2020-08-04 02:39:07 +00:00
|
|
|
libc::SYS_socket,
|
|
|
|
or![
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(0, ArgLen::Dword, Eq, libc::AF_UNIX as u64)?],
|
|
|
|
and![Cond::new(0, ArgLen::Dword, Eq, libc::AF_INET as u64)?],
|
2020-08-04 02:39:07 +00:00
|
|
|
],
|
|
|
|
),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_socketpair, vec![]),
|
2020-08-04 02:39:07 +00:00
|
|
|
#[cfg(target_arch = "x86_64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_stat, vec![]),
|
|
|
|
(libc::SYS_statx, vec![]),
|
|
|
|
(libc::SYS_tgkill, vec![]),
|
|
|
|
(libc::SYS_timerfd_create, vec![]),
|
|
|
|
(libc::SYS_timerfd_settime, vec![]),
|
|
|
|
(libc::SYS_tkill, vec![]),
|
|
|
|
(
|
2020-08-04 02:39:07 +00:00
|
|
|
libc::SYS_umask,
|
2021-08-17 03:40:11 +00:00
|
|
|
or![and![Cond::new(0, ArgLen::Dword, Eq, 0o077)?]],
|
2020-08-04 02:39:07 +00:00
|
|
|
),
|
|
|
|
#[cfg(target_arch = "x86_64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_unlink, vec![]),
|
2020-08-04 02:39:07 +00:00
|
|
|
#[cfg(target_arch = "aarch64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_unlinkat, vec![]),
|
|
|
|
(libc::SYS_wait4, vec![]),
|
|
|
|
(libc::SYS_write, vec![]),
|
|
|
|
(libc::SYS_writev, vec![]),
|
2020-08-04 02:39:07 +00:00
|
|
|
])
|
|
|
|
}
|
|
|
|
|
2021-07-02 14:39:56 +00:00
|
|
|
#[cfg(feature = "kvm")]
|
2021-08-17 03:40:11 +00:00
|
|
|
fn create_vcpu_ioctl_seccomp_rule_kvm() -> Result<Vec<SeccompRule>, BackendError> {
|
2020-09-09 22:15:26 +00:00
|
|
|
Ok(or![
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_CHECK_EXTENSION,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_IOEVENTFD)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_IRQFD,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_DEVICE_ATTR,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_GSI_ROUTING,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_USER_MEMORY_REGION,)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, KVM_RUN,)?],
|
2021-07-02 14:39:56 +00:00
|
|
|
])
|
|
|
|
}
|
|
|
|
|
2021-07-02 15:40:20 +00:00
|
|
|
#[cfg(feature = "mshv")]
|
2021-08-17 03:40:11 +00:00
|
|
|
fn create_vcpu_ioctl_seccomp_rule_mshv() -> Result<Vec<SeccompRule>, BackendError> {
|
2021-07-02 15:40:20 +00:00
|
|
|
Ok(or![
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_SET_MSI_ROUTING)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_IOEVENTFD)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_IRQFD)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_RUN_VP)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_GET_VP_REGISTERS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_SET_VP_REGISTERS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_MAP_GUEST_MEMORY)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_UNMAP_GUEST_MEMORY)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, MSHV_VP_TRANSLATE_GVA)?],
|
2021-07-02 15:40:20 +00:00
|
|
|
])
|
|
|
|
}
|
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
fn create_vcpu_ioctl_seccomp_rule_hypervisor(
|
|
|
|
hypervisor_type: HypervisorType,
|
|
|
|
) -> Result<Vec<SeccompRule>, BackendError> {
|
|
|
|
match hypervisor_type {
|
|
|
|
#[cfg(feature = "kvm")]
|
|
|
|
HypervisorType::Kvm => create_vcpu_ioctl_seccomp_rule_kvm(),
|
|
|
|
#[cfg(feature = "mshv")]
|
|
|
|
HypervisorType::Mshv => create_vcpu_ioctl_seccomp_rule_mshv(),
|
|
|
|
#[allow(unreachable_patterns)]
|
|
|
|
_ => panic!("Invalid hypervisor {:?}", hypervisor_type),
|
|
|
|
}
|
2021-07-02 14:39:56 +00:00
|
|
|
}
|
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
fn create_vcpu_ioctl_seccomp_rule(
|
|
|
|
hypervisor_type: HypervisorType,
|
|
|
|
) -> Result<Vec<SeccompRule>, BackendError> {
|
2021-07-02 14:39:56 +00:00
|
|
|
let mut rules = or![
|
2021-08-17 03:40:11 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_DEVICE_SET_IRQS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_GROUP_UNSET_CONTAINER)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_UNMAP_DMA)?],
|
2022-03-11 11:31:25 +00:00
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_STATUS)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_CONFIG)?],
|
|
|
|
and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_CONFIG)?],
|
|
|
|
and![Cond::new(
|
|
|
|
1,
|
|
|
|
ArgLen::Dword,
|
|
|
|
Eq,
|
|
|
|
VHOST_VDPA_SET_VRING_ENABLE
|
|
|
|
)?],
|
2021-07-02 14:39:56 +00:00
|
|
|
];
|
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
let hypervisor_rules = create_vcpu_ioctl_seccomp_rule_hypervisor(hypervisor_type)?;
|
2021-07-02 14:39:56 +00:00
|
|
|
|
|
|
|
rules.extend(hypervisor_rules);
|
|
|
|
|
|
|
|
Ok(rules)
|
2020-09-09 22:15:26 +00:00
|
|
|
}
|
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
fn vcpu_thread_rules(
|
|
|
|
hypervisor_type: HypervisorType,
|
|
|
|
) -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
|
2020-09-09 22:15:26 +00:00
|
|
|
Ok(vec![
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_brk, vec![]),
|
|
|
|
(libc::SYS_clock_gettime, vec![]),
|
|
|
|
(libc::SYS_clock_nanosleep, vec![]),
|
|
|
|
(libc::SYS_close, vec![]),
|
|
|
|
(libc::SYS_dup, vec![]),
|
|
|
|
(libc::SYS_exit, vec![]),
|
vmm: Add epoll_ctl() syscall to vCPU seccomp filter
Fix seccomp violation when trying to add the out FD to the epoll loop
when the serial buffer needs to be flushed.
0x00007ffff7dc093e in epoll_ctl () at ../sysdeps/unix/syscall-template.S:120
0x0000555555db9b6d in epoll::ctl (epfd=56, op=epoll::ControlOptions::EPOLL_CTL_MOD, fd=55, event=...)
at /home/rob/.cargo/registry/src/github.com-1ecc6299db9ec823/epoll-4.3.1/src/lib.rs:155
0x00005555556f5127 in vmm::serial_buffer::SerialBuffer::add_out_poll (self=0x7fffe800b5d0) at vmm/src/serial_buffer.rs:101
0x00005555556f583d in vmm::serial_buffer::{impl#1}::write (self=0x7fffe800b5d0, buf=...) at vmm/src/serial_buffer.rs:139
0x0000555555a30b10 in std::io::Write::write_all<vmm::serial_buffer::SerialBuffer> (self=0x7fffe800b5d0, buf=...)
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/io/mod.rs:1527
0x0000555555ab82fb in devices::legacy::serial::Serial::handle_write (self=0x7fffe800b520, offset=0, v=13) at devices/src/legacy/serial.rs:217
0x0000555555ab897f in devices::legacy::serial::{impl#2}::write (self=0x7fffe800b520, _base=1016, offset=0, data=...) at devices/src/legacy/serial.rs:295
0x0000555555f30e95 in vm_device::bus::Bus::write (self=0x7fffe8006ce0, addr=1016, data=...) at vm-device/src/bus.rs:235
0x00005555559406d4 in vmm::vm::{impl#4}::pio_write (self=0x7fffe8009640, port=1016, data=...) at vmm/src/vm.rs:459
Signed-off-by: Rob Bradford <robert.bradford@intel.com>
2021-11-16 14:17:30 +00:00
|
|
|
(libc::SYS_epoll_ctl, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_fstat, vec![]),
|
|
|
|
(libc::SYS_futex, vec![]),
|
|
|
|
(libc::SYS_getrandom, vec![]),
|
|
|
|
(libc::SYS_getpid, vec![]),
|
2022-07-20 22:51:15 +00:00
|
|
|
(
|
|
|
|
libc::SYS_ioctl,
|
|
|
|
create_vcpu_ioctl_seccomp_rule(hypervisor_type)?,
|
|
|
|
),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_lseek, vec![]),
|
|
|
|
(libc::SYS_madvise, vec![]),
|
|
|
|
(libc::SYS_mmap, vec![]),
|
|
|
|
(libc::SYS_mprotect, vec![]),
|
|
|
|
(libc::SYS_munmap, vec![]),
|
|
|
|
(libc::SYS_nanosleep, vec![]),
|
|
|
|
(libc::SYS_newfstatat, vec![]),
|
2021-01-11 09:34:37 +00:00
|
|
|
#[cfg(target_arch = "x86_64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_open, vec![]),
|
|
|
|
(libc::SYS_openat, vec![]),
|
|
|
|
(libc::SYS_pread64, vec![]),
|
|
|
|
(libc::SYS_pwrite64, vec![]),
|
|
|
|
(libc::SYS_read, vec![]),
|
|
|
|
(libc::SYS_recvfrom, vec![]),
|
|
|
|
(libc::SYS_recvmsg, vec![]),
|
|
|
|
(libc::SYS_rt_sigaction, vec![]),
|
|
|
|
(libc::SYS_rt_sigprocmask, vec![]),
|
|
|
|
(libc::SYS_rt_sigreturn, vec![]),
|
|
|
|
(libc::SYS_sendmsg, vec![]),
|
vmm: Add 'shutdown()' to vCPU seccomp filter
This is required when hot-removing a vfio-user device. Details code path
below:
Thread 6 "vcpu0" received signal SIGSYS, Bad system call.
[Switching to Thread 0x7f8196889700 (LWP 2358305)]
0x00007f8196dae7ab in shutdown () at ../sysdeps/unix/syscall-template.S:78
78 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
(gdb) bt
0x00007f8196dae7ab in shutdown () at ../sysdeps/unix/syscall-template.S:78
0x000056189240737d in std::sys::unix::net::Socket::shutdown ()
at library/std/src/sys/unix/net.rs:383
std::os::unix::net::stream::UnixStream::shutdown () at library/std/src/os/unix/net/stream.rs:479
0x000056189210e23d in vfio_user::Client::shutdown (self=0x7f8190014300)
at vfio_user/src/lib.rs:787
0x00005618920b9d02 in <pci::vfio_user::VfioUserPciDevice as core::ops::drop::Drop>::drop (
self=0x7f819002d7c0) at pci/src/vfio_user.rs:551
0x00005618920b8787 in core::ptr::drop_in_place<pci::vfio_user::VfioUserPciDevice> ()
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c/library/core/src/ptr/mod.rs:188
0x00005618920b92e3 in core::ptr::drop_in_place<core::cell::UnsafeCell<dyn pci::device::PciDevice>>
() at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c/library/core/src/ptr/mod.rs:188
0x00005618920b9362 in core::ptr::drop_in_place<std::sync::mutex::Mutex<dyn pci::device::PciDevice>> () at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c/library/core/src/ptr/mod.rs:188
0x00005618920d8a3e in alloc::sync::Arc<T>::drop_slow (self=0x7f81968852b8)
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c/library/alloc/src/sync.rs:1092
0x00005618920ba273 in <alloc::sync::Arc<T> as core::ops::drop::Drop>::drop (self=0x7f81968852b8)
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c/library/alloc/src/sync.rs:1688
0x00005618920b76fb in core::ptr::drop_in_place<alloc::sync::Arc<std::sync::mutex::Mutex<dyn pci::device::PciDevice>>> ()
at /rustc/7737e0b5c4103216d6fd8cf941b7ab9bdbaace7c/library/core/src/ptr/mod.rs:188
0x0000561891b5e47d in vmm::device_manager::DeviceManager::eject_device (self=0x7f8190009600,
pci_segment_id=0, device_id=3) at vmm/src/device_manager.rs:4000
0x0000561891b674bc in <vmm::device_manager::DeviceManager as vm_device::bus::BusDevice>::write (
self=0x7f8190009600, base=70368744108032, offset=8, data=&[u8](size=4) = {...})
at vmm/src/device_manager.rs:4625
0x00005618921927d5 in vm_device::bus::Bus::write (self=0x7f8190006e00, addr=70368744108040,
data=&[u8](size=4) = {...}) at vm-device/src/bus.rs:235
0x0000561891b72e10 in <vmm::vm::VmOps as hypervisor::vm::VmmOps>::mmio_write (
self=0x7f81900097b0, gpa=70368744108040, data=&[u8](size=4) = {...}) at vmm/src/vm.rs:378
0x0000561892133ae2 in <hypervisor::kvm::KvmVcpu as hypervisor::cpu::Vcpu>::run (
self=0x7f8190013c90) at hypervisor/src/kvm/mod.rs:1114
0x0000561891914e85 in vmm::cpu::Vcpu::run (self=0x7f819001b230) at vmm/src/cpu.rs:348
0x000056189189f2cb in vmm::cpu::CpuManager::start_vcpu::{{closure}}::{{closure}} ()
at vmm/src/cpu.rs:953
Signed-off-by: Bo Chen <chen.bo@intel.com>
2022-05-05 20:29:01 +00:00
|
|
|
(libc::SYS_shutdown, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_sigaltstack, vec![]),
|
|
|
|
(libc::SYS_tgkill, vec![]),
|
|
|
|
(libc::SYS_tkill, vec![]),
|
2020-10-06 11:12:40 +00:00
|
|
|
#[cfg(target_arch = "x86_64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_unlink, vec![]),
|
2020-10-06 11:12:40 +00:00
|
|
|
#[cfg(target_arch = "aarch64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_unlinkat, vec![]),
|
|
|
|
(libc::SYS_write, vec![]),
|
|
|
|
(libc::SYS_writev, vec![]),
|
2020-09-09 22:15:26 +00:00
|
|
|
])
|
|
|
|
}
|
|
|
|
|
2020-08-04 02:39:07 +00:00
|
|
|
// The filter containing the white listed syscall rules required by the API to
|
|
|
|
// function.
|
2021-08-17 03:40:11 +00:00
|
|
|
fn api_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
|
2020-08-04 02:39:07 +00:00
|
|
|
Ok(vec![
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_accept4, vec![]),
|
|
|
|
(libc::SYS_brk, vec![]),
|
|
|
|
(libc::SYS_close, vec![]),
|
|
|
|
(libc::SYS_dup, vec![]),
|
|
|
|
(libc::SYS_epoll_create1, vec![]),
|
|
|
|
(libc::SYS_epoll_ctl, vec![]),
|
|
|
|
(libc::SYS_epoll_pwait, vec![]),
|
2020-08-04 02:39:07 +00:00
|
|
|
#[cfg(target_arch = "x86_64")]
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_epoll_wait, vec![]),
|
|
|
|
(libc::SYS_exit, vec![]),
|
|
|
|
(libc::SYS_fcntl, vec![]),
|
|
|
|
(libc::SYS_futex, vec![]),
|
|
|
|
(libc::SYS_getrandom, vec![]),
|
|
|
|
(libc::SYS_ioctl, create_api_ioctl_seccomp_rule()?),
|
|
|
|
(libc::SYS_madvise, vec![]),
|
2021-09-10 18:16:17 +00:00
|
|
|
(libc::SYS_mmap, vec![]),
|
2021-08-17 03:40:11 +00:00
|
|
|
(libc::SYS_mprotect, vec![]),
|
|
|
|
(libc::SYS_munmap, vec![]),
|
|
|
|
(libc::SYS_recvfrom, vec![]),
|
|
|
|
(libc::SYS_recvmsg, vec![]),
|
|
|
|
(libc::SYS_sigaltstack, vec![]),
|
|
|
|
(libc::SYS_write, vec![]),
|
2020-08-04 02:39:07 +00:00
|
|
|
])
|
|
|
|
}
|
|
|
|
|
2022-07-20 22:51:15 +00:00
|
|
|
fn get_seccomp_rules(
|
|
|
|
thread_type: Thread,
|
|
|
|
hypervisor_type: HypervisorType,
|
|
|
|
) -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
|
2021-08-17 23:05:08 +00:00
|
|
|
match thread_type {
|
|
|
|
Thread::Api => Ok(api_thread_rules()?),
|
|
|
|
Thread::SignalHandler => Ok(signal_handler_thread_rules()?),
|
2022-07-20 22:51:15 +00:00
|
|
|
Thread::Vcpu => Ok(vcpu_thread_rules(hypervisor_type)?),
|
|
|
|
Thread::Vmm => Ok(vmm_thread_rules(hypervisor_type)?),
|
vmm: notify virtio-console of pty resizes
When a pty is resized (using the TIOCSWINSZ ioctl -- see ioctl_tty(2)),
the kernel will send a SIGWINCH signal to the pty's foreground process
group to notify it of the resize. This is the only way to be notified
by the kernel of a pty resize.
We can't just make the cloud-hypervisor process's process group the
foreground process group though, because a process can only set the
foreground process group of its controlling terminal, and
cloud-hypervisor's controlling terminal will often be the terminal the
user is running it in. To work around this, we fork a subprocess in a
new process group, and set its process group to be the foreground
process group of the pty. The subprocess additionally must be running
in a new session so that it can have a different controlling
terminal. This subprocess writes a byte to a pipe every time the pty
is resized, and the virtio-console device can listen for this in its
epoll loop.
Alternatives I considered were to have the subprocess just send
SIGWINCH to its parent, and to use an eventfd instead of a pipe.
I decided against the signal approach because re-purposing a signal
that has a very specific meaning (even if this use was only slightly
different to its normal meaning) felt unclean, and because it would
have required using pidfds to avoid race conditions if
cloud-hypervisor had terminated, which added complexity. I decided
against using an eventfd because using a pipe instead allows the child
to be notified (via poll(2)) when nothing is reading from the pipe any
more, meaning it can be reliably notified of parent death and
terminate itself immediately.
I used clone3(2) instead of fork(2) because without
CLONE_CLEAR_SIGHAND the subprocess would inherit signal-hook's signal
handlers, and there's no other straightforward way to restore all signal
handlers to their defaults in the child process. The only way to do
it would be to iterate through all possible signals, or maintain a
global list of monitored signals ourselves (vmm:vm::HANDLED_SIGNALS is
insufficient because it doesn't take into account e.g. the SIGSYS
signal handler that catches seccomp violations).
Signed-off-by: Alyssa Ross <hi@alyssa.is>
2021-09-10 11:12:17 +00:00
|
|
|
Thread::PtyForeground => Ok(pty_foreground_thread_rules()?),
|
2021-08-17 23:05:08 +00:00
|
|
|
}
|
2020-03-20 16:57:03 +00:00
|
|
|
}
|
|
|
|
|
2020-07-30 21:21:58 +00:00
|
|
|
/// Generate a BPF program based on the seccomp_action value
|
2020-03-20 16:57:03 +00:00
|
|
|
pub fn get_seccomp_filter(
|
2020-07-30 21:21:58 +00:00
|
|
|
seccomp_action: &SeccompAction,
|
2020-03-20 16:57:03 +00:00
|
|
|
thread_type: Thread,
|
2022-07-20 22:51:15 +00:00
|
|
|
hypervisor_type: HypervisorType,
|
2021-08-17 03:40:11 +00:00
|
|
|
) -> Result<BpfProgram, Error> {
|
2020-07-30 21:21:58 +00:00
|
|
|
match seccomp_action {
|
|
|
|
SeccompAction::Allow => Ok(vec![]),
|
2021-08-17 23:05:08 +00:00
|
|
|
SeccompAction::Log => SeccompFilter::new(
|
2022-07-20 22:51:15 +00:00
|
|
|
get_seccomp_rules(thread_type, hypervisor_type)
|
2021-08-17 23:05:08 +00:00
|
|
|
.map_err(Error::Backend)?
|
|
|
|
.into_iter()
|
|
|
|
.collect(),
|
|
|
|
SeccompAction::Log,
|
|
|
|
SeccompAction::Allow,
|
|
|
|
std::env::consts::ARCH.try_into().unwrap(),
|
|
|
|
)
|
|
|
|
.and_then(|filter| filter.try_into())
|
|
|
|
.map_err(Error::Backend),
|
|
|
|
_ => SeccompFilter::new(
|
2022-07-20 22:51:15 +00:00
|
|
|
get_seccomp_rules(thread_type, hypervisor_type)
|
2021-08-17 23:05:08 +00:00
|
|
|
.map_err(Error::Backend)?
|
|
|
|
.into_iter()
|
|
|
|
.collect(),
|
|
|
|
SeccompAction::Trap,
|
|
|
|
SeccompAction::Allow,
|
|
|
|
std::env::consts::ARCH.try_into().unwrap(),
|
|
|
|
)
|
|
|
|
.and_then(|filter| filter.try_into())
|
|
|
|
.map_err(Error::Backend),
|
2020-03-19 12:26:16 +00:00
|
|
|
}
|
|
|
|
}
|