vmm: Only return from reset driven I/O once event received

The reset system is asynchronous with an I/O event (PIO or MMIO) for
ACPI/i8042/CMOS triggering a write to the reset_evt event handler. The
VMM thread will pick up this event on the VMM main loop and then trigger
a shutdown in the CpuManager. However since there is some delay between
the CPU threads being marked to be killed (through the
CpuManager::cpus_kill_signalled bool) it is possible for the guest vCPU
that triggered the exit to be re-entered when the vCPU KVM_RUN is called
after the I/O exit is completed.

This is undesirable and in particular the Linux kernel will attempt to
jump to real mode after a CMOS based exit - this is unsupported in
nested KVM on AMD on Azure and will trigger an error in KVM_RUN.

Solve this problem by spinning in the device that has triggered the
reset until the vcpus_kill_signalled boolean has been updated
indicating that the VMM thread has received the event and called
CpuManager::shutdown(). In particular if this bool is set then the vCPU
threads will not re-enter the guest.

Signed-off-by: Rob Bradford <rbradford@rivosinc.com>
This commit is contained in:
Rob Bradford 2023-08-03 15:13:03 +01:00 committed by Bo Chen
parent 70cfd1be67
commit 06dc708515
6 changed files with 82 additions and 6 deletions

View File

@ -5,7 +5,9 @@
use super::AcpiNotificationFlags;
use acpi_tables::{aml, Aml, AmlSink};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Barrier};
use std::thread;
use std::time::Instant;
use vm_device::interrupt::InterruptSourceGroup;
use vm_device::BusDevice;
@ -18,14 +20,20 @@ pub const GED_DEVICE_ACPI_SIZE: usize = 0x1;
pub struct AcpiShutdownDevice {
exit_evt: EventFd,
reset_evt: EventFd,
vcpus_kill_signalled: Arc<AtomicBool>,
}
impl AcpiShutdownDevice {
/// Constructs a device that will signal the given event when the guest requests it.
pub fn new(exit_evt: EventFd, reset_evt: EventFd) -> AcpiShutdownDevice {
pub fn new(
exit_evt: EventFd,
reset_evt: EventFd,
vcpus_kill_signalled: Arc<AtomicBool>,
) -> AcpiShutdownDevice {
AcpiShutdownDevice {
exit_evt,
reset_evt,
vcpus_kill_signalled,
}
}
}
@ -43,6 +51,13 @@ impl BusDevice for AcpiShutdownDevice {
if let Err(e) = self.reset_evt.write(1) {
error!("Error triggering ACPI reset event: {}", e);
}
// Spin until we are sure the reset_evt has been handled and that when
// we return from the KVM_RUN we will exit rather than re-enter the guest.
while !self.vcpus_kill_signalled.load(Ordering::SeqCst) {
// This is more effective than thread::yield_now() at
// avoiding a priority inversion with the VMM thread
thread::sleep(std::time::Duration::from_millis(1));
}
}
// The ACPI DSDT table specifies the S5 sleep state (shutdown) as value 5
const S5_SLEEP_VALUE: u8 = 5;
@ -53,6 +68,13 @@ impl BusDevice for AcpiShutdownDevice {
if let Err(e) = self.exit_evt.write(1) {
error!("Error triggering ACPI shutdown event: {}", e);
}
// Spin until we are sure the reset_evt has been handled and that when
// we return from the KVM_RUN we will exit rather than re-enter the guest.
while !self.vcpus_kill_signalled.load(Ordering::SeqCst) {
// This is more effective than thread::yield_now() at
// avoiding a priority inversion with the VMM thread
thread::sleep(std::time::Duration::from_millis(1));
}
}
None
}

View File

@ -5,7 +5,9 @@
use libc::{clock_gettime, gmtime_r, timespec, tm, CLOCK_REALTIME};
use std::cmp::min;
use std::mem;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Barrier};
use std::thread;
use vm_device::BusDevice;
use vmm_sys_util::eventfd::EventFd;
@ -23,13 +25,19 @@ pub struct Cmos {
index: u8,
data: [u8; DATA_LEN],
reset_evt: EventFd,
vcpus_kill_signalled: Arc<AtomicBool>,
}
impl Cmos {
/// Constructs a CMOS/RTC device with initial data.
/// `mem_below_4g` is the size of memory in bytes below the 32-bit gap.
/// `mem_above_4g` is the size of memory in bytes above the 32-bit gap.
pub fn new(mem_below_4g: u64, mem_above_4g: u64, reset_evt: EventFd) -> Cmos {
pub fn new(
mem_below_4g: u64,
mem_above_4g: u64,
reset_evt: EventFd,
vcpus_kill_signalled: Arc<AtomicBool>,
) -> Cmos {
let mut data = [0u8; DATA_LEN];
// Extended memory from 16 MB to 4 GB in units of 64 KB
@ -50,6 +58,7 @@ impl Cmos {
index: 0,
data,
reset_evt,
vcpus_kill_signalled,
}
}
}
@ -67,6 +76,13 @@ impl BusDevice for Cmos {
if self.index == 0x8f && data[0] == 0 {
info!("CMOS reset");
self.reset_evt.write(1).unwrap();
// Spin until we are sure the reset_evt has been handled and that when
// we return from the KVM_RUN we will exit rather than re-enter the guest.
while !self.vcpus_kill_signalled.load(Ordering::SeqCst) {
// This is more effective than thread::yield_now() at
// avoiding a priority inversion with the VMM thread
thread::sleep(std::time::Duration::from_millis(1));
}
} else {
self.data[(self.index & INDEX_MASK) as usize] = data[0]
}

View File

@ -2,19 +2,27 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE-BSD-3-Clause file.
use std::sync::{Arc, Barrier};
use std::sync::{
atomic::{AtomicBool, Ordering},
Arc, Barrier,
};
use std::thread;
use vm_device::BusDevice;
use vmm_sys_util::eventfd::EventFd;
/// A i8042 PS/2 controller that emulates just enough to shutdown the machine.
pub struct I8042Device {
reset_evt: EventFd,
vcpus_kill_signalled: Arc<AtomicBool>,
}
impl I8042Device {
/// Constructs a i8042 device that will signal the given event when the guest requests it.
pub fn new(reset_evt: EventFd) -> I8042Device {
I8042Device { reset_evt }
pub fn new(reset_evt: EventFd, vcpus_kill_signalled: Arc<AtomicBool>) -> I8042Device {
I8042Device {
reset_evt,
vcpus_kill_signalled,
}
}
}
@ -38,6 +46,13 @@ impl BusDevice for I8042Device {
if let Err(e) = self.reset_evt.write(1) {
error!("Error triggering i8042 reset event: {}", e);
}
// Spin until we are sure the reset_evt has been handled and that when
// we return from the KVM_RUN we will exit rather than re-enter the guest.
while !self.vcpus_kill_signalled.load(Ordering::SeqCst) {
// This is more effective than thread::yield_now() at
// avoiding a priority inversion with the VMM thread
thread::sleep(std::time::Duration::from_millis(1));
}
}
None

View File

@ -6,6 +6,8 @@
use devices::legacy::Cmos;
use libc::EFD_NONBLOCK;
use libfuzzer_sys::fuzz_target;
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
use vm_device::BusDevice;
use vmm_sys_util::eventfd::EventFd;
@ -25,6 +27,7 @@ fuzz_target!(|bytes| {
u64::from_le_bytes(below_4g),
u64::from_le_bytes(above_4g),
EventFd::new(EFD_NONBLOCK).unwrap(),
Arc::new(AtomicBool::default()),
);
let mut i = 16;

View File

@ -1725,6 +1725,10 @@ impl CpuManager {
) {
self.interrupt_controller = Some(interrupt_controller);
}
pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
&self.vcpus_kill_signalled
}
}
struct Cpu {

View File

@ -1485,8 +1485,16 @@ impl DeviceManager {
reset_evt: EventFd,
exit_evt: EventFd,
) -> DeviceManagerResult<Option<Arc<Mutex<devices::AcpiGedDevice>>>> {
let vcpus_kill_signalled = self
.cpu_manager
.lock()
.unwrap()
.vcpus_kill_signalled()
.clone();
let shutdown_device = Arc::new(Mutex::new(devices::AcpiShutdownDevice::new(
exit_evt, reset_evt,
exit_evt,
reset_evt,
vcpus_kill_signalled,
)));
self.bus_devices
@ -1585,9 +1593,16 @@ impl DeviceManager {
#[cfg(target_arch = "x86_64")]
fn add_legacy_devices(&mut self, reset_evt: EventFd) -> DeviceManagerResult<()> {
let vcpus_kill_signalled = self
.cpu_manager
.lock()
.unwrap()
.vcpus_kill_signalled()
.clone();
// Add a shutdown device (i8042)
let i8042 = Arc::new(Mutex::new(devices::legacy::I8042Device::new(
reset_evt.try_clone().unwrap(),
vcpus_kill_signalled.clone(),
)));
self.bus_devices
@ -1615,6 +1630,7 @@ impl DeviceManager {
mem_below_4g,
mem_above_4g,
reset_evt,
vcpus_kill_signalled,
)));
self.bus_devices