cloud-hypervisor/vmm/src/cpu.rs

// Copyright © 2020, Oracle and/or its affiliates.
//
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE-BSD-3-Clause file.
//
// Copyright © 2019 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
//

#[cfg(target_arch = "x86_64")]
use crate::config::CpuTopology;
use crate::config::CpusConfig;
use crate::device_manager::DeviceManager;
use crate::CPU_MANAGER_SNAPSHOT_ID;
#[cfg(feature = "acpi")]
use acpi_tables::{aml, aml::Aml, sdt::SDT};
use anyhow::anyhow;
#[cfg(feature = "acpi")]
use arch::layout;
use arch::EntryPoint;
#[cfg(target_arch = "x86_64")]
use arch::{CpuidPatch, CpuidReg};
use devices::{interrupt_controller::InterruptController, BusDevice};
#[cfg(target_arch = "aarch64")]
use hypervisor::kvm::kvm_bindings::KVM_SYSTEM_EVENT_SHUTDOWN;
#[cfg(target_arch = "x86_64")]
use hypervisor::CpuId;
use hypervisor::{CpuState, VcpuExit};

use libc::{c_void, siginfo_t};

#[cfg(target_arch = "x86_64")]
use std::fmt;
use std::os::unix::thread::JoinHandleExt;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Barrier, Mutex};
use std::{cmp, io, result, thread};
#[cfg(target_arch = "x86_64")]
use vm_memory::GuestAddress;
use vm_memory::{GuestMemoryAtomic, GuestMemoryMmap};
use vm_migration::{
    Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable,
    Transportable,
};
use vmm_sys_util::eventfd::EventFd;
use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};

// CPUID feature bits
#[cfg(target_arch = "x86_64")]
const TSC_DEADLINE_TIMER_ECX_BIT: u8 = 24; // tsc deadline timer ecx bit.
#[cfg(target_arch = "x86_64")]
const HYPERVISOR_ECX_BIT: u8 = 31; // Hypervisor ecx bit.

// Debug I/O port
#[cfg(target_arch = "x86_64")]
const DEBUG_IOPORT: u16 = 0x80;
#[cfg(target_arch = "x86_64")]
const DEBUG_IOPORT_PREFIX: &str = "Debug I/O port";

#[cfg(target_arch = "x86_64")]
/// Debug I/O port, see:
/// https://www.intel.com/content/www/us/en/support/articles/000005500/boards-and-kits.html
///
/// Since we're not a physical platform, we can freely assign code ranges for
/// debugging specific parts of our virtual platform.
pub enum DebugIoPortRange {
    Firmware,
    Bootloader,
    Kernel,
    Userspace,
    Custom,
}
#[cfg(target_arch = "x86_64")]
impl DebugIoPortRange {
    fn from_u8(value: u8) -> DebugIoPortRange {
        match value {
            0x00..=0x1f => DebugIoPortRange::Firmware,
            0x20..=0x3f => DebugIoPortRange::Bootloader,
            0x40..=0x5f => DebugIoPortRange::Kernel,
            0x60..=0x7f => DebugIoPortRange::Userspace,
            _ => DebugIoPortRange::Custom,
        }
    }
}

#[cfg(target_arch = "x86_64")]
impl fmt::Display for DebugIoPortRange {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            DebugIoPortRange::Firmware => write!(f, "{}: Firmware", DEBUG_IOPORT_PREFIX),
            DebugIoPortRange::Bootloader => write!(f, "{}: Bootloader", DEBUG_IOPORT_PREFIX),
            DebugIoPortRange::Kernel => write!(f, "{}: Kernel", DEBUG_IOPORT_PREFIX),
            DebugIoPortRange::Userspace => write!(f, "{}: Userspace", DEBUG_IOPORT_PREFIX),
            DebugIoPortRange::Custom => write!(f, "{}: Custom", DEBUG_IOPORT_PREFIX),
        }
    }
}

#[derive(Debug)]
pub enum Error {
    /// Cannot open the VCPU file descriptor.
    VcpuFd(anyhow::Error),

    /// Cannot run the VCPUs.
    VcpuRun(anyhow::Error),

    /// Cannot spawn a new vCPU thread.
    VcpuSpawn(io::Error),

    /// Cannot patch the CPU ID
    PatchCpuId(anyhow::Error),

    /// The call to KVM_SET_CPUID2 failed.
    SetSupportedCpusFailed(anyhow::Error),

    #[cfg(target_arch = "x86_64")]
    /// Cannot set the local interruption due to bad configuration.
    LocalIntConfiguration(anyhow::Error),

    /// Error configuring VCPU
    VcpuConfiguration(arch::Error),

    /// Unexpected KVM_RUN exit reason
    VcpuUnhandledKvmExit,

    /// Failed to join on vCPU threads
    ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),

    /// Cannot add legacy device to Bus.
    BusError(devices::BusError),

    /// Failed to allocate IO port
    AllocateIOPort,

    /// Asking for more vCPUs that we can have
    DesiredVCPUCountExceedsMax,

    /// Failed to get KVM vcpu lapic.
    VcpuGetLapic(anyhow::Error),

    /// Failed to set KVM vcpu lapic.
    VcpuSetLapic(anyhow::Error),

    /// Failed to get KVM vcpu MP state.
    VcpuGetMpState(anyhow::Error),

    /// Failed to set KVM vcpu MP state.
    VcpuSetMpState(anyhow::Error),

    /// Failed to get KVM vcpu msrs.
    VcpuGetMsrs(anyhow::Error),

    /// Failed to set KVM vcpu msrs.
    VcpuSetMsrs(anyhow::Error),

    /// Failed to get KVM vcpu regs.
    VcpuGetRegs(anyhow::Error),

    /// Failed to set KVM vcpu regs.
    VcpuSetRegs(anyhow::Error),

    /// Failed to get KVM vcpu sregs.
    VcpuGetSregs(anyhow::Error),

    /// Failed to set KVM vcpu sregs.
    VcpuSetSregs(anyhow::Error),

    /// Failed to get KVM vcpu events.
    VcpuGetVcpuEvents(anyhow::Error),

    /// Failed to set KVM vcpu events.
    VcpuSetVcpuEvents(anyhow::Error),

    /// Failed to get KVM vcpu FPU.
    VcpuGetFpu(anyhow::Error),

    /// Failed to set KVM vcpu FPU.
    VcpuSetFpu(anyhow::Error),

    /// Failed to get KVM vcpu XSAVE.
    VcpuGetXsave(anyhow::Error),

    /// Failed to set KVM vcpu XSAVE.
    VcpuSetXsave(anyhow::Error),

    /// Failed to get KVM vcpu XCRS.
    VcpuGetXcrs(anyhow::Error),

    /// Failed to set KVM vcpu XCRS.
    VcpuSetXcrs(anyhow::Error),

    /// Error resuming vCPU on shutdown
    ResumeOnShutdown(MigratableError),
}
pub type Result<T> = result::Result<T, Error>;

#[cfg(feature = "acpi")]
#[repr(packed)]
struct LocalAPIC {
    pub r#type: u8,
    pub length: u8,
    pub processor_id: u8,
    pub apic_id: u8,
    pub flags: u32,
}

#[repr(packed)]
#[derive(Default)]
struct IOAPIC {
    pub r#type: u8,
    pub length: u8,
    pub ioapic_id: u8,
    _reserved: u8,
    pub apic_address: u32,
    pub gsi_base: u32,
}

#[repr(packed)]
#[derive(Default)]
struct InterruptSourceOverride {
    pub r#type: u8,
    pub length: u8,
    pub bus: u8,
    pub source: u8,
    pub gsi: u32,
    pub flags: u16,
}

/// A wrapper around creating and using a kvm-based VCPU.
pub struct Vcpu {
    fd: Arc<dyn hypervisor::Vcpu>,
    id: u8,
    #[cfg(target_arch = "x86_64")]
    io_bus: Arc<devices::Bus>,
    mmio_bus: Arc<devices::Bus>,
    #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
    interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
    #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
    vm_ts: std::time::Instant,
    #[cfg(target_arch = "aarch64")]
    mpidr: u64,
    saved_state: Option<CpuState>,
}

impl Vcpu {
    /// Constructs a new VCPU for `vm`.
    ///
    /// # Arguments
    ///
    /// * `id` - Represents the CPU number between [0, max vcpus).
    /// * `vm` - The virtual machine this vcpu will get attached to.
    pub fn new(
        id: u8,
        fd: &Arc<dyn hypervisor::Vm>,
        #[cfg(target_arch = "x86_64")] io_bus: Arc<devices::Bus>,
        mmio_bus: Arc<devices::Bus>,
        interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
        creation_ts: std::time::Instant,
    ) -> Result<Arc<Mutex<Self>>> {
        let kvm_vcpu = fd.create_vcpu(id).map_err(|e| Error::VcpuFd(e.into()))?;
        // Initially the cpuid per vCPU is the one supported by this VM.
        Ok(Arc::new(Mutex::new(Vcpu {
            fd: kvm_vcpu,
            id,
            #[cfg(target_arch = "x86_64")]
            io_bus,
            mmio_bus,
            interrupt_controller,
            vm_ts: creation_ts,
            #[cfg(target_arch = "aarch64")]
            mpidr: 0,
            saved_state: None,
        })))
    }

    /// Configures a vcpu and should be called once per vcpu when created.
    ///
    /// # Arguments
    ///
    /// * `fd` - VcpuFd.
    /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
    /// * `vm_memory` - Guest memory.
    /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
    pub fn configure(
        &mut self,
        #[cfg(target_arch = "aarch64")] vm_fd: &Arc<dyn hypervisor::Vm>,
        kernel_entry_point: Option<EntryPoint>,
        vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
        #[cfg(target_arch = "x86_64")] cpuid: CpuId,
    ) -> Result<()> {
        #[cfg(target_arch = "aarch64")]
        {
            self.mpidr =
                arch::configure_vcpu(&self.fd, self.id, vm_fd, kernel_entry_point, vm_memory)
                    .map_err(Error::VcpuConfiguration)?;
        }

        #[cfg(target_arch = "x86_64")]
        arch::configure_vcpu(&self.fd, self.id, kernel_entry_point, vm_memory, cpuid)
            .map_err(Error::VcpuConfiguration)?;

        Ok(())
    }

    /// Gets the MPIDR register value.
    #[cfg(target_arch = "aarch64")]
    pub fn get_mpidr(&self) -> u64 {
        self.mpidr
    }

    /// Runs the VCPU until it exits, returning the reason.
    ///
    /// Note that the state of the VCPU and associated VM must be setup first for this to do
    /// anything useful.
    pub fn run(&self) -> Result<bool> {
        match self.fd.run() {
            Ok(run) => match run {
                #[cfg(target_arch = "x86_64")]
                VcpuExit::IoIn(addr, data) => {
                    self.io_bus.read(u64::from(addr), data);
                    Ok(true)
                }
                #[cfg(target_arch = "x86_64")]
                VcpuExit::IoOut(addr, data) => {
                    if addr == DEBUG_IOPORT && data.len() == 1 {
                        self.log_debug_ioport(data[0]);
                    }
                    self.io_bus.write(u64::from(addr), data);
                    Ok(true)
                }
                VcpuExit::MmioRead(addr, data) => {
                    self.mmio_bus.read(addr as u64, data);
                    Ok(true)
                }
                VcpuExit::MmioWrite(addr, data) => {
                    self.mmio_bus.write(addr as u64, data);
                    Ok(true)
                }
                #[cfg(target_arch = "x86_64")]
                VcpuExit::IoapicEoi(vector) => {
                    if let Some(interrupt_controller) = &self.interrupt_controller {
                        interrupt_controller
                            .lock()
                            .unwrap()
                            .end_of_interrupt(vector);
                    }
                    Ok(true)
                }
                VcpuExit::Shutdown => {
                    // Triple fault to trigger a reboot
                    Ok(false)
                }
                #[cfg(target_arch = "aarch64")]
                VcpuExit::SystemEvent(event_type, flags) => {
                    // On Aarch64, when the VM is shutdown, run() returns
                    // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
                    if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
                        Ok(false)
                    } else {
                        error!(
                            "Unexpected system event with type 0x{:x}, flags 0x{:x}",
                            event_type, flags
                        );
                        Err(Error::VcpuUnhandledKvmExit)
                    }
                }
                r => {
                    error!("Unexpected exit reason on vcpu run: {:?}", r);
                    Err(Error::VcpuUnhandledKvmExit)
                }
            },

            Err(ref e) => match e.errno() {
                libc::EAGAIN | libc::EINTR => Ok(true),
                _ => {
                    error!("VCPU {:?} error {:?}", self.id, e);
                    Err(Error::VcpuUnhandledKvmExit)
                }
            },
        }
    }

    #[cfg(target_arch = "x86_64")]
    // Log debug io port codes.
    fn log_debug_ioport(&self, code: u8) {
        let ts = self.vm_ts.elapsed();

        debug!(
            "[{} code 0x{:x}] {}.{:>06} seconds",
            DebugIoPortRange::from_u8(code),
            code,
            ts.as_secs(),
            ts.as_micros()
        );
    }
}

const VCPU_SNAPSHOT_ID: &str = "vcpu";
impl Pausable for Vcpu {
    fn pause(&mut self) -> std::result::Result<(), MigratableError> {
        self.saved_state =
            Some(self.fd.cpu_state().map_err(|e| {
                MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e))
            })?);

        Ok(())
    }

    fn resume(&mut self) -> std::result::Result<(), MigratableError> {
        if let Some(vcpu_state) = &self.saved_state {
            self.fd.set_cpu_state(vcpu_state).map_err(|e| {
                MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e))
            })?;
        }

        Ok(())
    }
}
impl Snapshottable for Vcpu {
    fn id(&self) -> String {
        VCPU_SNAPSHOT_ID.to_string()
    }

    fn snapshot(&self) -> std::result::Result<Snapshot, MigratableError> {
        let snapshot = serde_json::to_vec(&self.saved_state)
            .map_err(|e| MigratableError::Snapshot(e.into()))?;

        let mut vcpu_snapshot = Snapshot::new(&format!("{}", self.id));
        vcpu_snapshot.add_data_section(SnapshotDataSection {
            id: format!("{}-section", VCPU_SNAPSHOT_ID),
            snapshot,
        });

        Ok(vcpu_snapshot)
    }

    fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
        if let Some(vcpu_section) = snapshot
            .snapshot_data
            .get(&format!("{}-section", VCPU_SNAPSHOT_ID))
        {
            let vcpu_state = match serde_json::from_slice(&vcpu_section.snapshot) {
                Ok(state) => state,
                Err(error) => {
                    return Err(MigratableError::Restore(anyhow!(
                        "Could not deserialize the vCPU snapshot {}",
                        error
                    )))
                }
            };

            self.saved_state = Some(vcpu_state);

            Ok(())
        } else {
            Err(MigratableError::Restore(anyhow!(
                "Could not find the vCPU snapshot section"
            )))
        }
    }
}

pub struct CpuManager {
    config: CpusConfig,
    #[cfg(target_arch = "x86_64")]
    io_bus: Arc<devices::Bus>,
    #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
    mmio_bus: Arc<devices::Bus>,
    #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
    interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
    #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
    vm_memory: GuestMemoryAtomic<GuestMemoryMmap>,
    #[cfg(target_arch = "x86_64")]
    cpuid: CpuId,
    #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
    fd: Arc<dyn hypervisor::Vm>,
    vcpus_kill_signalled: Arc<AtomicBool>,
    vcpus_pause_signalled: Arc<AtomicBool>,
    #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
    reset_evt: EventFd,
    vcpu_states: Vec<VcpuState>,
    selected_cpu: u8,
    vcpus: Vec<Arc<Mutex<Vcpu>>>,
}

const CPU_ENABLE_FLAG: usize = 0;
const CPU_INSERTING_FLAG: usize = 1;
const CPU_REMOVING_FLAG: usize = 2;
const CPU_EJECT_FLAG: usize = 3;

const CPU_STATUS_OFFSET: u64 = 4;
const CPU_SELECTION_OFFSET: u64 = 0;

impl BusDevice for CpuManager {
    fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
        match offset {
            CPU_STATUS_OFFSET => {
                if self.selected_cpu < self.present_vcpus() {
                    let state = &self.vcpu_states[usize::from(self.selected_cpu)];
                    if state.active() {
                        data[0] |= 1 << CPU_ENABLE_FLAG;
                    }
                    if state.inserting {
                        data[0] |= 1 << CPU_INSERTING_FLAG;
                    }
                    if state.removing {
                        data[0] |= 1 << CPU_REMOVING_FLAG;
                    }
                }
            }
            _ => {
                warn!(
                    "Unexpected offset for accessing CPU manager device: {:#}",
                    offset
                );
            }
        }
    }

    fn write(&mut self, _base: u64, offset: u64, data: &[u8]) {
        match offset {
            CPU_SELECTION_OFFSET => {
                self.selected_cpu = data[0];
            }
            CPU_STATUS_OFFSET => {
                let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
                // The ACPI code writes back a 1 to acknowledge the insertion
                if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
                    && state.inserting
                {
                    state.inserting = false;
                }
                // Ditto for removal
                if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) && state.removing
                {
                    state.removing = false;
                }
                // Trigger removal of vCPU
                if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
                    if let Err(e) = self.remove_vcpu(self.selected_cpu) {
                        error!("Error removing vCPU: {:?}", e);
                    }
                }
            }
            _ => {
                warn!(
                    "Unexpected offset for accessing CPU manager device: {:#}",
                    offset
                );
            }
        }
    }
}

#[derive(Default)]
struct VcpuState {
    inserting: bool,
    removing: bool,
    handle: Option<thread::JoinHandle<()>>,
    kill: Arc<AtomicBool>,
    vcpu_run_interrupted: Arc<AtomicBool>,
}

impl VcpuState {
    fn active(&self) -> bool {
        self.handle.is_some()
    }

    fn signal_thread(&self) {
        if let Some(handle) = self.handle.as_ref() {
            loop {
                unsafe {
                    libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
                }
                if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
                    break;
                } else {
                    // This is more effective than thread::yield_now() at
                    // avoiding a priority inversion with the vCPU thread
                    thread::sleep(std::time::Duration::from_millis(1));
                }
            }
        }
    }

    fn join_thread(&mut self) -> Result<()> {
        if let Some(handle) = self.handle.take() {
            handle.join().map_err(Error::ThreadCleanup)?
        }

        Ok(())
    }

    fn unpark_thread(&self) {
        if let Some(handle) = self.handle.as_ref() {
            handle.thread().unpark()
        }
    }
}

impl CpuManager {
    #[allow(unused_variables)]
    pub fn new(
        config: &CpusConfig,
        device_manager: &Arc<Mutex<DeviceManager>>,
        guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
        fd: Arc<dyn hypervisor::Vm>,
        reset_evt: EventFd,
        hypervisor: Arc<dyn hypervisor::Hypervisor>,
    ) -> Result<Arc<Mutex<CpuManager>>> {
        let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
        vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);

        let device_manager = device_manager.lock().unwrap();
        #[cfg(target_arch = "x86_64")]
        let cpuid = CpuManager::patch_cpuid(hypervisor, &config.topology)?;
        let cpu_manager = Arc::new(Mutex::new(CpuManager {
            config: config.clone(),
            #[cfg(target_arch = "x86_64")]
            io_bus: device_manager.io_bus().clone(),
            mmio_bus: device_manager.mmio_bus().clone(),
            interrupt_controller: device_manager.interrupt_controller().clone(),
            vm_memory: guest_memory,
            #[cfg(target_arch = "x86_64")]
            cpuid,
            fd,
            vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
            vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
            vcpu_states,
            reset_evt,
            selected_cpu: 0,
            vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
        }));

        #[cfg(target_arch = "x86_64")]
        device_manager
            .allocator()
            .lock()
            .unwrap()
            .allocate_io_addresses(Some(GuestAddress(0x0cd8)), 0x8, None)
            .ok_or(Error::AllocateIOPort)?;

        #[cfg(target_arch = "x86_64")]
        cpu_manager
            .lock()
            .unwrap()
            .io_bus
            .insert(cpu_manager.clone(), 0x0cd8, 0xc)
            .map_err(Error::BusError)?;

        Ok(cpu_manager)
    }

    #[cfg(target_arch = "x86_64")]
    fn patch_cpuid(
        hypervisor: Arc<dyn hypervisor::Hypervisor>,
        topology: &Option<CpuTopology>,
    ) -> Result<CpuId> {
        let mut cpuid_patches = Vec::new();

        // Patch tsc deadline timer bit
        cpuid_patches.push(CpuidPatch {
            function: 1,
            index: 0,
            flags_bit: None,
            eax_bit: None,
            ebx_bit: None,
            ecx_bit: Some(TSC_DEADLINE_TIMER_ECX_BIT),
            edx_bit: None,
        });

        // Patch hypervisor bit
        cpuid_patches.push(CpuidPatch {
            function: 1,
            index: 0,
            flags_bit: None,
            eax_bit: None,
            ebx_bit: None,
            ecx_bit: Some(HYPERVISOR_ECX_BIT),
            edx_bit: None,
        });

        // Supported CPUID
        let mut cpuid = hypervisor
            .get_cpuid()
            .map_err(|e| Error::PatchCpuId(e.into()))?;

        CpuidPatch::patch_cpuid(&mut cpuid, cpuid_patches);

        if let Some(t) = topology {
            arch::x86_64::update_cpuid_topology(
                &mut cpuid,
                t.threads_per_core,
                t.cores_per_die,
                t.dies_per_package,
            );
        }

        Ok(cpuid)
    }

    fn create_vcpu(
        &mut self,
        cpu_id: u8,
        entry_point: Option<EntryPoint>,
        snapshot: Option<Snapshot>,
    ) -> Result<Arc<Mutex<Vcpu>>> {
        info!("Creating vCPU: cpu_id = {}", cpu_id);

        let interrupt_controller = if let Some(interrupt_controller) = &self.interrupt_controller {
            Some(interrupt_controller.clone())
        } else {
            None
        };

        let creation_ts = std::time::Instant::now();

        let vcpu = Vcpu::new(
            cpu_id,
            &self.fd,
            #[cfg(target_arch = "x86_64")]
            self.io_bus.clone(),
            self.mmio_bus.clone(),
            interrupt_controller,
            creation_ts,
        )?;

        if let Some(snapshot) = snapshot {
            #[cfg(target_arch = "x86_64")]
            {
                let mut cpuid = self.cpuid.clone();
                CpuidPatch::set_cpuid_reg(&mut cpuid, 0xb, None, CpuidReg::EDX, u32::from(cpu_id));
                CpuidPatch::set_cpuid_reg(&mut cpuid, 0x1f, None, CpuidReg::EDX, u32::from(cpu_id));

                vcpu.lock()
                    .unwrap()
                    .fd
                    .set_cpuid2(&cpuid)
                    .map_err(|e| Error::SetSupportedCpusFailed(e.into()))?;
            }
            vcpu.lock()
                .unwrap()
                .restore(snapshot)
                .expect("Failed to restore vCPU");
        } else {
            let vm_memory = self.vm_memory.clone();

            #[cfg(target_arch = "x86_64")]
            vcpu.lock()
                .unwrap()
                .configure(entry_point, &vm_memory, self.cpuid.clone())
                .expect("Failed to configure vCPU");

            #[cfg(target_arch = "aarch64")]
            vcpu.lock()
                .unwrap()
                .configure(&self.fd, entry_point, &vm_memory)
                .expect("Failed to configure vCPU");
        }

        // Adding vCPU to the CpuManager's vCPU list.
        self.vcpus.push(Arc::clone(&vcpu));

        Ok(vcpu)
    }

    /// Only create new vCPUs if there aren't any inactive ones to reuse
    fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> {
        info!(
            "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
            desired_vcpus,
            self.config.max_vcpus,
            self.vcpus.len(),
            self.present_vcpus()
        );

        if desired_vcpus > self.config.max_vcpus {
            return Err(Error::DesiredVCPUCountExceedsMax);
        }

        // Only create vCPUs in excess of all the allocated vCPUs.
        for cpu_id in self.vcpus.len() as u8..desired_vcpus {
            self.create_vcpu(cpu_id, entry_point, None)?;
        }

        Ok(())
    }

    fn start_vcpu(
        &mut self,
        vcpu: Arc<Mutex<Vcpu>>,
        vcpu_thread_barrier: Arc<Barrier>,
        inserting: bool,
    ) -> Result<()> {
        let cpu_id = vcpu.lock().unwrap().id;
        let reset_evt = self.reset_evt.try_clone().unwrap();
        let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
        let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();

        let vcpu_kill = self.vcpu_states[usize::from(cpu_id)].kill.clone();
        let vcpu_run_interrupted = self.vcpu_states[usize::from(cpu_id)]
            .vcpu_run_interrupted
            .clone();

        info!("Starting vCPU: cpu_id = {}", cpu_id);

        let handle = Some(
            thread::Builder::new()
                .name(format!("vcpu{}", cpu_id))
                .spawn(move || {
                    extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
                    // This uses an async signal safe handler to kill the vcpu handles.
                    register_signal_handler(SIGRTMIN(), handle_signal)
                        .expect("Failed to register vcpu signal handler");

                    // Block until all CPUs are ready.
                    vcpu_thread_barrier.wait();

                    loop {
                        // If we are being told to pause, we park the thread
                        // until the pause boolean is toggled.
                        // The resume operation is responsible for toggling
                        // the boolean and unpark the thread.
                        // We enter a loop because park() could spuriously
                        // return. We will then park() again unless the
                        // pause boolean has been toggled.
                        if vcpu_pause_signalled.load(Ordering::SeqCst) {
                            vcpu_run_interrupted.store(true, Ordering::SeqCst);
                            while vcpu_pause_signalled.load(Ordering::SeqCst) {
                                thread::park();
                            }
                            vcpu_run_interrupted.store(false, Ordering::SeqCst);
                        }

                        // We've been told to terminate
                        if vcpu_kill_signalled.load(Ordering::SeqCst)
                            || vcpu_kill.load(Ordering::SeqCst)
                        {
                            vcpu_run_interrupted.store(true, Ordering::SeqCst);
                            break;
                        }

                        // vcpu.run() returns false on a KVM_EXIT_SHUTDOWN (triple-fault) so trigger a reset
                        match vcpu.lock().unwrap().run() {
                            Err(e) => {
                                error!("VCPU generated error: {:?}", e);
                                break;
                            }
                            Ok(true) => {}
                            Ok(false) => {
                                vcpu_run_interrupted.store(true, Ordering::SeqCst);
                                reset_evt.write(1).unwrap();
                                break;
                            }
                        }

                        // We've been told to terminate
                        if vcpu_kill_signalled.load(Ordering::SeqCst)
                            || vcpu_kill.load(Ordering::SeqCst)
                        {
                            vcpu_run_interrupted.store(true, Ordering::SeqCst);
                            break;
                        }
                    }
                })
                .map_err(Error::VcpuSpawn)?,
        );

        // On hot plug calls into this function entry_point is None. It is for
        // those hotplug CPU additions that we need to set the inserting flag.
        self.vcpu_states[usize::from(cpu_id)].handle = handle;
        self.vcpu_states[usize::from(cpu_id)].inserting = inserting;

        Ok(())
    }

    /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
    fn activate_vcpus(&mut self, desired_vcpus: u8, inserting: bool) -> Result<()> {
        if desired_vcpus > self.config.max_vcpus {
            return Err(Error::DesiredVCPUCountExceedsMax);
        }

        let vcpu_thread_barrier = Arc::new(Barrier::new(
            (desired_vcpus - self.present_vcpus() + 1) as usize,
        ));

        info!(
            "Starting vCPUs: desired = {}, allocated = {}, present = {}",
            desired_vcpus,
            self.vcpus.len(),
            self.present_vcpus()
        );

        // This reuses any inactive vCPUs as well as any that were newly created
        for cpu_id in self.present_vcpus()..desired_vcpus {
            let vcpu = Arc::clone(&self.vcpus[cpu_id as usize]);
            self.start_vcpu(vcpu, vcpu_thread_barrier.clone(), inserting)?;
        }

        // Unblock all CPU threads.
        vcpu_thread_barrier.wait();
        Ok(())
    }

    fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) -> Result<()> {
        // Mark vCPUs for removal, actual removal happens on ejection
        for cpu_id in desired_vcpus..self.present_vcpus() {
            self.vcpu_states[usize::from(cpu_id)].removing = true;
        }
        Ok(())
    }

    fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
        info!("Removing vCPU: cpu_id = {}", cpu_id);
        let mut state = &mut self.vcpu_states[usize::from(cpu_id)];
        state.kill.store(true, Ordering::SeqCst);
        state.signal_thread();
        state.join_thread()?;
        state.handle = None;

        // Once the thread has exited, clear the "kill" so that it can reused
        state.kill.store(false, Ordering::SeqCst);

        Ok(())
    }

    pub fn create_boot_vcpus(&mut self, entry_point: EntryPoint) -> Result<()> {
        self.create_vcpus(self.boot_vcpus(), Some(entry_point))
    }

    // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
    pub fn start_boot_vcpus(&mut self) -> Result<()> {
        self.activate_vcpus(self.boot_vcpus(), false)
    }

    pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
        match desired_vcpus.cmp(&self.present_vcpus()) {
            cmp::Ordering::Greater => {
                self.create_vcpus(desired_vcpus, None)?;
                self.activate_vcpus(desired_vcpus, true)?;
                Ok(true)
            }
            cmp::Ordering::Less => self.mark_vcpus_for_removal(desired_vcpus).and(Ok(true)),
            _ => Ok(false),
        }
    }

    pub fn shutdown(&mut self) -> Result<()> {
        // Tell the vCPUs to stop themselves next time they go through the loop
        self.vcpus_kill_signalled.store(true, Ordering::SeqCst);

        // Toggle the vCPUs pause boolean
        self.vcpus_pause_signalled.store(false, Ordering::SeqCst);

        // Unpark all the VCPU threads.
        for state in self.vcpu_states.iter() {
            state.unpark_thread();
        }

        // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
        // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
        // above.
        for state in self.vcpu_states.iter() {
            state.signal_thread();
        }

        // Wait for all the threads to finish. This removes the state from the vector.
        for mut state in self.vcpu_states.drain(..) {
            state.join_thread()?;
        }

        Ok(())
    }

    pub fn boot_vcpus(&self) -> u8 {
        self.config.boot_vcpus
    }

    pub fn max_vcpus(&self) -> u8 {
        self.config.max_vcpus
    }

    fn present_vcpus(&self) -> u8 {
        self.vcpu_states
            .iter()
            .fold(0, |acc, state| acc + state.active() as u8)
    }

    #[cfg(target_arch = "aarch64")]
    pub fn get_mpidrs(&self) -> Vec<u64> {
        let vcpu_mpidrs = self
            .vcpus
            .iter()
            .map(|cpu| cpu.lock().unwrap().get_mpidr())
            .collect();
        vcpu_mpidrs
    }

    #[cfg(feature = "acpi")]
    pub fn create_madt(&self) -> SDT {
        // This is also checked in the commandline parsing.
        assert!(self.config.boot_vcpus <= self.config.max_vcpus);

        let mut madt = SDT::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
        madt.write(36, layout::APIC_START);

        for cpu in 0..self.config.max_vcpus {
            let lapic = LocalAPIC {
                r#type: 0,
                length: 8,
                processor_id: cpu,
                apic_id: cpu,
                flags: if cpu < self.config.boot_vcpus {
                    1 << MADT_CPU_ENABLE_FLAG
                } else {
                    0
                },
            };
            madt.append(lapic);
        }

        madt.append(IOAPIC {
            r#type: 1,
            length: 12,
            ioapic_id: 0,
            apic_address: layout::IOAPIC_START.0 as u32,
            gsi_base: 0,
            ..Default::default()
        });

        madt.append(InterruptSourceOverride {
            r#type: 2,
            length: 10,
            bus: 0,
            source: 4,
            gsi: 4,
            flags: 0,
        });

        madt
    }
}

#[cfg(feature = "acpi")]
struct CPU {
    cpu_id: u8,
}

#[cfg(feature = "acpi")]
const MADT_CPU_ENABLE_FLAG: usize = 0;

#[cfg(feature = "acpi")]
impl Aml for CPU {
    fn to_aml_bytes(&self) -> Vec<u8> {
        let lapic = LocalAPIC {
            r#type: 0,
            length: 8,
            processor_id: self.cpu_id,
            apic_id: self.cpu_id,
            flags: 1 << MADT_CPU_ENABLE_FLAG,
        };

        let mut mat_data: Vec<u8> = Vec::new();
        mat_data.resize(std::mem::size_of_val(&lapic), 0);
        unsafe { *(mat_data.as_mut_ptr() as *mut LocalAPIC) = lapic };

        aml::Device::new(
            format!("C{:03}", self.cpu_id).as_str().into(),
            vec![
                &aml::Name::new("_HID".into(), &"ACPI0007"),
                &aml::Name::new("_UID".into(), &self.cpu_id),
                /*
                _STA return value:
                Bit [0] – Set if the device is present.
                Bit [1] – Set if the device is enabled and decoding its resources.
                Bit [2] – Set if the device should be shown in the UI.
                Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
                Bit [4] – Set if the battery is present.
                Bits [31:5] – Reserved (must be cleared).
                */
                &aml::Method::new(
                    "_STA".into(),
                    0,
                    false,
                    // Call into CSTA method which will interrogate device
                    vec![&aml::Return::new(&aml::MethodCall::new(
                        "CSTA".into(),
                        vec![&self.cpu_id],
                    ))],
                ),
                // The Linux kernel expects every CPU device to have a _MAT entry
                // containing the LAPIC for this processor with the enabled bit set
                // even it if is disabled in the MADT (non-boot CPU)
                &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
                // Trigger CPU ejection
                &aml::Method::new(
                    "_EJ0".into(),
                    1,
                    false,
                    // Call into CEJ0 method which will actually eject device
                    vec![&aml::Return::new(&aml::MethodCall::new(
                        "CEJ0".into(),
                        vec![&self.cpu_id],
                    ))],
                ),
            ],
        )
        .to_aml_bytes()
    }
}

#[cfg(feature = "acpi")]
struct CPUNotify {
    cpu_id: u8,
}

#[cfg(feature = "acpi")]
impl Aml for CPUNotify {
    fn to_aml_bytes(&self) -> Vec<u8> {
        let object = aml::Path::new(&format!("C{:03}", self.cpu_id));
        aml::If::new(
            &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
            vec![&aml::Notify::new(&object, &aml::Arg(1))],
        )
        .to_aml_bytes()
    }
}

#[cfg(feature = "acpi")]
struct CPUMethods {
    max_vcpus: u8,
}

#[cfg(feature = "acpi")]
impl Aml for CPUMethods {
    fn to_aml_bytes(&self) -> Vec<u8> {
        let mut bytes = Vec::new();
        bytes.extend_from_slice(
            // CPU status method
            &aml::Method::new(
                "CSTA".into(),
                1,
                true,
                vec![
                    // Take lock defined above
                    &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xfff),
                    // Write CPU number (in first argument) to I/O port via field
                    &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
                    &aml::Store::new(&aml::Local(0), &aml::ZERO),
                    // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
                    &aml::If::new(
                        &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
                        vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
                    ),
                    // Release lock
                    &aml::Release::new("\\_SB_.PRES.CPLK".into()),
                    // Return 0 or 0xf
                    &aml::Return::new(&aml::Local(0)),
                ],
            )
            .to_aml_bytes(),
        );

        let mut cpu_notifies = Vec::new();
        for cpu_id in 0..self.max_vcpus {
            cpu_notifies.push(CPUNotify { cpu_id });
        }

        let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
        for cpu_id in 0..self.max_vcpus {
            cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
        }

        bytes.extend_from_slice(
            &aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(),
        );

        bytes.extend_from_slice(
            &aml::Method::new(
                "CEJ0".into(),
                1,
                true,
                vec![
                    &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xfff),
                    // Write CPU number (in first argument) to I/O port via field
                    &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
                    // Set CEJ0 bit
                    &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
                    &aml::Release::new("\\_SB_.PRES.CPLK".into()),
                ],
            )
            .to_aml_bytes(),
        );

        bytes.extend_from_slice(
            &aml::Method::new(
                "CSCN".into(),
                0,
                true,
                vec![
                    // Take lock defined above
                    &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xfff),
                    &aml::Store::new(&aml::Local(0), &aml::ZERO),
                    &aml::While::new(
                        &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
                        vec![
                            // Write CPU number (in first argument) to I/O port via field
                            &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
                            // Check if CINS bit is set
                            &aml::If::new(
                                &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
                                // Notify device if it is
                                vec![
                                    &aml::MethodCall::new(
                                        "CTFY".into(),
                                        vec![&aml::Local(0), &aml::ONE],
                                    ),
                                    // Reset CINS bit
                                    &aml::Store::new(
                                        &aml::Path::new("\\_SB_.PRES.CINS"),
                                        &aml::ONE,
                                    ),
                                ],
                            ),
                            // Check if CRMV bit is set
                            &aml::If::new(
                                &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
                                // Notify device if it is (with the eject constant 0x3)
                                vec![
                                    &aml::MethodCall::new(
                                        "CTFY".into(),
                                        vec![&aml::Local(0), &3u8],
                                    ),
                                    // Reset CRMV bit
                                    &aml::Store::new(
                                        &aml::Path::new("\\_SB_.PRES.CRMV"),
                                        &aml::ONE,
                                    ),
                                ],
                            ),
                            &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
                        ],
                    ),
                    // Release lock
                    &aml::Release::new("\\_SB_.PRES.CPLK".into()),
                ],
            )
            .to_aml_bytes(),
        );
        bytes
    }
}

#[cfg(feature = "acpi")]
impl Aml for CpuManager {
    fn to_aml_bytes(&self) -> Vec<u8> {
        let mut bytes = Vec::new();
        // CPU hotplug controller
        bytes.extend_from_slice(
            &aml::Device::new(
                "_SB_.PRES".into(),
                vec![
                    &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
                    // Mutex to protect concurrent access as we write to choose CPU and then read back status
                    &aml::Mutex::new("CPLK".into(), 0),
                    // I/O port for CPU controller
                    &aml::Name::new(
                        "_CRS".into(),
                        &aml::ResourceTemplate::new(vec![&aml::IO::new(
                            0x0cd8, 0x0cd8, 0x01, 0x0c,
                        )]),
                    ),
                    // OpRegion and Fields map I/O port into individual field values
                    &aml::OpRegion::new("PRST".into(), aml::OpRegionSpace::SystemIO, 0x0cd8, 0x0c),
                    &aml::Field::new(
                        "PRST".into(),
                        aml::FieldAccessType::Byte,
                        aml::FieldUpdateRule::WriteAsZeroes,
                        vec![
                            aml::FieldEntry::Reserved(32),
                            aml::FieldEntry::Named(*b"CPEN", 1),
                            aml::FieldEntry::Named(*b"CINS", 1),
                            aml::FieldEntry::Named(*b"CRMV", 1),
                            aml::FieldEntry::Named(*b"CEJ0", 1),
                            aml::FieldEntry::Reserved(4),
                            aml::FieldEntry::Named(*b"CCMD", 8),
                        ],
                    ),
                    &aml::Field::new(
                        "PRST".into(),
                        aml::FieldAccessType::DWord,
                        aml::FieldUpdateRule::Preserve,
                        vec![
                            aml::FieldEntry::Named(*b"CSEL", 32),
                            aml::FieldEntry::Reserved(32),
                            aml::FieldEntry::Named(*b"CDAT", 32),
                        ],
                    ),
                ],
            )
            .to_aml_bytes(),
        );

        // CPU devices
        let hid = aml::Name::new("_HID".into(), &"ACPI0010");
        let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
        // Bundle methods together under a common object
        let methods = CPUMethods {
            max_vcpus: self.config.max_vcpus,
        };
        let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods];

        let mut cpu_devices = Vec::new();
        for cpu_id in 0..self.config.max_vcpus {
            let cpu_device = CPU { cpu_id };

            cpu_devices.push(cpu_device);
        }

        for cpu_device in cpu_devices.iter() {
            cpu_data_inner.push(cpu_device);
        }

        bytes.extend_from_slice(
            &aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(),
        );
        bytes
    }
}

impl Pausable for CpuManager {
    fn pause(&mut self) -> std::result::Result<(), MigratableError> {
        // Tell the vCPUs to pause themselves next time they exit
        self.vcpus_pause_signalled.store(true, Ordering::SeqCst);

        // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
        // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
        // above.
        for state in self.vcpu_states.iter() {
            state.signal_thread();
        }

        for vcpu in self.vcpus.iter() {
            let mut vcpu = vcpu.lock().unwrap();
            vcpu.pause()?;
            #[cfg(target_arch = "x86_64")]
            vcpu.fd.notify_guest_clock_paused().map_err(|e| {
                MigratableError::Pause(anyhow!("Could not notify guest it has been paused {:?}", e))
            })?;
        }

        Ok(())
    }

    fn resume(&mut self) -> std::result::Result<(), MigratableError> {
        for vcpu in self.vcpus.iter() {
            vcpu.lock().unwrap().resume()?;
        }

        // Toggle the vCPUs pause boolean
        self.vcpus_pause_signalled.store(false, Ordering::SeqCst);

        // Unpark all the VCPU threads.
        // Once unparked, the next thing they will do is checking for the pause
        // boolean. Since it'll be set to false, they will exit their pause loop
        // and go back to vmx root.
        for state in self.vcpu_states.iter() {
            state.unpark_thread();
        }
        Ok(())
    }
}

impl Snapshottable for CpuManager {
    fn id(&self) -> String {
        CPU_MANAGER_SNAPSHOT_ID.to_string()
    }

    fn snapshot(&self) -> std::result::Result<Snapshot, MigratableError> {
        let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID);

        // The CpuManager snapshot is a collection of all vCPUs snapshots.
        for vcpu in &self.vcpus {
            let cpu_snapshot = vcpu.lock().unwrap().snapshot()?;
            cpu_manager_snapshot.add_snapshot(cpu_snapshot);
        }

        Ok(cpu_manager_snapshot)
    }

    fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
        let vcpu_thread_barrier = Arc::new(Barrier::new((snapshot.snapshots.len() + 1) as usize));

        // Restore the vCPUs in "paused" state.
        self.vcpus_pause_signalled.store(true, Ordering::SeqCst);

        for (cpu_id, snapshot) in snapshot.snapshots.iter() {
            debug!("Restoring VCPU {}", cpu_id);
            let vcpu = self
                .create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone()))
                .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?;
            self.start_vcpu(vcpu, vcpu_thread_barrier.clone(), false)
                .map_err(|e| MigratableError::Restore(anyhow!("Could not restore vCPU {:?}", e)))?;
        }

        // Unblock all restored CPU threads.
        vcpu_thread_barrier.wait();
        Ok(())
    }
}

impl Transportable for CpuManager {}
impl Migratable for CpuManager {}