// Copyright © 2020, Oracle and/or its affiliates. // // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. // // Copyright © 2019 Intel Corporation // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause // use crate::config::CpusConfig; #[cfg(feature = "guest_debug")] use crate::coredump::{ CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, NT_PRSTATUS, }; use crate::device_manager::DeviceManager; #[cfg(feature = "gdb")] use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; use crate::memory_manager::MemoryManager; use crate::seccomp_filters::{get_seccomp_filter, Thread}; #[cfg(target_arch = "x86_64")] use crate::vm::physical_bits; use crate::GuestMemoryMmap; use crate::CPU_MANAGER_SNAPSHOT_ID; use acpi_tables::{aml, aml::Aml, sdt::Sdt}; use anyhow::anyhow; #[cfg(all(target_arch = "aarch64", feature = "gdb"))] use arch::aarch64::regs; use arch::EntryPoint; use arch::NumaNodes; use devices::interrupt_controller::InterruptController; #[cfg(all(target_arch = "aarch64", feature = "gdb"))] use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; #[cfg(all(target_arch = "x86_64", feature = "gdb"))] use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; #[cfg(all(target_arch = "aarch64", feature = "gdb"))] use hypervisor::aarch64::StandardRegisters; #[cfg(feature = "guest_debug")] use hypervisor::arch::x86::msr_index; #[cfg(target_arch = "x86_64")] use hypervisor::arch::x86::CpuIdEntry; #[cfg(feature = "guest_debug")] use hypervisor::arch::x86::MsrEntry; #[cfg(all(target_arch = "x86_64", feature = "gdb"))] use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters}; #[cfg(target_arch = "aarch64")] use hypervisor::kvm::kvm_bindings; #[cfg(feature = "tdx")] use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps}; use libc::{c_void, siginfo_t}; #[cfg(feature = "guest_debug")] use linux_loader::elf::Elf64_Nhdr; use seccompiler::{apply_filter, SeccompAction}; use std::collections::BTreeMap; #[cfg(feature = "guest_debug")] use std::io::Write; #[cfg(feature = "guest_debug")] use std::mem::size_of; use std::os::unix::thread::JoinHandleExt; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Barrier, Mutex}; use std::{cmp, io, result, thread}; use thiserror::Error; use vm_device::BusDevice; #[cfg(feature = "guest_debug")] use vm_memory::ByteValued; #[cfg(feature = "gdb")] use vm_memory::{Bytes, GuestAddressSpace}; use vm_memory::{GuestAddress, GuestMemoryAtomic}; use vm_migration::{ Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, Transportable, }; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; #[cfg(all(target_arch = "aarch64", feature = "gdb"))] /// Extract the specified bits of a 64-bit integer. /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, /// following expression should return 3 (`0b11`): /// `extract_bits_64!(0b0000_0110u64, 1, 2)` /// macro_rules! extract_bits_64 { ($value: tt, $offset: tt, $length: tt) => { ($value >> $offset) & (!0u64 >> (64 - $length)) }; } pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc; #[derive(Debug, Error)] pub enum Error { #[error("Error creating vCPU: {0}")] VcpuCreate(#[source] anyhow::Error), #[error("Error running bCPU: {0}")] VcpuRun(#[source] anyhow::Error), #[error("Error spawning vCPU thread: {0}")] VcpuSpawn(#[source] io::Error), #[error("Error generating common CPUID: {0}")] CommonCpuId(#[source] arch::Error), #[error("Error configuring vCPU: {0}")] VcpuConfiguration(#[source] arch::Error), #[cfg(target_arch = "aarch64")] #[error("Error fetching preferred target: {0}")] VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError), #[cfg(target_arch = "aarch64")] #[error("Error initialising vCPU: {0}")] VcpuArmInit(#[source] hypervisor::HypervisorCpuError), #[error("Failed to join on vCPU threads: {0:?}")] ThreadCleanup(std::boxed::Box), #[error("Error adding CpuManager to MMIO bus: {0}")] BusError(#[source] vm_device::BusError), #[error("Requested vCPUs exceed maximum")] DesiredVCpuCountExceedsMax, #[error("Cannot create seccomp filter: {0}")] CreateSeccompFilter(#[source] seccompiler::Error), #[error("Cannot apply seccomp filter: {0}")] ApplySeccompFilter(#[source] seccompiler::Error), #[error("Error starting vCPU after restore: {0}")] StartRestoreVcpu(#[source] anyhow::Error), #[error("Unexpected VmExit")] UnexpectedVmExit, #[error("Failed to allocate MMIO address for CpuManager")] AllocateMmmioAddress, #[cfg(feature = "tdx")] #[error("Error initializing TDX: {0}")] InitializeTdx(#[source] hypervisor::HypervisorCpuError), #[cfg(target_arch = "aarch64")] #[error("Error initializing PMU: {0}")] InitPmu(#[source] hypervisor::HypervisorCpuError), #[cfg(feature = "gdb")] #[error("Error during CPU debug: {0}")] CpuDebug(#[source] hypervisor::HypervisorCpuError), #[cfg(feature = "gdb")] #[error("Error translating virtual address: {0}")] TranslateVirtualAddress(#[source] anyhow::Error), #[cfg(all(feature = "amx", target_arch = "x86_64"))] #[error("Error setting up AMX: {0}")] AmxEnable(#[source] anyhow::Error), } pub type Result = result::Result; #[cfg(target_arch = "x86_64")] #[allow(dead_code)] #[repr(packed)] struct LocalApic { pub r#type: u8, pub length: u8, pub processor_id: u8, pub apic_id: u8, pub flags: u32, } #[allow(dead_code)] #[repr(packed)] #[derive(Default)] struct Ioapic { pub r#type: u8, pub length: u8, pub ioapic_id: u8, _reserved: u8, pub apic_address: u32, pub gsi_base: u32, } #[cfg(target_arch = "aarch64")] #[allow(dead_code)] #[repr(packed)] struct GicC { pub r#type: u8, pub length: u8, pub reserved0: u16, pub cpu_interface_number: u32, pub uid: u32, pub flags: u32, pub parking_version: u32, pub performance_interrupt: u32, pub parked_address: u64, pub base_address: u64, pub gicv_base_address: u64, pub gich_base_address: u64, pub vgic_interrupt: u32, pub gicr_base_address: u64, pub mpidr: u64, pub proc_power_effi_class: u8, pub reserved1: u8, pub spe_overflow_interrupt: u16, } #[cfg(target_arch = "aarch64")] #[allow(dead_code)] #[repr(packed)] struct GicD { pub r#type: u8, pub length: u8, pub reserved0: u16, pub gic_id: u32, pub base_address: u64, pub global_irq_base: u32, pub version: u8, pub reserved1: [u8; 3], } #[cfg(target_arch = "aarch64")] #[allow(dead_code)] #[repr(packed)] struct GicR { pub r#type: u8, pub length: u8, pub reserved: u16, pub base_address: u64, pub range_length: u32, } #[cfg(target_arch = "aarch64")] #[allow(dead_code)] #[repr(packed)] struct GicIts { pub r#type: u8, pub length: u8, pub reserved0: u16, pub translation_id: u32, pub base_address: u64, pub reserved1: u32, } #[cfg(target_arch = "aarch64")] #[allow(dead_code)] #[repr(packed)] struct ProcessorHierarchyNode { pub r#type: u8, pub length: u8, pub reserved: u16, pub flags: u32, pub parent: u32, pub acpi_processor_id: u32, pub num_private_resources: u32, } #[allow(dead_code)] #[repr(packed)] #[derive(Default)] struct InterruptSourceOverride { pub r#type: u8, pub length: u8, pub bus: u8, pub source: u8, pub gsi: u32, pub flags: u16, } #[cfg(feature = "guest_debug")] macro_rules! round_up { ($n:expr,$d:expr) => { (($n / ($d + 1)) + 1) * $d }; } /// A wrapper around creating and using a kvm-based VCPU. pub struct Vcpu { // The hypervisor abstracted CPU. vcpu: Arc, id: u8, #[cfg(target_arch = "aarch64")] mpidr: u64, saved_state: Option, } impl Vcpu { /// Constructs a new VCPU for `vm`. /// /// # Arguments /// /// * `id` - Represents the CPU number between [0, max vcpus). /// * `vm` - The virtual machine this vcpu will get attached to. /// * `vm_ops` - Optional object for exit handling. pub fn new( id: u8, vm: &Arc, vm_ops: Option>, ) -> Result { let vcpu = vm .create_vcpu(id, vm_ops) .map_err(|e| Error::VcpuCreate(e.into()))?; // Initially the cpuid per vCPU is the one supported by this VM. Ok(Vcpu { vcpu, id, #[cfg(target_arch = "aarch64")] mpidr: 0, saved_state: None, }) } /// Configures a vcpu and should be called once per vcpu when created. /// /// # Arguments /// /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used. /// * `vm_memory` - Guest memory. /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure. pub fn configure( &mut self, #[cfg(target_arch = "aarch64")] vm: &Arc, kernel_entry_point: Option, #[cfg(target_arch = "x86_64")] vm_memory: &GuestMemoryAtomic, #[cfg(target_arch = "x86_64")] cpuid: Vec, #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, ) -> Result<()> { #[cfg(target_arch = "aarch64")] { self.init(vm)?; self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point) .map_err(Error::VcpuConfiguration)?; } info!("Configuring vCPU: cpu_id = {}", self.id); #[cfg(target_arch = "x86_64")] arch::configure_vcpu( &self.vcpu, self.id, kernel_entry_point, vm_memory, cpuid, kvm_hyperv, ) .map_err(Error::VcpuConfiguration)?; Ok(()) } /// Gets the MPIDR register value. #[cfg(target_arch = "aarch64")] pub fn get_mpidr(&self) -> u64 { self.mpidr } /// Gets the saved vCPU state. #[cfg(target_arch = "aarch64")] pub fn get_saved_state(&self) -> Option { self.saved_state.clone() } /// Initializes an aarch64 specific vcpu for booting Linux. #[cfg(target_arch = "aarch64")] pub fn init(&self, vm: &Arc) -> Result<()> { let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); // This reads back the kernel's preferred target type. vm.get_preferred_target(&mut kvi) .map_err(Error::VcpuArmPreferredTarget)?; // We already checked that the capability is supported. kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3; // Non-boot cpus are powered off initially. if self.id > 0 { kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; } self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit) } /// Runs the VCPU until it exits, returning the reason. /// /// Note that the state of the VCPU and associated VM must be setup first for this to do /// anything useful. pub fn run(&self) -> std::result::Result { self.vcpu.run() } } const VCPU_SNAPSHOT_ID: &str = "vcpu"; impl Pausable for Vcpu {} impl Snapshottable for Vcpu { fn id(&self) -> String { VCPU_SNAPSHOT_ID.to_string() } fn snapshot(&mut self) -> std::result::Result { let saved_state = self .vcpu .state() .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?; let mut vcpu_snapshot = Snapshot::new(&format!("{:03}", self.id)); vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state( VCPU_SNAPSHOT_ID, &saved_state, )?); self.saved_state = Some(saved_state); Ok(vcpu_snapshot) } fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { let saved_state: CpuState = snapshot.to_state(VCPU_SNAPSHOT_ID)?; self.vcpu .set_state(&saved_state) .map_err(|e| MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)))?; self.saved_state = Some(saved_state); Ok(()) } } pub struct CpuManager { hypervisor_type: HypervisorType, config: CpusConfig, #[cfg_attr(target_arch = "aarch64", allow(dead_code))] interrupt_controller: Option>>, #[cfg_attr(target_arch = "aarch64", allow(dead_code))] vm_memory: GuestMemoryAtomic, #[cfg(target_arch = "x86_64")] cpuid: Vec, #[cfg_attr(target_arch = "aarch64", allow(dead_code))] vm: Arc, vcpus_kill_signalled: Arc, vcpus_pause_signalled: Arc, exit_evt: EventFd, #[cfg_attr(target_arch = "aarch64", allow(dead_code))] reset_evt: EventFd, #[cfg(feature = "gdb")] vm_debug_evt: EventFd, vcpu_states: Vec, selected_cpu: u8, vcpus: Vec>>, seccomp_action: SeccompAction, vm_ops: Arc, #[cfg_attr(target_arch = "aarch64", allow(dead_code))] acpi_address: Option, proximity_domain_per_cpu: BTreeMap, affinity: BTreeMap>, dynamic: bool, } const CPU_ENABLE_FLAG: usize = 0; const CPU_INSERTING_FLAG: usize = 1; const CPU_REMOVING_FLAG: usize = 2; const CPU_EJECT_FLAG: usize = 3; const CPU_STATUS_OFFSET: u64 = 4; const CPU_SELECTION_OFFSET: u64 = 0; impl BusDevice for CpuManager { fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. data.fill(0); match offset { CPU_SELECTION_OFFSET => { data[0] = self.selected_cpu; } CPU_STATUS_OFFSET => { if self.selected_cpu < self.max_vcpus() { let state = &self.vcpu_states[usize::from(self.selected_cpu)]; if state.active() { data[0] |= 1 << CPU_ENABLE_FLAG; } if state.inserting { data[0] |= 1 << CPU_INSERTING_FLAG; } if state.removing { data[0] |= 1 << CPU_REMOVING_FLAG; } } else { warn!("Out of range vCPU id: {}", self.selected_cpu); } } _ => { warn!( "Unexpected offset for accessing CPU manager device: {:#}", offset ); } } } fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { match offset { CPU_SELECTION_OFFSET => { self.selected_cpu = data[0]; } CPU_STATUS_OFFSET => { if self.selected_cpu < self.max_vcpus() { let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; // The ACPI code writes back a 1 to acknowledge the insertion if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) && state.inserting { state.inserting = false; } // Ditto for removal if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) && state.removing { state.removing = false; } // Trigger removal of vCPU if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { if let Err(e) = self.remove_vcpu(self.selected_cpu) { error!("Error removing vCPU: {:?}", e); } } } else { warn!("Out of range vCPU id: {}", self.selected_cpu); } } _ => { warn!( "Unexpected offset for accessing CPU manager device: {:#}", offset ); } } None } } #[derive(Default)] struct VcpuState { inserting: bool, removing: bool, handle: Option>, kill: Arc, vcpu_run_interrupted: Arc, } impl VcpuState { fn active(&self) -> bool { self.handle.is_some() } fn signal_thread(&self) { if let Some(handle) = self.handle.as_ref() { loop { unsafe { libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); } if self.vcpu_run_interrupted.load(Ordering::SeqCst) { break; } else { // This is more effective than thread::yield_now() at // avoiding a priority inversion with the vCPU thread thread::sleep(std::time::Duration::from_millis(1)); } } } } fn join_thread(&mut self) -> Result<()> { if let Some(handle) = self.handle.take() { handle.join().map_err(Error::ThreadCleanup)? } Ok(()) } fn unpark_thread(&self) { if let Some(handle) = self.handle.as_ref() { handle.thread().unpark() } } } impl CpuManager { #[allow(unused_variables)] #[allow(clippy::too_many_arguments)] pub fn new( config: &CpusConfig, device_manager: &Arc>, memory_manager: &Arc>, vm: Arc, exit_evt: EventFd, reset_evt: EventFd, #[cfg(feature = "gdb")] vm_debug_evt: EventFd, hypervisor: Arc, seccomp_action: SeccompAction, vm_ops: Arc, #[cfg(feature = "tdx")] tdx_enabled: bool, numa_nodes: &NumaNodes, ) -> Result>> { let guest_memory = memory_manager.lock().unwrap().guest_memory(); let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); let hypervisor_type = hypervisor.hypervisor_type(); #[cfg(target_arch = "x86_64")] let sgx_epc_sections = memory_manager .lock() .unwrap() .sgx_epc_region() .as_ref() .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); #[cfg(target_arch = "x86_64")] let cpuid = { let phys_bits = physical_bits(config.max_phys_bits); arch::generate_common_cpuid( hypervisor, config .topology .clone() .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)), sgx_epc_sections, phys_bits, config.kvm_hyperv, #[cfg(feature = "tdx")] tdx_enabled, ) .map_err(Error::CommonCpuId)? }; #[cfg(all(feature = "amx", target_arch = "x86_64"))] if config.features.amx { const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; const XFEATURE_XTILEDATA: usize = 18; const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; // This is safe as the syscall is only modifing kernel internal // data structures that the kernel is itself expected to safeguard. let amx_tile = unsafe { libc::syscall( libc::SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, XFEATURE_XTILEDATA, ) }; if amx_tile != 0 { return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); } else { // This is safe as the mask being modified (not marked mutable as it is // modified in unsafe only which is permitted) isn't in use elsewhere. let mask: usize = 0; let result = unsafe { libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) }; if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); } } } let device_manager = device_manager.lock().unwrap(); let proximity_domain_per_cpu: BTreeMap = { let mut cpu_list = Vec::new(); for (proximity_domain, numa_node) in numa_nodes.iter() { for cpu in numa_node.cpus.iter() { cpu_list.push((*cpu, *proximity_domain)) } } cpu_list } .into_iter() .collect(); let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() { cpu_affinity .iter() .map(|a| (a.vcpu, a.host_cpus.clone())) .collect() } else { BTreeMap::new() }; #[cfg(feature = "tdx")] let dynamic = !tdx_enabled; #[cfg(not(feature = "tdx"))] let dynamic = true; let acpi_address = if dynamic { Some( device_manager .allocator() .lock() .unwrap() .allocate_platform_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None) .ok_or(Error::AllocateMmmioAddress)?, ) } else { None }; let cpu_manager = Arc::new(Mutex::new(CpuManager { hypervisor_type, config: config.clone(), interrupt_controller: device_manager.interrupt_controller().clone(), vm_memory: guest_memory, #[cfg(target_arch = "x86_64")] cpuid, vm, vcpus_kill_signalled: Arc::new(AtomicBool::new(false)), vcpus_pause_signalled: Arc::new(AtomicBool::new(false)), vcpu_states, exit_evt, reset_evt, #[cfg(feature = "gdb")] vm_debug_evt, selected_cpu: 0, vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), seccomp_action, vm_ops, acpi_address, proximity_domain_per_cpu, affinity, dynamic, })); if let Some(acpi_address) = acpi_address { device_manager .mmio_bus() .insert( cpu_manager.clone(), acpi_address.0, CPU_MANAGER_ACPI_SIZE as u64, ) .map_err(Error::BusError)?; } Ok(cpu_manager) } fn create_vcpu( &mut self, cpu_id: u8, entry_point: Option, snapshot: Option, ) -> Result<()> { info!("Creating vCPU: cpu_id = {}", cpu_id); let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?; if let Some(snapshot) = snapshot { // AArch64 vCPUs should be initialized after created. #[cfg(target_arch = "aarch64")] vcpu.init(&self.vm)?; vcpu.restore(snapshot).expect("Failed to restore vCPU"); } else { #[cfg(target_arch = "x86_64")] vcpu.configure( entry_point, &self.vm_memory, self.cpuid.clone(), self.config.kvm_hyperv, ) .expect("Failed to configure vCPU"); #[cfg(target_arch = "aarch64")] vcpu.configure(&self.vm, entry_point) .expect("Failed to configure vCPU"); } // Adding vCPU to the CpuManager's vCPU list. let vcpu = Arc::new(Mutex::new(vcpu)); self.vcpus.push(vcpu); Ok(()) } /// Only create new vCPUs if there aren't any inactive ones to reuse fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option) -> Result<()> { info!( "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}", desired_vcpus, self.config.max_vcpus, self.vcpus.len(), self.present_vcpus() ); if desired_vcpus > self.config.max_vcpus { return Err(Error::DesiredVCpuCountExceedsMax); } // Only create vCPUs in excess of all the allocated vCPUs. for cpu_id in self.vcpus.len() as u8..desired_vcpus { self.create_vcpu(cpu_id, entry_point, None)?; } Ok(()) } #[cfg(target_arch = "aarch64")] pub fn init_pmu(&self, irq: u32) -> Result { for cpu in self.vcpus.iter() { let cpu = cpu.lock().unwrap(); // Check if PMU attr is available, if not, log the information. if cpu.vcpu.has_pmu_support() { cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?; } else { debug!( "PMU attribute is not supported in vCPU{}, skip PMU init!", cpu.id ); return Ok(false); } } Ok(true) } fn start_vcpu( &mut self, vcpu: Arc>, vcpu_id: u8, vcpu_thread_barrier: Arc, inserting: bool, ) -> Result<()> { let reset_evt = self.reset_evt.try_clone().unwrap(); let exit_evt = self.exit_evt.try_clone().unwrap(); #[cfg(feature = "gdb")] let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); let panic_exit_evt = self.exit_evt.try_clone().unwrap(); let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] .vcpu_run_interrupted .clone(); let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); // Prepare the CPU set the current vCPU is expected to run onto. let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() }; unsafe { libc::CPU_ZERO(&mut cpuset) }; for host_cpu in host_cpus { unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) }; } cpuset }); // Retrieve seccomp filter for vcpu thread let vcpu_seccomp_filter = get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type) .map_err(Error::CreateSeccompFilter)?; #[cfg(target_arch = "x86_64")] let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned(); info!("Starting vCPU: cpu_id = {}", vcpu_id); let handle = Some( thread::Builder::new() .name(format!("vcpu{}", vcpu_id)) .spawn(move || { // Schedule the thread to run on the expected CPU set if let Some(cpuset) = cpuset.as_ref() { let ret = unsafe { libc::sched_setaffinity( 0, std::mem::size_of::(), cpuset as *const libc::cpu_set_t, ) }; if ret != 0 { error!( "Failed scheduling the vCPU {} on the expected CPU set: {}", vcpu_id, io::Error::last_os_error() ); return; } } // Apply seccomp filter for vcpu thread. if !vcpu_seccomp_filter.is_empty() { if let Err(e) = apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) { error!("Error applying seccomp filter: {:?}", e); return; } } extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} // This uses an async signal safe handler to kill the vcpu handles. register_signal_handler(SIGRTMIN(), handle_signal) .expect("Failed to register vcpu signal handler"); // Block until all CPUs are ready. vcpu_thread_barrier.wait(); std::panic::catch_unwind(move || { loop { // If we are being told to pause, we park the thread // until the pause boolean is toggled. // The resume operation is responsible for toggling // the boolean and unpark the thread. // We enter a loop because park() could spuriously // return. We will then park() again unless the // pause boolean has been toggled. // Need to use Ordering::SeqCst as we have multiple // loads and stores to different atomics and we need // to see them in a consistent order in all threads if vcpu_pause_signalled.load(Ordering::SeqCst) { // As a pause can be caused by PIO & MMIO exits then we need to ensure they are // completed by returning to KVM_RUN. From the kernel docs: // // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding // operations are complete (and guest state is consistent) only after userspace // has re-entered the kernel with KVM_RUN. The kernel side will first finish // incomplete operations and then check for pending signals. // The pending state of the operation is not preserved in state which is // visible to userspace, thus userspace should ensure that the operation is // completed before performing a live migration. Userspace can re-enter the // guest with an unmasked signal pending or with the immediate_exit field set // to complete pending operations without allowing any further instructions // to be executed. #[cfg(feature = "kvm")] { vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { error!("Unexpected VM exit on \"immediate_exit\" run"); break; } vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); } vcpu_run_interrupted.store(true, Ordering::SeqCst); while vcpu_pause_signalled.load(Ordering::SeqCst) { thread::park(); } vcpu_run_interrupted.store(false, Ordering::SeqCst); } // We've been told to terminate if vcpu_kill_signalled.load(Ordering::SeqCst) || vcpu_kill.load(Ordering::SeqCst) { vcpu_run_interrupted.store(true, Ordering::SeqCst); break; } #[cfg(feature = "tdx")] let mut vcpu = vcpu.lock().unwrap(); #[cfg(not(feature = "tdx"))] let vcpu = vcpu.lock().unwrap(); // vcpu.run() returns false on a triple-fault so trigger a reset match vcpu.run() { Ok(run) => match run { #[cfg(feature = "kvm")] VmExit::Debug => { info!("VmExit::Debug"); #[cfg(feature = "gdb")] { vcpu_pause_signalled.store(true, Ordering::SeqCst); let raw_tid = get_raw_tid(vcpu_id as usize); vm_debug_evt.write(raw_tid as u64).unwrap(); } } #[cfg(target_arch = "x86_64")] VmExit::IoapicEoi(vector) => { if let Some(interrupt_controller) = &interrupt_controller_clone { interrupt_controller .lock() .unwrap() .end_of_interrupt(vector); } } VmExit::Ignore => {} VmExit::Hyperv => {} VmExit::Reset => { info!("VmExit::Reset"); vcpu_run_interrupted.store(true, Ordering::SeqCst); reset_evt.write(1).unwrap(); break; } VmExit::Shutdown => { info!("VmExit::Shutdown"); vcpu_run_interrupted.store(true, Ordering::SeqCst); exit_evt.write(1).unwrap(); break; } #[cfg(feature = "tdx")] VmExit::Tdx => { if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) { match vcpu.get_tdx_exit_details() { Ok(details) => match details { TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"), TdxExitDetails::SetupEventNotifyInterrupt => { warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported") } }, Err(e) => error!("Unexpected TDX VMCALL: {}", e), } vcpu.set_tdx_status(TdxExitStatus::InvalidOperand); } else { // We should never reach this code as // this means the design from the code // is wrong. unreachable!("Couldn't get a mutable reference from Arc as there are multiple instances"); } } _ => { error!( "VCPU generated error: {:?}", Error::UnexpectedVmExit ); break; } }, Err(e) => { error!("VCPU generated error: {:?}", Error::VcpuRun(e.into())); break; } } // We've been told to terminate if vcpu_kill_signalled.load(Ordering::SeqCst) || vcpu_kill.load(Ordering::SeqCst) { vcpu_run_interrupted.store(true, Ordering::SeqCst); break; } } }) .or_else(|_| { panic_vcpu_run_interrupted.store(true, Ordering::SeqCst); error!("vCPU thread panicked"); panic_exit_evt.write(1) }) .ok(); }) .map_err(Error::VcpuSpawn)?, ); // On hot plug calls into this function entry_point is None. It is for // those hotplug CPU additions that we need to set the inserting flag. self.vcpu_states[usize::from(vcpu_id)].handle = handle; self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; Ok(()) } /// Start up as many vCPUs threads as needed to reach `desired_vcpus` fn activate_vcpus( &mut self, desired_vcpus: u8, inserting: bool, paused: Option, ) -> Result<()> { if desired_vcpus > self.config.max_vcpus { return Err(Error::DesiredVCpuCountExceedsMax); } let vcpu_thread_barrier = Arc::new(Barrier::new( (desired_vcpus - self.present_vcpus() + 1) as usize, )); if let Some(paused) = paused { self.vcpus_pause_signalled.store(paused, Ordering::SeqCst); } info!( "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}", desired_vcpus, self.vcpus.len(), self.present_vcpus(), self.vcpus_pause_signalled.load(Ordering::SeqCst) ); // This reuses any inactive vCPUs as well as any that were newly created for vcpu_id in self.present_vcpus()..desired_vcpus { let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]); self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?; } // Unblock all CPU threads. vcpu_thread_barrier.wait(); Ok(()) } fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { // Mark vCPUs for removal, actual removal happens on ejection for cpu_id in desired_vcpus..self.present_vcpus() { self.vcpu_states[usize::from(cpu_id)].removing = true; } } fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { info!("Removing vCPU: cpu_id = {}", cpu_id); let mut state = &mut self.vcpu_states[usize::from(cpu_id)]; state.kill.store(true, Ordering::SeqCst); state.signal_thread(); state.join_thread()?; state.handle = None; // Once the thread has exited, clear the "kill" so that it can reused state.kill.store(false, Ordering::SeqCst); Ok(()) } pub fn create_boot_vcpus(&mut self, entry_point: Option) -> Result<()> { self.create_vcpus(self.boot_vcpus(), entry_point) } // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running. pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> { self.activate_vcpus(self.boot_vcpus(), false, Some(paused)) } pub fn start_restored_vcpus(&mut self) -> Result<()> { self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) .map_err(|e| { Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) })?; Ok(()) } pub fn resize(&mut self, desired_vcpus: u8) -> Result { if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { return Ok(false); } if !self.dynamic { return Ok(false); } match desired_vcpus.cmp(&self.present_vcpus()) { cmp::Ordering::Greater => { self.create_vcpus(desired_vcpus, None)?; self.activate_vcpus(desired_vcpus, true, None)?; Ok(true) } cmp::Ordering::Less => { self.mark_vcpus_for_removal(desired_vcpus); Ok(true) } _ => Ok(false), } } pub fn shutdown(&mut self) -> Result<()> { // Tell the vCPUs to stop themselves next time they go through the loop self.vcpus_kill_signalled.store(true, Ordering::SeqCst); // Toggle the vCPUs pause boolean self.vcpus_pause_signalled.store(false, Ordering::SeqCst); // Unpark all the VCPU threads. for state in self.vcpu_states.iter() { state.unpark_thread(); } // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set // above. for state in self.vcpu_states.iter() { state.signal_thread(); } // Wait for all the threads to finish. This removes the state from the vector. for mut state in self.vcpu_states.drain(..) { state.join_thread()?; } Ok(()) } #[cfg(feature = "tdx")] pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> { for vcpu in &self.vcpus { vcpu.lock() .unwrap() .vcpu .tdx_init(hob_address) .map_err(Error::InitializeTdx)?; } Ok(()) } pub fn boot_vcpus(&self) -> u8 { self.config.boot_vcpus } pub fn max_vcpus(&self) -> u8 { self.config.max_vcpus } #[cfg(target_arch = "x86_64")] pub fn common_cpuid(&self) -> Vec { self.cpuid.clone() } fn present_vcpus(&self) -> u8 { self.vcpu_states .iter() .fold(0, |acc, state| acc + state.active() as u8) } #[cfg(target_arch = "aarch64")] pub fn get_mpidrs(&self) -> Vec { self.vcpus .iter() .map(|cpu| cpu.lock().unwrap().get_mpidr()) .collect() } #[cfg(target_arch = "aarch64")] pub fn get_saved_states(&self) -> Vec { self.vcpus .iter() .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap()) .collect() } #[cfg(target_arch = "aarch64")] pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { self.config .topology .clone() .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) } pub fn create_madt(&self) -> Sdt { use crate::acpi; // This is also checked in the commandline parsing. assert!(self.config.boot_vcpus <= self.config.max_vcpus); let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT ", 1); #[cfg(target_arch = "x86_64")] { madt.write(36, arch::layout::APIC_START); for cpu in 0..self.config.max_vcpus { let lapic = LocalApic { r#type: acpi::ACPI_APIC_PROCESSOR, length: 8, processor_id: cpu, apic_id: cpu, flags: if cpu < self.config.boot_vcpus { 1 << MADT_CPU_ENABLE_FLAG } else { 0 } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG, }; madt.append(lapic); } madt.append(Ioapic { r#type: acpi::ACPI_APIC_IO, length: 12, ioapic_id: 0, apic_address: arch::layout::IOAPIC_START.0 as u32, gsi_base: 0, ..Default::default() }); madt.append(InterruptSourceOverride { r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE, length: 10, bus: 0, source: 4, gsi: 4, flags: 0, }); } #[cfg(target_arch = "aarch64")] { /* Notes: * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table. */ // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec. for cpu in 0..self.config.boot_vcpus { let vcpu = &self.vcpus[cpu as usize]; let mpidr = vcpu.lock().unwrap().get_mpidr(); /* ARMv8 MPIDR format: Bits [63:40] Must be zero Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR Bits [31:24] Must be zero Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR */ let mpidr_mask = 0xff_00ff_ffff; let gicc = GicC { r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, length: 80, reserved0: 0, cpu_interface_number: cpu as u32, uid: cpu as u32, flags: 1, parking_version: 0, performance_interrupt: 0, parked_address: 0, base_address: 0, gicv_base_address: 0, gich_base_address: 0, vgic_interrupt: 0, gicr_base_address: 0, mpidr: mpidr & mpidr_mask, proc_power_effi_class: 0, reserved1: 0, spe_overflow_interrupt: 0, }; madt.append(gicc); } // GIC Distributor structure. See section 5.2.12.15 in ACPI spec. let gicd = GicD { r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR, length: 24, reserved0: 0, gic_id: 0, base_address: arch::layout::GIC_V3_DIST_START.0, global_irq_base: 0, version: 3, reserved1: [0; 3], }; madt.append(gicd); // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec. let gicr_size: u32 = (arch::layout::GIC_V3_REDIST_SIZE * self.config.boot_vcpus as u64) as u32; let gicr_base: u64 = arch::layout::GIC_V3_DIST_START.0 - gicr_size as u64; let gicr = GicR { r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR, length: 16, reserved: 0, base_address: gicr_base, range_length: gicr_size, }; madt.append(gicr); // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec. let gicits = GicIts { r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR, length: 20, reserved0: 0, translation_id: 0, base_address: gicr_base - arch::layout::GIC_V3_ITS_SIZE, reserved1: 0, }; madt.append(gicits); madt.update_checksum(); } madt } #[cfg(target_arch = "aarch64")] pub fn create_pptt(&self) -> Sdt { let pptt_start = 0; let mut cpus = 0; let mut uid = 0; // If topology is not specified, the default setting is: // 1 package, multiple cores, 1 thread per core // This is also the behavior when PPTT is missing. let (threads_per_core, cores_per_package, packages) = self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); for cluster_idx in 0..packages { if cpus < self.config.boot_vcpus as usize { let cluster_offset = pptt.len() - pptt_start; let cluster_hierarchy_node = ProcessorHierarchyNode { r#type: 0, length: 20, reserved: 0, flags: 0x2, parent: 0, acpi_processor_id: cluster_idx as u32, num_private_resources: 0, }; pptt.append(cluster_hierarchy_node); for core_idx in 0..cores_per_package { let core_offset = pptt.len() - pptt_start; if threads_per_core > 1 { let core_hierarchy_node = ProcessorHierarchyNode { r#type: 0, length: 20, reserved: 0, flags: 0x2, parent: cluster_offset as u32, acpi_processor_id: core_idx as u32, num_private_resources: 0, }; pptt.append(core_hierarchy_node); for _thread_idx in 0..threads_per_core { let thread_hierarchy_node = ProcessorHierarchyNode { r#type: 0, length: 20, reserved: 0, flags: 0xE, parent: core_offset as u32, acpi_processor_id: uid as u32, num_private_resources: 0, }; pptt.append(thread_hierarchy_node); uid += 1; } } else { let thread_hierarchy_node = ProcessorHierarchyNode { r#type: 0, length: 20, reserved: 0, flags: 0xA, parent: cluster_offset as u32, acpi_processor_id: uid as u32, num_private_resources: 0, }; pptt.append(thread_hierarchy_node); uid += 1; } } cpus += (cores_per_package * threads_per_core) as usize; } } pptt.update_checksum(); pptt } #[cfg(feature = "gdb")] fn get_regs(&self, cpu_id: u8) -> Result { self.vcpus[usize::from(cpu_id)] .lock() .unwrap() .vcpu .get_regs() .map_err(Error::CpuDebug) } #[cfg(feature = "gdb")] fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> { self.vcpus[usize::from(cpu_id)] .lock() .unwrap() .vcpu .set_regs(regs) .map_err(Error::CpuDebug) } #[cfg(all(target_arch = "x86_64", feature = "gdb"))] fn get_sregs(&self, cpu_id: u8) -> Result { self.vcpus[usize::from(cpu_id)] .lock() .unwrap() .vcpu .get_sregs() .map_err(Error::CpuDebug) } #[cfg(all(target_arch = "x86_64", feature = "gdb"))] fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> { self.vcpus[usize::from(cpu_id)] .lock() .unwrap() .vcpu .set_sregs(sregs) .map_err(Error::CpuDebug) } #[cfg(all(target_arch = "x86_64", feature = "gdb"))] fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result { let (gpa, _) = self.vcpus[usize::from(cpu_id)] .lock() .unwrap() .vcpu .translate_gva(gva, /* flags: unused */ 0) .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; Ok(gpa) } /// /// On AArch64, `translate_gva` API is not provided by KVM. We implemented /// it in VMM by walking through translation tables. /// /// Address translation is big topic, here we only focus the scenario that /// happens in VMM while debugging kernel. This `translate_gva` /// implementation is restricted to: /// - Exception Level 1 /// - Translate high address range only (kernel space) /// /// This implementation supports following Arm-v8a features related to /// address translation: /// - FEAT_LPA /// - FEAT_LVA /// - FEAT_LPA2 /// #[cfg(all(target_arch = "aarch64", feature = "gdb"))] fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result { let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)] .lock() .unwrap() .vcpu .get_sys_reg(regs::TCR_EL1) .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)] .lock() .unwrap() .vcpu .get_sys_reg(regs::TTBR1_EL1) .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)] .lock() .unwrap() .vcpu .get_sys_reg(regs::ID_AA64MMFR0_EL1) .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; // Bit 55 of the VA determines the range, high (0xFFFxxx...) // or low (0x000xxx...). let high_range = extract_bits_64!(gva, 55, 1); if high_range == 0 { info!("VA (0x{:x}) range is not supported!", gva); return Ok(gva); } // High range size offset let tsz = extract_bits_64!(tcr_el1, 16, 6); // Granule size let tg = extract_bits_64!(tcr_el1, 30, 2); // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2 let ds = extract_bits_64!(tcr_el1, 59, 1); if tsz == 0 { info!("VA translation is not ready!"); return Ok(gva); } // VA size is determined by TCR_BL1.T1SZ let va_size = 64 - tsz; // Number of bits in VA consumed in each level of translation let stride = match tg { 3 => 13, // 64KB granule size 1 => 11, // 16KB granule size _ => 9, // 4KB, default }; // Starting level of walking let mut level = 4 - (va_size - 4) / stride; // PA or IPA size is determined let tcr_ips = extract_bits_64!(tcr_el1, 32, 3); #[allow(clippy::identity_op)] let pa_range = extract_bits_64!(id_aa64mmfr0_el1, 0, 4); // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match. // To be safe, we use the minimum value if they are different. let pa_range = std::cmp::min(tcr_ips, pa_range); // PA size in bits let pa_size = match pa_range { 0 => 32, 1 => 36, 2 => 40, 3 => 42, 4 => 44, 5 => 48, 6 => 52, _ => { return Err(Error::TranslateVirtualAddress(anyhow!(format!( "PA range not supported {}", pa_range )))) } }; let indexmask_grainsize = (!0u64) >> (64 - (stride + 3)); let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level)))); // If FEAT_LPA2 is present, the translation table descriptor holds // 50 bits of the table address of next level. // Otherwise, it is 48 bits. let descaddrmask = if ds == 1 { !0u64 >> (64 - 50) // mask with 50 least significant bits } else { !0u64 >> (64 - 48) // mask with 48 least significant bits }; let descaddrmask = descaddrmask & !indexmask_grainsize; // Translation table base address #[allow(clippy::identity_op)] let mut descaddr: u64 = extract_bits_64!(ttbr1_el1, 0, 48); // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5]. if pa_size == 52 { descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48; } // Loop through tables of each level loop { // Table offset for current level let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask; descaddr |= table_offset; descaddr &= !7u64; let mut buf = [0; 8]; self.vm_memory .memory() .read(&mut buf, GuestAddress(descaddr)) .map_err(|e| Error::TranslateVirtualAddress(e.into()))?; let descriptor = u64::from_le_bytes(buf); descaddr = descriptor & descaddrmask; // In the case of FEAT_LPA, the next-level translation table address // bits [48:51] comes from bits [12:15] of the current descriptor. // For FEAT_LPA2, the next-level translation table address // bits [50:51] comes from bits [8:9] of the current descriptor, // bits [48:49] comes from bits [48:49] of the descriptor which was // handled previously. if pa_size == 52 { if ds == 1 { // FEAT_LPA2 descaddr |= extract_bits_64!(descriptor, 8, 2) << 50; } else { // FEAT_LPA descaddr |= extract_bits_64!(descriptor, 12, 4) << 48; } } if (descriptor & 2) != 0 && (level < 3) { // This is a table entry. Go down to next level. level += 1; indexmask = indexmask_grainsize; continue; } break; } // We have reached either: // - a page entry at level 3 or // - a block entry at level 1 or 2 let page_size = 1u64 << ((stride * (4 - level)) + 3); descaddr &= !(page_size - 1); descaddr |= gva & (page_size - 1); Ok(descaddr) } } struct Cpu { cpu_id: u8, proximity_domain: u32, dynamic: bool, } #[cfg(target_arch = "x86_64")] const MADT_CPU_ENABLE_FLAG: usize = 0; #[cfg(target_arch = "x86_64")] const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; impl Cpu { #[cfg(target_arch = "x86_64")] fn generate_mat(&self) -> Vec { let lapic = LocalApic { r#type: 0, length: 8, processor_id: self.cpu_id, apic_id: self.cpu_id, flags: 1 << MADT_CPU_ENABLE_FLAG, }; let mut mat_data: Vec = Vec::new(); mat_data.resize(std::mem::size_of_val(&lapic), 0); unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic }; mat_data } } impl Aml for Cpu { fn append_aml_bytes(&self, bytes: &mut Vec) { #[cfg(target_arch = "x86_64")] let mat_data: Vec = self.generate_mat(); #[allow(clippy::if_same_then_else)] if self.dynamic { aml::Device::new( format!("C{:03}", self.cpu_id).as_str().into(), vec![ &aml::Name::new("_HID".into(), &"ACPI0007"), &aml::Name::new("_UID".into(), &self.cpu_id), // Currently, AArch64 cannot support following fields. /* _STA return value: Bit [0] – Set if the device is present. Bit [1] – Set if the device is enabled and decoding its resources. Bit [2] – Set if the device should be shown in the UI. Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). Bit [4] – Set if the battery is present. Bits [31:5] – Reserved (must be cleared). */ #[cfg(target_arch = "x86_64")] &aml::Method::new( "_STA".into(), 0, false, // Call into CSTA method which will interrogate device vec![&aml::Return::new(&aml::MethodCall::new( "CSTA".into(), vec![&self.cpu_id], ))], ), &aml::Method::new( "_PXM".into(), 0, false, vec![&aml::Return::new(&self.proximity_domain)], ), // The Linux kernel expects every CPU device to have a _MAT entry // containing the LAPIC for this processor with the enabled bit set // even it if is disabled in the MADT (non-boot CPU) #[cfg(target_arch = "x86_64")] &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), // Trigger CPU ejection #[cfg(target_arch = "x86_64")] &aml::Method::new( "_EJ0".into(), 1, false, // Call into CEJ0 method which will actually eject device vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])], ), ], ) .append_aml_bytes(bytes); } else { aml::Device::new( format!("C{:03}", self.cpu_id).as_str().into(), vec![ &aml::Name::new("_HID".into(), &"ACPI0007"), &aml::Name::new("_UID".into(), &self.cpu_id), #[cfg(target_arch = "x86_64")] &aml::Method::new( "_STA".into(), 0, false, // Mark CPU present see CSTA implementation vec![&aml::Return::new(&0xfu8)], ), &aml::Method::new( "_PXM".into(), 0, false, vec![&aml::Return::new(&self.proximity_domain)], ), // The Linux kernel expects every CPU device to have a _MAT entry // containing the LAPIC for this processor with the enabled bit set // even it if is disabled in the MADT (non-boot CPU) #[cfg(target_arch = "x86_64")] &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)), ], ) .append_aml_bytes(bytes); } } } struct CpuNotify { cpu_id: u8, } impl Aml for CpuNotify { fn append_aml_bytes(&self, bytes: &mut Vec) { let object = aml::Path::new(&format!("C{:03}", self.cpu_id)); aml::If::new( &aml::Equal::new(&aml::Arg(0), &self.cpu_id), vec![&aml::Notify::new(&object, &aml::Arg(1))], ) .append_aml_bytes(bytes) } } struct CpuMethods { max_vcpus: u8, dynamic: bool, } impl Aml for CpuMethods { fn append_aml_bytes(&self, bytes: &mut Vec) { if self.dynamic { // CPU status method aml::Method::new( "CSTA".into(), 1, true, vec![ // Take lock defined above &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), // Write CPU number (in first argument) to I/O port via field &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), &aml::Store::new(&aml::Local(0), &aml::ZERO), // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning) &aml::If::new( &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE), vec![&aml::Store::new(&aml::Local(0), &0xfu8)], ), // Release lock &aml::Release::new("\\_SB_.PRES.CPLK".into()), // Return 0 or 0xf &aml::Return::new(&aml::Local(0)), ], ) .append_aml_bytes(bytes); let mut cpu_notifies = Vec::new(); for cpu_id in 0..self.max_vcpus { cpu_notifies.push(CpuNotify { cpu_id }); } let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); for cpu_id in 0..self.max_vcpus { cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); } aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes); aml::Method::new( "CEJ0".into(), 1, true, vec![ &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), // Write CPU number (in first argument) to I/O port via field &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)), // Set CEJ0 bit &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE), &aml::Release::new("\\_SB_.PRES.CPLK".into()), ], ) .append_aml_bytes(bytes); aml::Method::new( "CSCN".into(), 0, true, vec![ // Take lock defined above &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff), &aml::Store::new(&aml::Local(0), &aml::ZERO), &aml::While::new( &aml::LessThan::new(&aml::Local(0), &self.max_vcpus), vec![ // Write CPU number (in first argument) to I/O port via field &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)), // Check if CINS bit is set &aml::If::new( &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE), // Notify device if it is vec![ &aml::MethodCall::new( "CTFY".into(), vec![&aml::Local(0), &aml::ONE], ), // Reset CINS bit &aml::Store::new( &aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE, ), ], ), // Check if CRMV bit is set &aml::If::new( &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE), // Notify device if it is (with the eject constant 0x3) vec![ &aml::MethodCall::new( "CTFY".into(), vec![&aml::Local(0), &3u8], ), // Reset CRMV bit &aml::Store::new( &aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE, ), ], ), &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), ], ), // Release lock &aml::Release::new("\\_SB_.PRES.CPLK".into()), ], ) .append_aml_bytes(bytes) } else { aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes) } } } impl Aml for CpuManager { fn append_aml_bytes(&self, bytes: &mut Vec) { #[cfg(target_arch = "x86_64")] if let Some(acpi_address) = self.acpi_address { // CPU hotplug controller aml::Device::new( "_SB_.PRES".into(), vec![ &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"), // Mutex to protect concurrent access as we write to choose CPU and then read back status &aml::Mutex::new("CPLK".into(), 0), &aml::Name::new( "_CRS".into(), &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( aml::AddressSpaceCachable::NotCacheable, true, acpi_address.0 as u64, acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1, )]), ), // OpRegion and Fields map MMIO range into individual field values &aml::OpRegion::new( "PRST".into(), aml::OpRegionSpace::SystemMemory, acpi_address.0 as usize, CPU_MANAGER_ACPI_SIZE, ), &aml::Field::new( "PRST".into(), aml::FieldAccessType::Byte, aml::FieldUpdateRule::WriteAsZeroes, vec![ aml::FieldEntry::Reserved(32), aml::FieldEntry::Named(*b"CPEN", 1), aml::FieldEntry::Named(*b"CINS", 1), aml::FieldEntry::Named(*b"CRMV", 1), aml::FieldEntry::Named(*b"CEJ0", 1), aml::FieldEntry::Reserved(4), aml::FieldEntry::Named(*b"CCMD", 8), ], ), &aml::Field::new( "PRST".into(), aml::FieldAccessType::DWord, aml::FieldUpdateRule::Preserve, vec![ aml::FieldEntry::Named(*b"CSEL", 32), aml::FieldEntry::Reserved(32), aml::FieldEntry::Named(*b"CDAT", 32), ], ), ], ) .append_aml_bytes(bytes); } // CPU devices let hid = aml::Name::new("_HID".into(), &"ACPI0010"); let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05")); // Bundle methods together under a common object let methods = CpuMethods { max_vcpus: self.config.max_vcpus, dynamic: self.dynamic, }; let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods]; let mut cpu_devices = Vec::new(); for cpu_id in 0..self.config.max_vcpus { let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0); let cpu_device = Cpu { cpu_id, proximity_domain, dynamic: self.dynamic, }; cpu_devices.push(cpu_device); } for cpu_device in cpu_devices.iter() { cpu_data_inner.push(cpu_device); } aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes) } } impl Pausable for CpuManager { fn pause(&mut self) -> std::result::Result<(), MigratableError> { // Tell the vCPUs to pause themselves next time they exit self.vcpus_pause_signalled.store(true, Ordering::SeqCst); // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set // above. for state in self.vcpu_states.iter() { state.signal_thread(); } for vcpu in self.vcpus.iter() { let mut vcpu = vcpu.lock().unwrap(); vcpu.pause()?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] if !self.config.kvm_hyperv { vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { MigratableError::Pause(anyhow!( "Could not notify guest it has been paused {:?}", e )) })?; } } Ok(()) } fn resume(&mut self) -> std::result::Result<(), MigratableError> { for vcpu in self.vcpus.iter() { vcpu.lock().unwrap().resume()?; } // Toggle the vCPUs pause boolean self.vcpus_pause_signalled.store(false, Ordering::SeqCst); // Unpark all the VCPU threads. // Once unparked, the next thing they will do is checking for the pause // boolean. Since it'll be set to false, they will exit their pause loop // and go back to vmx root. for state in self.vcpu_states.iter() { state.unpark_thread(); } Ok(()) } } impl Snapshottable for CpuManager { fn id(&self) -> String { CPU_MANAGER_SNAPSHOT_ID.to_string() } fn snapshot(&mut self) -> std::result::Result { let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID); // The CpuManager snapshot is a collection of all vCPUs snapshots. for vcpu in &self.vcpus { let cpu_snapshot = vcpu.lock().unwrap().snapshot()?; cpu_manager_snapshot.add_snapshot(cpu_snapshot); } Ok(cpu_manager_snapshot) } fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { for (cpu_id, snapshot) in snapshot.snapshots.iter() { info!("Restoring VCPU {}", cpu_id); self.create_vcpu(cpu_id.parse::().unwrap(), None, Some(*snapshot.clone())) .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?; } Ok(()) } } impl Transportable for CpuManager {} impl Migratable for CpuManager {} #[cfg(feature = "gdb")] impl Debuggable for CpuManager { #[cfg(feature = "kvm")] fn set_guest_debug( &self, cpu_id: usize, addrs: &[GuestAddress], singlestep: bool, ) -> std::result::Result<(), DebuggableError> { self.vcpus[cpu_id] .lock() .unwrap() .vcpu .set_guest_debug(addrs, singlestep) .map_err(DebuggableError::SetDebug) } fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { Ok(()) } fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { Ok(()) } #[cfg(target_arch = "x86_64")] fn read_regs(&self, cpu_id: usize) -> std::result::Result { // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15 let gregs = self .get_regs(cpu_id as u8) .map_err(DebuggableError::ReadRegs)?; let regs = [ gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp, gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, ]; // GDB exposes 32-bit eflags instead of 64-bit rflags. // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml let eflags = gregs.rflags as u32; let rip = gregs.rip; // Segment registers: CS, SS, DS, ES, FS, GS let sregs = self .get_sregs(cpu_id as u8) .map_err(DebuggableError::ReadRegs)?; let segments = X86SegmentRegs { cs: sregs.cs.selector as u32, ss: sregs.ss.selector as u32, ds: sregs.ds.selector as u32, es: sregs.es.selector as u32, fs: sregs.fs.selector as u32, gs: sregs.gs.selector as u32, }; // TODO: Add other registers Ok(CoreRegs { regs, eflags, rip, segments, ..Default::default() }) } #[cfg(target_arch = "aarch64")] fn read_regs(&self, cpu_id: usize) -> std::result::Result { let gregs = self .get_regs(cpu_id as u8) .map_err(DebuggableError::ReadRegs)?; Ok(CoreRegs { x: gregs.regs.regs, sp: gregs.regs.sp, pc: gregs.regs.pc, ..Default::default() }) } #[cfg(target_arch = "x86_64")] fn write_regs( &self, cpu_id: usize, regs: &CoreRegs, ) -> std::result::Result<(), DebuggableError> { let orig_gregs = self .get_regs(cpu_id as u8) .map_err(DebuggableError::ReadRegs)?; let gregs = StandardRegisters { rax: regs.regs[0], rbx: regs.regs[1], rcx: regs.regs[2], rdx: regs.regs[3], rsi: regs.regs[4], rdi: regs.regs[5], rbp: regs.regs[6], rsp: regs.regs[7], r8: regs.regs[8], r9: regs.regs[9], r10: regs.regs[10], r11: regs.regs[11], r12: regs.regs[12], r13: regs.regs[13], r14: regs.regs[14], r15: regs.regs[15], rip: regs.rip, // Update the lower 32-bit of rflags. rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64), }; self.set_regs(cpu_id as u8, &gregs) .map_err(DebuggableError::WriteRegs)?; // Segment registers: CS, SS, DS, ES, FS, GS // Since GDB care only selectors, we call get_sregs() first. let mut sregs = self .get_sregs(cpu_id as u8) .map_err(DebuggableError::ReadRegs)?; sregs.cs.selector = regs.segments.cs as u16; sregs.ss.selector = regs.segments.ss as u16; sregs.ds.selector = regs.segments.ds as u16; sregs.es.selector = regs.segments.es as u16; sregs.fs.selector = regs.segments.fs as u16; sregs.gs.selector = regs.segments.gs as u16; self.set_sregs(cpu_id as u8, &sregs) .map_err(DebuggableError::WriteRegs)?; // TODO: Add other registers Ok(()) } #[cfg(target_arch = "aarch64")] fn write_regs( &self, cpu_id: usize, regs: &CoreRegs, ) -> std::result::Result<(), DebuggableError> { let mut gregs = self .get_regs(cpu_id as u8) .map_err(DebuggableError::ReadRegs)?; gregs.regs.regs = regs.x; gregs.regs.sp = regs.sp; gregs.regs.pc = regs.pc; self.set_regs(cpu_id as u8, &gregs) .map_err(DebuggableError::WriteRegs)?; Ok(()) } fn read_mem( &self, cpu_id: usize, vaddr: GuestAddress, len: usize, ) -> std::result::Result, DebuggableError> { let mut buf = vec![0; len]; let mut total_read = 0_u64; while total_read < len as u64 { let gaddr = vaddr.0 + total_read; let paddr = match self.translate_gva(cpu_id as u8, gaddr) { Ok(paddr) => paddr, Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. Err(e) => return Err(DebuggableError::TranslateGva(e)), }; let psize = arch::PAGE_SIZE as u64; let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1))); self.vm_memory .memory() .read( &mut buf[total_read as usize..total_read as usize + read_len as usize], GuestAddress(paddr), ) .map_err(DebuggableError::ReadMem)?; total_read += read_len; } Ok(buf) } fn write_mem( &self, cpu_id: usize, vaddr: &GuestAddress, data: &[u8], ) -> std::result::Result<(), DebuggableError> { let mut total_written = 0_u64; while total_written < data.len() as u64 { let gaddr = vaddr.0 + total_written; let paddr = match self.translate_gva(cpu_id as u8, gaddr) { Ok(paddr) => paddr, Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0. Err(e) => return Err(DebuggableError::TranslateGva(e)), }; let psize = arch::PAGE_SIZE as u64; let write_len = std::cmp::min( data.len() as u64 - total_written, psize - (paddr & (psize - 1)), ); self.vm_memory .memory() .write( &data[total_written as usize..total_written as usize + write_len as usize], GuestAddress(paddr), ) .map_err(DebuggableError::WriteMem)?; total_written += write_len; } Ok(()) } fn active_vcpus(&self) -> usize { self.present_vcpus() as usize } } #[cfg(feature = "guest_debug")] impl Elf64Writable for CpuManager {} #[cfg(feature = "guest_debug")] impl CpuElf64Writable for CpuManager { fn cpu_write_elf64_note( &mut self, dump_state: &DumpState, ) -> std::result::Result<(), GuestDebuggableError> { let mut coredump_file = dump_state.file.as_ref().unwrap(); for vcpu in &self.vcpus { let note_size = self.get_note_size(NoteDescType::Elf, 1); let mut pos: usize = 0; let mut buf = vec![0; note_size as usize]; let descsz = size_of::(); let vcpu_id = vcpu.lock().unwrap().id; let note = Elf64_Nhdr { n_namesz: COREDUMP_NAME_SIZE, n_descsz: descsz as u32, n_type: NT_PRSTATUS, }; let bytes: &[u8] = note.as_slice(); buf.splice(0.., bytes.to_vec()); pos += round_up!(size_of::(), 4); buf.resize(pos + 4, 0); buf.splice(pos.., "CORE".to_string().into_bytes()); pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); buf.resize(pos + 32 + 4, 0); let pid = vcpu_id as u64; let bytes: &[u8] = pid.as_slice(); buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */ pos += descsz - size_of::() - size_of::(); let orig_rax: u64 = 0; let gregs = self.vcpus[usize::from(vcpu_id)] .lock() .unwrap() .vcpu .get_regs() .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; let regs1 = [ gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11, gregs.r10, ]; let regs2 = [ gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax, ]; let sregs = self.vcpus[usize::from(vcpu_id)] .lock() .unwrap() .vcpu .get_sregs() .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; debug!( "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}", gregs.rip, gregs.rsp, sregs.gs.base, sregs.cs.selector, sregs.ss.selector, sregs.ds.selector, ); let regs = X86_64UserRegs { regs1, regs2, rip: gregs.rip, cs: sregs.cs.selector as u64, eflags: gregs.rflags, rsp: gregs.rsp, ss: sregs.ss.selector as u64, fs_base: sregs.fs.base as u64, gs_base: sregs.gs.base as u64, ds: sregs.ds.selector as u64, es: sregs.es.selector as u64, fs: sregs.fs.selector as u64, gs: sregs.gs.selector as u64, }; // let bytes: &[u8] = unsafe { any_as_u8_slice(®s) }; let bytes: &[u8] = regs.as_slice(); buf.resize(note_size as usize, 0); buf.splice(pos.., bytes.to_vec()); buf.resize(note_size as usize, 0); coredump_file .write(&buf) .map_err(GuestDebuggableError::CoredumpFile)?; } Ok(()) } fn cpu_write_vmm_note( &mut self, dump_state: &DumpState, ) -> std::result::Result<(), GuestDebuggableError> { let mut coredump_file = dump_state.file.as_ref().unwrap(); for vcpu in &self.vcpus { let note_size = self.get_note_size(NoteDescType::Vmm, 1); let mut pos: usize = 0; let mut buf = vec![0; note_size as usize]; let descsz = size_of::(); let vcpu_id = vcpu.lock().unwrap().id; let note = Elf64_Nhdr { n_namesz: COREDUMP_NAME_SIZE, n_descsz: descsz as u32, n_type: 0, }; let bytes: &[u8] = note.as_slice(); buf.splice(0.., bytes.to_vec()); pos += round_up!(size_of::(), 4); buf.resize(pos + 4, 0); buf.splice(pos.., "QEMU".to_string().into_bytes()); pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); let gregs = self.vcpus[usize::from(vcpu_id)] .lock() .unwrap() .vcpu .get_regs() .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?; let regs1 = [ gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp, gregs.rbp, ]; let regs2 = [ gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15, ]; let sregs = self.vcpus[usize::from(vcpu_id)] .lock() .unwrap() .vcpu .get_sregs() .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?; let mut msrs = vec![MsrEntry { index: msr_index::MSR_KERNEL_GS_BASE, ..Default::default() }]; self.vcpus[vcpu_id as usize] .lock() .unwrap() .vcpu .get_msrs(&mut msrs) .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?; let kernel_gs_base = msrs[0].data; let cs = CpuSegment::new(sregs.cs); let ds = CpuSegment::new(sregs.ds); let es = CpuSegment::new(sregs.es); let fs = CpuSegment::new(sregs.fs); let gs = CpuSegment::new(sregs.gs); let ss = CpuSegment::new(sregs.ss); let ldt = CpuSegment::new(sregs.ldt); let tr = CpuSegment::new(sregs.tr); let gdt = CpuSegment::new_from_table(sregs.gdt); let idt = CpuSegment::new_from_table(sregs.idt); let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4]; let regs = DumpCpusState { version: 1, size: size_of::() as u32, regs1, regs2, rip: gregs.rip, rflags: gregs.rflags, cs, ds, es, fs, gs, ss, ldt, tr, gdt, idt, cr, kernel_gs_base, }; let bytes: &[u8] = regs.as_slice(); buf.resize(note_size as usize, 0); buf.splice(pos.., bytes.to_vec()); buf.resize(note_size as usize, 0); coredump_file .write(&buf) .map_err(GuestDebuggableError::CoredumpFile)?; } Ok(()) } } #[cfg(all(feature = "kvm", target_arch = "x86_64"))] #[cfg(test)] mod tests { use arch::x86_64::interrupts::*; use arch::x86_64::regs::*; use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters}; #[test] fn test_setlint() { let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().expect("new VM fd creation failed"); assert!(hv.check_required_extensions().is_ok()); // Calling get_lapic will fail if there is no irqchip before hand. assert!(vm.create_irq_chip().is_ok()); let vcpu = vm.create_vcpu(0, None).unwrap(); let klapic_before: LapicState = vcpu.get_lapic().unwrap(); // Compute the value that is expected to represent LVT0 and LVT1. let lint0 = klapic_before.get_klapic_reg(APIC_LVT0); let lint1 = klapic_before.get_klapic_reg(APIC_LVT1); let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); set_lint(&vcpu).unwrap(); // Compute the value that represents LVT0 and LVT1 after set_lint. let klapic_actual: LapicState = vcpu.get_lapic().unwrap(); let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0); let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1); assert_eq!(lint0_mode_expected, lint0_mode_actual); assert_eq!(lint1_mode_expected, lint1_mode_actual); } #[test] fn test_setup_fpu() { let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().expect("new VM fd creation failed"); let vcpu = vm.create_vcpu(0, None).unwrap(); setup_fpu(&vcpu).unwrap(); let expected_fpu: FpuState = FpuState { fcw: 0x37f, mxcsr: 0x1f80, ..Default::default() }; let actual_fpu: FpuState = vcpu.get_fpu().unwrap(); // TODO: auto-generate kvm related structures with PartialEq on. assert_eq!(expected_fpu.fcw, actual_fpu.fcw); // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything. // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should // remove it at all. // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); } #[test] fn test_setup_msrs() { use hypervisor::arch::x86::{msr_index, MsrEntry}; let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().expect("new VM fd creation failed"); let vcpu = vm.create_vcpu(0, None).unwrap(); setup_msrs(&vcpu).unwrap(); // This test will check against the last MSR entry configured (the tenth one). // See create_msr_entries for details. let mut msrs = vec![MsrEntry { index: msr_index::MSR_IA32_MISC_ENABLE, ..Default::default() }]; // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1 // in this test case scenario. let read_msrs = vcpu.get_msrs(&mut msrs).unwrap(); assert_eq!(read_msrs, 1); // Official entries that were setup when we did setup_msrs. We need to assert that the // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we // expect. let entry_vec = vcpu.boot_msr_entries(); assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]); } #[test] fn test_setup_regs() { let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().expect("new VM fd creation failed"); let vcpu = vm.create_vcpu(0, None).unwrap(); let expected_regs: StandardRegisters = StandardRegisters { rflags: 0x0000000000000002u64, rbx: arch::layout::PVH_INFO_START.0, rip: 1, ..Default::default() }; setup_regs(&vcpu, expected_regs.rip).unwrap(); let actual_regs: StandardRegisters = vcpu.get_regs().unwrap(); assert_eq!(actual_regs, expected_regs); } } #[cfg(target_arch = "aarch64")] #[cfg(test)] mod tests { use arch::{aarch64::regs, layout}; use hypervisor::kvm::aarch64::is_system_register; use hypervisor::kvm::kvm_bindings::{ kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, }; use hypervisor::{arm64_core_reg_id, offset__of}; use std::mem; #[test] fn test_setup_regs() { let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().unwrap(); let vcpu = vm.create_vcpu(0, None).unwrap(); let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0); // Must fail when vcpu is not initialized yet. assert!(res.is_err()); let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); vm.get_preferred_target(&mut kvi).unwrap(); vcpu.vcpu_init(&kvi).unwrap(); assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok()); } #[test] fn test_read_mpidr() { let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().unwrap(); let vcpu = vm.create_vcpu(0, None).unwrap(); let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); vm.get_preferred_target(&mut kvi).unwrap(); // Must fail when vcpu is not initialized yet. assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err()); vcpu.vcpu_init(&kvi).unwrap(); assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000); } #[test] fn test_is_system_register() { let offset = offset__of!(user_pt_regs, pc); let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset); assert!(!is_system_register(regid)); let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64; assert!(is_system_register(regid)); } #[test] fn test_save_restore_core_regs() { let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().unwrap(); let vcpu = vm.create_vcpu(0, None).unwrap(); let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); vm.get_preferred_target(&mut kvi).unwrap(); // Must fail when vcpu is not initialized yet. let res = vcpu.get_regs(); assert!(res.is_err()); assert_eq!( format!("{}", res.unwrap_err()), "Failed to get core register: Exec format error (os error 8)" ); let mut state = kvm_regs::default(); let res = vcpu.set_regs(&state); assert!(res.is_err()); assert_eq!( format!("{}", res.unwrap_err()), "Failed to set core register: Exec format error (os error 8)" ); vcpu.vcpu_init(&kvi).unwrap(); let res = vcpu.get_regs(); assert!(res.is_ok()); state = res.unwrap(); assert_eq!(state.regs.pstate, 0x3C5); assert!(vcpu.set_regs(&state).is_ok()); } #[test] fn test_get_set_mpstate() { let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().unwrap(); let vcpu = vm.create_vcpu(0, None).unwrap(); let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); vm.get_preferred_target(&mut kvi).unwrap(); let res = vcpu.get_mp_state(); assert!(res.is_ok()); assert!(vcpu.set_mp_state(res.unwrap()).is_ok()); } }