// Copyright © 2019 Intel Corporation // // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause // // Copyright © 2020, Microsoft Corporation // // Copyright 2018-2019 CrowdStrike, Inc. // // #[cfg(target_arch = "aarch64")] use crate::aarch64::gic::KvmGicV3Its; #[cfg(target_arch = "aarch64")] pub use crate::aarch64::{ check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, VcpuKvmState, MPIDR_EL1, }; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::Vgic; use crate::cpu; use crate::device; use crate::hypervisor; use crate::vec_with_array_field; use crate::vm::{self, InterruptSourceConfig, VmOps}; #[cfg(target_arch = "aarch64")] use crate::{arm64_core_reg_id, offset__of}; use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; use std::any::Any; use std::collections::HashMap; #[cfg(target_arch = "aarch64")] use std::convert::TryInto; #[cfg(target_arch = "x86_64")] use std::fs::File; #[cfg(target_arch = "x86_64")] use std::os::unix::io::AsRawFd; #[cfg(feature = "tdx")] use std::os::unix::io::RawFd; use std::result; #[cfg(target_arch = "x86_64")] use std::sync::atomic::{AtomicBool, Ordering}; #[cfg(target_arch = "aarch64")] use std::sync::Mutex; use std::sync::{Arc, RwLock}; use vmm_sys_util::eventfd::EventFd; // x86_64 dependencies #[cfg(target_arch = "x86_64")] pub mod x86_64; #[cfg(target_arch = "x86_64")] use crate::arch::x86::{ CpuIdEntry, FpuState, LapicState, SpecialRegisters, StandardRegisters, NUM_IOAPIC_PINS, }; #[cfg(target_arch = "x86_64")] use crate::ClockData; use crate::{ CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, }; #[cfg(target_arch = "aarch64")] use aarch64::{RegList, Register, StandardRegisters}; #[cfg(target_arch = "x86_64")] use kvm_bindings::{ kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP, }; #[cfg(target_arch = "x86_64")] use x86_64::check_required_kvm_extensions; #[cfg(target_arch = "x86_64")] pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave}; // aarch64 dependencies #[cfg(target_arch = "aarch64")] pub mod aarch64; pub use kvm_bindings; #[cfg(feature = "tdx")] use kvm_bindings::KVMIO; pub use kvm_bindings::{ kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, }; #[cfg(target_arch = "aarch64")] use kvm_bindings::{ kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, }; pub use kvm_ioctls; pub use kvm_ioctls::{Cap, Kvm}; #[cfg(target_arch = "aarch64")] use std::mem; use thiserror::Error; #[cfg(feature = "tdx")] use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr}; /// /// Export generically-named wrappers of kvm-bindings for Unix-based platforms /// pub use { kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr, kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::VcpuExit, }; #[cfg(target_arch = "x86_64")] const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; #[cfg(feature = "tdx")] const KVM_EXIT_TDX: u32 = 35; #[cfg(feature = "tdx")] const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; #[cfg(feature = "tdx")] const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; #[cfg(feature = "tdx")] const TDG_VP_VMCALL_SUCCESS: u64 = 0; #[cfg(feature = "tdx")] const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; #[cfg(feature = "tdx")] ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); #[cfg(feature = "tdx")] #[repr(u32)] enum TdxCommand { Capabilities = 0, InitVm, InitVcpu, InitMemRegion, Finalize, } #[cfg(feature = "tdx")] pub enum TdxExitDetails { GetQuote, SetupEventNotifyInterrupt, } #[cfg(feature = "tdx")] pub enum TdxExitStatus { Success, InvalidOperand, } #[cfg(feature = "tdx")] const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; #[cfg(feature = "tdx")] #[repr(C)] #[derive(Debug, Default)] pub struct TdxCpuidConfig { pub leaf: u32, pub sub_leaf: u32, pub eax: u32, pub ebx: u32, pub ecx: u32, pub edx: u32, } #[cfg(feature = "tdx")] #[repr(C)] #[derive(Debug, Default)] pub struct TdxCapabilities { pub attrs_fixed0: u64, pub attrs_fixed1: u64, pub xfam_fixed0: u64, pub xfam_fixed1: u64, pub nr_cpuid_configs: u32, pub padding: u32, pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], } impl From for UserMemoryRegion { fn from(region: kvm_userspace_memory_region) -> Self { let mut flags = USER_MEMORY_REGION_READ; if region.flags & KVM_MEM_READONLY == 0 { flags |= USER_MEMORY_REGION_WRITE; } if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { flags |= USER_MEMORY_REGION_LOG_DIRTY; } UserMemoryRegion { slot: region.slot, guest_phys_addr: region.guest_phys_addr, memory_size: region.memory_size, userspace_addr: region.userspace_addr, flags, } } } impl From for kvm_userspace_memory_region { fn from(region: UserMemoryRegion) -> Self { assert!( region.flags & USER_MEMORY_REGION_READ != 0, "KVM mapped memory is always readable" ); let mut flags = 0; if region.flags & USER_MEMORY_REGION_WRITE == 0 { flags |= KVM_MEM_READONLY; } if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { flags |= KVM_MEM_LOG_DIRTY_PAGES; } kvm_userspace_memory_region { slot: region.slot, guest_phys_addr: region.guest_phys_addr, memory_size: region.memory_size, userspace_addr: region.userspace_addr, flags, } } } impl From for MpState { fn from(s: kvm_mp_state) -> Self { MpState::Kvm(s) } } impl From for kvm_mp_state { fn from(ms: MpState) -> Self { match ms { MpState::Kvm(s) => s, /* Needed in case other hypervisors are enabled */ #[allow(unreachable_patterns)] _ => panic!("CpuState is not valid"), } } } impl From for IoEventAddress { fn from(a: kvm_ioctls::IoEventAddress) -> Self { match a { kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x), kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x), } } } impl From for kvm_ioctls::IoEventAddress { fn from(a: IoEventAddress) -> Self { match a { IoEventAddress::Pio(x) => Self::Pio(x), IoEventAddress::Mmio(x) => Self::Mmio(x), } } } impl From for CpuState { fn from(s: VcpuKvmState) -> Self { CpuState::Kvm(s) } } impl From for VcpuKvmState { fn from(s: CpuState) -> Self { match s { CpuState::Kvm(s) => s, /* Needed in case other hypervisors are enabled */ #[allow(unreachable_patterns)] _ => panic!("CpuState is not valid"), } } } #[cfg(target_arch = "x86_64")] impl From for ClockData { fn from(d: kvm_clock_data) -> Self { ClockData::Kvm(d) } } #[cfg(target_arch = "x86_64")] impl From for kvm_clock_data { fn from(ms: ClockData) -> Self { match ms { ClockData::Kvm(s) => s, /* Needed in case other hypervisors are enabled */ #[allow(unreachable_patterns)] _ => panic!("CpuState is not valid"), } } } impl From for IrqRoutingEntry { fn from(s: kvm_irq_routing_entry) -> Self { IrqRoutingEntry::Kvm(s) } } impl From for kvm_irq_routing_entry { fn from(e: IrqRoutingEntry) -> Self { match e { IrqRoutingEntry::Kvm(e) => e, /* Needed in case other hypervisors are enabled */ #[allow(unreachable_patterns)] _ => panic!("IrqRoutingEntry is not valid"), } } } struct KvmDirtyLogSlot { slot: u32, guest_phys_addr: u64, memory_size: u64, userspace_addr: u64, } /// Wrapper over KVM VM ioctls. pub struct KvmVm { fd: Arc, #[cfg(target_arch = "x86_64")] msrs: MsrEntries, dirty_log_slots: Arc>>, } /// /// Implementation of Vm trait for KVM /// Example: /// #[cfg(feature = "kvm")] /// extern crate hypervisor /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); /// let hypervisor: Arc = Arc::new(kvm); /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); /// vm.set/get().unwrap() /// impl vm::Vm for KvmVm { #[cfg(target_arch = "x86_64")] /// /// Sets the address of the one-page region in the VM's address space. /// fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { self.fd .set_identity_map_address(address) .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) } #[cfg(target_arch = "x86_64")] /// /// Sets the address of the three-page region in the VM's address space. /// fn set_tss_address(&self, offset: usize) -> vm::Result<()> { self.fd .set_tss_address(offset) .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) } /// /// Creates an in-kernel interrupt controller. /// fn create_irq_chip(&self) -> vm::Result<()> { self.fd .create_irq_chip() .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) } /// /// Registers an event that will, when signaled, trigger the `gsi` IRQ. /// fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { self.fd .register_irqfd(fd, gsi) .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) } /// /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. /// fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { self.fd .unregister_irqfd(fd, gsi) .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) } /// /// Creates a VcpuFd object from a vcpu RawFd. /// fn create_vcpu( &self, id: u8, vm_ops: Option>, ) -> vm::Result> { let vc = self .fd .create_vcpu(id as u64) .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; let vcpu = KvmVcpu { fd: vc, #[cfg(target_arch = "x86_64")] msrs: self.msrs.clone(), vm_ops, #[cfg(target_arch = "x86_64")] hyperv_synic: AtomicBool::new(false), }; Ok(Arc::new(vcpu)) } #[cfg(target_arch = "aarch64")] /// /// Creates a virtual GIC device. /// fn create_vgic( &self, vcpu_count: u64, dist_addr: u64, dist_size: u64, redist_size: u64, msi_size: u64, nr_irqs: u32, ) -> vm::Result>> { let gic_device = KvmGicV3Its::new( self, vcpu_count, dist_addr, dist_size, redist_size, msi_size, nr_irqs, ) .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; Ok(Arc::new(Mutex::new(gic_device))) } /// /// Registers an event to be signaled whenever a certain address is written to. /// fn register_ioevent( &self, fd: &EventFd, addr: &IoEventAddress, datamatch: Option, ) -> vm::Result<()> { let addr = &kvm_ioctls::IoEventAddress::from(*addr); if let Some(dm) = datamatch { match dm { vm::DataMatch::DataMatch32(kvm_dm32) => self .fd .register_ioevent(fd, addr, kvm_dm32) .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), vm::DataMatch::DataMatch64(kvm_dm64) => self .fd .register_ioevent(fd, addr, kvm_dm64) .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), } } else { self.fd .register_ioevent(fd, addr, NoDatamatch) .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) } } /// /// Unregisters an event from a certain address it has been previously registered to. /// fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { let addr = &kvm_ioctls::IoEventAddress::from(*addr); self.fd .unregister_ioevent(fd, addr, NoDatamatch) .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) } /// /// Constructs a routing entry /// fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry { match &config { InterruptSourceConfig::MsiIrq(cfg) => { let mut kvm_route = kvm_irq_routing_entry { gsi, type_: KVM_IRQ_ROUTING_MSI, ..Default::default() }; kvm_route.u.msi.address_lo = cfg.low_addr; kvm_route.u.msi.address_hi = cfg.high_addr; kvm_route.u.msi.data = cfg.data; if self.check_extension(crate::kvm::Cap::MsiDevid) { // On AArch64, there is limitation on the range of the 'devid', // it can not be greater than 65536 (the max of u16). // // BDF can not be used directly, because 'segment' is in high // 16 bits. The layout of the u32 BDF is: // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| // | segment | bus | device | function | // // Now that we support 1 bus only in a segment, we can build a // 'devid' by replacing the 'bus' bits with the low 8 bits of // 'segment' data. // This way we can resolve the range checking problem and give // different `devid` to all the devices. Limitation is that at // most 256 segments can be supported. // let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; kvm_route.flags = KVM_MSI_VALID_DEVID; kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; } kvm_route.into() } InterruptSourceConfig::LegacyIrq(cfg) => { let mut kvm_route = kvm_irq_routing_entry { gsi, type_: KVM_IRQ_ROUTING_IRQCHIP, ..Default::default() }; kvm_route.u.irqchip.irqchip = cfg.irqchip; kvm_route.u.irqchip.pin = cfg.pin; kvm_route.into() } } } /// /// Sets the GSI routing table entries, overwriting any previously set /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. /// fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { let mut irq_routing = vec_with_array_field::(entries.len()); irq_routing[0].nr = entries.len() as u32; irq_routing[0].flags = 0; let entries: Vec = entries .iter() .map(|entry| match entry { IrqRoutingEntry::Kvm(e) => *e, #[allow(unreachable_patterns)] _ => panic!("IrqRoutingEntry type is wrong"), }) .collect(); // SAFETY: irq_routing initialized with entries.len() and now it is being turned into // entries_slice with entries.len() again. It is guaranteed to be large enough to hold // everything from entries. unsafe { let entries_slice: &mut [kvm_irq_routing_entry] = irq_routing[0].entries.as_mut_slice(entries.len()); entries_slice.copy_from_slice(&entries); } self.fd .set_gsi_routing(&irq_routing[0]) .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) } /// /// Creates a memory region structure that can be used with {create/remove}_user_memory_region /// fn make_user_memory_region( &self, slot: u32, guest_phys_addr: u64, memory_size: u64, userspace_addr: u64, readonly: bool, log_dirty_pages: bool, ) -> UserMemoryRegion { kvm_userspace_memory_region { slot, guest_phys_addr, memory_size, userspace_addr, flags: if readonly { KVM_MEM_READONLY } else { 0 } | if log_dirty_pages { KVM_MEM_LOG_DIRTY_PAGES } else { 0 }, } .into() } /// /// Creates a guest physical memory region. /// fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { let mut region: kvm_userspace_memory_region = user_memory_region.into(); if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { if (region.flags & KVM_MEM_READONLY) != 0 { return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( "Error creating regions with both 'dirty-pages-log' and 'read-only'." ))); } // Keep track of the regions that need dirty pages log self.dirty_log_slots.write().unwrap().insert( region.slot, KvmDirtyLogSlot { slot: region.slot, guest_phys_addr: region.guest_phys_addr, memory_size: region.memory_size, userspace_addr: region.userspace_addr, }, ); // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. region.flags = 0; } // SAFETY: Safe because guest regions are guaranteed not to overlap. unsafe { self.fd .set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) } } /// /// Removes a guest physical memory region. /// fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { let mut region: kvm_userspace_memory_region = user_memory_region.into(); // Remove the corresponding entry from "self.dirty_log_slots" if needed self.dirty_log_slots.write().unwrap().remove(®ion.slot); // Setting the size to 0 means "remove" region.memory_size = 0; // SAFETY: Safe because guest regions are guaranteed not to overlap. unsafe { self.fd .set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) } } /// /// Creates an emulated device in the kernel. /// /// See the documentation for `KVM_CREATE_DEVICE`. fn create_device(&self, device: &mut CreateDevice) -> vm::Result> { let device_fd = self .fd .create_device(device) .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; Ok(Arc::new(device_fd)) } /// /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. /// #[cfg(target_arch = "aarch64")] fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { self.fd .get_preferred_target(kvi) .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) } #[cfg(target_arch = "x86_64")] fn enable_split_irq(&self) -> vm::Result<()> { // Create split irqchip // Only the local APIC is emulated in kernel, both PICs and IOAPIC // are not. let mut cap = kvm_enable_cap { cap: KVM_CAP_SPLIT_IRQCHIP, ..Default::default() }; cap.args[0] = NUM_IOAPIC_PINS as u64; self.fd .enable_cap(&cap) .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; Ok(()) } #[cfg(target_arch = "x86_64")] fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { let mut cap = kvm_enable_cap { cap: KVM_CAP_SGX_ATTRIBUTE, ..Default::default() }; cap.args[0] = file.as_raw_fd() as u64; self.fd .enable_cap(&cap) .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; Ok(()) } /// Retrieve guest clock. #[cfg(target_arch = "x86_64")] fn get_clock(&self) -> vm::Result { Ok(self .fd .get_clock() .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))? .into()) } /// Set guest clock. #[cfg(target_arch = "x86_64")] fn set_clock(&self, data: &ClockData) -> vm::Result<()> { let data = (*data).into(); self.fd .set_clock(&data) .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) } /// Checks if a particular `Cap` is available. fn check_extension(&self, c: Cap) -> bool { self.fd.check_extension(c) } /// Create a device that is used for passthrough fn create_passthrough_device(&self) -> vm::Result> { let mut vfio_dev = kvm_create_device { type_: kvm_device_type_KVM_DEV_TYPE_VFIO, fd: 0, flags: 0, }; self.create_device(&mut vfio_dev) .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) } /// /// Start logging dirty pages /// fn start_dirty_log(&self) -> vm::Result<()> { let dirty_log_slots = self.dirty_log_slots.read().unwrap(); for (_, s) in dirty_log_slots.iter() { let region = kvm_userspace_memory_region { slot: s.slot, guest_phys_addr: s.guest_phys_addr, memory_size: s.memory_size, userspace_addr: s.userspace_addr, flags: KVM_MEM_LOG_DIRTY_PAGES, }; // SAFETY: Safe because guest regions are guaranteed not to overlap. unsafe { self.fd .set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; } } Ok(()) } /// /// Stop logging dirty pages /// fn stop_dirty_log(&self) -> vm::Result<()> { let dirty_log_slots = self.dirty_log_slots.read().unwrap(); for (_, s) in dirty_log_slots.iter() { let region = kvm_userspace_memory_region { slot: s.slot, guest_phys_addr: s.guest_phys_addr, memory_size: s.memory_size, userspace_addr: s.userspace_addr, flags: 0, }; // SAFETY: Safe because guest regions are guaranteed not to overlap. unsafe { self.fd .set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; } } Ok(()) } /// /// Get dirty pages bitmap (one bit per page) /// fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result> { self.fd .get_dirty_log(slot, memory_size as usize) .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) } /// /// Initialize TDX for this VM /// #[cfg(feature = "tdx")] fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> { use std::io::{Error, ErrorKind}; let cpuid: Vec = cpuid.iter().map(|e| (*e).into()).collect(); let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| { vm::HypervisorVmError::InitializeTdx(Error::new( ErrorKind::Other, "failed to allocate CpuId", )) })?; #[repr(C)] struct TdxInitVm { max_vcpus: u32, tsc_khz: u32, attributes: u64, cpuid: u64, mrconfigid: [u64; 6], mrowner: [u64; 6], mrownerconfig: [u64; 6], reserved: [u64; 43], } let data = TdxInitVm { max_vcpus, tsc_khz: 0, attributes: 0, cpuid: kvm_cpuid.as_fam_struct_ptr() as u64, mrconfigid: [0; 6], mrowner: [0; 6], mrownerconfig: [0; 6], reserved: [0; 43], }; tdx_command( &self.fd.as_raw_fd(), TdxCommand::InitVm, 0, &data as *const _ as u64, ) .map_err(vm::HypervisorVmError::InitializeTdx) } /// /// Finalize the TDX setup for this VM /// #[cfg(feature = "tdx")] fn tdx_finalize(&self) -> vm::Result<()> { tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) .map_err(vm::HypervisorVmError::FinalizeTdx) } /// /// Initialize memory regions for the TDX VM /// #[cfg(feature = "tdx")] fn tdx_init_memory_region( &self, host_address: u64, guest_address: u64, size: u64, measure: bool, ) -> vm::Result<()> { #[repr(C)] struct TdxInitMemRegion { host_address: u64, guest_address: u64, pages: u64, } let data = TdxInitMemRegion { host_address, guest_address, pages: size / 4096, }; tdx_command( &self.fd.as_raw_fd(), TdxCommand::InitMemRegion, if measure { 1 } else { 0 }, &data as *const _ as u64, ) .map_err(vm::HypervisorVmError::InitMemRegionTdx) } } #[cfg(feature = "tdx")] fn tdx_command( fd: &RawFd, command: TdxCommand, metadata: u32, data: u64, ) -> std::result::Result<(), std::io::Error> { #[repr(C)] struct TdxIoctlCmd { command: TdxCommand, metadata: u32, data: u64, } let cmd = TdxIoctlCmd { command, metadata, data, }; // SAFETY: FFI call. All input parameters are valid. let ret = unsafe { ioctl_with_val( fd, KVM_MEMORY_ENCRYPT_OP(), &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, ) }; if ret < 0 { return Err(std::io::Error::last_os_error()); } Ok(()) } /// Wrapper over KVM system ioctls. pub struct KvmHypervisor { kvm: Kvm, } /// Enum for KVM related error #[derive(Debug, Error)] pub enum KvmError { #[error("Capability missing: {0:?}")] CapabilityMissing(Cap), } pub type KvmResult = result::Result; impl KvmHypervisor { /// Create a hypervisor based on Kvm pub fn new() -> hypervisor::Result { let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; let api_version = kvm_obj.get_api_version(); if api_version != kvm_bindings::KVM_API_VERSION as i32 { return Err(hypervisor::HypervisorError::IncompatibleApiVersion); } Ok(KvmHypervisor { kvm: kvm_obj }) } } /// Implementation of Hypervisor trait for KVM /// Example: /// #[cfg(feature = "kvm")] /// extern crate hypervisor /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); /// let hypervisor: Arc = Arc::new(kvm); /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); /// impl hypervisor::Hypervisor for KvmHypervisor { /// Create a KVM vm object of a specific VM type and return the object as Vm trait object /// Example /// # extern crate hypervisor; /// # use hypervisor::KvmHypervisor; /// use hypervisor::KvmVm; /// let hypervisor = KvmHypervisor::new().unwrap(); /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() /// fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result> { let fd: VmFd; loop { match self.kvm.create_vm_with_type(vm_type) { Ok(res) => fd = res, Err(e) => { if e.errno() == libc::EINTR { // If the error returned is EINTR, which means the // ioctl has been interrupted, we have to retry as // this can't be considered as a regular error. continue; } else { return Err(hypervisor::HypervisorError::VmCreate(e.into())); } } } break; } let vm_fd = Arc::new(fd); #[cfg(target_arch = "x86_64")] { let msr_list = self.get_msr_list()?; let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; let mut msrs = MsrEntries::new(num_msrs).unwrap(); let indices = msr_list.as_slice(); let msr_entries = msrs.as_mut_slice(); for (pos, index) in indices.iter().enumerate() { msr_entries[pos].index = *index; } Ok(Arc::new(KvmVm { fd: vm_fd, msrs, dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), })) } #[cfg(target_arch = "aarch64")] { Ok(Arc::new(KvmVm { fd: vm_fd, dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), })) } } /// Create a KVM vm object and return the object as Vm trait object /// Example /// # extern crate hypervisor; /// # use hypervisor::KvmHypervisor; /// use hypervisor::KvmVm; /// let hypervisor = KvmHypervisor::new().unwrap(); /// let vm = hypervisor.create_vm().unwrap() /// fn create_vm(&self) -> hypervisor::Result> { #[allow(unused_mut)] let mut vm_type: u64 = 0; // Create with default platform type // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA // size from the host and use that when creating the VM, which may // avoid unnecessary VM creation failures. #[cfg(target_arch = "aarch64")] if self.kvm.check_extension(Cap::ArmVmIPASize) { vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); } self.create_vm_with_type(vm_type) } fn check_required_extensions(&self) -> hypervisor::Result<()> { check_required_kvm_extensions(&self.kvm) .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) } #[cfg(target_arch = "x86_64")] /// /// X86 specific call to get the system supported CPUID values. /// fn get_cpuid(&self) -> hypervisor::Result> { let kvm_cpuid = self .kvm .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?; let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); Ok(v) } #[cfg(target_arch = "x86_64")] /// /// Retrieve the list of MSRs supported by KVM. /// fn get_msr_list(&self) -> hypervisor::Result { self.kvm .get_msr_index_list() .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) } #[cfg(target_arch = "aarch64")] /// /// Retrieve AArch64 host maximum IPA size supported by KVM. /// fn get_host_ipa_limit(&self) -> i32 { self.kvm.get_host_ipa_limit() } /// /// Retrieve TDX capabilities /// #[cfg(feature = "tdx")] fn tdx_capabilities(&self) -> hypervisor::Result { let data = TdxCapabilities { nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, ..Default::default() }; tdx_command( &self.kvm.as_raw_fd(), TdxCommand::Capabilities, 0, &data as *const _ as u64, ) .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; Ok(data) } } /// Vcpu struct for KVM pub struct KvmVcpu { fd: VcpuFd, #[cfg(target_arch = "x86_64")] msrs: MsrEntries, vm_ops: Option>, #[cfg(target_arch = "x86_64")] hyperv_synic: AtomicBool, } /// Implementation of Vcpu trait for KVM /// Example: /// #[cfg(feature = "kvm")] /// extern crate hypervisor /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); /// let hypervisor: Arc = Arc::new(kvm); /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); /// let vcpu = vm.create_vcpu(0, None).unwrap(); /// vcpu.get/set().unwrap() /// impl cpu::Vcpu for KvmVcpu { #[cfg(target_arch = "x86_64")] /// /// Returns the vCPU general purpose registers. /// fn get_regs(&self) -> cpu::Result { Ok(self .fd .get_regs() .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))? .into()) } /// /// Returns the vCPU general purpose registers. /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG` /// is used to get registers one by one. /// #[cfg(target_arch = "aarch64")] fn get_regs(&self) -> cpu::Result { let mut state: StandardRegisters = kvm_regs::default(); let mut off = offset__of!(user_pt_regs, regs); // There are 31 user_pt_regs: // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 // These actually are the general-purpose registers of the Armv8-a // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). for i in 0..31 { state.regs.regs[i] = self .fd .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; off += std::mem::size_of::(); } // We are now entering the "Other register" section of the ARMv8-a architecture. // First one, stack pointer. let off = offset__of!(user_pt_regs, sp); state.regs.sp = self .fd .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; // Second one, the program counter. let off = offset__of!(user_pt_regs, pc); state.regs.pc = self .fd .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; // Next is the processor state. let off = offset__of!(user_pt_regs, pstate); state.regs.pstate = self .fd .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; // The stack pointer associated with EL1 let off = offset__of!(kvm_regs, sp_el1); state.sp_el1 = self .fd .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; // Exception Link Register for EL1, when taking an exception to EL1, this register // holds the address to which to return afterwards. let off = offset__of!(kvm_regs, elr_el1); state.elr_el1 = self .fd .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; // Saved Program Status Registers, there are 5 of them used in the kernel. let mut off = offset__of!(kvm_regs, spsr); for i in 0..KVM_NR_SPSR as usize { state.spsr[i] = self .fd .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; off += std::mem::size_of::(); } // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); for i in 0..32 { state.fp_regs.vregs[i] = self .fd .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? .into(); off += mem::size_of::(); } // Floating-point Status Register let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); state.fp_regs.fpsr = self .fd .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? as u32; // Floating-point Control Register let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); state.fp_regs.fpcr = self .fd .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? as u32; Ok(state) } #[cfg(target_arch = "x86_64")] /// /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. /// fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { let regs = (*regs).into(); self.fd .set_regs(®s) .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) } /// /// Sets the vCPU general purpose registers. /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG` /// is used to set registers one by one. /// #[cfg(target_arch = "aarch64")] fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> { // The function follows the exact identical order from `state`. Look there // for some additional info on registers. let mut off = offset__of!(user_pt_regs, regs); for i in 0..31 { self.fd .set_one_reg( arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.regs[i], ) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; off += std::mem::size_of::(); } let off = offset__of!(user_pt_regs, sp); self.fd .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; let off = offset__of!(user_pt_regs, pc); self.fd .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; let off = offset__of!(user_pt_regs, pstate); self.fd .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; let off = offset__of!(kvm_regs, sp_el1); self.fd .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; let off = offset__of!(kvm_regs, elr_el1); self.fd .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; let mut off = offset__of!(kvm_regs, spsr); for i in 0..KVM_NR_SPSR as usize { self.fd .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; off += std::mem::size_of::(); } let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); for i in 0..32 { self.fd .set_one_reg( arm64_core_reg_id!(KVM_REG_SIZE_U128, off), state.fp_regs.vregs[i] as u64, ) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; off += mem::size_of::(); } let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); self.fd .set_one_reg( arm64_core_reg_id!(KVM_REG_SIZE_U32, off), state.fp_regs.fpsr as u64, ) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); self.fd .set_one_reg( arm64_core_reg_id!(KVM_REG_SIZE_U32, off), state.fp_regs.fpcr as u64, ) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; Ok(()) } #[cfg(target_arch = "aarch64")] /// /// Set attribute for vcpu. /// fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { self.fd .set_device_attr(attr) .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into())) } #[cfg(target_arch = "aarch64")] /// /// Check if vcpu has a certain attribute. /// fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> { self.fd .has_device_attr(attr) .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into())) } #[cfg(target_arch = "x86_64")] /// /// Returns the vCPU special registers. /// fn get_sregs(&self) -> cpu::Result { Ok(self .fd .get_sregs() .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))? .into()) } #[cfg(target_arch = "x86_64")] /// /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. /// fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { let sregs = (*sregs).into(); self.fd .set_sregs(&sregs) .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) } #[cfg(target_arch = "x86_64")] /// /// Returns the floating point state (FPU) from the vCPU. /// fn get_fpu(&self) -> cpu::Result { Ok(self .fd .get_fpu() .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))? .into()) } #[cfg(target_arch = "x86_64")] /// /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. /// fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into(); self.fd .set_fpu(&fpu) .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) } #[cfg(target_arch = "x86_64")] /// /// X86 specific call to setup the CPUID registers. /// fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> { let cpuid: Vec = cpuid.iter().map(|e| (*e).into()).collect(); let kvm_cpuid = ::from_entries(&cpuid) .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?; self.fd .set_cpuid2(&kvm_cpuid) .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) } #[cfg(target_arch = "x86_64")] /// /// X86 specific call to enable HyperV SynIC /// fn enable_hyperv_synic(&self) -> cpu::Result<()> { // Update the information about Hyper-V SynIC being enabled and // emulated as it will influence later which MSRs should be saved. self.hyperv_synic.store(true, Ordering::Release); let cap = kvm_enable_cap { cap: KVM_CAP_HYPERV_SYNIC, ..Default::default() }; self.fd .enable_cap(&cap) .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) } /// /// X86 specific call to retrieve the CPUID registers. /// #[cfg(target_arch = "x86_64")] fn get_cpuid2(&self, num_entries: usize) -> cpu::Result> { let kvm_cpuid = self .fd .get_cpuid2(num_entries) .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?; let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); Ok(v) } #[cfg(target_arch = "x86_64")] /// /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). /// fn get_lapic(&self) -> cpu::Result { Ok(self .fd .get_lapic() .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))? .into()) } #[cfg(target_arch = "x86_64")] /// /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). /// fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into(); self.fd .set_lapic(&klapic) .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) } #[cfg(target_arch = "x86_64")] /// /// Returns the model-specific registers (MSR) for this vCPU. /// fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result { self.fd .get_msrs(msrs) .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into())) } #[cfg(target_arch = "x86_64")] /// /// Setup the model-specific registers (MSR) for this vCPU. /// Returns the number of MSR entries actually written. /// fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result { self.fd .set_msrs(msrs) .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) } /// /// Returns the vcpu's current "multiprocessing state". /// fn get_mp_state(&self) -> cpu::Result { Ok(self .fd .get_mp_state() .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))? .into()) } /// /// Sets the vcpu's current "multiprocessing state". /// fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { self.fd .set_mp_state(mp_state.into()) .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) } #[cfg(target_arch = "x86_64")] /// /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. /// fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { let tr = self .fd .translate_gva(gva) .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; // tr.valid is set if the GVA is mapped to valid GPA. match tr.valid { 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( "Invalid GVA: {:#x}", gva ))), _ => Ok((tr.physical_address, 0)), } } /// /// Triggers the running of the current virtual CPU returning an exit reason. /// fn run(&self) -> std::result::Result { match self.fd.run() { Ok(run) => match run { #[cfg(target_arch = "x86_64")] VcpuExit::IoIn(addr, data) => { if let Some(vm_ops) = &self.vm_ops { return vm_ops .pio_read(addr.into(), data) .map(|_| cpu::VmExit::Ignore) .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); } Ok(cpu::VmExit::IoIn(addr, data)) } #[cfg(target_arch = "x86_64")] VcpuExit::IoOut(addr, data) => { if let Some(vm_ops) = &self.vm_ops { return vm_ops .pio_write(addr.into(), data) .map(|_| cpu::VmExit::Ignore) .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); } Ok(cpu::VmExit::IoOut(addr, data)) } #[cfg(target_arch = "x86_64")] VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), #[cfg(target_arch = "x86_64")] VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), #[cfg(target_arch = "aarch64")] VcpuExit::SystemEvent(event_type, flags) => { use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; // On Aarch64, when the VM is shutdown, run() returns // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN if event_type == KVM_SYSTEM_EVENT_RESET { Ok(cpu::VmExit::Reset) } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { Ok(cpu::VmExit::Shutdown) } else { Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "Unexpected system event with type 0x{:x}, flags 0x{:x}", event_type, flags ))) } } VcpuExit::MmioRead(addr, data) => { if let Some(vm_ops) = &self.vm_ops { return vm_ops .mmio_read(addr, data) .map(|_| cpu::VmExit::Ignore) .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); } Ok(cpu::VmExit::MmioRead(addr, data)) } VcpuExit::MmioWrite(addr, data) => { if let Some(vm_ops) = &self.vm_ops { return vm_ops .mmio_write(addr, data) .map(|_| cpu::VmExit::Ignore) .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); } Ok(cpu::VmExit::MmioWrite(addr, data)) } VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), #[cfg(feature = "tdx")] VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "Unexpected exit reason on vcpu run: {:?}", r ))), }, Err(ref e) => match e.errno() { libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "VCPU error {:?}", e ))), }, } } #[cfg(target_arch = "x86_64")] /// /// Let the guest know that it has been paused, which prevents from /// potential soft lockups when being resumed. /// fn notify_guest_clock_paused(&self) -> cpu::Result<()> { if let Err(e) = self.fd.kvmclock_ctrl() { // Linux kernel returns -EINVAL if the PV clock isn't yet initialised // which could be because we're still in firmware or the guest doesn't // use KVM clock. if e.errno() != libc::EINVAL { return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); } } Ok(()) } #[cfg(target_arch = "x86_64")] /// /// Sets debug registers to set hardware breakpoints and/or enable single step. /// fn set_guest_debug( &self, addrs: &[vm_memory::GuestAddress], singlestep: bool, ) -> cpu::Result<()> { if addrs.len() > 4 { return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!( "Support 4 breakpoints at most but {} addresses are passed", addrs.len() ))); } let mut dbg = kvm_guest_debug { control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, ..Default::default() }; if singlestep { dbg.control |= KVM_GUESTDBG_SINGLESTEP; } // Set bits 9 and 10. // bit 9: GE (global exact breakpoint enable) flag. // bit 10: always 1. dbg.arch.debugreg[7] = 0x0600; for (i, addr) in addrs.iter().enumerate() { dbg.arch.debugreg[i] = addr.0; // Set global breakpoint enable flag dbg.arch.debugreg[7] |= 2 << (i * 2); } self.fd .set_guest_debug(&dbg) .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) } #[cfg(target_arch = "aarch64")] fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { self.fd .vcpu_init(kvi) .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) } /// /// Sets the value of one register for this vCPU. /// #[cfg(target_arch = "aarch64")] fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { self.fd .set_one_reg(reg_id, data) .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) } /// /// Gets the value of one register for this vCPU. /// #[cfg(target_arch = "aarch64")] fn get_reg(&self, reg_id: u64) -> cpu::Result { self.fd .get_one_reg(reg_id) .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) } /// /// Gets a list of the guest registers that are supported for the /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. /// #[cfg(target_arch = "aarch64")] fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { self.fd .get_reg_list(reg_list) .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) } /// /// Save the state of the system registers. /// #[cfg(target_arch = "aarch64")] fn get_sys_regs(&self) -> cpu::Result> { // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are // around 500 registers. let mut state: Vec = Vec::new(); let mut reg_list = RegList::new(500).unwrap(); self.fd .get_reg_list(&mut reg_list) .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; // At this point reg_list should contain: core registers and system registers. // The register list contains the number of registers and their ids. We will be needing to // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list // the core registers which are represented in the kernel by kvm_regs structure and for which // we can calculate the id based on the offset in the structure. reg_list.retain(|regid| is_system_register(*regid)); // Now, for the rest of the registers left in the previously fetched register list, we are // simply calling KVM_GET_ONE_REG. let indices = reg_list.as_slice(); for index in indices.iter() { state.push(kvm_bindings::kvm_one_reg { id: *index, addr: self .fd .get_one_reg(*index) .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, }); } Ok(state) } /// /// Restore the state of the system registers. /// #[cfg(target_arch = "aarch64")] fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> { for reg in state { self.fd .set_one_reg(reg.id, reg.addr) .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; } Ok(()) } /// /// Read the MPIDR - Multiprocessor Affinity Register. /// #[cfg(target_arch = "aarch64")] fn read_mpidr(&self) -> cpu::Result { self.fd .get_one_reg(MPIDR_EL1) .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) } /// /// Configure core registers for a given CPU. /// #[cfg(target_arch = "aarch64")] fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { #[allow(non_upper_case_globals)] // PSR (Processor State Register) bits. // Taken from arch/arm64/include/uapi/asm/ptrace.h. const PSR_MODE_EL1h: u64 = 0x0000_0005; const PSR_F_BIT: u64 = 0x0000_0040; const PSR_I_BIT: u64 = 0x0000_0080; const PSR_A_BIT: u64 = 0x0000_0100; const PSR_D_BIT: u64 = 0x0000_0200; // Taken from arch/arm64/kvm/inject_fault.c. const PSTATE_FAULT_BITS_64: u64 = PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; let kreg_off = offset__of!(kvm_regs, regs); // Get the register index of the PSTATE (Processor State) register. let pstate = offset__of!(user_pt_regs, pstate) + kreg_off; self.set_reg( arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), PSTATE_FAULT_BITS_64, ) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; // Other vCPUs are powered off initially awaiting PSCI wakeup. if cpu_id == 0 { // Setting the PC (Processor Counter) to the current program address (kernel address). let pc = offset__of!(user_pt_regs, pc) + kreg_off; self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). // "The device tree blob (dtb) must be placed on an 8-byte boundary and must // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. // We are choosing to place it the end of DRAM. See `get_fdt_addr`. let regs0 = offset__of!(user_pt_regs, regs) + kreg_off; self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start) .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; } Ok(()) } #[cfg(target_arch = "x86_64")] /// /// Get the current CPU state /// /// Ordering requirements: /// /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify /// vCPU/LAPIC state. As such, it must be done before most everything /// else, otherwise we cannot restore everything and expect it to work. /// /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are /// still running. /// /// KVM_GET_LAPIC may change state of LAPIC before returning it. /// /// GET_VCPU_EVENTS should probably be last to save. The code looks as /// it might as well be affected by internal state modifications of the /// GET ioctls. /// /// SREGS saves/restores a pending interrupt, similar to what /// VCPU_EVENTS also does. /// /// GET_MSRS requires a pre-populated data structure to do something /// meaningful. For SET_MSRS it will then contain good data. /// /// # Example /// /// ```rust /// # extern crate hypervisor; /// # use hypervisor::KvmHypervisor; /// # use std::sync::Arc; /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); /// let hv: Arc = Arc::new(kvm); /// let vm = hv.create_vm().expect("new VM fd creation failed"); /// vm.enable_split_irq().unwrap(); /// let vcpu = vm.create_vcpu(0, None).unwrap(); /// let state = vcpu.state().unwrap(); /// ``` fn state(&self) -> cpu::Result { let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; let mp_state = self.get_mp_state()?.into(); let regs = self.get_regs()?; let sregs = self.get_sregs()?; let xsave = self.get_xsave()?; let xcrs = self.get_xcrs()?; let lapic_state = self.get_lapic()?; let fpu = self.get_fpu()?; // Try to get all MSRs based on the list previously retrieved from KVM. // If the number of MSRs obtained from GET_MSRS is different from the // expected amount, we fallback onto a slower method by getting MSRs // by chunks. This is the only way to make sure we try to get as many // MSRs as possible, even if some MSRs are not supported. let mut msr_entries = self.msrs.clone(); // Save extra MSRs if the Hyper-V synthetic interrupt controller is // emulated. if self.hyperv_synic.load(Ordering::Acquire) { let hyperv_synic_msrs = vec![ 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 0x400000b5, 0x400000b6, 0x400000b7, ]; for index in hyperv_synic_msrs { let msr = kvm_msr_entry { index, ..Default::default() }; msr_entries.push(msr).unwrap(); } } let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize; let num_msrs = self.get_msrs(&mut msr_entries)?; let msrs = if num_msrs != expected_num_msrs { let mut faulty_msr_index = num_msrs; let mut msr_entries_tmp = MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap(); loop { warn!( "Detected faulty MSR 0x{:x} while getting MSRs", msr_entries.as_slice()[faulty_msr_index].index ); let start_pos = faulty_msr_index + 1; let mut sub_msr_entries = MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap(); let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; let num_msrs = self.get_msrs(&mut sub_msr_entries)?; for i in 0..num_msrs { msr_entries_tmp .push(sub_msr_entries.as_slice()[i]) .map_err(|e| { cpu::HypervisorCpuError::GetMsrEntries(anyhow!( "Failed adding MSR entries: {:?}", e )) })?; } if num_msrs == expected_num_msrs { break; } faulty_msr_index = start_pos + num_msrs; } msr_entries_tmp } else { msr_entries }; let vcpu_events = self.get_vcpu_events()?; Ok(VcpuKvmState { cpuid, msrs, vcpu_events, regs: regs.into(), sregs: sregs.into(), fpu, lapic_state, xsave, xcrs, mp_state, } .into()) } /// /// Get the current AArch64 CPU state /// #[cfg(target_arch = "aarch64")] fn state(&self) -> cpu::Result { let mut state = VcpuKvmState { mp_state: self.get_mp_state()?.into(), mpidr: self.read_mpidr()?, ..Default::default() }; state.core_regs = self.get_regs()?; state.sys_regs = self.get_sys_regs()?; Ok(state.into()) } #[cfg(target_arch = "x86_64")] /// /// Restore the previously saved CPU state /// /// Ordering requirements: /// /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are /// still running. /// /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so /// if we ever change the BSP, we have to do that before restoring anything. /// The same seems to be true for CPUID stuff. /// /// SREGS saves/restores a pending interrupt, similar to what /// VCPU_EVENTS also does. /// /// SET_REGS clears pending exceptions unconditionally, thus, it must be /// done before SET_VCPU_EVENTS, which restores it. /// /// SET_LAPIC must come after SET_SREGS, because the latter restores /// the apic base msr. /// /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR /// only restores successfully, when the LAPIC is correctly configured. /// /// Arguments: CpuState /// # Example /// /// ```rust /// # extern crate hypervisor; /// # use hypervisor::KvmHypervisor; /// # use std::sync::Arc; /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); /// let hv: Arc = Arc::new(kvm); /// let vm = hv.create_vm().expect("new VM fd creation failed"); /// vm.enable_split_irq().unwrap(); /// let vcpu = vm.create_vcpu(0, None).unwrap(); /// let state = vcpu.state().unwrap(); /// vcpu.set_state(&state).unwrap(); /// ``` fn set_state(&self, state: &CpuState) -> cpu::Result<()> { let state: VcpuKvmState = state.clone().into(); self.set_cpuid2(&state.cpuid)?; self.set_mp_state(state.mp_state.into())?; self.set_regs(&state.regs.into())?; self.set_sregs(&state.sregs.into())?; self.set_xsave(&state.xsave)?; self.set_xcrs(&state.xcrs)?; self.set_lapic(&state.lapic_state)?; self.set_fpu(&state.fpu)?; // Try to set all MSRs previously stored. // If the number of MSRs set from SET_MSRS is different from the // expected amount, we fallback onto a slower method by setting MSRs // by chunks. This is the only way to make sure we try to set as many // MSRs as possible, even if some MSRs are not supported. let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize; let num_msrs = self.set_msrs(&state.msrs)?; if num_msrs != expected_num_msrs { let mut faulty_msr_index = num_msrs; loop { warn!( "Detected faulty MSR 0x{:x} while setting MSRs", state.msrs.as_slice()[faulty_msr_index].index ); let start_pos = faulty_msr_index + 1; let sub_msr_entries = MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap(); let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize; let num_msrs = self.set_msrs(&sub_msr_entries)?; if num_msrs == expected_num_msrs { break; } faulty_msr_index = start_pos + num_msrs; } } self.set_vcpu_events(&state.vcpu_events)?; Ok(()) } /// /// Restore the previously saved AArch64 CPU state /// #[cfg(target_arch = "aarch64")] fn set_state(&self, state: &CpuState) -> cpu::Result<()> { let state: VcpuKvmState = state.clone().into(); self.set_regs(&state.core_regs)?; self.set_sys_regs(&state.sys_regs)?; self.set_mp_state(state.mp_state.into())?; Ok(()) } /// /// Initialize TDX for this CPU /// #[cfg(feature = "tdx")] fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) .map_err(cpu::HypervisorCpuError::InitializeTdx) } /// /// Set the "immediate_exit" state /// fn set_immediate_exit(&self, exit: bool) { self.fd.set_kvm_immediate_exit(exit.into()); } /// /// Returns the details about TDX exit reason /// #[cfg(feature = "tdx")] fn get_tdx_exit_details(&mut self) -> cpu::Result { let kvm_run = self.fd.get_kvm_run(); let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; if tdx_vmcall.type_ != 0 { return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); } match tdx_vmcall.subfunction { TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { Ok(TdxExitDetails::SetupEventNotifyInterrupt) } _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), } } /// /// Set the status code for TDX exit /// #[cfg(feature = "tdx")] fn set_tdx_status(&mut self, status: TdxExitStatus) { let kvm_run = self.fd.get_kvm_run(); let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; tdx_vmcall.status_code = match status { TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, }; } #[cfg(target_arch = "x86_64")] /// /// Return the list of initial MSR entries for a VCPU /// fn boot_msr_entries(&self) -> MsrEntries { use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; use kvm_bindings::kvm_msr_entry as MsrEntry; MsrEntries::from_entries(&[ msr!(msr_index::MSR_IA32_SYSENTER_CS), msr!(msr_index::MSR_IA32_SYSENTER_ESP), msr!(msr_index::MSR_IA32_SYSENTER_EIP), msr!(msr_index::MSR_STAR), msr!(msr_index::MSR_CSTAR), msr!(msr_index::MSR_LSTAR), msr!(msr_index::MSR_KERNEL_GS_BASE), msr!(msr_index::MSR_SYSCALL_MASK), msr!(msr_index::MSR_IA32_TSC), msr_data!( msr_index::MSR_IA32_MISC_ENABLE, msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 ), msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), ]) .unwrap() } } impl KvmVcpu { #[cfg(target_arch = "x86_64")] /// /// X86 specific call that returns the vcpu's current "xsave struct". /// fn get_xsave(&self) -> cpu::Result { self.fd .get_xsave() .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) } #[cfg(target_arch = "x86_64")] /// /// X86 specific call that sets the vcpu's current "xsave struct". /// fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { self.fd .set_xsave(xsave) .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) } #[cfg(target_arch = "x86_64")] /// /// X86 specific call that returns the vcpu's current "xcrs". /// fn get_xcrs(&self) -> cpu::Result { self.fd .get_xcrs() .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) } #[cfg(target_arch = "x86_64")] /// /// X86 specific call that sets the vcpu's current "xcrs". /// fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { self.fd .set_xcrs(xcrs) .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) } #[cfg(target_arch = "x86_64")] /// /// Returns currently pending exceptions, interrupts, and NMIs as well as related /// states of the vcpu. /// fn get_vcpu_events(&self) -> cpu::Result { self.fd .get_vcpu_events() .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) } #[cfg(target_arch = "x86_64")] /// /// Sets pending exceptions, interrupts, and NMIs as well as related states /// of the vcpu. /// fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { self.fd .set_vcpu_events(events) .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) } } /// Device struct for KVM pub type KvmDevice = DeviceFd; impl device::Device for KvmDevice { /// /// Set device attribute /// fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> { self.set_device_attr(attr) .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into())) } /// /// Get device attribute /// fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> { self.get_device_attr(attr) .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into())) } /// /// Cast to the underlying KVM device fd /// fn as_any(&self) -> &dyn Any { self } }