// Copyright © 2020, Oracle and/or its affiliates. // // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. // // Copyright © 2019 Intel Corporation // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause // extern crate arch; extern crate devices; extern crate epoll; extern crate hypervisor; extern crate libc; extern crate linux_loader; extern crate net_util; extern crate signal_hook; #[cfg(feature = "pci_support")] extern crate vm_allocator; extern crate vm_memory; use crate::config::{ DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, ValidationError, VmConfig, VsockConfig, }; use crate::cpu; use crate::device_manager::{self, get_win_size, Console, DeviceManager, DeviceManagerError}; use crate::memory_manager::{Error as MemoryManagerError, MemoryManager}; use crate::migration::{get_vm_snapshot, url_to_path, VM_SNAPSHOT_FILE}; use crate::{ PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, }; use anyhow::anyhow; #[cfg(target_arch = "x86_64")] use arch::BootProtocol; use arch::EntryPoint; use devices::HotPlugNotificationFlags; use linux_loader::cmdline::Cmdline; #[cfg(target_arch = "x86_64")] use linux_loader::loader::elf::Error::InvalidElfMagicNumber; #[cfg(target_arch = "x86_64")] use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; use linux_loader::loader::KernelLoader; use seccomp::SeccompAction; use signal_hook::{iterator::Signals, SIGINT, SIGTERM, SIGWINCH}; use std::collections::HashMap; use std::convert::TryInto; use std::ffi::CString; use std::fs::{File, OpenOptions}; use std::io::{self, Write}; use std::io::{Seek, SeekFrom}; use std::num::Wrapping; use std::ops::Deref; use std::path::PathBuf; use std::sync::{Arc, Mutex, RwLock}; use std::{result, str, thread}; use url::Url; use vm_memory::{Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemoryMmap}; use vm_migration::{ Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, Transportable, }; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::terminal::Terminal; // 64 bit direct boot entry offset for bzImage #[cfg(target_arch = "x86_64")] const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200; /// Errors associated with VM management #[derive(Debug)] pub enum Error { /// Cannot open the kernel image KernelFile(io::Error), /// Cannot open the initramfs image InitramfsFile(io::Error), /// Cannot load the kernel in memory KernelLoad(linux_loader::loader::Error), /// Cannot load the initramfs in memory InitramfsLoad, /// Cannot load the command line in memory LoadCmdLine(linux_loader::loader::Error), /// Cannot modify the command line CmdLineInsertStr(linux_loader::cmdline::Error), /// Cannot convert command line into CString CmdLineCString(std::ffi::NulError), /// Cannot configure system ConfigureSystem(arch::Error), /// Cannot enable interrupt controller EnableInterruptController(device_manager::DeviceManagerError), PoisonedState, /// Cannot create a device manager. DeviceManager(DeviceManagerError), /// Write to the console failed. Console(vmm_sys_util::errno::Error), /// Cannot setup terminal in raw mode. SetTerminalRaw(vmm_sys_util::errno::Error), /// Cannot setup terminal in canonical mode. SetTerminalCanon(vmm_sys_util::errno::Error), /// Failed parsing network parameters ParseNetworkParameters, /// Memory is overflow MemOverflow, /// Failed to allocate the IOAPIC memory range. IoapicRangeAllocation, /// Cannot spawn a signal handler thread SignalHandlerSpawn(io::Error), /// Failed to join on vCPU threads ThreadCleanup(std::boxed::Box), /// VM is not created VmNotCreated, /// VM is already created VmAlreadyCreated, /// VM is not running VmNotRunning, /// Cannot clone EventFd. EventFdClone(io::Error), /// Invalid VM state transition InvalidStateTransition(VmState, VmState), /// Error from CPU handling CpuManager(cpu::Error), /// Cannot pause devices PauseDevices(MigratableError), /// Cannot resume devices ResumeDevices(MigratableError), /// Cannot pause CPUs PauseCpus(MigratableError), /// Cannot resume cpus ResumeCpus(MigratableError), /// Cannot pause VM Pause(MigratableError), /// Cannot resume VM Resume(MigratableError), /// Memory manager error MemoryManager(MemoryManagerError), /// No PCI support NoPciSupport, /// Eventfd write error EventfdError(std::io::Error), /// Cannot snapshot VM Snapshot(MigratableError), /// Cannot restore VM Restore(MigratableError), /// Cannot send VM snapshot SnapshotSend(MigratableError), /// Cannot convert source URL from Path into &str RestoreSourceUrlPathToStr, /// Failed to validate config ConfigValidation(ValidationError), /// No more that one virtio-vsock device TooManyVsockDevices, /// Failed serializing into JSON SerializeJson(serde_json::Error), } pub type Result = result::Result; #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq)] pub enum VmState { Created, Running, Shutdown, Paused, } impl VmState { fn valid_transition(self, new_state: VmState) -> Result<()> { match self { VmState::Created => match new_state { VmState::Created | VmState::Shutdown => { Err(Error::InvalidStateTransition(self, new_state)) } VmState::Running | VmState::Paused => Ok(()), }, VmState::Running => match new_state { VmState::Created | VmState::Running => { Err(Error::InvalidStateTransition(self, new_state)) } VmState::Paused | VmState::Shutdown => Ok(()), }, VmState::Shutdown => match new_state { VmState::Paused | VmState::Created | VmState::Shutdown => { Err(Error::InvalidStateTransition(self, new_state)) } VmState::Running => Ok(()), }, VmState::Paused => match new_state { VmState::Created | VmState::Paused => { Err(Error::InvalidStateTransition(self, new_state)) } VmState::Running | VmState::Shutdown => Ok(()), }, } } } pub struct Vm { kernel: File, initramfs: Option, threads: Vec>, device_manager: Arc>, config: Arc>, on_tty: bool, signals: Option, state: RwLock, cpu_manager: Arc>, memory_manager: Arc>, #[cfg_attr(not(feature = "kvm"), allow(dead_code))] // The hypervisor abstracted virtual machine. vm: Arc, #[cfg(target_arch = "x86_64")] saved_clock: Option, } impl Vm { #[allow(clippy::too_many_arguments)] fn new_from_memory_manager( config: Arc>, memory_manager: Arc>, vm: Arc, exit_evt: EventFd, reset_evt: EventFd, vmm_path: PathBuf, seccomp_action: &SeccompAction, hypervisor: Arc, _saved_clock: Option, ) -> Result { config .lock() .unwrap() .validate() .map_err(Error::ConfigValidation)?; let device_manager = DeviceManager::new( vm.clone(), config.clone(), memory_manager.clone(), &exit_evt, &reset_evt, vmm_path, seccomp_action.clone(), ) .map_err(Error::DeviceManager)?; let cpu_manager = cpu::CpuManager::new( &config.lock().unwrap().cpus.clone(), &device_manager, &memory_manager, vm.clone(), reset_evt, hypervisor, ) .map_err(Error::CpuManager)?; let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0; let kernel = File::open(&config.lock().unwrap().kernel.as_ref().unwrap().path) .map_err(Error::KernelFile)?; let initramfs = config .lock() .unwrap() .initramfs .as_ref() .map(|i| File::open(&i.path)) .transpose() .map_err(Error::InitramfsFile)?; Ok(Vm { kernel, initramfs, device_manager, config, on_tty, threads: Vec::with_capacity(1), signals: None, state: RwLock::new(VmState::Created), cpu_manager, memory_manager, vm, #[cfg(target_arch = "x86_64")] saved_clock: _saved_clock, }) } pub fn new( config: Arc>, exit_evt: EventFd, reset_evt: EventFd, vmm_path: PathBuf, seccomp_action: &SeccompAction, hypervisor: Arc, ) -> Result { #[cfg(target_arch = "x86_64")] hypervisor.check_required_extensions().unwrap(); let vm = hypervisor.create_vm().unwrap(); #[cfg(target_arch = "x86_64")] vm.enable_split_irq().unwrap(); let memory_manager = MemoryManager::new( vm.clone(), &config.lock().unwrap().memory.clone(), None, false, ) .map_err(Error::MemoryManager)?; #[cfg(target_arch = "x86_64")] { if let Some(sgx_epc_config) = config.lock().unwrap().sgx_epc.clone() { memory_manager .lock() .unwrap() .setup_sgx(sgx_epc_config) .map_err(Error::MemoryManager)?; } } let new_vm = Vm::new_from_memory_manager( config, memory_manager, vm, exit_evt, reset_evt, vmm_path, seccomp_action, hypervisor, None, )?; // The device manager must create the devices from here as it is part // of the regular code path creating everything from scratch. new_vm .device_manager .lock() .unwrap() .create_devices() .map_err(Error::DeviceManager)?; Ok(new_vm) } #[allow(clippy::too_many_arguments)] pub fn new_from_snapshot( snapshot: &Snapshot, exit_evt: EventFd, reset_evt: EventFd, vmm_path: PathBuf, source_url: &str, prefault: bool, seccomp_action: &SeccompAction, hypervisor: Arc, ) -> Result { #[cfg(target_arch = "x86_64")] hypervisor.check_required_extensions().unwrap(); let vm = hypervisor.create_vm().unwrap(); #[cfg(target_arch = "x86_64")] vm.enable_split_irq().unwrap(); let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; let config = vm_snapshot.config; if let Some(state) = vm_snapshot.state { vm.set_state(&state) .map_err(|e| Error::Restore(MigratableError::Restore(e.into())))?; } let memory_manager = if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { MemoryManager::new_from_snapshot( memory_manager_snapshot, vm.clone(), &config.lock().unwrap().memory.clone(), source_url, prefault, ) .map_err(Error::MemoryManager)? } else { return Err(Error::Restore(MigratableError::Restore(anyhow!( "Missing memory manager snapshot" )))); }; Vm::new_from_memory_manager( config, memory_manager, vm, exit_evt, reset_evt, vmm_path, seccomp_action, hypervisor, #[cfg(target_arch = "x86_64")] vm_snapshot.clock, #[cfg(target_arch = "aarch64")] None, ) } fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result { let mut initramfs = self.initramfs.as_ref().unwrap(); let size: usize = initramfs .seek(SeekFrom::End(0)) .map_err(|_| Error::InitramfsLoad)? .try_into() .unwrap(); initramfs .seek(SeekFrom::Start(0)) .map_err(|_| Error::InitramfsLoad)?; let address = arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; let address = GuestAddress(address); guest_mem .read_from(address, &mut initramfs, size) .map_err(|_| Error::InitramfsLoad)?; Ok(arch::InitramfsConfig { address, size }) } fn get_cmdline(&mut self) -> Result { let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE); cmdline .insert_str(self.config.lock().unwrap().cmdline.args.clone()) .map_err(Error::CmdLineInsertStr)?; for entry in self.device_manager.lock().unwrap().cmdline_additions() { cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; } Ok(CString::new(cmdline).map_err(Error::CmdLineCString)?) } #[cfg(target_arch = "aarch64")] fn load_kernel(&mut self) -> Result { let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); let mem = guest_memory.memory(); let entry_addr = match linux_loader::loader::pe::PE::load( mem.deref(), Some(GuestAddress(arch::get_kernel_start())), &mut self.kernel, None, ) { Ok(entry_addr) => entry_addr, Err(e) => { return Err(Error::KernelLoad(e)); } }; let entry_point_addr: GuestAddress = entry_addr.kernel_load; Ok(EntryPoint { entry_addr: entry_point_addr, }) } #[cfg(target_arch = "x86_64")] fn load_kernel(&mut self) -> Result { let cmdline_cstring = self.get_cmdline()?; let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); let mem = guest_memory.memory(); let entry_addr = match linux_loader::loader::elf::Elf::load( mem.deref(), None, &mut self.kernel, Some(arch::layout::HIGH_RAM_START), ) { Ok(entry_addr) => entry_addr, Err(linux_loader::loader::Error::Elf(InvalidElfMagicNumber)) => { linux_loader::loader::bzimage::BzImage::load( mem.deref(), None, &mut self.kernel, Some(arch::layout::HIGH_RAM_START), ) .map_err(Error::KernelLoad)? } Err(e) => { return Err(Error::KernelLoad(e)); } }; linux_loader::loader::load_cmdline( mem.deref(), arch::layout::CMDLINE_START, &cmdline_cstring, ) .map_err(Error::LoadCmdLine)?; if entry_addr.setup_header.is_some() { let load_addr = entry_addr .kernel_load .raw_value() .checked_add(KERNEL_64BIT_ENTRY_OFFSET) .ok_or(Error::MemOverflow)?; Ok(EntryPoint { entry_addr: GuestAddress(load_addr), protocol: BootProtocol::LinuxBoot, setup_header: entry_addr.setup_header, }) } else { let entry_point_addr: GuestAddress; let boot_prot: BootProtocol; if let PvhEntryPresent(pvh_entry_addr) = entry_addr.pvh_boot_cap { // Use the PVH kernel entry point to boot the guest entry_point_addr = pvh_entry_addr; boot_prot = BootProtocol::PvhBoot; } else { // Use the Linux 64-bit boot protocol entry_point_addr = entry_addr.kernel_load; boot_prot = BootProtocol::LinuxBoot; } Ok(EntryPoint { entry_addr: entry_point_addr, protocol: boot_prot, setup_header: None, }) } } #[cfg(target_arch = "x86_64")] fn configure_system(&mut self, entry_addr: EntryPoint) -> Result<()> { let cmdline_cstring = self.get_cmdline()?; let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); let mem = guest_memory.memory(); let initramfs_config = match self.initramfs { Some(_) => Some(self.load_initramfs(mem.deref())?), None => None, }; let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); #[allow(unused_mut, unused_assignments)] let mut rsdp_addr: Option = None; #[cfg(feature = "acpi")] { rsdp_addr = Some(crate::acpi::create_acpi_tables( mem.deref(), &self.device_manager, &self.cpu_manager, &self.memory_manager, )); } let sgx_epc_region = self .memory_manager .lock() .unwrap() .sgx_epc_region() .as_ref() .cloned(); match entry_addr.setup_header { Some(hdr) => { arch::configure_system( &mem, arch::layout::CMDLINE_START, cmdline_cstring.to_bytes().len() + 1, &initramfs_config, boot_vcpus, Some(hdr), rsdp_addr, BootProtocol::LinuxBoot, sgx_epc_region, ) .map_err(Error::ConfigureSystem)?; } None => { arch::configure_system( &mem, arch::layout::CMDLINE_START, cmdline_cstring.to_bytes().len() + 1, &initramfs_config, boot_vcpus, None, rsdp_addr, entry_addr.protocol, sgx_epc_region, ) .map_err(Error::ConfigureSystem)?; } } Ok(()) } #[cfg(target_arch = "aarch64")] fn configure_system(&mut self, _entry_addr: EntryPoint) -> Result<()> { let cmdline_cstring = self.get_cmdline()?; let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); let mem = guest_memory.memory(); let initramfs_config = match self.initramfs { Some(_) => Some(self.load_initramfs(mem.deref())?), None => None, }; let device_info = &self .device_manager .lock() .unwrap() .get_device_info() .clone(); let pci_space: Option<(u64, u64)> = if cfg!(feature = "pci_support") { let pci_space_start: GuestAddress = self .memory_manager .lock() .as_ref() .unwrap() .start_of_device_area(); let pci_space_end: GuestAddress = self .memory_manager .lock() .as_ref() .unwrap() .end_of_device_area(); let pci_space_size = pci_space_end .checked_offset_from(pci_space_start) .ok_or(Error::MemOverflow)? + 1; Some((pci_space_start.0, pci_space_size)) } else { None }; arch::configure_system( &self.memory_manager.lock().as_ref().unwrap().vm, &mem, &cmdline_cstring, self.cpu_manager.lock().unwrap().boot_vcpus() as u64, vcpu_mpidrs, device_info, &initramfs_config, &pci_space, ) .map_err(Error::ConfigureSystem)?; self.device_manager .lock() .unwrap() .enable_interrupt_controller() .map_err(Error::EnableInterruptController)?; Ok(()) } pub fn shutdown(&mut self) -> Result<()> { let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; let new_state = VmState::Shutdown; state.valid_transition(new_state)?; if self.on_tty { // Don't forget to set the terminal in canonical mode // before to exit. io::stdin() .lock() .set_canon_mode() .map_err(Error::SetTerminalCanon)?; } // Trigger the termination of the signal_handler thread if let Some(signals) = self.signals.take() { signals.close(); } // Wake up the DeviceManager threads so they will get terminated cleanly self.device_manager .lock() .unwrap() .resume() .map_err(Error::Resume)?; self.cpu_manager .lock() .unwrap() .shutdown() .map_err(Error::CpuManager)?; // Wait for all the threads to finish for thread in self.threads.drain(..) { thread.join().map_err(Error::ThreadCleanup)? } *state = new_state; Ok(()) } pub fn resize( &mut self, desired_vcpus: Option, desired_memory: Option, desired_ram_w_balloon: Option, ) -> Result<()> { if let Some(desired_vcpus) = desired_vcpus { if self .cpu_manager .lock() .unwrap() .resize(desired_vcpus) .map_err(Error::CpuManager)? { self.device_manager .lock() .unwrap() .notify_hotplug(HotPlugNotificationFlags::CPU_DEVICES_CHANGED) .map_err(Error::DeviceManager)?; } self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; } if let Some(desired_memory) = desired_memory { let new_region = self .memory_manager .lock() .unwrap() .resize(desired_memory) .map_err(Error::MemoryManager)?; if let Some(new_region) = &new_region { self.device_manager .lock() .unwrap() .update_memory(&new_region) .map_err(Error::DeviceManager)?; let memory_config = &self.config.lock().unwrap().memory; match memory_config.hotplug_method { HotplugMethod::Acpi => { self.device_manager .lock() .unwrap() .notify_hotplug(HotPlugNotificationFlags::MEMORY_DEVICES_CHANGED) .map_err(Error::DeviceManager)?; } HotplugMethod::VirtioMem => {} } } // We update the VM config regardless of the actual guest resize // operation result (happened or not), so that if the VM reboots // it will be running with the last configure memory size. self.config.lock().unwrap().memory.size = desired_memory; } if let Some(desired_ram_w_balloon) = desired_ram_w_balloon { // update the configuration value for the balloon size to ensure // a reboot would use the right value. self.config.lock().unwrap().memory.balloon_size = self .memory_manager .lock() .unwrap() .balloon_resize(desired_ram_w_balloon) .map_err(Error::MemoryManager)?; } Ok(()) } #[cfg(not(feature = "pci_support"))] pub fn add_device(&mut self, mut _device_cfg: DeviceConfig) -> Result { Err(Error::NoPciSupport) } #[cfg(feature = "pci_support")] pub fn add_device(&mut self, mut _device_cfg: DeviceConfig) -> Result { let pci_device_info = self .device_manager .lock() .unwrap() .add_device(&mut _device_cfg) .map_err(Error::DeviceManager)?; // Update VmConfig by adding the new device. This is important to // ensure the device would be created in case of a reboot. { let mut config = self.config.lock().unwrap(); if let Some(devices) = config.devices.as_mut() { devices.push(_device_cfg); } else { config.devices = Some(vec![_device_cfg]); } } self.device_manager .lock() .unwrap() .notify_hotplug(HotPlugNotificationFlags::PCI_DEVICES_CHANGED) .map_err(Error::DeviceManager)?; Ok(pci_device_info) } pub fn remove_device(&mut self, _id: String) -> Result<()> { if cfg!(feature = "pci_support") { #[cfg(feature = "pci_support")] { self.device_manager .lock() .unwrap() .remove_device(_id.clone()) .map_err(Error::DeviceManager)?; // Update VmConfig by removing the device. This is important to // ensure the device would not be created in case of a reboot. { let mut config = self.config.lock().unwrap(); // Remove if VFIO device if let Some(devices) = config.devices.as_mut() { devices.retain(|dev| dev.id.as_ref() != Some(&_id)); } // Remove if disk device if let Some(disks) = config.disks.as_mut() { disks.retain(|dev| dev.id.as_ref() != Some(&_id)); } // Remove if net device if let Some(net) = config.net.as_mut() { net.retain(|dev| dev.id.as_ref() != Some(&_id)); } // Remove if pmem device if let Some(pmem) = config.pmem.as_mut() { pmem.retain(|dev| dev.id.as_ref() != Some(&_id)); } // Remove if vsock device if let Some(vsock) = config.vsock.as_ref() { if vsock.id.as_ref() == Some(&_id) { config.vsock = None; } } } self.device_manager .lock() .unwrap() .notify_hotplug(HotPlugNotificationFlags::PCI_DEVICES_CHANGED) .map_err(Error::DeviceManager)?; } Ok(()) } else { Err(Error::NoPciSupport) } } #[cfg(not(feature = "pci_support"))] pub fn add_disk(&mut self, mut _disk_cfg: DiskConfig) -> Result { Err(Error::NoPciSupport) } #[cfg(feature = "pci_support")] pub fn add_disk(&mut self, mut _disk_cfg: DiskConfig) -> Result { let pci_device_info = self .device_manager .lock() .unwrap() .add_disk(&mut _disk_cfg) .map_err(Error::DeviceManager)?; // Update VmConfig by adding the new device. This is important to // ensure the device would be created in case of a reboot. { let mut config = self.config.lock().unwrap(); if let Some(disks) = config.disks.as_mut() { disks.push(_disk_cfg); } else { config.disks = Some(vec![_disk_cfg]); } } self.device_manager .lock() .unwrap() .notify_hotplug(HotPlugNotificationFlags::PCI_DEVICES_CHANGED) .map_err(Error::DeviceManager)?; Ok(pci_device_info) } #[cfg(not(feature = "pci_support"))] pub fn add_fs(&mut self, mut _fs_cfg: FsConfig) -> Result { Err(Error::NoPciSupport) } #[cfg(feature = "pci_support")] pub fn add_fs(&mut self, mut _fs_cfg: FsConfig) -> Result { let pci_device_info = self .device_manager .lock() .unwrap() .add_fs(&mut _fs_cfg) .map_err(Error::DeviceManager)?; // Update VmConfig by adding the new device. This is important to // ensure the device would be created in case of a reboot. { let mut config = self.config.lock().unwrap(); if let Some(fs_config) = config.fs.as_mut() { fs_config.push(_fs_cfg); } else { config.fs = Some(vec![_fs_cfg]); } } self.device_manager .lock() .unwrap() .notify_hotplug(HotPlugNotificationFlags::PCI_DEVICES_CHANGED) .map_err(Error::DeviceManager)?; Ok(pci_device_info) } #[cfg(not(feature = "pci_support"))] pub fn add_pmem(&mut self, mut _pmem_cfg: PmemConfig) -> Result { Err(Error::NoPciSupport) } #[cfg(feature = "pci_support")] pub fn add_pmem(&mut self, mut _pmem_cfg: PmemConfig) -> Result { let pci_device_info = self .device_manager .lock() .unwrap() .add_pmem(&mut _pmem_cfg) .map_err(Error::DeviceManager)?; // Update VmConfig by adding the new device. This is important to // ensure the device would be created in case of a reboot. { let mut config = self.config.lock().unwrap(); if let Some(pmem) = config.pmem.as_mut() { pmem.push(_pmem_cfg); } else { config.pmem = Some(vec![_pmem_cfg]); } } self.device_manager .lock() .unwrap() .notify_hotplug(HotPlugNotificationFlags::PCI_DEVICES_CHANGED) .map_err(Error::DeviceManager)?; Ok(pci_device_info) } #[cfg(not(feature = "pci_support"))] pub fn add_net(&mut self, mut _net_cfg: NetConfig) -> Result { Err(Error::NoPciSupport) } #[cfg(feature = "pci_support")] pub fn add_net(&mut self, mut _net_cfg: NetConfig) -> Result { let pci_device_info = self .device_manager .lock() .unwrap() .add_net(&mut _net_cfg) .map_err(Error::DeviceManager)?; // Update VmConfig by adding the new device. This is important to // ensure the device would be created in case of a reboot. { let mut config = self.config.lock().unwrap(); if let Some(net) = config.net.as_mut() { net.push(_net_cfg); } else { config.net = Some(vec![_net_cfg]); } } self.device_manager .lock() .unwrap() .notify_hotplug(HotPlugNotificationFlags::PCI_DEVICES_CHANGED) .map_err(Error::DeviceManager)?; Ok(pci_device_info) } #[cfg(not(feature = "pci_support"))] pub fn add_vsock(&mut self, mut _vsock_cfg: VsockConfig) -> Result { Err(Error::NoPciSupport) } #[cfg(feature = "pci_support")] pub fn add_vsock(&mut self, mut _vsock_cfg: VsockConfig) -> Result { if self.config.lock().unwrap().vsock.is_some() { return Err(Error::TooManyVsockDevices); } let pci_device_info = self .device_manager .lock() .unwrap() .add_vsock(&mut _vsock_cfg) .map_err(Error::DeviceManager)?; // Update VmConfig by adding the new device. This is important to // ensure the device would be created in case of a reboot. { let mut config = self.config.lock().unwrap(); config.vsock = Some(_vsock_cfg); } self.device_manager .lock() .unwrap() .notify_hotplug(HotPlugNotificationFlags::PCI_DEVICES_CHANGED) .map_err(Error::DeviceManager)?; Ok(pci_device_info) } pub fn counters(&self) -> Result>>> { Ok(self.device_manager.lock().unwrap().counters()) } fn os_signal_handler(signals: Signals, console_input_clone: Arc, on_tty: bool) { for signal in signals.forever() { match signal { SIGWINCH => { let (col, row) = get_win_size(); console_input_clone.update_console_size(col, row); } SIGTERM | SIGINT => { if on_tty { io::stdin() .lock() .set_canon_mode() .expect("failed to restore terminal mode"); } std::process::exit((signal != SIGTERM) as i32); } _ => (), } } } pub fn boot(&mut self) -> Result<()> { let current_state = self.get_state()?; if current_state == VmState::Paused { return self.resume().map_err(Error::Resume); } let new_state = VmState::Running; current_state.valid_transition(new_state)?; let entry_point = self.load_kernel()?; // create and configure vcpus self.cpu_manager .lock() .unwrap() .create_boot_vcpus(entry_point) .map_err(Error::CpuManager)?; self.configure_system(entry_point)?; self.cpu_manager .lock() .unwrap() .start_boot_vcpus() .map_err(Error::CpuManager)?; if self .device_manager .lock() .unwrap() .console() .input_enabled() { let console = self.device_manager.lock().unwrap().console().clone(); let signals = Signals::new(&[SIGWINCH, SIGINT, SIGTERM]); match signals { Ok(signals) => { self.signals = Some(signals.clone()); let on_tty = self.on_tty; self.threads.push( thread::Builder::new() .name("signal_handler".to_string()) .spawn(move || Vm::os_signal_handler(signals, console, on_tty)) .map_err(Error::SignalHandlerSpawn)?, ); } Err(e) => error!("Signal not found {}", e), } if self.on_tty { io::stdin() .lock() .set_raw_mode() .map_err(Error::SetTerminalRaw)?; } } let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; *state = new_state; Ok(()) } pub fn handle_stdin(&self) -> Result<()> { let mut out = [0u8; 64]; let count = io::stdin() .lock() .read_raw(&mut out) .map_err(Error::Console)?; if self .device_manager .lock() .unwrap() .console() .input_enabled() { self.device_manager .lock() .unwrap() .console() .queue_input_bytes(&out[..count]) .map_err(Error::Console)?; } Ok(()) } /// Gets a thread-safe reference counted pointer to the VM configuration. pub fn get_config(&self) -> Arc> { Arc::clone(&self.config) } /// Get the VM state. Returns an error if the state is poisoned. pub fn get_state(&self) -> Result { self.state .try_read() .map_err(|_| Error::PoisonedState) .map(|state| *state) } } impl Pausable for Vm { fn pause(&mut self) -> std::result::Result<(), MigratableError> { let mut state = self .state .try_write() .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; let new_state = VmState::Paused; state .valid_transition(new_state) .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; #[cfg(target_arch = "x86_64")] { let mut clock = self .vm .get_clock() .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; // Reset clock flags. clock.flags = 0; self.saved_clock = Some(clock); } self.cpu_manager.lock().unwrap().pause()?; self.device_manager.lock().unwrap().pause()?; *state = new_state; Ok(()) } fn resume(&mut self) -> std::result::Result<(), MigratableError> { let mut state = self .state .try_write() .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; let new_state = VmState::Running; state .valid_transition(new_state) .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; self.cpu_manager.lock().unwrap().resume()?; #[cfg(target_arch = "x86_64")] { if let Some(clock) = &self.saved_clock { self.vm.set_clock(clock).map_err(|e| { MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) })?; } } self.device_manager.lock().unwrap().resume()?; // And we're back to the Running state. *state = new_state; Ok(()) } } #[derive(Serialize, Deserialize)] pub struct VmSnapshot { pub config: Arc>, #[cfg(target_arch = "x86_64")] pub clock: Option, pub state: Option, } pub const VM_SNAPSHOT_ID: &str = "vm"; impl Snapshottable for Vm { fn id(&self) -> String { VM_SNAPSHOT_ID.to_string() } fn snapshot(&self) -> std::result::Result { let current_state = self.get_state().unwrap(); if current_state != VmState::Paused { return Err(MigratableError::Snapshot(anyhow!( "Trying to snapshot while VM is running" ))); } let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID); let vm_state = self .vm .state() .map_err(|e| MigratableError::Snapshot(e.into()))?; let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { config: self.get_config(), #[cfg(target_arch = "x86_64")] clock: self.saved_clock, state: Some(vm_state), }) .map_err(|e| MigratableError::Snapshot(e.into()))?; vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?); vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?); vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?); vm_snapshot.add_data_section(SnapshotDataSection { id: format!("{}-section", VM_SNAPSHOT_ID), snapshot: vm_snapshot_data, }); Ok(vm_snapshot) } fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { let current_state = self .get_state() .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?; let new_state = VmState::Paused; current_state.valid_transition(new_state).map_err(|e| { MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e)) })?; if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { self.memory_manager .lock() .unwrap() .restore(*memory_manager_snapshot.clone())?; } else { return Err(MigratableError::Restore(anyhow!( "Missing memory manager snapshot" ))); } if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { self.device_manager .lock() .unwrap() .restore(*device_manager_snapshot.clone())?; } else { return Err(MigratableError::Restore(anyhow!( "Missing device manager snapshot" ))); } if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) { self.cpu_manager .lock() .unwrap() .restore(*cpu_manager_snapshot.clone())?; } else { return Err(MigratableError::Restore(anyhow!( "Missing CPU manager snapshot" ))); } if self .device_manager .lock() .unwrap() .console() .input_enabled() { let console = self.device_manager.lock().unwrap().console().clone(); let signals = Signals::new(&[SIGWINCH, SIGINT, SIGTERM]); match signals { Ok(signals) => { self.signals = Some(signals.clone()); let on_tty = self.on_tty; self.threads.push( thread::Builder::new() .name("signal_handler".to_string()) .spawn(move || Vm::os_signal_handler(signals, console, on_tty)) .map_err(|e| { MigratableError::Restore(anyhow!( "Could not start console signal thread: {:#?}", e )) })?, ); } Err(e) => error!("Signal not found {}", e), } if self.on_tty { io::stdin().lock().set_raw_mode().map_err(|e| { MigratableError::Restore(anyhow!( "Could not set terminal in raw mode: {:#?}", e )) })?; } } let mut state = self .state .try_write() .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?; *state = new_state; Ok(()) } } impl Transportable for Vm { fn send( &self, snapshot: &Snapshot, destination_url: &str, ) -> std::result::Result<(), MigratableError> { let url = Url::parse(destination_url).map_err(|e| { MigratableError::MigrateSend(anyhow!("Could not parse destination URL: {}", e)) })?; match url.scheme() { "file" => { let mut vm_snapshot_path = url_to_path(&url)?; vm_snapshot_path.push(VM_SNAPSHOT_FILE); // Create the snapshot file let mut vm_snapshot_file = OpenOptions::new() .read(true) .write(true) .create_new(true) .open(vm_snapshot_path) .map_err(|e| MigratableError::MigrateSend(e.into()))?; // Serialize and write the snapshot let vm_snapshot = serde_json::to_vec(snapshot) .map_err(|e| MigratableError::MigrateSend(e.into()))?; vm_snapshot_file .write(&vm_snapshot) .map_err(|e| MigratableError::MigrateSend(e.into()))?; // Tell the memory manager to also send/write its own snapshot. if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { self.memory_manager .lock() .unwrap() .send(&*memory_manager_snapshot.clone(), destination_url)?; } else { return Err(MigratableError::Restore(anyhow!( "Missing memory manager snapshot" ))); } } _ => { return Err(MigratableError::MigrateSend(anyhow!( "Unsupported VM transport URL scheme: {}", url.scheme() ))) } } Ok(()) } } impl Migratable for Vm {} #[cfg(target_arch = "x86_64")] #[cfg(test)] mod tests { use super::*; fn test_vm_state_transitions(state: VmState) { match state { VmState::Created => { // Check the transitions from Created assert!(state.valid_transition(VmState::Created).is_err()); assert!(state.valid_transition(VmState::Running).is_ok()); assert!(state.valid_transition(VmState::Shutdown).is_err()); assert!(state.valid_transition(VmState::Paused).is_ok()); } VmState::Running => { // Check the transitions from Running assert!(state.valid_transition(VmState::Created).is_err()); assert!(state.valid_transition(VmState::Running).is_err()); assert!(state.valid_transition(VmState::Shutdown).is_ok()); assert!(state.valid_transition(VmState::Paused).is_ok()); } VmState::Shutdown => { // Check the transitions from Shutdown assert!(state.valid_transition(VmState::Created).is_err()); assert!(state.valid_transition(VmState::Running).is_ok()); assert!(state.valid_transition(VmState::Shutdown).is_err()); assert!(state.valid_transition(VmState::Paused).is_err()); } VmState::Paused => { // Check the transitions from Paused assert!(state.valid_transition(VmState::Created).is_err()); assert!(state.valid_transition(VmState::Running).is_ok()); assert!(state.valid_transition(VmState::Shutdown).is_ok()); assert!(state.valid_transition(VmState::Paused).is_err()); } } } #[test] fn test_vm_created_transitions() { test_vm_state_transitions(VmState::Created); } #[test] fn test_vm_running_transitions() { test_vm_state_transitions(VmState::Running); } #[test] fn test_vm_shutdown_transitions() { test_vm_state_transitions(VmState::Shutdown); } #[test] fn test_vm_paused_transitions() { test_vm_state_transitions(VmState::Paused); } } #[cfg(target_arch = "aarch64")] #[cfg(test)] mod tests { use super::*; use arch::aarch64::fdt::create_fdt; use arch::aarch64::gic::kvm::create_gic; use arch::aarch64::{layout, DeviceInfoForFDT}; use arch::DeviceType; use vm_memory::{GuestAddress, GuestMemoryMmap}; const LEN: u64 = 4096; #[derive(Clone, Debug)] pub struct MMIODeviceInfo { addr: u64, irq: u32, } impl DeviceInfoForFDT for MMIODeviceInfo { fn addr(&self) -> u64 { self.addr } fn irq(&self) -> u32 { self.irq } fn length(&self) -> u64 { LEN } } #[test] fn test_create_fdt_with_devices() { let mut regions = Vec::new(); regions.push(( GuestAddress(layout::RAM_64BIT_START), (layout::FDT_MAX_SIZE + 0x1000) as usize, )); let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); let dev_info: HashMap<(DeviceType, std::string::String), MMIODeviceInfo> = [ ( (DeviceType::Serial, DeviceType::Serial.to_string()), MMIODeviceInfo { addr: 0x00, irq: 1 }, ), ( (DeviceType::Virtio(1), "virtio".to_string()), MMIODeviceInfo { addr: 0x00 + LEN, irq: 2, }, ), ( (DeviceType::RTC, "rtc".to_string()), MMIODeviceInfo { addr: 0x00 + 2 * LEN, irq: 3, }, ), ] .iter() .cloned() .collect(); let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().unwrap(); let gic = create_gic(&vm, 1, false).unwrap(); assert!(create_fdt( &mem, &CString::new("console=tty0").unwrap(), vec![0], &dev_info, &*gic, &None, &None, ) .is_ok()) } } #[cfg(target_arch = "x86_64")] #[test] pub fn test_vm() { use hypervisor::VmExit; use vm_memory::{GuestMemory, GuestMemoryRegion}; // This example based on https://lwn.net/Articles/658511/ let code = [ 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 0x00, 0xd8, /* add %bl, %al */ 0x04, b'0', /* add $'0', %al */ 0xee, /* out %al, (%dx) */ 0xb0, b'\n', /* mov $'\n', %al */ 0xee, /* out %al, (%dx) */ 0xf4, /* hlt */ ]; let mem_size = 0x1000; let load_addr = GuestAddress(0x1000); let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().expect("new VM creation failed"); mem.with_regions(|index, region| { let mem_region = vm.make_user_memory_region( index as u32, region.start_addr().raw_value(), region.len() as u64, region.as_ptr() as u64, false, ); vm.set_user_memory_region(mem_region) }) .expect("Cannot configure guest memory"); mem.write_slice(&code, load_addr) .expect("Writing code to memory failed"); let vcpu = vm.create_vcpu(0).expect("new Vcpu failed"); let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); vcpu_sregs.cs.base = 0; vcpu_sregs.cs.selector = 0; vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); vcpu_regs.rip = 0x1000; vcpu_regs.rax = 2; vcpu_regs.rbx = 3; vcpu_regs.rflags = 2; vcpu.set_regs(&vcpu_regs).expect("set regs failed"); loop { match vcpu.run().expect("run failed") { VmExit::IoOut(addr, data) => { println!( "IO out -- addr: {:#x} data [{:?}]", addr, str::from_utf8(&data).unwrap() ); } VmExit::Reset => { println!("HLT"); break; } r => panic!("unexpected exit reason: {:?}", r), } } }