// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. // // Copyright © 2019 Intel Corporation // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause // use crate::config::ConsoleOutputMode; use crate::vm::VmInfo; use devices::ioapic; use kvm_bindings::kvm_userspace_memory_region; use kvm_ioctls::*; use libc::O_TMPFILE; use libc::{EFD_NONBLOCK, TIOCGWINSZ}; use net_util::Tap; #[cfg(feature = "pci_support")] use pci::{ DeviceRelocation, InterruptDelivery, InterruptParameters, PciBarRegionType, PciBus, PciConfigIo, PciConfigMmio, PciDevice, PciInterruptPin, PciRoot, }; use qcow::{self, ImageType, QcowFile}; use std::fs::{File, OpenOptions}; use std::io::{self, sink, stdout}; use arch::layout::{APIC_START, IOAPIC_SIZE, IOAPIC_START}; use std::os::unix::fs::OpenOptionsExt; use std::os::unix::io::AsRawFd; use std::ptr::null_mut; use std::result; #[cfg(feature = "pci_support")] use std::sync::Weak; use std::sync::{Arc, Mutex, RwLock}; #[cfg(feature = "pci_support")] use vfio::{VfioDevice, VfioDmaMapping, VfioPciDevice, VfioPciError}; use vm_allocator::SystemAllocator; use vm_memory::GuestAddress; use vm_memory::{Address, GuestMemoryMmap, GuestUsize}; #[cfg(feature = "pci_support")] use vm_virtio::transport::VirtioPciDevice; use vm_virtio::transport::VirtioTransport; use vm_virtio::vhost_user::VhostUserConfig; #[cfg(feature = "pci_support")] use vm_virtio::{DmaRemapping, IommuMapping, VirtioIommuRemapping}; use vm_virtio::{VirtioSharedMemory, VirtioSharedMemoryList}; use vmm_sys_util::eventfd::EventFd; #[cfg(feature = "mmio_support")] const MMIO_LEN: u64 = 0x1000; /// Errors associated with device manager #[derive(Debug)] pub enum DeviceManagerError { /// Cannot create EventFd. EventFd(io::Error), /// Cannot open disk path Disk(io::Error), /// Cannot create vhost-user-net device CreateVhostUserNet(vm_virtio::vhost_user::Error), /// Cannot create virtio-blk device CreateVirtioBlock(io::Error), /// Cannot create virtio-net device CreateVirtioNet(vm_virtio::net::Error), /// Cannot create virtio-console device CreateVirtioConsole(io::Error), /// Cannot create virtio-rng device CreateVirtioRng(io::Error), /// Cannot create virtio-fs device CreateVirtioFs(vm_virtio::vhost_user::Error), /// Cannot create vhost-user-blk device CreateVhostUserBlk(vm_virtio::vhost_user::Error), /// Cannot create virtio-pmem device CreateVirtioPmem(io::Error), /// Cannot create virtio-vsock device CreateVirtioVsock(io::Error), /// Failed converting Path to &str for the virtio-vsock device. CreateVsockConvertPath, /// Cannot create virtio-vsock backend CreateVsockBackend(vm_virtio::vsock::VsockUnixError), /// Cannot create virtio-iommu device CreateVirtioIommu(io::Error), /// Failed parsing disk image format DetectImageType(qcow::Error), /// Cannot open qcow disk path QcowDeviceCreate(qcow::Error), /// Cannot open tap interface OpenTap(net_util::TapError), /// Cannot allocate IRQ. AllocateIrq, /// Cannot configure the IRQ. Irq(kvm_ioctls::Error), /// Cannot allocate PCI BARs #[cfg(feature = "pci_support")] AllocateBars(pci::PciDeviceError), /// Cannot register ioevent. RegisterIoevent(kvm_ioctls::Error), /// Cannot create virtio device VirtioDevice(vmm_sys_util::errno::Error), /// Cannot add PCI device #[cfg(feature = "pci_support")] AddPciDevice(pci::PciRootError), /// Cannot open persistent memory file PmemFileOpen(io::Error), /// Cannot set persistent memory file size PmemFileSetLen(io::Error), /// Cannot find a memory range for persistent memory PmemRangeAllocation, /// Cannot find a memory range for virtio-fs FsRangeAllocation, /// Error creating serial output file SerialOutputFileOpen(io::Error), /// Error creating console output file ConsoleOutputFileOpen(io::Error), /// Cannot create a VFIO device #[cfg(feature = "pci_support")] VfioCreate(vfio::VfioError), /// Cannot create a VFIO PCI device #[cfg(feature = "pci_support")] VfioPciCreate(vfio::VfioPciError), /// Failed to map VFIO MMIO region. #[cfg(feature = "pci_support")] VfioMapRegion(VfioPciError), /// Failed to create the KVM device. CreateKvmDevice(kvm_ioctls::Error), /// Failed to memory map. Mmap(io::Error), /// Cannot add legacy device to Bus. BusError(devices::BusError), /// Failed to allocate IO port AllocateIOPort, } pub type DeviceManagerResult = result::Result; struct InterruptInfo<'a> { _msi_capable: bool, ioapic: &'a Option>>, } struct KernelIoapicIrq { evt: EventFd, } impl KernelIoapicIrq { fn new(evt: EventFd) -> Self { KernelIoapicIrq { evt } } } impl devices::Interrupt for KernelIoapicIrq { fn deliver(&self) -> result::Result<(), io::Error> { self.evt.write(1) } } struct UserIoapicIrq { ioapic: Arc>, irq: usize, } impl UserIoapicIrq { fn new(ioapic: Arc>, irq: usize) -> Self { UserIoapicIrq { ioapic, irq } } } impl devices::Interrupt for UserIoapicIrq { fn deliver(&self) -> result::Result<(), io::Error> { self.ioapic .lock() .unwrap() .service_irq(self.irq) .map_err(|e| { std::io::Error::new( std::io::ErrorKind::Other, format!("failed to inject IRQ #{}: {:?}", self.irq, e), ) }) } } pub fn get_win_size() -> (u16, u16) { #[repr(C)] struct WS { rows: u16, cols: u16, }; let ws: WS = WS { rows: 0u16, cols: 0u16, }; unsafe { libc::ioctl(0, TIOCGWINSZ, &ws); } (ws.cols, ws.rows) } pub struct Console { // Serial port on 0x3f8 serial: Option>>, console_input: Option>, input_enabled: bool, } impl Console { pub fn queue_input_bytes(&self, out: &[u8]) -> vmm_sys_util::errno::Result<()> { if self.serial.is_some() { self.serial .as_ref() .unwrap() .lock() .expect("Failed to process stdin event due to poisoned lock") .queue_input_bytes(out)?; } if self.console_input.is_some() { self.console_input.as_ref().unwrap().queue_input_bytes(out); } Ok(()) } pub fn update_console_size(&self, cols: u16, rows: u16) { if self.console_input.is_some() { self.console_input .as_ref() .unwrap() .update_console_size(cols, rows) } } pub fn input_enabled(&self) -> bool { self.input_enabled } } struct AddressManager { allocator: Arc>, io_bus: Arc, mmio_bus: Arc, #[cfg(feature = "pci_support")] vm_fd: Arc, } #[cfg(feature = "pci_support")] impl DeviceRelocation for AddressManager { fn move_bar( &self, old_base: u64, new_base: u64, len: u64, pci_dev: &mut dyn PciDevice, region_type: PciBarRegionType, ) -> std::result::Result<(), std::io::Error> { match region_type { PciBarRegionType::IORegion => { // Update system allocator self.allocator .lock() .unwrap() .free_io_addresses(GuestAddress(old_base), len as GuestUsize); self.allocator .lock() .unwrap() .allocate_io_addresses(Some(GuestAddress(new_base)), len as GuestUsize, None) .ok_or_else(|| { io::Error::new(io::ErrorKind::Other, "failed allocating new IO range") })?; // Update PIO bus self.io_bus .update_range(old_base, len, new_base, len) .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; } PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { // Update system allocator self.allocator .lock() .unwrap() .free_mmio_addresses(GuestAddress(old_base), len as GuestUsize); if region_type == PciBarRegionType::Memory32BitRegion { self.allocator .lock() .unwrap() .allocate_mmio_hole_addresses( Some(GuestAddress(new_base)), len as GuestUsize, None, ) .ok_or_else(|| { io::Error::new( io::ErrorKind::Other, "failed allocating new 32 bits MMIO range", ) })?; } else { self.allocator .lock() .unwrap() .allocate_mmio_addresses( Some(GuestAddress(new_base)), len as GuestUsize, None, ) .ok_or_else(|| { io::Error::new( io::ErrorKind::Other, "failed allocating new 64 bits MMIO range", ) })?; } // Update MMIO bus self.mmio_bus .update_range(old_base, len, new_base, len) .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; } } let any_dev = pci_dev.as_any(); if let Some(virtio_pci_dev) = any_dev.downcast_ref::() { let bar_addr = virtio_pci_dev.config_bar_addr(); if bar_addr == new_base { for (event, addr) in virtio_pci_dev.ioeventfds(old_base) { let io_addr = IoEventAddress::Mmio(addr); self.vm_fd .unregister_ioevent(event, &io_addr) .map_err(|e| io::Error::from_raw_os_error(e.errno()))?; } for (event, addr) in virtio_pci_dev.ioeventfds(new_base) { let io_addr = IoEventAddress::Mmio(addr); self.vm_fd .register_ioevent(event, &io_addr, NoDatamatch) .map_err(|e| io::Error::from_raw_os_error(e.errno()))?; } } } pci_dev.move_bar(old_base, new_base) } } pub struct DeviceManager { // Manage address space related to devices address_manager: Arc, // Console abstraction console: Arc, // IOAPIC ioapic: Option>>, // mmap()ed region to unmap on drop mmap_regions: Vec<(*mut libc::c_void, usize)>, // Things to be added to the commandline (i.e. for virtio-mmio) cmdline_additions: Vec, // Virtual IOMMU ID along with the list of device IDs attached to the // virtual IOMMU. This is useful for filling the ACPI IORT table. virt_iommu: Option<(u32, Vec)>, } impl DeviceManager { pub fn new( vm_info: &VmInfo, allocator: SystemAllocator, _msi_capable: bool, userspace_ioapic: bool, mut mem_slots: u32, _exit_evt: &EventFd, reset_evt: &EventFd, ) -> DeviceManagerResult { let io_bus = devices::Bus::new(); let mmio_bus = devices::Bus::new(); let mut virtio_devices: Vec<(Box, bool)> = Vec::new(); let mut mmap_regions = Vec::new(); #[allow(unused_mut)] let mut cmdline_additions = Vec::new(); #[allow(unused_mut)] let mut virt_iommu: Option<(u32, Vec)> = None; let address_manager = Arc::new(AddressManager { allocator: Arc::new(Mutex::new(allocator)), io_bus: Arc::new(io_bus), mmio_bus: Arc::new(mmio_bus), #[cfg(feature = "pci_support")] vm_fd: vm_info.vm_fd.clone(), }); let ioapic = DeviceManager::add_ioapic(vm_info, &address_manager, userspace_ioapic)?; let interrupt_info = InterruptInfo { _msi_capable, ioapic: &ioapic, }; let console = DeviceManager::add_console_device( vm_info, &address_manager, &ioapic, &mut virtio_devices, )?; virtio_devices.append(&mut DeviceManager::make_virtio_devices( vm_info, &address_manager, &mut mem_slots, &mut mmap_regions, )?); DeviceManager::add_legacy_devices( vm_info, &address_manager, reset_evt.try_clone().map_err(DeviceManagerError::EventFd)?, )?; DeviceManager::add_acpi_device( &address_manager, reset_evt.try_clone().map_err(DeviceManagerError::EventFd)?, _exit_evt.try_clone().map_err(DeviceManagerError::EventFd)?, )?; if cfg!(feature = "pci_support") { DeviceManager::add_pci_devices( vm_info, &address_manager, mem_slots, &mut virt_iommu, virtio_devices, &interrupt_info, )?; } else if cfg!(feature = "mmio_support") { DeviceManager::add_mmio_devices( vm_info, &address_manager, virtio_devices, &interrupt_info, &mut cmdline_additions, )?; } Ok(DeviceManager { address_manager, console, ioapic, mmap_regions, cmdline_additions, virt_iommu, }) } #[allow(unused_variables)] fn add_pci_devices( vm_info: &VmInfo, address_manager: &Arc, mem_slots: u32, virt_iommu: &mut Option<(u32, Vec)>, virtio_devices: Vec<(Box, bool)>, interrupt_info: &InterruptInfo, ) -> DeviceManagerResult<()> { #[cfg(feature = "pci_support")] { let pci_root = PciRoot::new(None); let mut pci_bus = PciBus::new( pci_root, Arc::downgrade(&address_manager) as Weak, ); let (mut iommu_device, iommu_mapping) = if vm_info.vm_cfg.iommu { let (device, mapping) = vm_virtio::Iommu::new().map_err(DeviceManagerError::CreateVirtioIommu)?; (Some(device), Some(mapping)) } else { (None, None) }; let mut iommu_attached_devices = Vec::new(); for (device, iommu_attached) in virtio_devices { let mapping: &Option> = if iommu_attached { &iommu_mapping } else { &None }; let virtio_iommu_attach_dev = DeviceManager::add_virtio_pci_device( device, vm_info.memory, &address_manager, vm_info.vm_fd, &mut pci_bus, &interrupt_info, mapping, )?; if let Some(dev_id) = virtio_iommu_attach_dev { iommu_attached_devices.push(dev_id); } } let mut vfio_iommu_device_ids = DeviceManager::add_vfio_devices( vm_info, &address_manager, &mut pci_bus, mem_slots, &mut iommu_device, )?; iommu_attached_devices.append(&mut vfio_iommu_device_ids); if let Some(iommu_device) = iommu_device { // We need to shift the device id since the 3 first bits // are dedicated to the PCI function, and we know we don't // do multifunction. Also, because we only support one PCI // bus, the bus 0, we don't need to add anything to the // global device ID. let iommu_id = pci_bus.next_device_id() << 3; // Because we determined the virtio-iommu b/d/f, we have to // add the device to the PCI topology now. Otherwise, the // b/d/f won't match the virtio-iommu device as expected. DeviceManager::add_virtio_pci_device( Box::new(iommu_device), vm_info.memory, &address_manager, vm_info.vm_fd, &mut pci_bus, &interrupt_info, &None, )?; *virt_iommu = Some((iommu_id, iommu_attached_devices)); } let pci_bus = Arc::new(Mutex::new(pci_bus)); let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(pci_bus.clone()))); address_manager .io_bus .insert(pci_config_io, 0xcf8, 0x8) .map_err(DeviceManagerError::BusError)?; let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(pci_bus))); address_manager .mmio_bus .insert( pci_config_mmio, arch::layout::PCI_MMCONFIG_START.0, arch::layout::PCI_MMCONFIG_SIZE, ) .map_err(DeviceManagerError::BusError)?; } Ok(()) } #[allow(unused_variables, unused_mut)] fn add_mmio_devices( vm_info: &VmInfo, address_manager: &Arc, virtio_devices: Vec<(Box, bool)>, interrupt_info: &InterruptInfo, mut cmdline_additions: &mut Vec, ) -> DeviceManagerResult<()> { #[cfg(feature = "mmio_support")] { for (device, _) in virtio_devices { let mmio_addr = address_manager .allocator .lock() .unwrap() .allocate_mmio_addresses(None, MMIO_LEN, Some(MMIO_LEN)); if let Some(addr) = mmio_addr { DeviceManager::add_virtio_mmio_device( device, vm_info.memory, &address_manager, vm_info.vm_fd, &interrupt_info, addr, &mut cmdline_additions, )?; } else { error!("Unable to allocate MMIO address!"); } } } Ok(()) } fn add_ioapic( vm_info: &VmInfo, address_manager: &Arc, userspace_ioapic: bool, ) -> DeviceManagerResult>>> { let ioapic = if userspace_ioapic { // Create IOAPIC let ioapic = Arc::new(Mutex::new(ioapic::Ioapic::new( vm_info.vm_fd.clone(), APIC_START, ))); address_manager .mmio_bus .insert(ioapic.clone(), IOAPIC_START.0, IOAPIC_SIZE) .map_err(DeviceManagerError::BusError)?; Some(ioapic) } else { None }; Ok(ioapic) } #[allow(unused_variables)] fn add_acpi_device( address_manager: &Arc, reset_evt: EventFd, exit_evt: EventFd, ) -> DeviceManagerResult<()> { #[cfg(feature = "acpi")] { let acpi_device = Arc::new(Mutex::new(devices::AcpiShutdownDevice::new( exit_evt, reset_evt, ))); address_manager .io_bus .insert(acpi_device.clone(), 0x3c0, 0x4) .map_err(DeviceManagerError::BusError)?; } Ok(()) } fn add_legacy_devices( _vm_info: &VmInfo, address_manager: &Arc, reset_evt: EventFd, ) -> DeviceManagerResult<()> { // Add a shutdown device (i8042) let i8042 = Arc::new(Mutex::new(devices::legacy::I8042Device::new(reset_evt))); address_manager .io_bus .insert(i8042.clone(), 0x61, 0x4) .map_err(DeviceManagerError::BusError)?; #[cfg(feature = "cmos")] { // Add a CMOS emulated device use vm_memory::GuestMemory; let mem_size = _vm_info.memory.as_ref().read().unwrap().end_addr().0 + 1; let mem_below_4g = std::cmp::min(arch::layout::MEM_32BIT_RESERVED_START.0, mem_size); let mem_above_4g = mem_size.saturating_sub(arch::layout::RAM_64BIT_START.0); let cmos = Arc::new(Mutex::new(devices::legacy::Cmos::new( mem_below_4g, mem_above_4g, ))); address_manager .io_bus .insert(cmos.clone(), 0x70, 0x2) .map_err(DeviceManagerError::BusError)?; } Ok(()) } fn add_console_device( vm_info: &VmInfo, address_manager: &Arc, ioapic: &Option>>, virtio_devices: &mut Vec<(Box, bool)>, ) -> DeviceManagerResult> { let serial_writer: Option> = match vm_info.vm_cfg.serial.mode { ConsoleOutputMode::File => Some(Box::new( File::create(vm_info.vm_cfg.serial.file.as_ref().unwrap()) .map_err(DeviceManagerError::SerialOutputFileOpen)?, )), ConsoleOutputMode::Tty => Some(Box::new(stdout())), ConsoleOutputMode::Off | ConsoleOutputMode::Null => None, }; let serial = if vm_info.vm_cfg.serial.mode != ConsoleOutputMode::Off { // Serial is tied to IRQ #4 let serial_irq = 4; let interrupt: Box = if let Some(ioapic) = &ioapic { Box::new(UserIoapicIrq::new(ioapic.clone(), serial_irq)) } else { let serial_evt = EventFd::new(EFD_NONBLOCK).map_err(DeviceManagerError::EventFd)?; vm_info .vm_fd .register_irqfd(&serial_evt, serial_irq as u32) .map_err(DeviceManagerError::Irq)?; Box::new(KernelIoapicIrq::new(serial_evt)) }; let serial = Arc::new(Mutex::new(devices::legacy::Serial::new( interrupt, serial_writer, ))); address_manager .allocator .lock() .unwrap() .allocate_io_addresses(Some(GuestAddress(0x3f8)), 0x8, None) .ok_or(DeviceManagerError::AllocateIOPort)?; address_manager .io_bus .insert(serial.clone(), 0x3f8, 0x8) .map_err(DeviceManagerError::BusError)?; Some(serial) } else { None }; // Create serial and virtio-console let console_writer: Option> = match vm_info.vm_cfg.console.mode { ConsoleOutputMode::File => Some(Box::new( File::create(vm_info.vm_cfg.console.file.as_ref().unwrap()) .map_err(DeviceManagerError::ConsoleOutputFileOpen)?, )), ConsoleOutputMode::Tty => Some(Box::new(stdout())), ConsoleOutputMode::Null => Some(Box::new(sink())), ConsoleOutputMode::Off => None, }; let (col, row) = get_win_size(); let console_input = if let Some(writer) = console_writer { let (virtio_console_device, console_input) = vm_virtio::Console::new(writer, col, row, vm_info.vm_cfg.console.iommu) .map_err(DeviceManagerError::CreateVirtioConsole)?; virtio_devices.push(( Box::new(virtio_console_device) as Box, false, )); Some(console_input) } else { None }; Ok(Arc::new(Console { serial, console_input, input_enabled: vm_info.vm_cfg.serial.mode.input_enabled() || vm_info.vm_cfg.console.mode.input_enabled(), })) } fn make_virtio_devices( vm_info: &VmInfo, address_manager: &Arc, mut mem_slots: &mut u32, mmap_regions: &mut Vec<(*mut libc::c_void, usize)>, ) -> DeviceManagerResult, bool)>> { let mut allocator = address_manager.allocator.lock().unwrap(); let mut devices: Vec<(Box, bool)> = Vec::new(); // Create "standard" virtio devices (net/block/rng) devices.append(&mut DeviceManager::make_virtio_block_devices(vm_info)?); devices.append(&mut DeviceManager::make_virtio_net_devices(vm_info)?); devices.append(&mut DeviceManager::make_virtio_rng_devices(vm_info)?); // Add virtio-fs if required devices.append(&mut DeviceManager::make_virtio_fs_devices( vm_info, &mut allocator, &mut mem_slots, mmap_regions, )?); // Add virtio-pmem if required devices.append(&mut DeviceManager::make_virtio_pmem_devices( vm_info, &mut allocator, &mut mem_slots, mmap_regions, )?); // Add virtio-vhost-user-net if required devices.append(&mut DeviceManager::make_virtio_vhost_user_net_devices( vm_info, )?); // Add virtio-vhost-user-blk if required devices.append(&mut DeviceManager::make_virtio_vhost_user_blk_devices( vm_info, )?); // Add virtio-vsock if required devices.append(&mut DeviceManager::make_virtio_vsock_devices(vm_info)?); Ok(devices) } fn make_virtio_block_devices( vm_info: &VmInfo, ) -> DeviceManagerResult, bool)>> { let mut devices = Vec::new(); if let Some(disk_list_cfg) = &vm_info.vm_cfg.disks { for disk_cfg in disk_list_cfg.iter() { // Open block device path let raw_img: File = OpenOptions::new() .read(true) .write(true) .open(&disk_cfg.path) .map_err(DeviceManagerError::Disk)?; let image_type = qcow::detect_image_type(&raw_img) .map_err(DeviceManagerError::DetectImageType)?; let block = match image_type { ImageType::Raw => { let raw_img = vm_virtio::RawFile::new(raw_img); let dev = vm_virtio::Block::new( raw_img, disk_cfg.path.clone(), false, disk_cfg.iommu, ) .map_err(DeviceManagerError::CreateVirtioBlock)?; Box::new(dev) as Box } ImageType::Qcow2 => { let qcow_img = QcowFile::from(raw_img) .map_err(DeviceManagerError::QcowDeviceCreate)?; let dev = vm_virtio::Block::new( qcow_img, disk_cfg.path.clone(), false, disk_cfg.iommu, ) .map_err(DeviceManagerError::CreateVirtioBlock)?; Box::new(dev) as Box } }; devices.push((block, disk_cfg.iommu)); } } Ok(devices) } fn make_virtio_net_devices( vm_info: &VmInfo, ) -> DeviceManagerResult, bool)>> { let mut devices = Vec::new(); // Add virtio-net if required if let Some(net_list_cfg) = &vm_info.vm_cfg.net { for net_cfg in net_list_cfg.iter() { let virtio_net_device = if let Some(ref tap_if_name) = net_cfg.tap { let tap = Tap::open_named(tap_if_name).map_err(DeviceManagerError::OpenTap)?; vm_virtio::Net::new_with_tap(tap, Some(&net_cfg.mac), net_cfg.iommu) .map_err(DeviceManagerError::CreateVirtioNet)? } else { vm_virtio::Net::new(net_cfg.ip, net_cfg.mask, Some(&net_cfg.mac), net_cfg.iommu) .map_err(DeviceManagerError::CreateVirtioNet)? }; devices.push(( Box::new(virtio_net_device) as Box, net_cfg.iommu, )); } } Ok(devices) } fn make_virtio_rng_devices( vm_info: &VmInfo, ) -> DeviceManagerResult, bool)>> { let mut devices = Vec::new(); // Add virtio-rng if required if let Some(rng_path) = vm_info.vm_cfg.rng.src.to_str() { let virtio_rng_device = vm_virtio::Rng::new(rng_path, vm_info.vm_cfg.rng.iommu) .map_err(DeviceManagerError::CreateVirtioRng)?; devices.push(( Box::new(virtio_rng_device) as Box, false, )); } Ok(devices) } fn make_virtio_fs_devices( vm_info: &VmInfo, allocator: &mut SystemAllocator, mem_slots: &mut u32, mmap_regions: &mut Vec<(*mut libc::c_void, usize)>, ) -> DeviceManagerResult, bool)>> { let mut devices = Vec::new(); // Add virtio-fs if required if let Some(fs_list_cfg) = &vm_info.vm_cfg.fs { for fs_cfg in fs_list_cfg.iter() { if let Some(fs_sock) = fs_cfg.sock.to_str() { let mut cache: Option<(VirtioSharedMemoryList, u64)> = None; if let Some(fs_cache) = fs_cfg.cache_size { // The memory needs to be 2MiB aligned in order to support // hugepages. let fs_guest_addr = allocator .allocate_mmio_addresses( None, fs_cache as GuestUsize, Some(0x0020_0000), ) .ok_or(DeviceManagerError::FsRangeAllocation)?; let addr = unsafe { libc::mmap( null_mut(), fs_cache as usize, libc::PROT_READ | libc::PROT_WRITE, libc::MAP_NORESERVE | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, -1, 0 as libc::off_t, ) }; if addr == libc::MAP_FAILED { return Err(DeviceManagerError::Mmap(io::Error::last_os_error())); } mmap_regions.push((addr, fs_cache as usize)); let mem_region = kvm_userspace_memory_region { slot: *mem_slots as u32, guest_phys_addr: fs_guest_addr.raw_value(), memory_size: fs_cache, userspace_addr: addr as u64, flags: 0, }; // Safe because the guest regions are guaranteed not to overlap. let _ = unsafe { vm_info.vm_fd.set_user_memory_region(mem_region) }; // Increment the KVM slot number *mem_slots += 1; let mut region_list = Vec::new(); region_list.push(VirtioSharedMemory { offset: 0, len: fs_cache, }); cache = Some(( VirtioSharedMemoryList { addr: fs_guest_addr, len: fs_cache as GuestUsize, region_list, }, addr as u64, )); } let virtio_fs_device = vm_virtio::vhost_user::Fs::new( fs_sock, &fs_cfg.tag, fs_cfg.num_queues, fs_cfg.queue_size, cache, ) .map_err(DeviceManagerError::CreateVirtioFs)?; devices.push(( Box::new(virtio_fs_device) as Box, false, )); } } } Ok(devices) } fn make_virtio_pmem_devices( vm_info: &VmInfo, allocator: &mut SystemAllocator, mem_slots: &mut u32, mmap_regions: &mut Vec<(*mut libc::c_void, usize)>, ) -> DeviceManagerResult, bool)>> { let mut devices = Vec::new(); // Add virtio-pmem if required if let Some(pmem_list_cfg) = &vm_info.vm_cfg.pmem { for pmem_cfg in pmem_list_cfg.iter() { let size = pmem_cfg.size; // The memory needs to be 2MiB aligned in order to support // hugepages. let pmem_guest_addr = allocator .allocate_mmio_addresses(None, size as GuestUsize, Some(0x0020_0000)) .ok_or(DeviceManagerError::PmemRangeAllocation)?; let (custom_flags, set_len) = if pmem_cfg.file.is_dir() { (O_TMPFILE, true) } else { (0, false) }; let file = OpenOptions::new() .read(true) .write(true) .custom_flags(custom_flags) .open(&pmem_cfg.file) .map_err(DeviceManagerError::PmemFileOpen)?; if set_len { file.set_len(size) .map_err(DeviceManagerError::PmemFileSetLen)?; } let addr = unsafe { libc::mmap( null_mut(), size as usize, libc::PROT_READ | libc::PROT_WRITE, libc::MAP_NORESERVE | libc::MAP_SHARED, file.as_raw_fd(), 0 as libc::off_t, ) }; mmap_regions.push((addr, size as usize)); let mem_region = kvm_userspace_memory_region { slot: *mem_slots as u32, guest_phys_addr: pmem_guest_addr.raw_value(), memory_size: size, userspace_addr: addr as u64, flags: 0, }; // Safe because the guest regions are guaranteed not to overlap. let _ = unsafe { vm_info.vm_fd.set_user_memory_region(mem_region) }; // Mark the pages as mergeable if explicitly asked for. if pmem_cfg.mergeable { // Safe because the address and size are valid since the // mmap succeeded.. let ret = unsafe { libc::madvise( addr as *mut libc::c_void, size as libc::size_t, libc::MADV_MERGEABLE, ) }; if ret != 0 { let err = io::Error::last_os_error(); // Safe to unwrap because the error is constructed with // last_os_error(), which ensures the output will be Some(). let errno = err.raw_os_error().unwrap(); if errno == libc::EINVAL { warn!("kernel not configured with CONFIG_KSM"); } else { warn!("madvise error: {}", err); } warn!("failed to mark pages as mergeable"); } } // Increment the KVM slot number *mem_slots += 1; let virtio_pmem_device = vm_virtio::Pmem::new(file, pmem_guest_addr, size as GuestUsize, pmem_cfg.iommu) .map_err(DeviceManagerError::CreateVirtioPmem)?; devices.push(( Box::new(virtio_pmem_device) as Box, false, )); } } Ok(devices) } fn make_virtio_vhost_user_net_devices( vm_info: &VmInfo, ) -> DeviceManagerResult, bool)>> { let mut devices = Vec::new(); // Add vhost-user-net if required if let Some(vhost_user_net_list_cfg) = &vm_info.vm_cfg.vhost_user_net { for vhost_user_net_cfg in vhost_user_net_list_cfg.iter() { let vu_cfg = VhostUserConfig { sock: vhost_user_net_cfg.vu_cfg.sock.clone(), num_queues: vhost_user_net_cfg.vu_cfg.num_queues, queue_size: vhost_user_net_cfg.vu_cfg.queue_size, }; let vhost_user_net_device = vm_virtio::vhost_user::Net::new(vhost_user_net_cfg.mac, vu_cfg) .map_err(DeviceManagerError::CreateVhostUserNet)?; devices.push(( Box::new(vhost_user_net_device) as Box, false, )); } } Ok(devices) } fn make_virtio_vhost_user_blk_devices( vm_info: &VmInfo, ) -> DeviceManagerResult, bool)>> { let mut devices = Vec::new(); // Add vhost-user-blk if required if let Some(vhost_user_blk_list_cfg) = &vm_info.vm_cfg.vhost_user_blk { for vhost_user_blk_cfg in vhost_user_blk_list_cfg.iter() { let vu_cfg = VhostUserConfig { sock: vhost_user_blk_cfg.vu_cfg.sock.clone(), num_queues: vhost_user_blk_cfg.vu_cfg.num_queues, queue_size: vhost_user_blk_cfg.vu_cfg.queue_size, }; let vhost_user_blk_device = vm_virtio::vhost_user::Blk::new(vhost_user_blk_cfg.wce, vu_cfg) .map_err(DeviceManagerError::CreateVhostUserBlk)?; devices.push(( Box::new(vhost_user_blk_device) as Box, false, )); } } Ok(devices) } fn make_virtio_vsock_devices( vm_info: &VmInfo, ) -> DeviceManagerResult, bool)>> { let mut devices = Vec::new(); // Add vsock if required if let Some(vsock_list_cfg) = &vm_info.vm_cfg.vsock { for vsock_cfg in vsock_list_cfg.iter() { let socket_path = vsock_cfg .sock .to_str() .ok_or(DeviceManagerError::CreateVsockConvertPath)?; let backend = vm_virtio::vsock::VsockUnixBackend::new(vsock_cfg.cid, socket_path.to_string()) .map_err(DeviceManagerError::CreateVsockBackend)?; let vsock_device = vm_virtio::Vsock::new(vsock_cfg.cid, backend, vsock_cfg.iommu) .map_err(DeviceManagerError::CreateVirtioVsock)?; devices.push(( Box::new(vsock_device) as Box, false, )); } } Ok(devices) } #[cfg(feature = "pci_support")] fn create_kvm_device(vm: &Arc) -> DeviceManagerResult { let mut vfio_dev = kvm_bindings::kvm_create_device { type_: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_VFIO, fd: 0, flags: 0, }; vm.create_device(&mut vfio_dev) .map_err(DeviceManagerError::CreateKvmDevice) } #[cfg(feature = "pci_support")] fn add_vfio_devices( vm_info: &VmInfo, address_manager: &Arc, pci: &mut PciBus, mem_slots: u32, iommu_device: &mut Option, ) -> DeviceManagerResult> { let mut mem_slot = mem_slots; let mut iommu_attached_device_ids = Vec::new(); let mut allocator = address_manager.allocator.lock().unwrap(); if let Some(device_list_cfg) = &vm_info.vm_cfg.devices { // Create the KVM VFIO device let device_fd = DeviceManager::create_kvm_device(vm_info.vm_fd)?; let device_fd = Arc::new(device_fd); for device_cfg in device_list_cfg.iter() { // We need to shift the device id since the 3 first bits // are dedicated to the PCI function, and we know we don't // do multifunction. Also, because we only support one PCI // bus, the bus 0, we don't need to add anything to the // global device ID. let device_id = pci.next_device_id() << 3; let vfio_device = VfioDevice::new( &device_cfg.path, device_fd.clone(), vm_info.memory.clone(), device_cfg.iommu, ) .map_err(DeviceManagerError::VfioCreate)?; if device_cfg.iommu { if let Some(iommu) = iommu_device { let vfio_mapping = Arc::new(VfioDmaMapping::new( vfio_device.get_container(), Arc::clone(vm_info.memory), )); iommu_attached_device_ids.push(device_id); iommu.add_external_mapping(device_id, vfio_mapping); } } let mut vfio_pci_device = VfioPciDevice::new(vm_info.vm_fd, &mut allocator, vfio_device) .map_err(DeviceManagerError::VfioPciCreate)?; let bars = vfio_pci_device .allocate_bars(&mut allocator) .map_err(DeviceManagerError::AllocateBars)?; mem_slot = vfio_pci_device .map_mmio_regions(vm_info.vm_fd, mem_slot) .map_err(DeviceManagerError::VfioMapRegion)?; let vfio_pci_device = Arc::new(Mutex::new(vfio_pci_device)); pci.add_device(vfio_pci_device.clone()) .map_err(DeviceManagerError::AddPciDevice)?; pci.register_mapping( vfio_pci_device.clone(), address_manager.io_bus.as_ref(), address_manager.mmio_bus.as_ref(), bars, ) .map_err(DeviceManagerError::AddPciDevice)?; } } Ok(iommu_attached_device_ids) } #[cfg(feature = "pci_support")] #[allow(clippy::too_many_arguments)] fn add_virtio_pci_device( virtio_device: Box, memory: &Arc>, address_manager: &Arc, vm_fd: &Arc, pci: &mut PciBus, interrupt_info: &InterruptInfo, iommu_mapping: &Option>, ) -> DeviceManagerResult> { let msix_num = if interrupt_info._msi_capable { // Allows support for one MSI-X vector per queue. It also adds 1 // as we need to take into account the dedicated vector to notify // about a virtio config change. (virtio_device.queue_max_sizes().len() + 1) as u16 } else { 0 }; // We need to shift the device id since the 3 first bits are dedicated // to the PCI function, and we know we don't do multifunction. // Also, because we only support one PCI bus, the bus 0, we don't need // to add anything to the global device ID. let dev_id = pci.next_device_id() << 3; // Create the callback from the implementation of the DmaRemapping // trait. The point with the callback is to simplify the code as we // know about the device ID from this point. let iommu_mapping_cb: Option> = if let Some(mapping) = iommu_mapping { let mapping_clone = mapping.clone(); Some(Arc::new(Box::new(move |addr: u64| { mapping_clone.translate(dev_id, addr).map_err(|e| { std::io::Error::new( std::io::ErrorKind::Other, format!( "failed to translate addr 0x{:x} for device 00:{:02x}.0 {}", addr, dev_id, e ), ) }) }) as VirtioIommuRemapping)) } else { None }; let mut virtio_pci_device = VirtioPciDevice::new(memory.clone(), virtio_device, msix_num, iommu_mapping_cb) .map_err(DeviceManagerError::VirtioDevice)?; let mut allocator = address_manager.allocator.lock().unwrap(); let bars = virtio_pci_device .allocate_bars(&mut allocator) .map_err(DeviceManagerError::AllocateBars)?; let bar_addr = virtio_pci_device.config_bar_addr(); for (event, addr) in virtio_pci_device.ioeventfds(bar_addr) { let io_addr = IoEventAddress::Mmio(addr); vm_fd .register_ioevent(event, &io_addr, NoDatamatch) .map_err(DeviceManagerError::RegisterIoevent)?; } if interrupt_info._msi_capable { let vm_fd_clone = vm_fd.clone(); let msi_cb = Arc::new(Box::new(move |p: InterruptParameters| { if let Some(entry) = p.msix { use kvm_bindings::kvm_msi; let msi_queue = kvm_msi { address_lo: entry.msg_addr_lo, address_hi: entry.msg_addr_hi, data: entry.msg_data, flags: 0u32, devid: 0u32, pad: [0u8; 12], }; return vm_fd_clone .signal_msi(msi_queue) .map_err(|e| io::Error::from_raw_os_error(e.errno())) .map(|ret| { if ret > 0 { debug!("MSI message successfully delivered"); } else if ret == 0 { warn!("failed to deliver MSI message, blocked by guest"); } }); } Err(std::io::Error::new( std::io::ErrorKind::Other, "missing MSI-X entry", )) }) as InterruptDelivery); virtio_pci_device.assign_msix(msi_cb); } else { let irq_num = allocator .allocate_irq() .ok_or(DeviceManagerError::AllocateIrq)?; let irq_cb = if let Some(ioapic) = interrupt_info.ioapic { let ioapic_clone = ioapic.clone(); Box::new(move |_p: InterruptParameters| { ioapic_clone .lock() .unwrap() .service_irq(irq_num as usize) .map_err(|e| { std::io::Error::new( std::io::ErrorKind::Other, format!("failed to inject IRQ #{}: {:?}", irq_num, e), ) }) }) as InterruptDelivery } else { let irqfd = EventFd::new(EFD_NONBLOCK).map_err(DeviceManagerError::EventFd)?; vm_fd .register_irqfd(&irqfd, irq_num) .map_err(DeviceManagerError::Irq)?; Box::new(move |_p: InterruptParameters| irqfd.write(1)) as InterruptDelivery }; virtio_pci_device.assign_pin_irq( Arc::new(irq_cb), irq_num as u32, PciInterruptPin::IntA, ); } let virtio_pci_device = Arc::new(Mutex::new(virtio_pci_device)); pci.add_device(virtio_pci_device.clone()) .map_err(DeviceManagerError::AddPciDevice)?; pci.register_mapping( virtio_pci_device.clone(), address_manager.io_bus.as_ref(), address_manager.mmio_bus.as_ref(), bars, ) .map_err(DeviceManagerError::AddPciDevice)?; let ret = if iommu_mapping.is_some() { Some(dev_id) } else { None }; Ok(ret) } #[allow(clippy::too_many_arguments)] #[cfg(feature = "mmio_support")] fn add_virtio_mmio_device( virtio_device: Box, memory: &Arc>, address_manager: &Arc, vm_fd: &Arc, interrupt_info: &InterruptInfo, mmio_base: GuestAddress, cmdline_additions: &mut Vec, ) -> DeviceManagerResult<()> { let mut mmio_device = vm_virtio::transport::MmioDevice::new(memory.clone(), virtio_device) .map_err(DeviceManagerError::VirtioDevice)?; for (i, (event, addr)) in mmio_device.ioeventfds(mmio_base.0).iter().enumerate() { let io_addr = IoEventAddress::Mmio(*addr); vm_fd .register_ioevent(event, &io_addr, i as u32) .map_err(DeviceManagerError::RegisterIoevent)?; } let irq_num = address_manager .allocator .lock() .unwrap() .allocate_irq() .ok_or(DeviceManagerError::AllocateIrq)?; let interrupt: Box = if let Some(ioapic) = interrupt_info.ioapic { Box::new(UserIoapicIrq::new(ioapic.clone(), irq_num as usize)) } else { let irqfd = EventFd::new(EFD_NONBLOCK).map_err(DeviceManagerError::EventFd)?; vm_fd .register_irqfd(&irqfd, irq_num as u32) .map_err(DeviceManagerError::Irq)?; Box::new(KernelIoapicIrq::new(irqfd)) }; mmio_device.assign_interrupt(interrupt); address_manager .mmio_bus .insert(Arc::new(Mutex::new(mmio_device)), mmio_base.0, MMIO_LEN) .map_err(DeviceManagerError::BusError)?; cmdline_additions.push(format!( "virtio_mmio.device={}K@0x{:08x}:{}", MMIO_LEN / 1024, mmio_base.0, irq_num )); Ok(()) } pub fn io_bus(&self) -> &Arc { &self.address_manager.io_bus } pub fn mmio_bus(&self) -> &Arc { &self.address_manager.mmio_bus } pub fn allocator(&self) -> &Arc> { &self.address_manager.allocator } pub fn ioapic(&self) -> &Option>> { &self.ioapic } pub fn console(&self) -> &Arc { &self.console } pub fn cmdline_additions(&self) -> &[String] { self.cmdline_additions.as_slice() } pub fn virt_iommu(&self) -> Option<(u32, &[u32])> { if let Some((iommu_id, dev_ids)) = self.virt_iommu.as_ref() { Some((*iommu_id, dev_ids.as_slice())) } else { None } } } impl Drop for DeviceManager { fn drop(&mut self) { for (addr, size) in self.mmap_regions.drain(..) { unsafe { libc::munmap(addr, size); } } } }