diff --git a/vfio/Cargo.toml b/vfio/Cargo.toml index e9f2e91d3..a1abda8d4 100644 --- a/vfio/Cargo.toml +++ b/vfio/Cargo.toml @@ -5,11 +5,14 @@ authors = ["The Cloud Hypervisor Authors"] [dependencies] byteorder = ">=1.2.1" +devices = { path = "../devices" } kvm-bindings = "0.1" kvm-ioctls = { git = "https://github.com/rust-vmm/kvm-ioctls", branch = "master" } libc = ">=0.2.39" log = "*" +pci = { path = "../pci" } vfio-bindings = { path = "../vfio-bindings" } +vm-allocator = { path = "../vm-allocator" } vmm-sys-util = { git = "https://github.com/rust-vmm/vmm-sys-util" } [dependencies.vm-memory] diff --git a/vfio/src/lib.rs b/vfio/src/lib.rs index c7cc4e20c..b3d245f0d 100644 --- a/vfio/src/lib.rs +++ b/vfio/src/lib.rs @@ -5,22 +5,26 @@ //#![deny(missing_docs)] //! Virtual Function I/O (VFIO) API - extern crate byteorder; +extern crate devices; extern crate kvm_bindings; extern crate kvm_ioctls; extern crate log; +extern crate pci; extern crate vfio_bindings; +extern crate vm_allocator; extern crate vm_memory; #[macro_use] extern crate vmm_sys_util; mod vfio_device; mod vfio_ioctls; +mod vfio_pci; use std::mem::size_of; pub use vfio_device::{VfioDevice, VfioError}; +pub use vfio_pci::VfioPciDevice; // Returns a `Vec` with a size in bytes at least as large as `size_in_bytes`. fn vec_with_size_in_bytes(size_in_bytes: usize) -> Vec { diff --git a/vfio/src/vfio_pci.rs b/vfio/src/vfio_pci.rs new file mode 100644 index 000000000..f6b226278 --- /dev/null +++ b/vfio/src/vfio_pci.rs @@ -0,0 +1,356 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +extern crate devices; +extern crate pci; +extern crate vm_allocator; + +use crate::vfio_device::VfioDevice; +use devices::BusDevice; +use kvm_ioctls::*; +use pci::{ + PciBarConfiguration, PciBarRegionType, PciClassCode, PciConfiguration, PciDevice, + PciDeviceError, PciHeaderType, PciSubclass, +}; +use std::sync::Arc; +use vfio_bindings::bindings::vfio::*; +use vm_allocator::SystemAllocator; +use vm_memory::{Address, GuestAddress, GuestUsize}; + +#[derive(Copy, Clone)] +enum PciVfioSubclass { + VfioSubclass = 0xff, +} + +impl PciSubclass for PciVfioSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +#[derive(Copy, Clone)] +struct MmioRegion { + start: GuestAddress, + length: GuestUsize, + index: u32, +} + +struct VfioPciConfig { + device: Arc, +} + +impl VfioPciConfig { + fn new(device: Arc) -> Self { + VfioPciConfig { device } + } + + fn read_config_byte(&self, offset: u32) -> u8 { + let mut data: [u8; 1] = [0]; + self.device + .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); + + data[0] + } + + fn read_config_word(&self, offset: u32) -> u16 { + let mut data: [u8; 2] = [0, 0]; + self.device + .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); + + u16::from_le_bytes(data) + } + + fn read_config_dword(&self, offset: u32) -> u32 { + let mut data: [u8; 4] = [0, 0, 0, 0]; + self.device + .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); + + u32::from_le_bytes(data) + } + + fn write_config_dword(&self, buf: u32, offset: u32) { + let data: [u8; 4] = buf.to_le_bytes(); + self.device + .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into()) + } +} + +/// VfioPciDevice represents a VFIO PCI device. +/// This structure implements the BusDevice and PciDevice traits. +/// +/// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. +/// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, +/// which then gets added to the PCI bus. +pub struct VfioPciDevice { + vm_fd: Arc, + device: Arc, + vfio_pci_configuration: VfioPciConfig, + configuration: PciConfiguration, + mmio_regions: Vec, +} + +impl VfioPciDevice { + /// Constructs a new Vfio Pci device for the given Vfio device + pub fn new( + vm_fd: &Arc, + allocator: &mut SystemAllocator, + device: VfioDevice, + ) -> Result { + let device = Arc::new(device); + device.reset(); + + let configuration = PciConfiguration::new( + 0, + 0, + PciClassCode::Other, + &PciVfioSubclass::VfioSubclass, + None, + PciHeaderType::Device, + 0, + 0, + None, + ); + + let vfio_pci_configuration = VfioPciConfig::new(Arc::clone(&device)); + + let mut vfio_pci_device = VfioPciDevice { + vm_fd: vm_fd.clone(), + device, + configuration, + vfio_pci_configuration, + mmio_regions: Vec::new(), + }; + + Ok(vfio_pci_device) + } + + fn find_region(&self, addr: u64) -> Option { + for region in self.mmio_regions.iter() { + if addr >= region.start.raw_value() + && addr < region.start.unchecked_add(region.length).raw_value() + { + return Some(*region); + } + } + None + } +} + +impl Drop for VfioPciDevice { + fn drop(&mut self) { + if self.device.unset_dma_map().is_err() { + error!("failed to remove all guest memory regions from iommu table"); + } + } +} + +impl BusDevice for VfioPciDevice { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + fn write(&mut self, base: u64, offset: u64, data: &[u8]) { + self.write_bar(base, offset, data) + } +} + +// First BAR offset in the PCI config space. +const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; +// First BAR register index +const PCI_CONFIG_BAR0_INDEX: usize = 4; +// Capability register offset in the PCI config space. +const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; +// IO BAR when first BAR bit is 1. +const PCI_CONFIG_IO_BAR: u32 = 0x1; +// Memory BAR flags (lower 4 bits). +const PCI_CONFIG_MEMORY_BAR_FLAG_MASK: u32 = 0xf; +// 64-bit memory bar flag. +const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; +// PCI config register size (4 bytes). +const PCI_CONFIG_REGISTER_SIZE: usize = 4; +// Number of BARs for a PCI device +const BAR_NUMS: usize = 6; + +impl PciDevice for VfioPciDevice { + fn allocate_bars( + &mut self, + allocator: &mut SystemAllocator, + ) -> std::result::Result, PciDeviceError> + { + let mut ranges = Vec::new(); + let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX as u32; + + // Going through all regular regions to compute the BAR size. + // We're not saving the BAR address to restore it, because we + // are going to allocate a guest address for each BAR and write + // that new address back. + while bar_id < VFIO_PCI_ROM_REGION_INDEX { + let mut lsb_size: u32 = 0xffff_ffff; + let mut msb_size = 0; + let mut region_size: u64; + let bar_addr: GuestAddress; + + // Read the BAR size (Starts by all 1s to the BAR) + let bar_offset = PCI_CONFIG_BAR_OFFSET + bar_id * 4; + + self.vfio_pci_configuration + .write_config_dword(lsb_size, bar_offset); + lsb_size = self.vfio_pci_configuration.read_config_dword(bar_offset); + + // We've just read the BAR size back. Or at least its LSB. + let lsb_flag = lsb_size & PCI_CONFIG_MEMORY_BAR_FLAG_MASK; + + if lsb_size == 0 { + bar_id += 1; + continue; + } + + // Is this an IO BAR? + let io_bar = match lsb_flag & PCI_CONFIG_IO_BAR { + PCI_CONFIG_IO_BAR => true, + _ => false, + }; + + // Is this a 64-bit BAR? + let is_64bit_bar = match lsb_flag & PCI_CONFIG_MEMORY_BAR_64BIT { + PCI_CONFIG_MEMORY_BAR_64BIT => true, + _ => false, + }; + + // By default, the region type is 32 bits memory BAR. + let mut region_type = PciBarRegionType::Memory32BitRegion; + + if io_bar { + // IO BAR + region_type = PciBarRegionType::IORegion; + + // Clear first bit. + lsb_size &= 0xffff_fffc; + + // Find the first bit that's set to 1. + let first_bit = lsb_size.trailing_zeros(); + region_size = 2u64.pow(first_bit); + // We need to allocate a guest PIO address range for that BAR. + bar_addr = allocator + .allocate_io_addresses(None, region_size, Some(0x4)) + .ok_or_else(|| PciDeviceError::IoAllocationFailed(region_size))?; + } else { + if is_64bit_bar { + // 64 bits Memory BAR + region_type = PciBarRegionType::Memory64BitRegion; + + msb_size = 0xffff_ffff; + let msb_bar_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; + + self.vfio_pci_configuration + .write_config_dword(msb_bar_offset, msb_size); + + msb_size = self + .vfio_pci_configuration + .read_config_dword(msb_bar_offset); + } + + // Clear the first four bytes from our LSB. + lsb_size &= 0xffff_fff0; + + region_size = u64::from(msb_size); + region_size <<= 32; + region_size |= u64::from(lsb_size); + + // Find the first that's set to 1. + let first_bit = region_size.trailing_zeros(); + region_size = 2u64.pow(first_bit); + + // We need to allocate a guest MMIO address range for that BAR. + if is_64bit_bar { + bar_addr = allocator + .allocate_mmio_addresses(None, region_size, Some(0x1000)) + .ok_or_else(|| PciDeviceError::IoAllocationFailed(region_size))?; + } else { + bar_addr = allocator + .allocate_mmio_hole_addresses(None, region_size, Some(0x1000)) + .ok_or_else(|| PciDeviceError::IoAllocationFailed(region_size))?; + } + } + + // We can now build our BAR configuration block. + let config = PciBarConfiguration::default() + .set_register_index(bar_id as usize) + .set_address(bar_addr.raw_value()) + .set_size(region_size) + .set_region_type(region_type); + + self.configuration + .add_pci_bar(&config) + .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; + + ranges.push((bar_addr, region_size, region_type)); + self.mmio_regions.push(MmioRegion { + start: bar_addr, + length: region_size, + index: bar_id as u32, + }); + + bar_id += 1; + if is_64bit_bar { + bar_id += 1; + } + } + + if self.device.setup_dma_map().is_err() { + error!("failed to add all guest memory regions into iommu table"); + } + + Ok(ranges) + } + + fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) { + // When the guest wants to write to a BAR, we trap it into + // our local configuration space. We're not reprogramming + // VFIO device. + if reg_idx >= PCI_CONFIG_BAR0_INDEX && reg_idx < PCI_CONFIG_BAR0_INDEX + BAR_NUMS { + // We keep our local cache updated with the BARs. + // We'll read it back from there when the guest is asking + // for BARs (see read_config_register()). + return self + .configuration + .write_config_register(reg_idx, offset, data); + } + + let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; + self.device + .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, reg + offset); + } + + fn read_config_register(&self, reg_idx: usize) -> u32 { + // When reading the BARs, we trap it and return what comes + // from our local configuration space. We want the guest to + // use that and not the VFIO device BARs as it does not map + // with the guest address space. + if reg_idx >= PCI_CONFIG_BAR0_INDEX && reg_idx < PCI_CONFIG_BAR0_INDEX + BAR_NUMS { + return self.configuration.read_reg(reg_idx); + } + + // The config register read comes from the VFIO device itself. + self.vfio_pci_configuration + .read_config_dword((reg_idx * 4) as u32) + } + + fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { + let addr = base + offset; + if let Some(region) = self.find_region(addr) { + let offset = addr - region.start.raw_value(); + self.device.region_read(region.index, data, offset); + } + } + + fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) { + let addr = base + offset; + if let Some(region) = self.find_region(addr) { + let offset = addr - region.start.raw_value(); + self.device.region_write(region.index, data, offset); + } + } +}