cloud-hypervisor/pci/src/vfio.rs
Jianyong Wu a718716831 vfio: fix vfio device fail to initialize issue for 64k page size
Currently, vfio device fails to initialize as the msix-cap region in BAR
is mapped as RW region.

To resolve the initialization issue, this commit avoids mapping the
msix-cap region in the BAR. However, this solution introduces another
problem where aligning the msix table offset in the BAR to the page
size may cause overlap with the MMIO RW region, leading to reduced
performance. By enlarging the entire region in the BAR and relocating
the msix table to achieve page size alignment, this problem can be
overcomed effectively.

Fixes: #5292
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
2023-06-19 10:29:23 +08:00

1813 lines
61 KiB
Rust

// Copyright © 2019 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
//
use crate::msi::{MsiConfigState, MSI_CONFIG_ID};
use crate::msix::MsixConfigState;
use crate::{
msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig,
PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId,
PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId,
PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID,
};
use anyhow::anyhow;
use byteorder::{ByteOrder, LittleEndian};
use hypervisor::HypervisorVmError;
use libc::{sysconf, _SC_PAGESIZE};
use std::any::Any;
use std::collections::{BTreeMap, HashMap};
use std::io;
use std::os::unix::io::AsRawFd;
use std::ptr::null_mut;
use std::sync::{Arc, Barrier, Mutex};
use thiserror::Error;
use versionize::{VersionMap, Versionize, VersionizeResult};
use versionize_derive::Versionize;
use vfio_bindings::bindings::vfio::*;
use vfio_ioctls::{
VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea,
};
use vm_allocator::page_size::{
align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned,
};
use vm_allocator::{AddressAllocator, SystemAllocator};
use vm_device::interrupt::{
InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
};
use vm_device::{BusDevice, Resource};
use vm_memory::{Address, GuestAddress, GuestUsize};
use vm_migration::{
Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
};
use vmm_sys_util::eventfd::EventFd;
pub(crate) const VFIO_COMMON_ID: &str = "vfio_common";
#[derive(Debug, Error)]
pub enum VfioPciError {
#[error("Failed to create user memory region: {0}")]
CreateUserMemoryRegion(#[source] HypervisorVmError),
#[error("Failed to DMA map: {0}")]
DmaMap(#[source] vfio_ioctls::VfioError),
#[error("Failed to DMA unmap: {0}")]
DmaUnmap(#[source] vfio_ioctls::VfioError),
#[error("Failed to enable INTx: {0}")]
EnableIntx(#[source] VfioError),
#[error("Failed to enable MSI: {0}")]
EnableMsi(#[source] VfioError),
#[error("Failed to enable MSI-x: {0}")]
EnableMsix(#[source] VfioError),
#[error("Failed to mmap the area")]
MmapArea,
#[error("Failed to notifier's eventfd")]
MissingNotifier,
#[error("Invalid region alignment")]
RegionAlignment,
#[error("Invalid region size")]
RegionSize,
#[error("Failed to retrieve MsiConfigState: {0}")]
RetrieveMsiConfigState(#[source] anyhow::Error),
#[error("Failed to retrieve MsixConfigState: {0}")]
RetrieveMsixConfigState(#[source] anyhow::Error),
#[error("Failed to retrieve PciConfigurationState: {0}")]
RetrievePciConfigurationState(#[source] anyhow::Error),
#[error("Failed to retrieve VfioCommonState: {0}")]
RetrieveVfioCommonState(#[source] anyhow::Error),
}
#[derive(Copy, Clone)]
enum PciVfioSubclass {
VfioSubclass = 0xff,
}
impl PciSubclass for PciVfioSubclass {
fn get_register_value(&self) -> u8 {
*self as u8
}
}
enum InterruptUpdateAction {
EnableMsi,
DisableMsi,
EnableMsix,
DisableMsix,
}
#[derive(Versionize)]
struct IntxState {
enabled: bool,
}
pub(crate) struct VfioIntx {
interrupt_source_group: Arc<dyn InterruptSourceGroup>,
enabled: bool,
}
#[derive(Versionize)]
struct MsiState {
cap: MsiCap,
cap_offset: u32,
}
pub(crate) struct VfioMsi {
pub(crate) cfg: MsiConfig,
cap_offset: u32,
interrupt_source_group: Arc<dyn InterruptSourceGroup>,
}
impl VfioMsi {
fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
let old_enabled = self.cfg.enabled();
self.cfg.update(offset, data);
let new_enabled = self.cfg.enabled();
if !old_enabled && new_enabled {
return Some(InterruptUpdateAction::EnableMsi);
}
if old_enabled && !new_enabled {
return Some(InterruptUpdateAction::DisableMsi);
}
None
}
}
#[derive(Versionize)]
struct MsixState {
cap: MsixCap,
cap_offset: u32,
bdf: u32,
}
pub(crate) struct VfioMsix {
pub(crate) bar: MsixConfig,
cap: MsixCap,
cap_offset: u32,
interrupt_source_group: Arc<dyn InterruptSourceGroup>,
}
impl VfioMsix {
fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
let old_enabled = self.bar.enabled();
// Update "Message Control" word
if offset == 2 && data.len() == 2 {
self.bar.set_msg_ctl(LittleEndian::read_u16(data));
}
let new_enabled = self.bar.enabled();
if !old_enabled && new_enabled {
return Some(InterruptUpdateAction::EnableMsix);
}
if old_enabled && !new_enabled {
return Some(InterruptUpdateAction::DisableMsix);
}
None
}
fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
let table_offset: u64 = u64::from(self.cap.table_offset());
let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
let table_bir: u32 = self.cap.table_bir();
bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
}
}
pub(crate) struct Interrupt {
pub(crate) intx: Option<VfioIntx>,
pub(crate) msi: Option<VfioMsi>,
pub(crate) msix: Option<VfioMsix>,
}
impl Interrupt {
fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
if let Some(ref mut msi) = &mut self.msi {
let action = msi.update(offset, data);
return action;
}
None
}
fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
if let Some(ref mut msix) = &mut self.msix {
let action = msix.update(offset, data);
return action;
}
None
}
fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
if let Some(msi) = &self.msi {
if offset >= u64::from(msi.cap_offset)
&& offset < u64::from(msi.cap_offset) + msi.cfg.size()
{
return Some((
PciCapabilityId::MessageSignalledInterrupts,
u64::from(msi.cap_offset),
));
}
}
if let Some(msix) = &self.msix {
if offset == u64::from(msix.cap_offset) {
return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
}
}
None
}
fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
if let Some(msix) = &self.msix {
return msix.table_accessed(bar_index, offset);
}
false
}
fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
if let Some(ref mut msix) = &mut self.msix {
let offset = offset - u64::from(msix.cap.table_offset());
msix.bar.write_table(offset, data)
}
}
fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
if let Some(msix) = &self.msix {
let offset = offset - u64::from(msix.cap.table_offset());
msix.bar.read_table(offset, data)
}
}
pub(crate) fn intx_in_use(&self) -> bool {
if let Some(intx) = &self.intx {
return intx.enabled;
}
false
}
}
#[derive(Copy, Clone)]
pub struct UserMemoryRegion {
pub slot: u32,
pub start: u64,
pub size: u64,
pub host_addr: u64,
}
#[derive(Clone)]
pub struct MmioRegion {
pub start: GuestAddress,
pub length: GuestUsize,
pub(crate) type_: PciBarRegionType,
pub(crate) index: u32,
pub(crate) user_memory_regions: Vec<UserMemoryRegion>,
}
#[derive(Debug, Error)]
pub enum VfioError {
#[error("Kernel VFIO error: {0}")]
KernelVfio(#[source] vfio_ioctls::VfioError),
#[error("VFIO user error: {0}")]
VfioUser(#[source] vfio_user::Error),
}
pub(crate) trait Vfio: Send + Sync {
fn read_config_byte(&self, offset: u32) -> u8 {
let mut data: [u8; 1] = [0];
self.read_config(offset, &mut data);
data[0]
}
fn read_config_word(&self, offset: u32) -> u16 {
let mut data: [u8; 2] = [0, 0];
self.read_config(offset, &mut data);
u16::from_le_bytes(data)
}
fn read_config_dword(&self, offset: u32) -> u32 {
let mut data: [u8; 4] = [0, 0, 0, 0];
self.read_config(offset, &mut data);
u32::from_le_bytes(data)
}
fn write_config_dword(&self, offset: u32, buf: u32) {
let data: [u8; 4] = buf.to_le_bytes();
self.write_config(offset, &data)
}
fn read_config(&self, offset: u32, data: &mut [u8]) {
self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
}
fn write_config(&self, offset: u32, data: &[u8]) {
self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
}
fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
}
fn disable_msi(&self) -> Result<(), VfioError> {
self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
}
fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
}
fn disable_msix(&self) -> Result<(), VfioError> {
self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
}
fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
unimplemented!()
}
fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
unimplemented!()
}
fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
unimplemented!()
}
fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
unimplemented!()
}
fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
unimplemented!()
}
fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
unimplemented!()
}
}
struct VfioDeviceWrapper {
device: Arc<VfioDevice>,
}
impl VfioDeviceWrapper {
fn new(device: Arc<VfioDevice>) -> Self {
Self { device }
}
}
impl Vfio for VfioDeviceWrapper {
fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
self.device.region_read(index, data, offset)
}
fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
self.device.region_write(index, data, offset)
}
fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
self.device.get_irq_info(irq_index).copied()
}
fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
self.device
.enable_irq(irq_index, event_fds)
.map_err(VfioError::KernelVfio)
}
fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
self.device
.disable_irq(irq_index)
.map_err(VfioError::KernelVfio)
}
fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
self.device
.unmask_irq(irq_index)
.map_err(VfioError::KernelVfio)
}
}
#[derive(Versionize)]
struct VfioCommonState {
intx_state: Option<IntxState>,
msi_state: Option<MsiState>,
msix_state: Option<MsixState>,
}
impl VersionMapped for VfioCommonState {}
pub(crate) struct ConfigPatch {
mask: u32,
patch: u32,
}
pub(crate) struct VfioCommon {
pub(crate) configuration: PciConfiguration,
pub(crate) mmio_regions: Vec<MmioRegion>,
pub(crate) interrupt: Interrupt,
pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
pub(crate) vfio_wrapper: Arc<dyn Vfio>,
pub(crate) patches: HashMap<usize, ConfigPatch>,
}
impl VfioCommon {
pub(crate) fn new(
msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
vfio_wrapper: Arc<dyn Vfio>,
subclass: &dyn PciSubclass,
bdf: PciBdf,
snapshot: Option<Snapshot>,
) -> Result<Self, VfioPciError> {
let pci_configuration_state =
vm_migration::versioned_state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID)
.map_err(|e| {
VfioPciError::RetrievePciConfigurationState(anyhow!(
"Failed to get PciConfigurationState from Snapshot: {}",
e
))
})?;
let configuration = PciConfiguration::new(
0,
0,
0,
PciClassCode::Other,
subclass,
None,
PciHeaderType::Device,
0,
0,
None,
pci_configuration_state,
);
let mut vfio_common = VfioCommon {
mmio_regions: Vec::new(),
configuration,
interrupt: Interrupt {
intx: None,
msi: None,
msix: None,
},
msi_interrupt_manager,
legacy_interrupt_group,
vfio_wrapper,
patches: HashMap::new(),
};
let state: Option<VfioCommonState> = snapshot
.as_ref()
.map(|s| s.to_versioned_state())
.transpose()
.map_err(|e| {
VfioPciError::RetrieveVfioCommonState(anyhow!(
"Failed to get VfioCommonState from Snapshot: {}",
e
))
})?;
let msi_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSI_CONFIG_ID)
.map_err(|e| {
VfioPciError::RetrieveMsiConfigState(anyhow!(
"Failed to get MsiConfigState from Snapshot: {}",
e
))
})?;
let msix_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID)
.map_err(|e| {
VfioPciError::RetrieveMsixConfigState(anyhow!(
"Failed to get MsixConfigState from Snapshot: {}",
e
))
})?;
if let Some(state) = state.as_ref() {
vfio_common.set_state(state, msi_state, msix_state)?;
} else {
vfio_common.parse_capabilities(bdf);
vfio_common.initialize_legacy_interrupt()?;
}
Ok(vfio_common)
}
/// In case msix table offset is not page size aligned, we need do some fixup to achive it.
/// Becuse we don't want the MMIO RW region and trap region overlap each other.
fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 {
let msix = self.interrupt.msix.as_mut().unwrap();
let msix_cap = &mut msix.cap;
// Suppose table_bir equals to pba_bir here. Am I right?
let (table_offset, table_size) = msix_cap.table_range();
if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id {
return region_size;
}
let (pba_offset, pba_size) = msix_cap.pba_range();
let msix_sz = align_page_size_up(table_size + pba_size);
// Expand region to hold RW and trap region which both page size aligned
let size = std::cmp::max(region_size * 2, msix_sz * 2);
// let table starts from the middle of the region
msix_cap.table_set_offset((size / 2) as u32);
msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32);
size
}
pub(crate) fn allocate_bars(
&mut self,
allocator: &Arc<Mutex<SystemAllocator>>,
mmio_allocator: &mut AddressAllocator,
resources: Option<Vec<Resource>>,
) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
let mut bars = Vec::new();
let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX;
// Going through all regular regions to compute the BAR size.
// We're not saving the BAR address to restore it, because we
// are going to allocate a guest address for each BAR and write
// that new address back.
while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
let mut region_size: u64 = 0;
let mut region_type = PciBarRegionType::Memory32BitRegion;
let mut prefetchable = PciBarPrefetchable::NotPrefetchable;
let mut flags: u32 = 0;
let mut restored_bar_addr = None;
if let Some(resources) = &resources {
for resource in resources {
if let Resource::PciBar {
index,
base,
size,
type_,
..
} = resource
{
if *index == bar_id as usize {
restored_bar_addr = Some(GuestAddress(*base));
region_size = *size;
region_type = PciBarRegionType::from(*type_);
break;
}
}
}
if restored_bar_addr.is_none() {
bar_id += 1;
continue;
}
} else {
let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
(PCI_ROM_EXP_BAR_INDEX * 4) as u32
} else {
PCI_CONFIG_BAR_OFFSET + bar_id * 4
};
// First read flags
flags = self.vfio_wrapper.read_config_dword(bar_offset);
// Is this an IO BAR?
let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
} else {
false
};
// Is this a 64-bit BAR?
let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
matches!(
flags & PCI_CONFIG_MEMORY_BAR_64BIT,
PCI_CONFIG_MEMORY_BAR_64BIT
)
} else {
false
};
if matches!(
flags & PCI_CONFIG_BAR_PREFETCHABLE,
PCI_CONFIG_BAR_PREFETCHABLE
) {
prefetchable = PciBarPrefetchable::Prefetchable
};
// To get size write all 1s
self.vfio_wrapper
.write_config_dword(bar_offset, 0xffff_ffff);
// And read back BAR value. The device will write zeros for bits it doesn't care about
let mut lower = self.vfio_wrapper.read_config_dword(bar_offset);
if io_bar {
// Mask flag bits (lowest 2 for I/O bars)
lower &= !0b11;
// BAR is not enabled
if lower == 0 {
bar_id += 1;
continue;
}
// IO BAR
region_type = PciBarRegionType::IoRegion;
// Invert bits and add 1 to calculate size
region_size = (!lower + 1) as u64;
} else if is_64bit_bar {
// 64 bits Memory BAR
region_type = PciBarRegionType::Memory64BitRegion;
// Query size of upper BAR of 64-bit BAR
let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
self.vfio_wrapper
.write_config_dword(upper_offset, 0xffff_ffff);
let upper = self.vfio_wrapper.read_config_dword(upper_offset);
let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
// Mask out flag bits (lowest 4 for memory bars)
combined_size &= !0b1111;
// BAR is not enabled
if combined_size == 0 {
bar_id += 1;
continue;
}
// Invert and add 1 to to find size
region_size = !combined_size + 1;
} else {
region_type = PciBarRegionType::Memory32BitRegion;
// Mask out flag bits (lowest 4 for memory bars)
lower &= !0b1111;
if lower == 0 {
bar_id += 1;
continue;
}
// Invert and add 1 to to find size
region_size = (!lower + 1) as u64;
}
}
let bar_addr = match region_type {
PciBarRegionType::IoRegion => {
#[cfg(target_arch = "aarch64")]
unimplemented!();
// The address needs to be 4 bytes aligned.
#[cfg(not(target_arch = "aarch64"))]
allocator
.lock()
.unwrap()
.allocate_io_addresses(restored_bar_addr, region_size, Some(0x4))
.ok_or(PciDeviceError::IoAllocationFailed(region_size))?
}
PciBarRegionType::Memory32BitRegion => {
// BAR allocation must be naturally aligned
allocator
.lock()
.unwrap()
.allocate_mmio_hole_addresses(
restored_bar_addr,
region_size,
Some(region_size),
)
.ok_or(PciDeviceError::IoAllocationFailed(region_size))?
}
PciBarRegionType::Memory64BitRegion => {
// We need do some fixup to keep MMIO RW region and msix cap region page size
// aligned.
region_size = self.fixup_msix_region(bar_id, region_size);
mmio_allocator
.allocate(
restored_bar_addr,
region_size,
// SAFETY: FFI call. Trivially safe.
Some(unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }),
)
.ok_or(PciDeviceError::IoAllocationFailed(region_size))?
}
};
// We can now build our BAR configuration block.
let bar = PciBarConfiguration::default()
.set_index(bar_id as usize)
.set_address(bar_addr.raw_value())
.set_size(region_size)
.set_region_type(region_type)
.set_prefetchable(prefetchable);
if bar_id == VFIO_PCI_ROM_REGION_INDEX {
self.configuration
.add_pci_rom_bar(&bar, flags & 0x1)
.map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
} else {
self.configuration
.add_pci_bar(&bar)
.map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
}
bars.push(bar);
self.mmio_regions.push(MmioRegion {
start: bar_addr,
length: region_size,
type_: region_type,
index: bar_id,
user_memory_regions: Vec::new(),
});
bar_id += 1;
if region_type == PciBarRegionType::Memory64BitRegion {
bar_id += 1;
}
}
Ok(bars)
}
pub(crate) fn free_bars(
&mut self,
allocator: &mut SystemAllocator,
mmio_allocator: &mut AddressAllocator,
) -> Result<(), PciDeviceError> {
for region in self.mmio_regions.iter() {
match region.type_ {
PciBarRegionType::IoRegion => {
#[cfg(target_arch = "x86_64")]
allocator.free_io_addresses(region.start, region.length);
#[cfg(target_arch = "aarch64")]
error!("I/O region is not supported");
}
PciBarRegionType::Memory32BitRegion => {
allocator.free_mmio_hole_addresses(region.start, region.length);
}
PciBarRegionType::Memory64BitRegion => {
mmio_allocator.free(region.start, region.length);
}
}
}
Ok(())
}
pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap {
let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into());
let table = self.vfio_wrapper.read_config_dword((cap + 4).into());
let pba = self.vfio_wrapper.read_config_dword((cap + 8).into());
MsixCap {
msg_ctl,
table,
pba,
}
}
pub(crate) fn initialize_msix(
&mut self,
msix_cap: MsixCap,
cap_offset: u32,
bdf: PciBdf,
state: Option<MsixConfigState>,
) {
let interrupt_source_group = self
.msi_interrupt_manager
.create_group(MsiIrqGroupConfig {
base: 0,
count: msix_cap.table_size() as InterruptIndex,
})
.unwrap();
let msix_config = MsixConfig::new(
msix_cap.table_size(),
interrupt_source_group.clone(),
bdf.into(),
state,
)
.unwrap();
self.interrupt.msix = Some(VfioMsix {
bar: msix_config,
cap: msix_cap,
cap_offset,
interrupt_source_group,
});
}
pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 {
self.vfio_wrapper.read_config_word((cap + 2).into())
}
pub(crate) fn initialize_msi(
&mut self,
msg_ctl: u16,
cap_offset: u32,
state: Option<MsiConfigState>,
) {
let interrupt_source_group = self
.msi_interrupt_manager
.create_group(MsiIrqGroupConfig {
base: 0,
count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
})
.unwrap();
let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap();
self.interrupt.msi = Some(VfioMsi {
cfg: msi_config,
cap_offset,
interrupt_source_group,
});
}
pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> {
let mut cap_next = self
.vfio_wrapper
.read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
while cap_next != 0 {
let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX {
return Some(cap_next as usize);
} else {
cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
}
}
None
}
pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) {
let mut cap_next = self
.vfio_wrapper
.read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
let mut pci_express_cap_found = false;
let mut power_management_cap_found = false;
while cap_next != 0 {
let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
match PciCapabilityId::from(cap_id) {
PciCapabilityId::MessageSignalledInterrupts => {
if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
if irq_info.count > 0 {
// Parse capability only if the VFIO device
// supports MSI.
let msg_ctl = self.parse_msi_capabilities(cap_next);
self.initialize_msi(msg_ctl, cap_next as u32, None);
}
}
}
PciCapabilityId::MsiX => {
if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX)
{
if irq_info.count > 0 {
// Parse capability only if the VFIO device
// supports MSI-X.
let msix_cap = self.parse_msix_capabilities(cap_next);
self.initialize_msix(msix_cap, cap_next as u32, bdf, None);
}
}
}
PciCapabilityId::PciExpress => pci_express_cap_found = true,
PciCapabilityId::PowerManagement => power_management_cap_found = true,
_ => {}
};
cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
}
if pci_express_cap_found && power_management_cap_found {
self.parse_extended_capabilities();
}
}
fn parse_extended_capabilities(&mut self) {
let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET;
loop {
let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset);
let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16;
let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16;
match PciExpressCapabilityId::from(cap_id) {
PciExpressCapabilityId::AlternativeRoutingIdentificationIntepretation
| PciExpressCapabilityId::ResizeableBar
| PciExpressCapabilityId::SingleRootIoVirtualization => {
let reg_idx = (current_offset / 4) as usize;
self.patches.insert(
reg_idx,
ConfigPatch {
mask: 0x0000_ffff,
patch: PciExpressCapabilityId::NullCapability as u32,
},
);
}
_ => {}
}
if cap_next == 0 {
break;
}
current_offset = cap_next.into();
}
}
pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> {
if let Some(intx) = &mut self.interrupt.intx {
if !intx.enabled {
if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
self.vfio_wrapper
.enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
.map_err(VfioPciError::EnableIntx)?;
intx.enabled = true;
} else {
return Err(VfioPciError::MissingNotifier);
}
}
}
Ok(())
}
pub(crate) fn disable_intx(&mut self) {
if let Some(intx) = &mut self.interrupt.intx {
if intx.enabled {
if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
error!("Could not disable INTx: {}", e);
} else {
intx.enabled = false;
}
}
}
}
pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> {
if let Some(msi) = &self.interrupt.msi {
let mut irq_fds: Vec<EventFd> = Vec::new();
for i in 0..msi.cfg.num_enabled_vectors() {
if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
irq_fds.push(eventfd);
} else {
return Err(VfioPciError::MissingNotifier);
}
}
self.vfio_wrapper
.enable_msi(irq_fds.iter().collect())
.map_err(VfioPciError::EnableMsi)?;
}
Ok(())
}
pub(crate) fn disable_msi(&self) {
if let Err(e) = self.vfio_wrapper.disable_msi() {
error!("Could not disable MSI: {}", e);
}
}
pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> {
if let Some(msix) = &self.interrupt.msix {
let mut irq_fds: Vec<EventFd> = Vec::new();
for i in 0..msix.bar.table_entries.len() {
if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
irq_fds.push(eventfd);
} else {
return Err(VfioPciError::MissingNotifier);
}
}
self.vfio_wrapper
.enable_msix(irq_fds.iter().collect())
.map_err(VfioPciError::EnableMsix)?;
}
Ok(())
}
pub(crate) fn disable_msix(&self) {
if let Err(e) = self.vfio_wrapper.disable_msix() {
error!("Could not disable MSI-X: {}", e);
}
}
pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> {
if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
if irq_info.count == 0 {
// A count of 0 means the INTx IRQ is not supported, therefore
// it shouldn't be initialized.
return Ok(());
}
}
if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() {
self.interrupt.intx = Some(VfioIntx {
interrupt_source_group,
enabled: false,
});
self.enable_intx()?;
}
Ok(())
}
pub(crate) fn update_msi_capabilities(
&mut self,
offset: u64,
data: &[u8],
) -> Result<(), VfioPciError> {
match self.interrupt.update_msi(offset, data) {
Some(InterruptUpdateAction::EnableMsi) => {
// Disable INTx before we can enable MSI
self.disable_intx();
self.enable_msi()?;
}
Some(InterruptUpdateAction::DisableMsi) => {
// Fallback onto INTx when disabling MSI
self.disable_msi();
self.enable_intx()?;
}
_ => {}
}
Ok(())
}
pub(crate) fn update_msix_capabilities(
&mut self,
offset: u64,
data: &[u8],
) -> Result<(), VfioPciError> {
match self.interrupt.update_msix(offset, data) {
Some(InterruptUpdateAction::EnableMsix) => {
// Disable INTx before we can enable MSI-X
self.disable_intx();
self.enable_msix()?;
}
Some(InterruptUpdateAction::DisableMsix) => {
// Fallback onto INTx when disabling MSI-X
self.disable_msix();
self.enable_intx()?;
}
_ => {}
}
Ok(())
}
pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
for region in self.mmio_regions.iter() {
if addr >= region.start.raw_value()
&& addr < region.start.unchecked_add(region.length).raw_value()
{
return Some(region.clone());
}
}
None
}
pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
let addr = base + offset;
if let Some(region) = self.find_region(addr) {
let offset = addr - region.start.raw_value();
if self.interrupt.msix_table_accessed(region.index, offset) {
self.interrupt.msix_read_table(offset, data);
} else {
self.vfio_wrapper.region_read(region.index, offset, data);
}
}
// INTx EOI
// The guest reading from the BAR potentially means the interrupt has
// been received and can be acknowledged.
if self.interrupt.intx_in_use() {
if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
error!("Failed unmasking INTx IRQ: {}", e);
}
}
}
pub(crate) fn write_bar(
&mut self,
base: u64,
offset: u64,
data: &[u8],
) -> Option<Arc<Barrier>> {
let addr = base + offset;
if let Some(region) = self.find_region(addr) {
let offset = addr - region.start.raw_value();
// If the MSI-X table is written to, we need to update our cache.
if self.interrupt.msix_table_accessed(region.index, offset) {
self.interrupt.msix_write_table(offset, data);
} else {
self.vfio_wrapper.region_write(region.index, offset, data);
}
}
// INTx EOI
// The guest writing to the BAR potentially means the interrupt has
// been received and can be acknowledged.
if self.interrupt.intx_in_use() {
if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
error!("Failed unmasking INTx IRQ: {}", e);
}
}
None
}
pub(crate) fn write_config_register(
&mut self,
reg_idx: usize,
offset: u64,
data: &[u8],
) -> Option<Arc<Barrier>> {
// When the guest wants to write to a BAR, we trap it into
// our local configuration space. We're not reprogramming
// VFIO device.
if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
|| reg_idx == PCI_ROM_EXP_BAR_INDEX
{
// We keep our local cache updated with the BARs.
// We'll read it back from there when the guest is asking
// for BARs (see read_config_register()).
self.configuration
.write_config_register(reg_idx, offset, data);
return None;
}
let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
// If the MSI or MSI-X capabilities are accessed, we need to
// update our local cache accordingly.
// Depending on how the capabilities are modified, this could
// trigger a VFIO MSI or MSI-X toggle.
if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
let cap_offset: u64 = reg - cap_base + offset;
match cap_id {
PciCapabilityId::MessageSignalledInterrupts => {
if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
error!("Could not update MSI capabilities: {}", e);
}
}
PciCapabilityId::MsiX => {
if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
error!("Could not update MSI-X capabilities: {}", e);
}
}
_ => {}
}
}
// Make sure to write to the device's PCI config space after MSI/MSI-X
// interrupts have been enabled/disabled. In case of MSI, when the
// interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
// the MSI Enable bit in the MSI capability structure found in the PCI
// config space is disabled by default. That's why when the guest is
// enabling this bit, we first need to enable the MSI interrupts with
// VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
// to the device region to update the MSI Enable bit.
self.vfio_wrapper.write_config((reg + offset) as u32, data);
None
}
pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 {
// When reading the BARs, we trap it and return what comes
// from our local configuration space. We want the guest to
// use that and not the VFIO device BARs as it does not map
// with the guest address space.
if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(&reg_idx)
|| reg_idx == PCI_ROM_EXP_BAR_INDEX
{
return self.configuration.read_reg(reg_idx);
}
if let Some(id) = self.get_msix_cap_idx() {
let msix = self.interrupt.msix.as_mut().unwrap();
if reg_idx * 4 == id + 4 {
return msix.cap.table;
} else if reg_idx * 4 == id + 8 {
return msix.cap.pba;
}
}
// Since we don't support passing multi-functions devices, we should
// mask the multi-function bit, bit 7 of the Header Type byte on the
// register 3.
let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
0xff7f_ffff
} else {
0xffff_ffff
};
// The config register read comes from the VFIO device itself.
let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask;
if let Some(config_patch) = self.patches.get(&reg_idx) {
value = (value & !config_patch.mask) | config_patch.patch;
}
value
}
fn state(&self) -> VfioCommonState {
let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState {
enabled: intx.enabled,
});
let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState {
cap: msi.cfg.cap,
cap_offset: msi.cap_offset,
});
let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState {
cap: msix.cap,
cap_offset: msix.cap_offset,
bdf: msix.bar.devid,
});
VfioCommonState {
intx_state,
msi_state,
msix_state,
}
}
fn set_state(
&mut self,
state: &VfioCommonState,
msi_state: Option<MsiConfigState>,
msix_state: Option<MsixConfigState>,
) -> Result<(), VfioPciError> {
if let (Some(intx), Some(interrupt_source_group)) =
(&state.intx_state, self.legacy_interrupt_group.clone())
{
self.interrupt.intx = Some(VfioIntx {
interrupt_source_group,
enabled: false,
});
if intx.enabled {
self.enable_intx()?;
}
}
if let Some(msi) = &state.msi_state {
self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state);
}
if let Some(msix) = &state.msix_state {
self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state);
}
Ok(())
}
}
impl Pausable for VfioCommon {}
impl Snapshottable for VfioCommon {
fn id(&self) -> String {
String::from(VFIO_COMMON_ID)
}
fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
let mut vfio_common_snapshot = Snapshot::new_from_versioned_state(&self.state())?;
// Snapshot PciConfiguration
vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?);
// Snapshot MSI
if let Some(msi) = &mut self.interrupt.msi {
vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?);
}
// Snapshot MSI-X
if let Some(msix) = &mut self.interrupt.msix {
vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?);
}
Ok(vfio_common_snapshot)
}
}
/// VfioPciDevice represents a VFIO PCI device.
/// This structure implements the BusDevice and PciDevice traits.
///
/// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
/// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
/// which then gets added to the PCI bus.
pub struct VfioPciDevice {
id: String,
vm: Arc<dyn hypervisor::Vm>,
device: Arc<VfioDevice>,
container: Arc<VfioContainer>,
common: VfioCommon,
iommu_attached: bool,
memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
}
impl VfioPciDevice {
/// Constructs a new Vfio Pci device for the given Vfio device
#[allow(clippy::too_many_arguments)]
pub fn new(
id: String,
vm: &Arc<dyn hypervisor::Vm>,
device: VfioDevice,
container: Arc<VfioContainer>,
msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
iommu_attached: bool,
bdf: PciBdf,
memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
snapshot: Option<Snapshot>,
) -> Result<Self, VfioPciError> {
let device = Arc::new(device);
device.reset();
let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
let common = VfioCommon::new(
msi_interrupt_manager,
legacy_interrupt_group,
Arc::new(vfio_wrapper) as Arc<dyn Vfio>,
&PciVfioSubclass::VfioSubclass,
bdf,
vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID),
)?;
let vfio_pci_device = VfioPciDevice {
id,
vm: vm.clone(),
device,
container,
common,
iommu_attached,
memory_slot,
};
Ok(vfio_pci_device)
}
pub fn iommu_attached(&self) -> bool {
self.iommu_attached
}
fn generate_sparse_areas(
caps: &[VfioRegionInfoCap],
region_index: u32,
region_start: u64,
region_size: u64,
vfio_msix: Option<&VfioMsix>,
) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> {
for cap in caps {
match cap {
VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()),
VfioRegionInfoCap::MsixMappable => {
if !is_4k_aligned(region_start) {
error!(
"Region start address 0x{:x} must be at least aligned on 4KiB",
region_start
);
return Err(VfioPciError::RegionAlignment);
}
if !is_4k_multiple(region_size) {
error!(
"Region size 0x{:x} must be at least a multiple of 4KiB",
region_size
);
return Err(VfioPciError::RegionSize);
}
// In case the region contains the MSI-X vectors table or
// the MSI-X PBA table, we must calculate the subregions
// around them, leading to a list of sparse areas.
// We want to make sure we will still trap MMIO accesses
// to these MSI-X specific ranges. If these region don't align
// with pagesize, we can achive it by enlarging its range.
//
// Using a BtreeMap as the list provided through the iterator is sorted
// by key. This ensures proper split of the whole region.
let mut inter_ranges = BTreeMap::new();
if let Some(msix) = vfio_msix {
if region_index == msix.cap.table_bir() {
let (offset, size) = msix.cap.table_range();
let offset = align_page_size_down(offset);
let size = align_page_size_up(size);
inter_ranges.insert(offset, size);
}
if region_index == msix.cap.pba_bir() {
let (offset, size) = msix.cap.pba_range();
let offset = align_page_size_down(offset);
let size = align_page_size_up(size);
inter_ranges.insert(offset, size);
}
}
let mut sparse_areas = Vec::new();
let mut current_offset = 0;
for (range_offset, range_size) in inter_ranges {
if range_offset > current_offset {
sparse_areas.push(VfioRegionSparseMmapArea {
offset: current_offset,
size: range_offset - current_offset,
});
}
current_offset = align_page_size_down(range_offset + range_size);
}
if region_size > current_offset {
sparse_areas.push(VfioRegionSparseMmapArea {
offset: current_offset,
size: region_size - current_offset,
});
}
return Ok(sparse_areas);
}
_ => {}
}
}
// In case no relevant capabilities have been found, create a single
// sparse area corresponding to the entire MMIO region.
Ok(vec![VfioRegionSparseMmapArea {
offset: 0,
size: region_size,
}])
}
/// Map MMIO regions into the guest, and avoid VM exits when the guest tries
/// to reach those regions.
///
/// # Arguments
///
/// * `vm` - The VM object. It is used to set the VFIO MMIO regions
/// as user memory regions.
/// * `mem_slot` - The closure to return a memory slot.
pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> {
let fd = self.device.as_raw_fd();
for region in self.common.mmio_regions.iter_mut() {
let region_flags = self.device.get_region_flags(region.index);
if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
let mut prot = 0;
if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
prot |= libc::PROT_READ;
}
if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
prot |= libc::PROT_WRITE;
}
// Retrieve the list of capabilities found on the region
let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 {
self.device.get_region_caps(region.index)
} else {
Vec::new()
};
// Don't try to mmap the region if it contains MSI-X table or
// MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE
// in the list of supported capabilities.
if let Some(msix) = self.common.interrupt.msix.as_ref() {
if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir())
&& !caps.contains(&VfioRegionInfoCap::MsixMappable)
{
continue;
}
}
let mmap_size = self.device.get_region_size(region.index);
let mmap_offset = self.device.get_region_offset(region.index);
let sparse_areas = Self::generate_sparse_areas(
&caps,
region.index,
region.start.0,
mmap_size,
self.common.interrupt.msix.as_ref(),
)?;
for area in sparse_areas.iter() {
// SAFETY: FFI call with correct arguments
let host_addr = unsafe {
libc::mmap(
null_mut(),
area.size as usize,
prot,
libc::MAP_SHARED,
fd,
mmap_offset as libc::off_t + area.offset as libc::off_t,
)
};
if host_addr == libc::MAP_FAILED {
error!(
"Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}",
area.offset,
area.size,
std::io::Error::last_os_error()
);
return Err(VfioPciError::MmapArea);
}
if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) {
warn!(
"Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})",
area.offset,
area.size,
);
return Ok(());
}
let user_memory_region = UserMemoryRegion {
slot: (self.memory_slot)(),
start: region.start.0 + area.offset,
size: area.size,
host_addr: host_addr as u64,
};
region.user_memory_regions.push(user_memory_region);
let mem_region = self.vm.make_user_memory_region(
user_memory_region.slot,
user_memory_region.start,
user_memory_region.size,
user_memory_region.host_addr,
false,
false,
);
self.vm
.create_user_memory_region(mem_region)
.map_err(VfioPciError::CreateUserMemoryRegion)?;
}
}
}
Ok(())
}
pub fn unmap_mmio_regions(&mut self) {
for region in self.common.mmio_regions.iter() {
for user_memory_region in region.user_memory_regions.iter() {
// Remove region
let r = self.vm.make_user_memory_region(
user_memory_region.slot,
user_memory_region.start,
user_memory_region.size,
user_memory_region.host_addr,
false,
false,
);
if let Err(e) = self.vm.remove_user_memory_region(r) {
error!("Could not remove the userspace memory region: {}", e);
}
// SAFETY: FFI call with correct arguments
let ret = unsafe {
libc::munmap(
user_memory_region.host_addr as *mut libc::c_void,
user_memory_region.size as usize,
)
};
if ret != 0 {
error!(
"Could not unmap region {}, error:{}",
region.index,
io::Error::last_os_error()
);
}
}
}
}
pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
if !self.iommu_attached {
self.container
.vfio_dma_map(iova, size, user_addr)
.map_err(VfioPciError::DmaMap)?;
}
Ok(())
}
pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
if !self.iommu_attached {
self.container
.vfio_dma_unmap(iova, size)
.map_err(VfioPciError::DmaUnmap)?;
}
Ok(())
}
pub fn mmio_regions(&self) -> Vec<MmioRegion> {
self.common.mmio_regions.clone()
}
}
impl Drop for VfioPciDevice {
fn drop(&mut self) {
self.unmap_mmio_regions();
if let Some(msix) = &self.common.interrupt.msix {
if msix.bar.enabled() {
self.common.disable_msix();
}
}
if let Some(msi) = &self.common.interrupt.msi {
if msi.cfg.enabled() {
self.common.disable_msi()
}
}
if self.common.interrupt.intx_in_use() {
self.common.disable_intx();
}
}
}
impl BusDevice for VfioPciDevice {
fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
self.read_bar(base, offset, data)
}
fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
self.write_bar(base, offset, data)
}
}
// First BAR offset in the PCI config space.
const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
// Capability register offset in the PCI config space.
const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
// Extended capabilities register offset in the PCI config space.
const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100;
// IO BAR when first BAR bit is 1.
const PCI_CONFIG_IO_BAR: u32 = 0x1;
// 64-bit memory bar flag.
const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
// Prefetchable BAR bit
const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8;
// PCI config register size (4 bytes).
const PCI_CONFIG_REGISTER_SIZE: usize = 4;
// Number of BARs for a PCI device
const BAR_NUMS: usize = 6;
// PCI Header Type register index
const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
// First BAR register index
const PCI_CONFIG_BAR0_INDEX: usize = 4;
// PCI ROM expansion BAR register index
const PCI_ROM_EXP_BAR_INDEX: usize = 12;
impl PciDevice for VfioPciDevice {
fn allocate_bars(
&mut self,
allocator: &Arc<Mutex<SystemAllocator>>,
mmio_allocator: &mut AddressAllocator,
resources: Option<Vec<Resource>>,
) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
self.common
.allocate_bars(allocator, mmio_allocator, resources)
}
fn free_bars(
&mut self,
allocator: &mut SystemAllocator,
mmio_allocator: &mut AddressAllocator,
) -> Result<(), PciDeviceError> {
self.common.free_bars(allocator, mmio_allocator)
}
fn write_config_register(
&mut self,
reg_idx: usize,
offset: u64,
data: &[u8],
) -> Option<Arc<Barrier>> {
self.common.write_config_register(reg_idx, offset, data)
}
fn read_config_register(&mut self, reg_idx: usize) -> u32 {
self.common.read_config_register(reg_idx)
}
fn detect_bar_reprogramming(
&mut self,
reg_idx: usize,
data: &[u8],
) -> Option<BarReprogrammingParams> {
self.common
.configuration
.detect_bar_reprogramming(reg_idx, data)
}
fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
self.common.read_bar(base, offset, data)
}
fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
self.common.write_bar(base, offset, data)
}
fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
for region in self.common.mmio_regions.iter_mut() {
if region.start.raw_value() == old_base {
region.start = GuestAddress(new_base);
for user_memory_region in region.user_memory_regions.iter_mut() {
// Remove old region
let old_mem_region = self.vm.make_user_memory_region(
user_memory_region.slot,
user_memory_region.start,
user_memory_region.size,
user_memory_region.host_addr,
false,
false,
);
self.vm
.remove_user_memory_region(old_mem_region)
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
// Update the user memory region with the correct start address.
if new_base > old_base {
user_memory_region.start += new_base - old_base;
} else {
user_memory_region.start -= old_base - new_base;
}
// Insert new region
let new_mem_region = self.vm.make_user_memory_region(
user_memory_region.slot,
user_memory_region.start,
user_memory_region.size,
user_memory_region.host_addr,
false,
false,
);
self.vm
.create_user_memory_region(new_mem_region)
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
}
}
}
Ok(())
}
fn as_any(&mut self) -> &mut dyn Any {
self
}
fn id(&self) -> Option<String> {
Some(self.id.clone())
}
}
impl Pausable for VfioPciDevice {}
impl Snapshottable for VfioPciDevice {
fn id(&self) -> String {
self.id.clone()
}
fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
let mut vfio_pci_dev_snapshot = Snapshot::default();
// Snapshot VfioCommon
vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?);
Ok(vfio_pci_dev_snapshot)
}
}
impl Transportable for VfioPciDevice {}
impl Migratable for VfioPciDevice {}