mirror of
https://github.com/cloud-hypervisor/cloud-hypervisor.git
synced 2025-01-18 18:45:19 +00:00
f0c1f8d079
For all VFIO devices, map all non-emulated MMIO regions to the vfio container to allow PCIe P2P between all VFIO devices on the virtual machine. This is required for a wide variety of advanced GPU workloads such as GPUDirect P2P (DMA between two GPUs), GPUDirect RDMA (DMA between a GPU and an IB device). Signed-off-by: Thomas Barrett <tbarrett@crusoeenergy.com>
1840 lines
62 KiB
Rust
1840 lines
62 KiB
Rust
// Copyright © 2019 Intel Corporation
|
|
//
|
|
// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
|
|
//
|
|
|
|
use crate::msi::{MsiConfigState, MSI_CONFIG_ID};
|
|
use crate::msix::MsixConfigState;
|
|
use crate::{
|
|
msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig,
|
|
PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId,
|
|
PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId,
|
|
PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID,
|
|
};
|
|
use anyhow::anyhow;
|
|
use byteorder::{ByteOrder, LittleEndian};
|
|
use hypervisor::HypervisorVmError;
|
|
use libc::{sysconf, _SC_PAGESIZE};
|
|
use std::any::Any;
|
|
use std::collections::{BTreeMap, HashMap};
|
|
use std::io;
|
|
use std::os::unix::io::AsRawFd;
|
|
use std::ptr::null_mut;
|
|
use std::sync::{Arc, Barrier, Mutex};
|
|
use thiserror::Error;
|
|
use versionize::{VersionMap, Versionize, VersionizeResult};
|
|
use versionize_derive::Versionize;
|
|
use vfio_bindings::bindings::vfio::*;
|
|
use vfio_ioctls::{
|
|
VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea,
|
|
};
|
|
use vm_allocator::page_size::{
|
|
align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned,
|
|
};
|
|
use vm_allocator::{AddressAllocator, SystemAllocator};
|
|
use vm_device::interrupt::{
|
|
InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig,
|
|
};
|
|
use vm_device::{BusDevice, Resource};
|
|
use vm_memory::{Address, GuestAddress, GuestUsize};
|
|
use vm_migration::{
|
|
Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
|
|
};
|
|
use vmm_sys_util::eventfd::EventFd;
|
|
|
|
pub(crate) const VFIO_COMMON_ID: &str = "vfio_common";
|
|
|
|
#[derive(Debug, Error)]
|
|
pub enum VfioPciError {
|
|
#[error("Failed to create user memory region: {0}")]
|
|
CreateUserMemoryRegion(#[source] HypervisorVmError),
|
|
#[error("Failed to DMA map: {0}")]
|
|
DmaMap(#[source] vfio_ioctls::VfioError),
|
|
#[error("Failed to DMA unmap: {0}")]
|
|
DmaUnmap(#[source] vfio_ioctls::VfioError),
|
|
#[error("Failed to enable INTx: {0}")]
|
|
EnableIntx(#[source] VfioError),
|
|
#[error("Failed to enable MSI: {0}")]
|
|
EnableMsi(#[source] VfioError),
|
|
#[error("Failed to enable MSI-x: {0}")]
|
|
EnableMsix(#[source] VfioError),
|
|
#[error("Failed to mmap the area")]
|
|
MmapArea,
|
|
#[error("Failed to notifier's eventfd")]
|
|
MissingNotifier,
|
|
#[error("Invalid region alignment")]
|
|
RegionAlignment,
|
|
#[error("Invalid region size")]
|
|
RegionSize,
|
|
#[error("Failed to retrieve MsiConfigState: {0}")]
|
|
RetrieveMsiConfigState(#[source] anyhow::Error),
|
|
#[error("Failed to retrieve MsixConfigState: {0}")]
|
|
RetrieveMsixConfigState(#[source] anyhow::Error),
|
|
#[error("Failed to retrieve PciConfigurationState: {0}")]
|
|
RetrievePciConfigurationState(#[source] anyhow::Error),
|
|
#[error("Failed to retrieve VfioCommonState: {0}")]
|
|
RetrieveVfioCommonState(#[source] anyhow::Error),
|
|
}
|
|
|
|
#[derive(Copy, Clone)]
|
|
enum PciVfioSubclass {
|
|
VfioSubclass = 0xff,
|
|
}
|
|
|
|
impl PciSubclass for PciVfioSubclass {
|
|
fn get_register_value(&self) -> u8 {
|
|
*self as u8
|
|
}
|
|
}
|
|
|
|
enum InterruptUpdateAction {
|
|
EnableMsi,
|
|
DisableMsi,
|
|
EnableMsix,
|
|
DisableMsix,
|
|
}
|
|
|
|
#[derive(Versionize)]
|
|
struct IntxState {
|
|
enabled: bool,
|
|
}
|
|
|
|
pub(crate) struct VfioIntx {
|
|
interrupt_source_group: Arc<dyn InterruptSourceGroup>,
|
|
enabled: bool,
|
|
}
|
|
|
|
#[derive(Versionize)]
|
|
struct MsiState {
|
|
cap: MsiCap,
|
|
cap_offset: u32,
|
|
}
|
|
|
|
pub(crate) struct VfioMsi {
|
|
pub(crate) cfg: MsiConfig,
|
|
cap_offset: u32,
|
|
interrupt_source_group: Arc<dyn InterruptSourceGroup>,
|
|
}
|
|
|
|
impl VfioMsi {
|
|
fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
|
|
let old_enabled = self.cfg.enabled();
|
|
|
|
self.cfg.update(offset, data);
|
|
|
|
let new_enabled = self.cfg.enabled();
|
|
|
|
if !old_enabled && new_enabled {
|
|
return Some(InterruptUpdateAction::EnableMsi);
|
|
}
|
|
|
|
if old_enabled && !new_enabled {
|
|
return Some(InterruptUpdateAction::DisableMsi);
|
|
}
|
|
|
|
None
|
|
}
|
|
}
|
|
|
|
#[derive(Versionize)]
|
|
struct MsixState {
|
|
cap: MsixCap,
|
|
cap_offset: u32,
|
|
bdf: u32,
|
|
}
|
|
|
|
pub(crate) struct VfioMsix {
|
|
pub(crate) bar: MsixConfig,
|
|
cap: MsixCap,
|
|
cap_offset: u32,
|
|
interrupt_source_group: Arc<dyn InterruptSourceGroup>,
|
|
}
|
|
|
|
impl VfioMsix {
|
|
fn update(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
|
|
let old_enabled = self.bar.enabled();
|
|
|
|
// Update "Message Control" word
|
|
if offset == 2 && data.len() == 2 {
|
|
self.bar.set_msg_ctl(LittleEndian::read_u16(data));
|
|
}
|
|
|
|
let new_enabled = self.bar.enabled();
|
|
|
|
if !old_enabled && new_enabled {
|
|
return Some(InterruptUpdateAction::EnableMsix);
|
|
}
|
|
|
|
if old_enabled && !new_enabled {
|
|
return Some(InterruptUpdateAction::DisableMsix);
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
fn table_accessed(&self, bar_index: u32, offset: u64) -> bool {
|
|
let table_offset: u64 = u64::from(self.cap.table_offset());
|
|
let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64);
|
|
let table_bir: u32 = self.cap.table_bir();
|
|
|
|
bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size
|
|
}
|
|
}
|
|
|
|
pub(crate) struct Interrupt {
|
|
pub(crate) intx: Option<VfioIntx>,
|
|
pub(crate) msi: Option<VfioMsi>,
|
|
pub(crate) msix: Option<VfioMsix>,
|
|
}
|
|
|
|
impl Interrupt {
|
|
fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
|
|
if let Some(ref mut msi) = &mut self.msi {
|
|
let action = msi.update(offset, data);
|
|
return action;
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option<InterruptUpdateAction> {
|
|
if let Some(ref mut msix) = &mut self.msix {
|
|
let action = msix.update(offset, data);
|
|
return action;
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> {
|
|
if let Some(msi) = &self.msi {
|
|
if offset >= u64::from(msi.cap_offset)
|
|
&& offset < u64::from(msi.cap_offset) + msi.cfg.size()
|
|
{
|
|
return Some((
|
|
PciCapabilityId::MessageSignalledInterrupts,
|
|
u64::from(msi.cap_offset),
|
|
));
|
|
}
|
|
}
|
|
|
|
if let Some(msix) = &self.msix {
|
|
if offset == u64::from(msix.cap_offset) {
|
|
return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset)));
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool {
|
|
if let Some(msix) = &self.msix {
|
|
return msix.table_accessed(bar_index, offset);
|
|
}
|
|
|
|
false
|
|
}
|
|
|
|
fn msix_write_table(&mut self, offset: u64, data: &[u8]) {
|
|
if let Some(ref mut msix) = &mut self.msix {
|
|
let offset = offset - u64::from(msix.cap.table_offset());
|
|
msix.bar.write_table(offset, data)
|
|
}
|
|
}
|
|
|
|
fn msix_read_table(&self, offset: u64, data: &mut [u8]) {
|
|
if let Some(msix) = &self.msix {
|
|
let offset = offset - u64::from(msix.cap.table_offset());
|
|
msix.bar.read_table(offset, data)
|
|
}
|
|
}
|
|
|
|
pub(crate) fn intx_in_use(&self) -> bool {
|
|
if let Some(intx) = &self.intx {
|
|
return intx.enabled;
|
|
}
|
|
|
|
false
|
|
}
|
|
}
|
|
|
|
#[derive(Copy, Clone)]
|
|
pub struct UserMemoryRegion {
|
|
pub slot: u32,
|
|
pub start: u64,
|
|
pub size: u64,
|
|
pub host_addr: u64,
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
pub struct MmioRegion {
|
|
pub start: GuestAddress,
|
|
pub length: GuestUsize,
|
|
pub(crate) type_: PciBarRegionType,
|
|
pub(crate) index: u32,
|
|
pub(crate) user_memory_regions: Vec<UserMemoryRegion>,
|
|
}
|
|
#[derive(Debug, Error)]
|
|
pub enum VfioError {
|
|
#[error("Kernel VFIO error: {0}")]
|
|
KernelVfio(#[source] vfio_ioctls::VfioError),
|
|
#[error("VFIO user error: {0}")]
|
|
VfioUser(#[source] vfio_user::Error),
|
|
}
|
|
|
|
pub(crate) trait Vfio: Send + Sync {
|
|
fn read_config_byte(&self, offset: u32) -> u8 {
|
|
let mut data: [u8; 1] = [0];
|
|
self.read_config(offset, &mut data);
|
|
data[0]
|
|
}
|
|
|
|
fn read_config_word(&self, offset: u32) -> u16 {
|
|
let mut data: [u8; 2] = [0, 0];
|
|
self.read_config(offset, &mut data);
|
|
u16::from_le_bytes(data)
|
|
}
|
|
|
|
fn read_config_dword(&self, offset: u32) -> u32 {
|
|
let mut data: [u8; 4] = [0, 0, 0, 0];
|
|
self.read_config(offset, &mut data);
|
|
u32::from_le_bytes(data)
|
|
}
|
|
|
|
fn write_config_dword(&self, offset: u32, buf: u32) {
|
|
let data: [u8; 4] = buf.to_le_bytes();
|
|
self.write_config(offset, &data)
|
|
}
|
|
|
|
fn read_config(&self, offset: u32, data: &mut [u8]) {
|
|
self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut());
|
|
}
|
|
|
|
fn write_config(&self, offset: u32, data: &[u8]) {
|
|
self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data)
|
|
}
|
|
|
|
fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
|
|
self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds)
|
|
}
|
|
|
|
fn disable_msi(&self) -> Result<(), VfioError> {
|
|
self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX)
|
|
}
|
|
|
|
fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> {
|
|
self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds)
|
|
}
|
|
|
|
fn disable_msix(&self) -> Result<(), VfioError> {
|
|
self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX)
|
|
}
|
|
|
|
fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) {
|
|
unimplemented!()
|
|
}
|
|
|
|
fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) {
|
|
unimplemented!()
|
|
}
|
|
|
|
fn get_irq_info(&self, _irq_index: u32) -> Option<VfioIrq> {
|
|
unimplemented!()
|
|
}
|
|
|
|
fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
|
|
unimplemented!()
|
|
}
|
|
|
|
fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
|
|
unimplemented!()
|
|
}
|
|
|
|
fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> {
|
|
unimplemented!()
|
|
}
|
|
}
|
|
|
|
struct VfioDeviceWrapper {
|
|
device: Arc<VfioDevice>,
|
|
}
|
|
|
|
impl VfioDeviceWrapper {
|
|
fn new(device: Arc<VfioDevice>) -> Self {
|
|
Self { device }
|
|
}
|
|
}
|
|
|
|
impl Vfio for VfioDeviceWrapper {
|
|
fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) {
|
|
self.device.region_read(index, data, offset)
|
|
}
|
|
|
|
fn region_write(&self, index: u32, offset: u64, data: &[u8]) {
|
|
self.device.region_write(index, data, offset)
|
|
}
|
|
|
|
fn get_irq_info(&self, irq_index: u32) -> Option<VfioIrq> {
|
|
self.device.get_irq_info(irq_index).copied()
|
|
}
|
|
|
|
fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> {
|
|
self.device
|
|
.enable_irq(irq_index, event_fds)
|
|
.map_err(VfioError::KernelVfio)
|
|
}
|
|
|
|
fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> {
|
|
self.device
|
|
.disable_irq(irq_index)
|
|
.map_err(VfioError::KernelVfio)
|
|
}
|
|
|
|
fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> {
|
|
self.device
|
|
.unmask_irq(irq_index)
|
|
.map_err(VfioError::KernelVfio)
|
|
}
|
|
}
|
|
|
|
#[derive(Versionize)]
|
|
struct VfioCommonState {
|
|
intx_state: Option<IntxState>,
|
|
msi_state: Option<MsiState>,
|
|
msix_state: Option<MsixState>,
|
|
}
|
|
|
|
impl VersionMapped for VfioCommonState {}
|
|
|
|
pub(crate) struct ConfigPatch {
|
|
mask: u32,
|
|
patch: u32,
|
|
}
|
|
|
|
pub(crate) struct VfioCommon {
|
|
pub(crate) configuration: PciConfiguration,
|
|
pub(crate) mmio_regions: Vec<MmioRegion>,
|
|
pub(crate) interrupt: Interrupt,
|
|
pub(crate) msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
|
|
pub(crate) legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
|
|
pub(crate) vfio_wrapper: Arc<dyn Vfio>,
|
|
pub(crate) patches: HashMap<usize, ConfigPatch>,
|
|
}
|
|
|
|
impl VfioCommon {
|
|
pub(crate) fn new(
|
|
msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
|
|
legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
|
|
vfio_wrapper: Arc<dyn Vfio>,
|
|
subclass: &dyn PciSubclass,
|
|
bdf: PciBdf,
|
|
snapshot: Option<Snapshot>,
|
|
) -> Result<Self, VfioPciError> {
|
|
let pci_configuration_state =
|
|
vm_migration::versioned_state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID)
|
|
.map_err(|e| {
|
|
VfioPciError::RetrievePciConfigurationState(anyhow!(
|
|
"Failed to get PciConfigurationState from Snapshot: {}",
|
|
e
|
|
))
|
|
})?;
|
|
|
|
let configuration = PciConfiguration::new(
|
|
0,
|
|
0,
|
|
0,
|
|
PciClassCode::Other,
|
|
subclass,
|
|
None,
|
|
PciHeaderType::Device,
|
|
0,
|
|
0,
|
|
None,
|
|
pci_configuration_state,
|
|
);
|
|
|
|
let mut vfio_common = VfioCommon {
|
|
mmio_regions: Vec::new(),
|
|
configuration,
|
|
interrupt: Interrupt {
|
|
intx: None,
|
|
msi: None,
|
|
msix: None,
|
|
},
|
|
msi_interrupt_manager,
|
|
legacy_interrupt_group,
|
|
vfio_wrapper,
|
|
patches: HashMap::new(),
|
|
};
|
|
|
|
let state: Option<VfioCommonState> = snapshot
|
|
.as_ref()
|
|
.map(|s| s.to_versioned_state())
|
|
.transpose()
|
|
.map_err(|e| {
|
|
VfioPciError::RetrieveVfioCommonState(anyhow!(
|
|
"Failed to get VfioCommonState from Snapshot: {}",
|
|
e
|
|
))
|
|
})?;
|
|
let msi_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSI_CONFIG_ID)
|
|
.map_err(|e| {
|
|
VfioPciError::RetrieveMsiConfigState(anyhow!(
|
|
"Failed to get MsiConfigState from Snapshot: {}",
|
|
e
|
|
))
|
|
})?;
|
|
let msix_state = vm_migration::versioned_state_from_id(snapshot.as_ref(), MSIX_CONFIG_ID)
|
|
.map_err(|e| {
|
|
VfioPciError::RetrieveMsixConfigState(anyhow!(
|
|
"Failed to get MsixConfigState from Snapshot: {}",
|
|
e
|
|
))
|
|
})?;
|
|
|
|
if let Some(state) = state.as_ref() {
|
|
vfio_common.set_state(state, msi_state, msix_state)?;
|
|
} else {
|
|
vfio_common.parse_capabilities(bdf);
|
|
vfio_common.initialize_legacy_interrupt()?;
|
|
}
|
|
|
|
Ok(vfio_common)
|
|
}
|
|
|
|
/// In case msix table offset is not page size aligned, we need do some fixup to achieve it.
|
|
/// Because we don't want the MMIO RW region and trap region overlap each other.
|
|
fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 {
|
|
if let Some(msix) = self.interrupt.msix.as_mut() {
|
|
let msix_cap = &mut msix.cap;
|
|
|
|
// Suppose table_bir equals to pba_bir here. Am I right?
|
|
let (table_offset, table_size) = msix_cap.table_range();
|
|
if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id {
|
|
return region_size;
|
|
}
|
|
|
|
let (pba_offset, pba_size) = msix_cap.pba_range();
|
|
let msix_sz = align_page_size_up(table_size + pba_size);
|
|
// Expand region to hold RW and trap region which both page size aligned
|
|
let size = std::cmp::max(region_size * 2, msix_sz * 2);
|
|
// let table starts from the middle of the region
|
|
msix_cap.table_set_offset((size / 2) as u32);
|
|
msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32);
|
|
|
|
size
|
|
} else {
|
|
// MSI-X not supported for this device
|
|
region_size
|
|
}
|
|
}
|
|
|
|
// The `allocator` argument is unused on `aarch64`
|
|
#[allow(unused_variables)]
|
|
pub(crate) fn allocate_bars(
|
|
&mut self,
|
|
allocator: &Arc<Mutex<SystemAllocator>>,
|
|
mmio32_allocator: &mut AddressAllocator,
|
|
mmio64_allocator: &mut AddressAllocator,
|
|
resources: Option<Vec<Resource>>,
|
|
) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
|
|
let mut bars = Vec::new();
|
|
let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX;
|
|
|
|
// Going through all regular regions to compute the BAR size.
|
|
// We're not saving the BAR address to restore it, because we
|
|
// are going to allocate a guest address for each BAR and write
|
|
// that new address back.
|
|
while bar_id < VFIO_PCI_CONFIG_REGION_INDEX {
|
|
let mut region_size: u64 = 0;
|
|
let mut region_type = PciBarRegionType::Memory32BitRegion;
|
|
let mut prefetchable = PciBarPrefetchable::NotPrefetchable;
|
|
let mut flags: u32 = 0;
|
|
|
|
let mut restored_bar_addr = None;
|
|
if let Some(resources) = &resources {
|
|
for resource in resources {
|
|
if let Resource::PciBar {
|
|
index,
|
|
base,
|
|
size,
|
|
type_,
|
|
..
|
|
} = resource
|
|
{
|
|
if *index == bar_id as usize {
|
|
restored_bar_addr = Some(GuestAddress(*base));
|
|
region_size = *size;
|
|
region_type = PciBarRegionType::from(*type_);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if restored_bar_addr.is_none() {
|
|
bar_id += 1;
|
|
continue;
|
|
}
|
|
} else {
|
|
let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX {
|
|
(PCI_ROM_EXP_BAR_INDEX * 4) as u32
|
|
} else {
|
|
PCI_CONFIG_BAR_OFFSET + bar_id * 4
|
|
};
|
|
|
|
// First read flags
|
|
flags = self.vfio_wrapper.read_config_dword(bar_offset);
|
|
|
|
// Is this an IO BAR?
|
|
let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
|
|
matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR)
|
|
} else {
|
|
false
|
|
};
|
|
|
|
// Is this a 64-bit BAR?
|
|
let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX {
|
|
matches!(
|
|
flags & PCI_CONFIG_MEMORY_BAR_64BIT,
|
|
PCI_CONFIG_MEMORY_BAR_64BIT
|
|
)
|
|
} else {
|
|
false
|
|
};
|
|
|
|
if matches!(
|
|
flags & PCI_CONFIG_BAR_PREFETCHABLE,
|
|
PCI_CONFIG_BAR_PREFETCHABLE
|
|
) {
|
|
prefetchable = PciBarPrefetchable::Prefetchable
|
|
};
|
|
|
|
// To get size write all 1s
|
|
self.vfio_wrapper
|
|
.write_config_dword(bar_offset, 0xffff_ffff);
|
|
|
|
// And read back BAR value. The device will write zeros for bits it doesn't care about
|
|
let mut lower = self.vfio_wrapper.read_config_dword(bar_offset);
|
|
|
|
if io_bar {
|
|
// Mask flag bits (lowest 2 for I/O bars)
|
|
lower &= !0b11;
|
|
|
|
// BAR is not enabled
|
|
if lower == 0 {
|
|
bar_id += 1;
|
|
continue;
|
|
}
|
|
|
|
// IO BAR
|
|
region_type = PciBarRegionType::IoRegion;
|
|
|
|
// Invert bits and add 1 to calculate size
|
|
region_size = (!lower + 1) as u64;
|
|
} else if is_64bit_bar {
|
|
// 64 bits Memory BAR
|
|
region_type = PciBarRegionType::Memory64BitRegion;
|
|
|
|
// Query size of upper BAR of 64-bit BAR
|
|
let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4;
|
|
self.vfio_wrapper
|
|
.write_config_dword(upper_offset, 0xffff_ffff);
|
|
let upper = self.vfio_wrapper.read_config_dword(upper_offset);
|
|
|
|
let mut combined_size = u64::from(upper) << 32 | u64::from(lower);
|
|
|
|
// Mask out flag bits (lowest 4 for memory bars)
|
|
combined_size &= !0b1111;
|
|
|
|
// BAR is not enabled
|
|
if combined_size == 0 {
|
|
bar_id += 1;
|
|
continue;
|
|
}
|
|
|
|
// Invert and add 1 to to find size
|
|
region_size = !combined_size + 1;
|
|
} else {
|
|
region_type = PciBarRegionType::Memory32BitRegion;
|
|
|
|
// Mask out flag bits (lowest 4 for memory bars)
|
|
lower &= !0b1111;
|
|
|
|
if lower == 0 {
|
|
bar_id += 1;
|
|
continue;
|
|
}
|
|
|
|
// Invert and add 1 to to find size
|
|
region_size = (!lower + 1) as u64;
|
|
}
|
|
}
|
|
|
|
let bar_addr = match region_type {
|
|
PciBarRegionType::IoRegion => {
|
|
#[cfg(target_arch = "aarch64")]
|
|
unimplemented!();
|
|
|
|
// The address needs to be 4 bytes aligned.
|
|
#[cfg(not(target_arch = "aarch64"))]
|
|
allocator
|
|
.lock()
|
|
.unwrap()
|
|
.allocate_io_addresses(restored_bar_addr, region_size, Some(0x4))
|
|
.ok_or(PciDeviceError::IoAllocationFailed(region_size))?
|
|
}
|
|
PciBarRegionType::Memory32BitRegion => {
|
|
// BAR allocation must be naturally aligned
|
|
mmio32_allocator
|
|
.allocate(restored_bar_addr, region_size, Some(region_size))
|
|
.ok_or(PciDeviceError::IoAllocationFailed(region_size))?
|
|
}
|
|
PciBarRegionType::Memory64BitRegion => {
|
|
// We need do some fixup to keep MMIO RW region and msix cap region page size
|
|
// aligned.
|
|
region_size = self.fixup_msix_region(bar_id, region_size);
|
|
mmio64_allocator
|
|
.allocate(
|
|
restored_bar_addr,
|
|
region_size,
|
|
// SAFETY: FFI call. Trivially safe.
|
|
Some(unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }),
|
|
)
|
|
.ok_or(PciDeviceError::IoAllocationFailed(region_size))?
|
|
}
|
|
};
|
|
|
|
// We can now build our BAR configuration block.
|
|
let bar = PciBarConfiguration::default()
|
|
.set_index(bar_id as usize)
|
|
.set_address(bar_addr.raw_value())
|
|
.set_size(region_size)
|
|
.set_region_type(region_type)
|
|
.set_prefetchable(prefetchable);
|
|
|
|
if bar_id == VFIO_PCI_ROM_REGION_INDEX {
|
|
self.configuration
|
|
.add_pci_rom_bar(&bar, flags & 0x1)
|
|
.map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
|
|
} else {
|
|
self.configuration
|
|
.add_pci_bar(&bar)
|
|
.map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?;
|
|
}
|
|
|
|
bars.push(bar);
|
|
self.mmio_regions.push(MmioRegion {
|
|
start: bar_addr,
|
|
length: region_size,
|
|
type_: region_type,
|
|
index: bar_id,
|
|
user_memory_regions: Vec::new(),
|
|
});
|
|
|
|
bar_id += 1;
|
|
if region_type == PciBarRegionType::Memory64BitRegion {
|
|
bar_id += 1;
|
|
}
|
|
}
|
|
|
|
Ok(bars)
|
|
}
|
|
|
|
// The `allocator` argument is unused on `aarch64`
|
|
#[allow(unused_variables)]
|
|
pub(crate) fn free_bars(
|
|
&mut self,
|
|
allocator: &mut SystemAllocator,
|
|
mmio32_allocator: &mut AddressAllocator,
|
|
mmio64_allocator: &mut AddressAllocator,
|
|
) -> Result<(), PciDeviceError> {
|
|
for region in self.mmio_regions.iter() {
|
|
match region.type_ {
|
|
PciBarRegionType::IoRegion => {
|
|
#[cfg(target_arch = "x86_64")]
|
|
allocator.free_io_addresses(region.start, region.length);
|
|
#[cfg(target_arch = "aarch64")]
|
|
error!("I/O region is not supported");
|
|
}
|
|
PciBarRegionType::Memory32BitRegion => {
|
|
mmio32_allocator.free(region.start, region.length);
|
|
}
|
|
PciBarRegionType::Memory64BitRegion => {
|
|
mmio64_allocator.free(region.start, region.length);
|
|
}
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap {
|
|
let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into());
|
|
|
|
let table = self.vfio_wrapper.read_config_dword((cap + 4).into());
|
|
|
|
let pba = self.vfio_wrapper.read_config_dword((cap + 8).into());
|
|
|
|
MsixCap {
|
|
msg_ctl,
|
|
table,
|
|
pba,
|
|
}
|
|
}
|
|
|
|
pub(crate) fn initialize_msix(
|
|
&mut self,
|
|
msix_cap: MsixCap,
|
|
cap_offset: u32,
|
|
bdf: PciBdf,
|
|
state: Option<MsixConfigState>,
|
|
) {
|
|
let interrupt_source_group = self
|
|
.msi_interrupt_manager
|
|
.create_group(MsiIrqGroupConfig {
|
|
base: 0,
|
|
count: msix_cap.table_size() as InterruptIndex,
|
|
})
|
|
.unwrap();
|
|
|
|
let msix_config = MsixConfig::new(
|
|
msix_cap.table_size(),
|
|
interrupt_source_group.clone(),
|
|
bdf.into(),
|
|
state,
|
|
)
|
|
.unwrap();
|
|
|
|
self.interrupt.msix = Some(VfioMsix {
|
|
bar: msix_config,
|
|
cap: msix_cap,
|
|
cap_offset,
|
|
interrupt_source_group,
|
|
});
|
|
}
|
|
|
|
pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 {
|
|
self.vfio_wrapper.read_config_word((cap + 2).into())
|
|
}
|
|
|
|
pub(crate) fn initialize_msi(
|
|
&mut self,
|
|
msg_ctl: u16,
|
|
cap_offset: u32,
|
|
state: Option<MsiConfigState>,
|
|
) {
|
|
let interrupt_source_group = self
|
|
.msi_interrupt_manager
|
|
.create_group(MsiIrqGroupConfig {
|
|
base: 0,
|
|
count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex,
|
|
})
|
|
.unwrap();
|
|
|
|
let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap();
|
|
|
|
self.interrupt.msi = Some(VfioMsi {
|
|
cfg: msi_config,
|
|
cap_offset,
|
|
interrupt_source_group,
|
|
});
|
|
}
|
|
|
|
pub(crate) fn get_msix_cap_idx(&self) -> Option<usize> {
|
|
let mut cap_next = self
|
|
.vfio_wrapper
|
|
.read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
|
|
|
|
while cap_next != 0 {
|
|
let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
|
|
if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX {
|
|
return Some(cap_next as usize);
|
|
} else {
|
|
cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) {
|
|
let mut cap_next = self
|
|
.vfio_wrapper
|
|
.read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET);
|
|
|
|
let mut pci_express_cap_found = false;
|
|
let mut power_management_cap_found = false;
|
|
|
|
while cap_next != 0 {
|
|
let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into());
|
|
|
|
match PciCapabilityId::from(cap_id) {
|
|
PciCapabilityId::MessageSignalledInterrupts => {
|
|
if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) {
|
|
if irq_info.count > 0 {
|
|
// Parse capability only if the VFIO device
|
|
// supports MSI.
|
|
let msg_ctl = self.parse_msi_capabilities(cap_next);
|
|
self.initialize_msi(msg_ctl, cap_next as u32, None);
|
|
}
|
|
}
|
|
}
|
|
PciCapabilityId::MsiX => {
|
|
if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX)
|
|
{
|
|
if irq_info.count > 0 {
|
|
// Parse capability only if the VFIO device
|
|
// supports MSI-X.
|
|
let msix_cap = self.parse_msix_capabilities(cap_next);
|
|
self.initialize_msix(msix_cap, cap_next as u32, bdf, None);
|
|
}
|
|
}
|
|
}
|
|
PciCapabilityId::PciExpress => pci_express_cap_found = true,
|
|
PciCapabilityId::PowerManagement => power_management_cap_found = true,
|
|
_ => {}
|
|
};
|
|
|
|
cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into());
|
|
}
|
|
|
|
if pci_express_cap_found && power_management_cap_found {
|
|
self.parse_extended_capabilities();
|
|
}
|
|
}
|
|
|
|
fn parse_extended_capabilities(&mut self) {
|
|
let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET;
|
|
|
|
loop {
|
|
let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset);
|
|
|
|
let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16;
|
|
let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16;
|
|
|
|
match PciExpressCapabilityId::from(cap_id) {
|
|
PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation
|
|
| PciExpressCapabilityId::ResizeableBar
|
|
| PciExpressCapabilityId::SingleRootIoVirtualization => {
|
|
let reg_idx = (current_offset / 4) as usize;
|
|
self.patches.insert(
|
|
reg_idx,
|
|
ConfigPatch {
|
|
mask: 0x0000_ffff,
|
|
patch: PciExpressCapabilityId::NullCapability as u32,
|
|
},
|
|
);
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
if cap_next == 0 {
|
|
break;
|
|
}
|
|
|
|
current_offset = cap_next.into();
|
|
}
|
|
}
|
|
|
|
pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> {
|
|
if let Some(intx) = &mut self.interrupt.intx {
|
|
if !intx.enabled {
|
|
if let Some(eventfd) = intx.interrupt_source_group.notifier(0) {
|
|
self.vfio_wrapper
|
|
.enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd])
|
|
.map_err(VfioPciError::EnableIntx)?;
|
|
|
|
intx.enabled = true;
|
|
} else {
|
|
return Err(VfioPciError::MissingNotifier);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub(crate) fn disable_intx(&mut self) {
|
|
if let Some(intx) = &mut self.interrupt.intx {
|
|
if intx.enabled {
|
|
if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) {
|
|
error!("Could not disable INTx: {}", e);
|
|
} else {
|
|
intx.enabled = false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> {
|
|
if let Some(msi) = &self.interrupt.msi {
|
|
let mut irq_fds: Vec<EventFd> = Vec::new();
|
|
for i in 0..msi.cfg.num_enabled_vectors() {
|
|
if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) {
|
|
irq_fds.push(eventfd);
|
|
} else {
|
|
return Err(VfioPciError::MissingNotifier);
|
|
}
|
|
}
|
|
|
|
self.vfio_wrapper
|
|
.enable_msi(irq_fds.iter().collect())
|
|
.map_err(VfioPciError::EnableMsi)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub(crate) fn disable_msi(&self) {
|
|
if let Err(e) = self.vfio_wrapper.disable_msi() {
|
|
error!("Could not disable MSI: {}", e);
|
|
}
|
|
}
|
|
|
|
pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> {
|
|
if let Some(msix) = &self.interrupt.msix {
|
|
let mut irq_fds: Vec<EventFd> = Vec::new();
|
|
for i in 0..msix.bar.table_entries.len() {
|
|
if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) {
|
|
irq_fds.push(eventfd);
|
|
} else {
|
|
return Err(VfioPciError::MissingNotifier);
|
|
}
|
|
}
|
|
|
|
self.vfio_wrapper
|
|
.enable_msix(irq_fds.iter().collect())
|
|
.map_err(VfioPciError::EnableMsix)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub(crate) fn disable_msix(&self) {
|
|
if let Err(e) = self.vfio_wrapper.disable_msix() {
|
|
error!("Could not disable MSI-X: {}", e);
|
|
}
|
|
}
|
|
|
|
pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> {
|
|
if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) {
|
|
if irq_info.count == 0 {
|
|
// A count of 0 means the INTx IRQ is not supported, therefore
|
|
// it shouldn't be initialized.
|
|
return Ok(());
|
|
}
|
|
}
|
|
|
|
if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() {
|
|
self.interrupt.intx = Some(VfioIntx {
|
|
interrupt_source_group,
|
|
enabled: false,
|
|
});
|
|
|
|
self.enable_intx()?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub(crate) fn update_msi_capabilities(
|
|
&mut self,
|
|
offset: u64,
|
|
data: &[u8],
|
|
) -> Result<(), VfioPciError> {
|
|
match self.interrupt.update_msi(offset, data) {
|
|
Some(InterruptUpdateAction::EnableMsi) => {
|
|
// Disable INTx before we can enable MSI
|
|
self.disable_intx();
|
|
self.enable_msi()?;
|
|
}
|
|
Some(InterruptUpdateAction::DisableMsi) => {
|
|
// Fallback onto INTx when disabling MSI
|
|
self.disable_msi();
|
|
self.enable_intx()?;
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub(crate) fn update_msix_capabilities(
|
|
&mut self,
|
|
offset: u64,
|
|
data: &[u8],
|
|
) -> Result<(), VfioPciError> {
|
|
match self.interrupt.update_msix(offset, data) {
|
|
Some(InterruptUpdateAction::EnableMsix) => {
|
|
// Disable INTx before we can enable MSI-X
|
|
self.disable_intx();
|
|
self.enable_msix()?;
|
|
}
|
|
Some(InterruptUpdateAction::DisableMsix) => {
|
|
// Fallback onto INTx when disabling MSI-X
|
|
self.disable_msix();
|
|
self.enable_intx()?;
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub(crate) fn find_region(&self, addr: u64) -> Option<MmioRegion> {
|
|
for region in self.mmio_regions.iter() {
|
|
if addr >= region.start.raw_value()
|
|
&& addr < region.start.unchecked_add(region.length).raw_value()
|
|
{
|
|
return Some(region.clone());
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
|
|
let addr = base + offset;
|
|
if let Some(region) = self.find_region(addr) {
|
|
let offset = addr - region.start.raw_value();
|
|
|
|
if self.interrupt.msix_table_accessed(region.index, offset) {
|
|
self.interrupt.msix_read_table(offset, data);
|
|
} else {
|
|
self.vfio_wrapper.region_read(region.index, offset, data);
|
|
}
|
|
}
|
|
|
|
// INTx EOI
|
|
// The guest reading from the BAR potentially means the interrupt has
|
|
// been received and can be acknowledged.
|
|
if self.interrupt.intx_in_use() {
|
|
if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
|
|
error!("Failed unmasking INTx IRQ: {}", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
pub(crate) fn write_bar(
|
|
&mut self,
|
|
base: u64,
|
|
offset: u64,
|
|
data: &[u8],
|
|
) -> Option<Arc<Barrier>> {
|
|
let addr = base + offset;
|
|
if let Some(region) = self.find_region(addr) {
|
|
let offset = addr - region.start.raw_value();
|
|
|
|
// If the MSI-X table is written to, we need to update our cache.
|
|
if self.interrupt.msix_table_accessed(region.index, offset) {
|
|
self.interrupt.msix_write_table(offset, data);
|
|
} else {
|
|
self.vfio_wrapper.region_write(region.index, offset, data);
|
|
}
|
|
}
|
|
|
|
// INTx EOI
|
|
// The guest writing to the BAR potentially means the interrupt has
|
|
// been received and can be acknowledged.
|
|
if self.interrupt.intx_in_use() {
|
|
if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) {
|
|
error!("Failed unmasking INTx IRQ: {}", e);
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
pub(crate) fn write_config_register(
|
|
&mut self,
|
|
reg_idx: usize,
|
|
offset: u64,
|
|
data: &[u8],
|
|
) -> Option<Arc<Barrier>> {
|
|
// When the guest wants to write to a BAR, we trap it into
|
|
// our local configuration space. We're not reprogramming
|
|
// VFIO device.
|
|
if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx)
|
|
|| reg_idx == PCI_ROM_EXP_BAR_INDEX
|
|
{
|
|
// We keep our local cache updated with the BARs.
|
|
// We'll read it back from there when the guest is asking
|
|
// for BARs (see read_config_register()).
|
|
self.configuration
|
|
.write_config_register(reg_idx, offset, data);
|
|
return None;
|
|
}
|
|
|
|
let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64;
|
|
|
|
// If the MSI or MSI-X capabilities are accessed, we need to
|
|
// update our local cache accordingly.
|
|
// Depending on how the capabilities are modified, this could
|
|
// trigger a VFIO MSI or MSI-X toggle.
|
|
if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) {
|
|
let cap_offset: u64 = reg - cap_base + offset;
|
|
match cap_id {
|
|
PciCapabilityId::MessageSignalledInterrupts => {
|
|
if let Err(e) = self.update_msi_capabilities(cap_offset, data) {
|
|
error!("Could not update MSI capabilities: {}", e);
|
|
}
|
|
}
|
|
PciCapabilityId::MsiX => {
|
|
if let Err(e) = self.update_msix_capabilities(cap_offset, data) {
|
|
error!("Could not update MSI-X capabilities: {}", e);
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
// Make sure to write to the device's PCI config space after MSI/MSI-X
|
|
// interrupts have been enabled/disabled. In case of MSI, when the
|
|
// interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS),
|
|
// the MSI Enable bit in the MSI capability structure found in the PCI
|
|
// config space is disabled by default. That's why when the guest is
|
|
// enabling this bit, we first need to enable the MSI interrupts with
|
|
// VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write
|
|
// to the device region to update the MSI Enable bit.
|
|
self.vfio_wrapper.write_config((reg + offset) as u32, data);
|
|
|
|
None
|
|
}
|
|
|
|
pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 {
|
|
// When reading the BARs, we trap it and return what comes
|
|
// from our local configuration space. We want the guest to
|
|
// use that and not the VFIO device BARs as it does not map
|
|
// with the guest address space.
|
|
if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx)
|
|
|| reg_idx == PCI_ROM_EXP_BAR_INDEX
|
|
{
|
|
return self.configuration.read_reg(reg_idx);
|
|
}
|
|
|
|
if let Some(id) = self.get_msix_cap_idx() {
|
|
let msix = self.interrupt.msix.as_mut().unwrap();
|
|
if reg_idx * 4 == id + 4 {
|
|
return msix.cap.table;
|
|
} else if reg_idx * 4 == id + 8 {
|
|
return msix.cap.pba;
|
|
}
|
|
}
|
|
|
|
// Since we don't support passing multi-functions devices, we should
|
|
// mask the multi-function bit, bit 7 of the Header Type byte on the
|
|
// register 3.
|
|
let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX {
|
|
0xff7f_ffff
|
|
} else {
|
|
0xffff_ffff
|
|
};
|
|
|
|
// The config register read comes from the VFIO device itself.
|
|
let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask;
|
|
|
|
if let Some(config_patch) = self.patches.get(®_idx) {
|
|
value = (value & !config_patch.mask) | config_patch.patch;
|
|
}
|
|
|
|
value
|
|
}
|
|
|
|
fn state(&self) -> VfioCommonState {
|
|
let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState {
|
|
enabled: intx.enabled,
|
|
});
|
|
|
|
let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState {
|
|
cap: msi.cfg.cap,
|
|
cap_offset: msi.cap_offset,
|
|
});
|
|
|
|
let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState {
|
|
cap: msix.cap,
|
|
cap_offset: msix.cap_offset,
|
|
bdf: msix.bar.devid,
|
|
});
|
|
|
|
VfioCommonState {
|
|
intx_state,
|
|
msi_state,
|
|
msix_state,
|
|
}
|
|
}
|
|
|
|
fn set_state(
|
|
&mut self,
|
|
state: &VfioCommonState,
|
|
msi_state: Option<MsiConfigState>,
|
|
msix_state: Option<MsixConfigState>,
|
|
) -> Result<(), VfioPciError> {
|
|
if let (Some(intx), Some(interrupt_source_group)) =
|
|
(&state.intx_state, self.legacy_interrupt_group.clone())
|
|
{
|
|
self.interrupt.intx = Some(VfioIntx {
|
|
interrupt_source_group,
|
|
enabled: false,
|
|
});
|
|
|
|
if intx.enabled {
|
|
self.enable_intx()?;
|
|
}
|
|
}
|
|
|
|
if let Some(msi) = &state.msi_state {
|
|
self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state);
|
|
}
|
|
|
|
if let Some(msix) = &state.msix_state {
|
|
self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
impl Pausable for VfioCommon {}
|
|
|
|
impl Snapshottable for VfioCommon {
|
|
fn id(&self) -> String {
|
|
String::from(VFIO_COMMON_ID)
|
|
}
|
|
|
|
fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
|
|
let mut vfio_common_snapshot = Snapshot::new_from_versioned_state(&self.state())?;
|
|
|
|
// Snapshot PciConfiguration
|
|
vfio_common_snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?);
|
|
|
|
// Snapshot MSI
|
|
if let Some(msi) = &mut self.interrupt.msi {
|
|
vfio_common_snapshot.add_snapshot(msi.cfg.id(), msi.cfg.snapshot()?);
|
|
}
|
|
|
|
// Snapshot MSI-X
|
|
if let Some(msix) = &mut self.interrupt.msix {
|
|
vfio_common_snapshot.add_snapshot(msix.bar.id(), msix.bar.snapshot()?);
|
|
}
|
|
|
|
Ok(vfio_common_snapshot)
|
|
}
|
|
}
|
|
|
|
/// VfioPciDevice represents a VFIO PCI device.
|
|
/// This structure implements the BusDevice and PciDevice traits.
|
|
///
|
|
/// A VfioPciDevice is bound to a VfioDevice and is also a PCI device.
|
|
/// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice,
|
|
/// which then gets added to the PCI bus.
|
|
pub struct VfioPciDevice {
|
|
id: String,
|
|
vm: Arc<dyn hypervisor::Vm>,
|
|
device: Arc<VfioDevice>,
|
|
container: Arc<VfioContainer>,
|
|
common: VfioCommon,
|
|
iommu_attached: bool,
|
|
memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
|
|
}
|
|
|
|
impl VfioPciDevice {
|
|
/// Constructs a new Vfio Pci device for the given Vfio device
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub fn new(
|
|
id: String,
|
|
vm: &Arc<dyn hypervisor::Vm>,
|
|
device: VfioDevice,
|
|
container: Arc<VfioContainer>,
|
|
msi_interrupt_manager: Arc<dyn InterruptManager<GroupConfig = MsiIrqGroupConfig>>,
|
|
legacy_interrupt_group: Option<Arc<dyn InterruptSourceGroup>>,
|
|
iommu_attached: bool,
|
|
bdf: PciBdf,
|
|
memory_slot: Arc<dyn Fn() -> u32 + Send + Sync>,
|
|
snapshot: Option<Snapshot>,
|
|
) -> Result<Self, VfioPciError> {
|
|
let device = Arc::new(device);
|
|
device.reset();
|
|
|
|
let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device));
|
|
|
|
let common = VfioCommon::new(
|
|
msi_interrupt_manager,
|
|
legacy_interrupt_group,
|
|
Arc::new(vfio_wrapper) as Arc<dyn Vfio>,
|
|
&PciVfioSubclass::VfioSubclass,
|
|
bdf,
|
|
vm_migration::snapshot_from_id(snapshot.as_ref(), VFIO_COMMON_ID),
|
|
)?;
|
|
|
|
let vfio_pci_device = VfioPciDevice {
|
|
id,
|
|
vm: vm.clone(),
|
|
device,
|
|
container,
|
|
common,
|
|
iommu_attached,
|
|
memory_slot,
|
|
};
|
|
|
|
Ok(vfio_pci_device)
|
|
}
|
|
|
|
pub fn iommu_attached(&self) -> bool {
|
|
self.iommu_attached
|
|
}
|
|
|
|
fn generate_sparse_areas(
|
|
caps: &[VfioRegionInfoCap],
|
|
region_index: u32,
|
|
region_start: u64,
|
|
region_size: u64,
|
|
vfio_msix: Option<&VfioMsix>,
|
|
) -> Result<Vec<VfioRegionSparseMmapArea>, VfioPciError> {
|
|
for cap in caps {
|
|
match cap {
|
|
VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()),
|
|
VfioRegionInfoCap::MsixMappable => {
|
|
if !is_4k_aligned(region_start) {
|
|
error!(
|
|
"Region start address 0x{:x} must be at least aligned on 4KiB",
|
|
region_start
|
|
);
|
|
return Err(VfioPciError::RegionAlignment);
|
|
}
|
|
if !is_4k_multiple(region_size) {
|
|
error!(
|
|
"Region size 0x{:x} must be at least a multiple of 4KiB",
|
|
region_size
|
|
);
|
|
return Err(VfioPciError::RegionSize);
|
|
}
|
|
|
|
// In case the region contains the MSI-X vectors table or
|
|
// the MSI-X PBA table, we must calculate the subregions
|
|
// around them, leading to a list of sparse areas.
|
|
// We want to make sure we will still trap MMIO accesses
|
|
// to these MSI-X specific ranges. If these region don't align
|
|
// with pagesize, we can achieve it by enlarging its range.
|
|
//
|
|
// Using a BtreeMap as the list provided through the iterator is sorted
|
|
// by key. This ensures proper split of the whole region.
|
|
let mut inter_ranges = BTreeMap::new();
|
|
if let Some(msix) = vfio_msix {
|
|
if region_index == msix.cap.table_bir() {
|
|
let (offset, size) = msix.cap.table_range();
|
|
let offset = align_page_size_down(offset);
|
|
let size = align_page_size_up(size);
|
|
inter_ranges.insert(offset, size);
|
|
}
|
|
if region_index == msix.cap.pba_bir() {
|
|
let (offset, size) = msix.cap.pba_range();
|
|
let offset = align_page_size_down(offset);
|
|
let size = align_page_size_up(size);
|
|
inter_ranges.insert(offset, size);
|
|
}
|
|
}
|
|
|
|
let mut sparse_areas = Vec::new();
|
|
let mut current_offset = 0;
|
|
for (range_offset, range_size) in inter_ranges {
|
|
if range_offset > current_offset {
|
|
sparse_areas.push(VfioRegionSparseMmapArea {
|
|
offset: current_offset,
|
|
size: range_offset - current_offset,
|
|
});
|
|
}
|
|
current_offset = align_page_size_down(range_offset + range_size);
|
|
}
|
|
|
|
if region_size > current_offset {
|
|
sparse_areas.push(VfioRegionSparseMmapArea {
|
|
offset: current_offset,
|
|
size: region_size - current_offset,
|
|
});
|
|
}
|
|
|
|
return Ok(sparse_areas);
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
// In case no relevant capabilities have been found, create a single
|
|
// sparse area corresponding to the entire MMIO region.
|
|
Ok(vec![VfioRegionSparseMmapArea {
|
|
offset: 0,
|
|
size: region_size,
|
|
}])
|
|
}
|
|
|
|
/// Map MMIO regions into the guest, and avoid VM exits when the guest tries
|
|
/// to reach those regions.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `vm` - The VM object. It is used to set the VFIO MMIO regions
|
|
/// as user memory regions.
|
|
/// * `mem_slot` - The closure to return a memory slot.
|
|
pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> {
|
|
let fd = self.device.as_raw_fd();
|
|
|
|
for region in self.common.mmio_regions.iter_mut() {
|
|
let region_flags = self.device.get_region_flags(region.index);
|
|
if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
|
|
let mut prot = 0;
|
|
if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 {
|
|
prot |= libc::PROT_READ;
|
|
}
|
|
if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 {
|
|
prot |= libc::PROT_WRITE;
|
|
}
|
|
|
|
// Retrieve the list of capabilities found on the region
|
|
let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 {
|
|
self.device.get_region_caps(region.index)
|
|
} else {
|
|
Vec::new()
|
|
};
|
|
|
|
// Don't try to mmap the region if it contains MSI-X table or
|
|
// MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE
|
|
// in the list of supported capabilities.
|
|
if let Some(msix) = self.common.interrupt.msix.as_ref() {
|
|
if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir())
|
|
&& !caps.contains(&VfioRegionInfoCap::MsixMappable)
|
|
{
|
|
continue;
|
|
}
|
|
}
|
|
|
|
let mmap_size = self.device.get_region_size(region.index);
|
|
let mmap_offset = self.device.get_region_offset(region.index);
|
|
|
|
let sparse_areas = Self::generate_sparse_areas(
|
|
&caps,
|
|
region.index,
|
|
region.start.0,
|
|
mmap_size,
|
|
self.common.interrupt.msix.as_ref(),
|
|
)?;
|
|
|
|
for area in sparse_areas.iter() {
|
|
// SAFETY: FFI call with correct arguments
|
|
let host_addr = unsafe {
|
|
libc::mmap(
|
|
null_mut(),
|
|
area.size as usize,
|
|
prot,
|
|
libc::MAP_SHARED,
|
|
fd,
|
|
mmap_offset as libc::off_t + area.offset as libc::off_t,
|
|
)
|
|
};
|
|
|
|
if host_addr == libc::MAP_FAILED {
|
|
error!(
|
|
"Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}",
|
|
area.offset,
|
|
area.size,
|
|
std::io::Error::last_os_error()
|
|
);
|
|
return Err(VfioPciError::MmapArea);
|
|
}
|
|
|
|
if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) {
|
|
warn!(
|
|
"Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})",
|
|
area.offset,
|
|
area.size,
|
|
);
|
|
return Ok(());
|
|
}
|
|
|
|
let user_memory_region = UserMemoryRegion {
|
|
slot: (self.memory_slot)(),
|
|
start: region.start.0 + area.offset,
|
|
size: area.size,
|
|
host_addr: host_addr as u64,
|
|
};
|
|
|
|
region.user_memory_regions.push(user_memory_region);
|
|
|
|
let mem_region = self.vm.make_user_memory_region(
|
|
user_memory_region.slot,
|
|
user_memory_region.start,
|
|
user_memory_region.size,
|
|
user_memory_region.host_addr,
|
|
false,
|
|
false,
|
|
);
|
|
|
|
self.vm
|
|
.create_user_memory_region(mem_region)
|
|
.map_err(VfioPciError::CreateUserMemoryRegion)?;
|
|
|
|
if !self.iommu_attached {
|
|
self.container
|
|
.vfio_dma_map(
|
|
user_memory_region.start,
|
|
user_memory_region.size,
|
|
user_memory_region.host_addr,
|
|
)
|
|
.map_err(VfioPciError::DmaMap)?;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn unmap_mmio_regions(&mut self) {
|
|
for region in self.common.mmio_regions.iter() {
|
|
for user_memory_region in region.user_memory_regions.iter() {
|
|
// Unmap from vfio container
|
|
if !self.iommu_attached {
|
|
if let Err(e) = self
|
|
.container
|
|
.vfio_dma_unmap(user_memory_region.start, user_memory_region.size)
|
|
{
|
|
error!("Could not unmap mmio region from vfio container: {}", e);
|
|
}
|
|
}
|
|
|
|
// Remove region
|
|
let r = self.vm.make_user_memory_region(
|
|
user_memory_region.slot,
|
|
user_memory_region.start,
|
|
user_memory_region.size,
|
|
user_memory_region.host_addr,
|
|
false,
|
|
false,
|
|
);
|
|
|
|
if let Err(e) = self.vm.remove_user_memory_region(r) {
|
|
error!("Could not remove the userspace memory region: {}", e);
|
|
}
|
|
|
|
// SAFETY: FFI call with correct arguments
|
|
let ret = unsafe {
|
|
libc::munmap(
|
|
user_memory_region.host_addr as *mut libc::c_void,
|
|
user_memory_region.size as usize,
|
|
)
|
|
};
|
|
if ret != 0 {
|
|
error!(
|
|
"Could not unmap region {}, error:{}",
|
|
region.index,
|
|
io::Error::last_os_error()
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> {
|
|
if !self.iommu_attached {
|
|
self.container
|
|
.vfio_dma_map(iova, size, user_addr)
|
|
.map_err(VfioPciError::DmaMap)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> {
|
|
if !self.iommu_attached {
|
|
self.container
|
|
.vfio_dma_unmap(iova, size)
|
|
.map_err(VfioPciError::DmaUnmap)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn mmio_regions(&self) -> Vec<MmioRegion> {
|
|
self.common.mmio_regions.clone()
|
|
}
|
|
}
|
|
|
|
impl Drop for VfioPciDevice {
|
|
fn drop(&mut self) {
|
|
self.unmap_mmio_regions();
|
|
|
|
if let Some(msix) = &self.common.interrupt.msix {
|
|
if msix.bar.enabled() {
|
|
self.common.disable_msix();
|
|
}
|
|
}
|
|
|
|
if let Some(msi) = &self.common.interrupt.msi {
|
|
if msi.cfg.enabled() {
|
|
self.common.disable_msi()
|
|
}
|
|
}
|
|
|
|
if self.common.interrupt.intx_in_use() {
|
|
self.common.disable_intx();
|
|
}
|
|
}
|
|
}
|
|
|
|
impl BusDevice for VfioPciDevice {
|
|
fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {
|
|
self.read_bar(base, offset, data)
|
|
}
|
|
|
|
fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
|
|
self.write_bar(base, offset, data)
|
|
}
|
|
}
|
|
|
|
// First BAR offset in the PCI config space.
|
|
const PCI_CONFIG_BAR_OFFSET: u32 = 0x10;
|
|
// Capability register offset in the PCI config space.
|
|
const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34;
|
|
// Extended capabilities register offset in the PCI config space.
|
|
const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100;
|
|
// IO BAR when first BAR bit is 1.
|
|
const PCI_CONFIG_IO_BAR: u32 = 0x1;
|
|
// 64-bit memory bar flag.
|
|
const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4;
|
|
// Prefetchable BAR bit
|
|
const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8;
|
|
// PCI config register size (4 bytes).
|
|
const PCI_CONFIG_REGISTER_SIZE: usize = 4;
|
|
// Number of BARs for a PCI device
|
|
const BAR_NUMS: usize = 6;
|
|
// PCI Header Type register index
|
|
const PCI_HEADER_TYPE_REG_INDEX: usize = 3;
|
|
// First BAR register index
|
|
const PCI_CONFIG_BAR0_INDEX: usize = 4;
|
|
// PCI ROM expansion BAR register index
|
|
const PCI_ROM_EXP_BAR_INDEX: usize = 12;
|
|
|
|
impl PciDevice for VfioPciDevice {
|
|
fn allocate_bars(
|
|
&mut self,
|
|
allocator: &Arc<Mutex<SystemAllocator>>,
|
|
mmio32_allocator: &mut AddressAllocator,
|
|
mmio64_allocator: &mut AddressAllocator,
|
|
resources: Option<Vec<Resource>>,
|
|
) -> Result<Vec<PciBarConfiguration>, PciDeviceError> {
|
|
self.common
|
|
.allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources)
|
|
}
|
|
|
|
fn free_bars(
|
|
&mut self,
|
|
allocator: &mut SystemAllocator,
|
|
mmio32_allocator: &mut AddressAllocator,
|
|
mmio64_allocator: &mut AddressAllocator,
|
|
) -> Result<(), PciDeviceError> {
|
|
self.common
|
|
.free_bars(allocator, mmio32_allocator, mmio64_allocator)
|
|
}
|
|
|
|
fn write_config_register(
|
|
&mut self,
|
|
reg_idx: usize,
|
|
offset: u64,
|
|
data: &[u8],
|
|
) -> Option<Arc<Barrier>> {
|
|
self.common.write_config_register(reg_idx, offset, data)
|
|
}
|
|
|
|
fn read_config_register(&mut self, reg_idx: usize) -> u32 {
|
|
self.common.read_config_register(reg_idx)
|
|
}
|
|
|
|
fn detect_bar_reprogramming(
|
|
&mut self,
|
|
reg_idx: usize,
|
|
data: &[u8],
|
|
) -> Option<BarReprogrammingParams> {
|
|
self.common
|
|
.configuration
|
|
.detect_bar_reprogramming(reg_idx, data)
|
|
}
|
|
|
|
fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) {
|
|
self.common.read_bar(base, offset, data)
|
|
}
|
|
|
|
fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
|
|
self.common.write_bar(base, offset, data)
|
|
}
|
|
|
|
fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> {
|
|
for region in self.common.mmio_regions.iter_mut() {
|
|
if region.start.raw_value() == old_base {
|
|
region.start = GuestAddress(new_base);
|
|
|
|
for user_memory_region in region.user_memory_regions.iter_mut() {
|
|
// Remove old region
|
|
let old_mem_region = self.vm.make_user_memory_region(
|
|
user_memory_region.slot,
|
|
user_memory_region.start,
|
|
user_memory_region.size,
|
|
user_memory_region.host_addr,
|
|
false,
|
|
false,
|
|
);
|
|
|
|
self.vm
|
|
.remove_user_memory_region(old_mem_region)
|
|
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
|
|
|
|
// Update the user memory region with the correct start address.
|
|
if new_base > old_base {
|
|
user_memory_region.start += new_base - old_base;
|
|
} else {
|
|
user_memory_region.start -= old_base - new_base;
|
|
}
|
|
|
|
// Insert new region
|
|
let new_mem_region = self.vm.make_user_memory_region(
|
|
user_memory_region.slot,
|
|
user_memory_region.start,
|
|
user_memory_region.size,
|
|
user_memory_region.host_addr,
|
|
false,
|
|
false,
|
|
);
|
|
|
|
self.vm
|
|
.create_user_memory_region(new_mem_region)
|
|
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn as_any(&mut self) -> &mut dyn Any {
|
|
self
|
|
}
|
|
|
|
fn id(&self) -> Option<String> {
|
|
Some(self.id.clone())
|
|
}
|
|
}
|
|
|
|
impl Pausable for VfioPciDevice {}
|
|
|
|
impl Snapshottable for VfioPciDevice {
|
|
fn id(&self) -> String {
|
|
self.id.clone()
|
|
}
|
|
|
|
fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
|
|
let mut vfio_pci_dev_snapshot = Snapshot::default();
|
|
|
|
// Snapshot VfioCommon
|
|
vfio_pci_dev_snapshot.add_snapshot(self.common.id(), self.common.snapshot()?);
|
|
|
|
Ok(vfio_pci_dev_snapshot)
|
|
}
|
|
}
|
|
impl Transportable for VfioPciDevice {}
|
|
impl Migratable for VfioPciDevice {}
|