vmm: Create a single VFIO container

For most use cases, there is no need to create multiple VFIO containers
as it causes unwanted behaviors. Especially when passing multiple
devices from the same IOMMU group, we need to use the same container so
that it can properly list the groups that have been already opened. The
correct logic was already there in vfio-ioctls, but it was incorrectly
used from our VMM implementation.

For the special case where we put a VFIO device behind a vIOMMU, we must
create one container per device, as we need to control the DMA mappings
per device, which is performed at the container level. Because we must
keep one container per device, the vIOMMU use case prevents multiple
devices attached to the same IOMMU group to be passed through the VM.
But this is a limitation that we are fine with, especially since the
vIOMMU doesn't let us group multiple devices in the same group from a
guest perspective.

Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
This commit is contained in:
Sebastien Boeuf 2021-09-14 11:02:10 +02:00 committed by Bo Chen
parent a5d77cdee9
commit a6040d7a30
2 changed files with 110 additions and 128 deletions

View File

@ -25,7 +25,6 @@ use crate::{VirtioInterrupt, VirtioInterruptType};
use anyhow::anyhow; use anyhow::anyhow;
use libc::EFD_NONBLOCK; use libc::EFD_NONBLOCK;
use seccompiler::SeccompAction; use seccompiler::SeccompAction;
use std::collections::BTreeMap;
use std::io; use std::io;
use std::mem::size_of; use std::mem::size_of;
use std::os::unix::io::{AsRawFd, RawFd}; use std::os::unix::io::{AsRawFd, RawFd};
@ -428,7 +427,7 @@ struct MemEpollHandler {
kill_evt: EventFd, kill_evt: EventFd,
pause_evt: EventFd, pause_evt: EventFd,
hugepages: bool, hugepages: bool,
dma_mapping_handlers: Arc<Mutex<BTreeMap<u32, Arc<dyn ExternalDmaMapping>>>>, dma_mapping_handler: Option<Arc<dyn ExternalDmaMapping>>,
} }
impl MemEpollHandler { impl MemEpollHandler {
@ -505,11 +504,10 @@ impl MemEpollHandler {
.unwrap() .unwrap()
.set_range(first_block_index, nb_blocks, plug); .set_range(first_block_index, nb_blocks, plug);
let handlers = self.dma_mapping_handlers.lock().unwrap();
if plug { if plug {
let mut gpa = addr; let mut gpa = addr;
for _ in 0..nb_blocks { for _ in 0..nb_blocks {
for (_, handler) in handlers.iter() { if let Some(handler) = &self.dma_mapping_handler {
if let Err(e) = handler.map(gpa, gpa, config.block_size) { if let Err(e) = handler.map(gpa, gpa, config.block_size) {
error!( error!(
"failed DMA mapping addr 0x{:x} size 0x{:x}: {}", "failed DMA mapping addr 0x{:x} size 0x{:x}: {}",
@ -524,7 +522,7 @@ impl MemEpollHandler {
config.plugged_size += size; config.plugged_size += size;
} else { } else {
for (_, handler) in handlers.iter() { if let Some(handler) = &self.dma_mapping_handler {
if let Err(e) = handler.unmap(addr, size) { if let Err(e) = handler.unmap(addr, size) {
error!( error!(
"failed DMA unmapping addr 0x{:x} size 0x{:x}: {}", "failed DMA unmapping addr 0x{:x} size 0x{:x}: {}",
@ -549,11 +547,10 @@ impl MemEpollHandler {
// Remaining plugged blocks are unmapped. // Remaining plugged blocks are unmapped.
if config.plugged_size > 0 { if config.plugged_size > 0 {
let handlers = self.dma_mapping_handlers.lock().unwrap();
for (idx, plugged) in self.blocks_state.lock().unwrap().inner().iter().enumerate() { for (idx, plugged) in self.blocks_state.lock().unwrap().inner().iter().enumerate() {
if *plugged { if *plugged {
let gpa = config.addr + (idx as u64 * config.block_size); let gpa = config.addr + (idx as u64 * config.block_size);
for (_, handler) in handlers.iter() { if let Some(handler) = &self.dma_mapping_handler {
if let Err(e) = handler.unmap(gpa, config.block_size) { if let Err(e) = handler.unmap(gpa, config.block_size) {
error!( error!(
"failed DMA unmapping addr 0x{:x} size 0x{:x}: {}", "failed DMA unmapping addr 0x{:x} size 0x{:x}: {}",
@ -744,7 +741,7 @@ pub struct Mem {
config: Arc<Mutex<VirtioMemConfig>>, config: Arc<Mutex<VirtioMemConfig>>,
seccomp_action: SeccompAction, seccomp_action: SeccompAction,
hugepages: bool, hugepages: bool,
dma_mapping_handlers: Arc<Mutex<BTreeMap<u32, Arc<dyn ExternalDmaMapping>>>>, dma_mapping_handler: Option<Arc<dyn ExternalDmaMapping>>,
blocks_state: Arc<Mutex<BlocksState>>, blocks_state: Arc<Mutex<BlocksState>>,
exit_evt: EventFd, exit_evt: EventFd,
} }
@ -832,7 +829,7 @@ impl Mem {
config: Arc::new(Mutex::new(config)), config: Arc::new(Mutex::new(config)),
seccomp_action, seccomp_action,
hugepages, hugepages,
dma_mapping_handlers: Arc::new(Mutex::new(BTreeMap::new())), dma_mapping_handler: None,
blocks_state: Arc::new(Mutex::new(BlocksState(vec![ blocks_state: Arc::new(Mutex::new(BlocksState(vec![
false; false;
(config.region_size / config.block_size) (config.region_size / config.block_size)
@ -844,7 +841,6 @@ impl Mem {
pub fn add_dma_mapping_handler( pub fn add_dma_mapping_handler(
&mut self, &mut self,
device_id: u32,
handler: Arc<dyn ExternalDmaMapping>, handler: Arc<dyn ExternalDmaMapping>,
) -> result::Result<(), Error> { ) -> result::Result<(), Error> {
let config = self.config.lock().unwrap(); let config = self.config.lock().unwrap();
@ -860,34 +856,7 @@ impl Mem {
} }
} }
self.dma_mapping_handlers self.dma_mapping_handler = Some(handler);
.lock()
.unwrap()
.insert(device_id, handler);
Ok(())
}
pub fn remove_dma_mapping_handler(&mut self, device_id: u32) -> result::Result<(), Error> {
let handler = self
.dma_mapping_handlers
.lock()
.unwrap()
.remove(&device_id)
.ok_or(Error::InvalidDmaMappingHandler)?;
let config = self.config.lock().unwrap();
if config.plugged_size > 0 {
for (idx, plugged) in self.blocks_state.lock().unwrap().inner().iter().enumerate() {
if *plugged {
let gpa = config.addr + (idx as u64 * config.block_size);
handler
.unmap(gpa, config.block_size)
.map_err(Error::DmaUnmap)?;
}
}
}
Ok(()) Ok(())
} }
@ -946,7 +915,7 @@ impl VirtioDevice for Mem {
kill_evt, kill_evt,
pause_evt, pause_evt,
hugepages: self.hugepages, hugepages: self.hugepages,
dma_mapping_handlers: Arc::clone(&self.dma_mapping_handlers), dma_mapping_handler: self.dma_mapping_handler.clone(),
}; };
handler handler

View File

@ -265,7 +265,7 @@ pub enum DeviceManagerError {
VfioMapRegion(pci::VfioPciError), VfioMapRegion(pci::VfioPciError),
/// Failed to DMA map VFIO device. /// Failed to DMA map VFIO device.
VfioDmaMap(pci::VfioPciError), VfioDmaMap(vfio_ioctls::VfioError),
/// Failed to DMA unmap VFIO device. /// Failed to DMA unmap VFIO device.
VfioDmaUnmap(pci::VfioPciError), VfioDmaUnmap(pci::VfioPciError),
@ -370,7 +370,7 @@ pub enum DeviceManagerError {
VirtioMemRangeAllocation, VirtioMemRangeAllocation,
/// Failed updating guest memory for VFIO PCI device. /// Failed updating guest memory for VFIO PCI device.
UpdateMemoryForVfioPciDevice(pci::VfioPciError), UpdateMemoryForVfioPciDevice(vfio_ioctls::VfioError),
/// Trying to use a directory for pmem but no size specified /// Trying to use a directory for pmem but no size specified
PmemWithDirectorySizeMissing, PmemWithDirectorySizeMissing,
@ -402,6 +402,9 @@ pub enum DeviceManagerError {
/// Missing virtio-balloon, can't proceed as expected. /// Missing virtio-balloon, can't proceed as expected.
MissingVirtioBalloon, MissingVirtioBalloon,
/// Missing virtual IOMMU device
MissingVirtualIommu,
/// Failed to do power button notification /// Failed to do power button notification
PowerButtonNotification(io::Error), PowerButtonNotification(io::Error),
@ -841,6 +844,11 @@ pub struct DeviceManager {
// Passthrough device handle // Passthrough device handle
passthrough_device: Option<Arc<dyn hypervisor::Device>>, passthrough_device: Option<Arc<dyn hypervisor::Device>>,
// VFIO container
// Only one container can be created, therefore it is stored as part of the
// DeviceManager to be reused.
vfio_container: Option<Arc<VfioContainer>>,
// Paravirtualized IOMMU // Paravirtualized IOMMU
iommu_device: Option<Arc<Mutex<virtio_devices::Iommu>>>, iommu_device: Option<Arc<Mutex<virtio_devices::Iommu>>>,
@ -961,6 +969,7 @@ impl DeviceManager {
msi_interrupt_manager, msi_interrupt_manager,
legacy_interrupt_manager: None, legacy_interrupt_manager: None,
passthrough_device: None, passthrough_device: None,
vfio_container: None,
iommu_device: None, iommu_device: None,
iommu_attached_devices: None, iommu_attached_devices: None,
pci_devices_up: 0, pci_devices_up: 0,
@ -2790,28 +2799,12 @@ impl DeviceManager {
self.add_vfio_device(pci, device_cfg) self.add_vfio_device(pci, device_cfg)
} }
fn add_vfio_device( fn create_vfio_container(&self) -> DeviceManagerResult<Arc<VfioContainer>> {
&mut self,
pci: &mut PciBus,
device_cfg: &mut DeviceConfig,
) -> DeviceManagerResult<(u32, String)> {
let passthrough_device = self let passthrough_device = self
.passthrough_device .passthrough_device
.as_ref() .as_ref()
.ok_or(DeviceManagerError::NoDevicePassthroughSupport)?; .ok_or(DeviceManagerError::NoDevicePassthroughSupport)?;
// We need to shift the device id since the 3 first bits
// are dedicated to the PCI function, and we know we don't
// do multifunction. Also, because we only support one PCI
// bus, the bus 0, we don't need to add anything to the
// global device ID.
let pci_device_bdf = pci
.next_device_id()
.map_err(DeviceManagerError::NextPciDeviceId)?
<< 3;
let memory = self.memory_manager.lock().unwrap().guest_memory();
// Safe because we know the RawFd is valid. // Safe because we know the RawFd is valid.
// //
// This dup() is mandatory to be able to give full ownership of the // This dup() is mandatory to be able to give full ownership of the
@ -2830,31 +2823,94 @@ impl DeviceManager {
// 1. When running on KVM or MSHV, passthrough_device wraps around DeviceFd. // 1. When running on KVM or MSHV, passthrough_device wraps around DeviceFd.
// 2. The conversion here extracts the raw fd and then turns the raw fd into a DeviceFd // 2. The conversion here extracts the raw fd and then turns the raw fd into a DeviceFd
// of the same (correct) type. // of the same (correct) type.
let vfio_container = Arc::new( Ok(Arc::new(
VfioContainer::new(Arc::new(unsafe { DeviceFd::from_raw_fd(dup_device_fd) })) VfioContainer::new(Arc::new(unsafe { DeviceFd::from_raw_fd(dup_device_fd) }))
.map_err(DeviceManagerError::VfioCreate)?, .map_err(DeviceManagerError::VfioCreate)?,
); ))
}
let vfio_device = VfioDevice::new(&device_cfg.path, Arc::clone(&vfio_container)) fn add_vfio_device(
.map_err(DeviceManagerError::VfioCreate)?; &mut self,
pci: &mut PciBus,
device_cfg: &mut DeviceConfig,
) -> DeviceManagerResult<(u32, String)> {
// We need to shift the device id since the 3 first bits
// are dedicated to the PCI function, and we know we don't
// do multifunction. Also, because we only support one PCI
// bus, the bus 0, we don't need to add anything to the
// global device ID.
let pci_device_bdf = pci
.next_device_id()
.map_err(DeviceManagerError::NextPciDeviceId)?
<< 3;
let mut needs_dma_mapping = false;
// Here we create a new VFIO container for two reasons. Either this is
// the first VFIO device, meaning we need a new VFIO container, which
// will be shared with other VFIO devices. Or the new VFIO device is
// attached to a vIOMMU, meaning we must create a dedicated VFIO
// container. In the vIOMMU use case, we can't let all devices under
// the same VFIO container since we couldn't map/unmap memory for each
// device. That's simply because the map/unmap operations happen at the
// VFIO container level.
let vfio_container = if device_cfg.iommu {
let vfio_container = self.create_vfio_container()?;
let vfio_mapping = Arc::new(VfioDmaMapping::new(
Arc::clone(&vfio_container),
Arc::new(self.memory_manager.lock().unwrap().guest_memory()),
));
let vfio_mapping = Arc::new(VfioDmaMapping::new(
Arc::clone(&vfio_container),
Arc::new(memory),
));
if device_cfg.iommu {
if let Some(iommu) = &self.iommu_device { if let Some(iommu) = &self.iommu_device {
iommu iommu
.lock() .lock()
.unwrap() .unwrap()
.add_external_mapping(pci_device_bdf, vfio_mapping); .add_external_mapping(pci_device_bdf, vfio_mapping);
} else {
return Err(DeviceManagerError::MissingVirtualIommu);
} }
vfio_container
} else if let Some(vfio_container) = &self.vfio_container {
Arc::clone(vfio_container)
} else { } else {
let vfio_container = self.create_vfio_container()?;
needs_dma_mapping = true;
self.vfio_container = Some(Arc::clone(&vfio_container));
vfio_container
};
let vfio_device = VfioDevice::new(&device_cfg.path, Arc::clone(&vfio_container))
.map_err(DeviceManagerError::VfioCreate)?;
if needs_dma_mapping {
// Register DMA mapping in IOMMU.
// Do not register virtio-mem regions, as they are handled directly by
// virtio-mem device itself.
for (_, zone) in self.memory_manager.lock().unwrap().memory_zones().iter() {
for region in zone.regions() {
vfio_container
.vfio_dma_map(
region.start_addr().raw_value(),
region.len() as u64,
region.as_ptr() as u64,
)
.map_err(DeviceManagerError::VfioDmaMap)?;
}
}
let vfio_mapping = Arc::new(VfioDmaMapping::new(
Arc::clone(&vfio_container),
Arc::new(self.memory_manager.lock().unwrap().guest_memory()),
));
for virtio_mem_device in self.virtio_mem_devices.iter() { for virtio_mem_device in self.virtio_mem_devices.iter() {
virtio_mem_device virtio_mem_device
.lock() .lock()
.unwrap() .unwrap()
.add_dma_mapping_handler(pci_device_bdf, vfio_mapping.clone()) .add_dma_mapping_handler(vfio_mapping.clone())
.map_err(DeviceManagerError::AddDmaMappingHandlerVirtioMem)?; .map_err(DeviceManagerError::AddDmaMappingHandlerVirtioMem)?;
} }
} }
@ -2910,21 +2966,6 @@ impl DeviceManager {
}); });
} }
// Register DMA mapping in IOMMU.
// Do not register virtio-mem regions, as they are handled directly by
// virtio-mem device itself.
for (_, zone) in self.memory_manager.lock().unwrap().memory_zones().iter() {
for region in zone.regions() {
vfio_pci_device
.dma_map(
region.start_addr().raw_value(),
region.len() as u64,
region.as_ptr() as u64,
)
.map_err(DeviceManagerError::VfioDmaMap)?;
}
}
let vfio_pci_device = Arc::new(Mutex::new(vfio_pci_device)); let vfio_pci_device = Arc::new(Mutex::new(vfio_pci_device));
self.add_pci_device( self.add_pci_device(
@ -3266,8 +3307,19 @@ impl DeviceManager {
.map_err(DeviceManagerError::UpdateMemoryForVirtioDevice)?; .map_err(DeviceManagerError::UpdateMemoryForVirtioDevice)?;
} }
#[allow(clippy::single_match)]
// Take care of updating the memory for VFIO PCI devices. // Take care of updating the memory for VFIO PCI devices.
if let Some(vfio_container) = &self.vfio_container {
vfio_container
.vfio_dma_map(
new_region.start_addr().raw_value(),
new_region.len() as u64,
new_region.as_ptr() as u64,
)
.map_err(DeviceManagerError::UpdateMemoryForVfioPciDevice)?;
}
#[allow(clippy::single_match)]
// Take care of updating the memory for vfio-user devices.
{ {
let device_tree = self.device_tree.lock().unwrap(); let device_tree = self.device_tree.lock().unwrap();
for pci_device_node in device_tree.pci_devices() { for pci_device_node in device_tree.pci_devices() {
@ -3276,18 +3328,6 @@ impl DeviceManager {
.as_ref() .as_ref()
.ok_or(DeviceManagerError::MissingPciDevice)? .ok_or(DeviceManagerError::MissingPciDevice)?
{ {
#[cfg(feature = "kvm")]
PciDeviceHandle::Vfio(vfio_pci_device) => {
vfio_pci_device
.lock()
.unwrap()
.dma_map(
new_region.start_addr().raw_value(),
new_region.len() as u64,
new_region.as_ptr() as u64,
)
.map_err(DeviceManagerError::UpdateMemoryForVfioPciDevice)?;
}
PciDeviceHandle::VfioUser(vfio_user_pci_device) => { PciDeviceHandle::VfioUser(vfio_user_pci_device) => {
vfio_user_pci_device vfio_user_pci_device
.lock() .lock()
@ -3466,38 +3506,11 @@ impl DeviceManager {
.pci_device_handle .pci_device_handle
.ok_or(DeviceManagerError::MissingPciDevice)?; .ok_or(DeviceManagerError::MissingPciDevice)?;
let (pci_device, bus_device, virtio_device) = match pci_device_handle { let (pci_device, bus_device, virtio_device) = match pci_device_handle {
PciDeviceHandle::Vfio(vfio_pci_device) => { PciDeviceHandle::Vfio(vfio_pci_device) => (
{ Arc::clone(&vfio_pci_device) as Arc<Mutex<dyn PciDevice>>,
// Unregister DMA mapping in IOMMU. Arc::clone(&vfio_pci_device) as Arc<Mutex<dyn BusDevice>>,
// Do not unregister the virtio-mem region, as it is None as Option<VirtioDeviceArc>,
// directly handled by the virtio-mem device. ),
let dev = vfio_pci_device.lock().unwrap();
for (_, zone) in self.memory_manager.lock().unwrap().memory_zones().iter() {
for region in zone.regions() {
dev.dma_unmap(region.start_addr().raw_value(), region.len() as u64)
.map_err(DeviceManagerError::VfioDmaUnmap)?;
}
}
// Unregister the VFIO mapping handler from all virtio-mem
// devices.
if !dev.iommu_attached() {
for virtio_mem_device in self.virtio_mem_devices.iter() {
virtio_mem_device
.lock()
.unwrap()
.remove_dma_mapping_handler(pci_device_bdf)
.map_err(DeviceManagerError::RemoveDmaMappingHandlerVirtioMem)?;
}
}
}
(
Arc::clone(&vfio_pci_device) as Arc<Mutex<dyn PciDevice>>,
Arc::clone(&vfio_pci_device) as Arc<Mutex<dyn BusDevice>>,
None as Option<VirtioDeviceArc>,
)
}
PciDeviceHandle::Virtio(virtio_pci_device) => { PciDeviceHandle::Virtio(virtio_pci_device) => {
let bar_addr = virtio_pci_device.lock().unwrap().config_bar_addr(); let bar_addr = virtio_pci_device.lock().unwrap().config_bar_addr();
for (event, addr) in virtio_pci_device.lock().unwrap().ioeventfds(bar_addr) { for (event, addr) in virtio_pci_device.lock().unwrap().ioeventfds(bar_addr) {