vmm: add pci_segment mmio aperture configs

When using multiple PCI segments, the 32-bit and 64-bit mmio
aperture is split equally between each segment. Add an option
to configure the 'weight'. For example, a PCI segment with a
`mmio32_aperture_weight` of 2 will be allocated twice as much
32-bit mmio space as a normal PCI segment.

Signed-off-by: Thomas Barrett <tbarrett@crusoeenergy.com>
This commit is contained in:
Thomas Barrett 2024-04-14 19:50:45 +00:00 committed by Bo Chen
parent a84bc06874
commit e7e856d8ac
8 changed files with 334 additions and 16 deletions

View File

@ -123,3 +123,30 @@ $ ls /sys/kernel/iommu_groups/22/devices/
This means these two devices are under the same IOMMU group 22. In such case, This means these two devices are under the same IOMMU group 22. In such case,
it is important to bind both devices to VFIO and pass them both through the it is important to bind both devices to VFIO and pass them both through the
VM, otherwise this could cause some functional and security issues. VM, otherwise this could cause some functional and security issues.
### Advanced Configuration Options
Some VFIO devices have a 32-bit mmio BAR. When using many such devices, it is
possible to exhaust the 32-bit mmio space available on a PCI segment. The
following example demonstrates an example device with a 16 MiB 32-bit mmio BAR.
```
lspci -s 0000:01:00.0 -v
0000:01:00.0 3D controller: NVIDIA Corporation Device 26b9 (rev a1)
[...]
Memory at f9000000 (32-bit, non-prefetchable) [size=16M]
Memory at 46000000000 (64-bit, prefetchable) [size=64G]
Memory at 48040000000 (64-bit, prefetchable) [size=32M]
[...]
```
When using multiple PCI segments, the 32-bit mmio address space available to
be allocated to VFIO devices is equally split between all PCI segments by
default. This can be tuned with the `--pci-segment` flag. The following example
demonstrates a guest with two PCI segments. 2/3 of the 32-bit mmio address
space is available for use by devices on PCI segment 0 and 1/3 of the 32-bit
mmio address space is available for use by devices on PCI segment 1.
```
--platform num_pci_segments=2
--pci-segment pci_segment=0,mmio32_aperture_weight=2
--pci-segment pci_segment=1,mmio32_aperture_weight=1
```

View File

@ -187,6 +187,7 @@ impl RequestHandler for StubApiRequestHandler {
watchdog: false, watchdog: false,
#[cfg(feature = "guest_debug")] #[cfg(feature = "guest_debug")]
gdb: false, gdb: false,
pci_segments: None,
platform: None, platform: None,
tpm: None, tpm: None,
preserved_fds: None, preserved_fds: None,

View File

@ -355,6 +355,13 @@ fn create_app(default_vcpus: String, default_memory: String, default_rng: String
.num_args(1..) .num_args(1..)
.group("vm-config"), .group("vm-config"),
) )
.arg(
Arg::new("pci-segment")
.long("pci-segment")
.help(config::PciSegmentConfig::SYNTAX)
.num_args(1..)
.group("vm-config"),
)
.arg( .arg(
Arg::new("watchdog") Arg::new("watchdog")
.long("watchdog") .long("watchdog")
@ -934,6 +941,7 @@ mod unit_tests {
watchdog: false, watchdog: false,
#[cfg(feature = "guest_debug")] #[cfg(feature = "guest_debug")]
gdb: false, gdb: false,
pci_segments: None,
platform: None, platform: None,
tpm: None, tpm: None,
preserved_fds: None, preserved_fds: None,

View File

@ -618,6 +618,10 @@ components:
pvpanic: pvpanic:
type: boolean type: boolean
default: false default: false
pci_segments:
type: array
items:
$ref: "#/components/schemas/PciSegmentConfig"
platform: platform:
$ref: "#/components/schemas/PlatformConfig" $ref: "#/components/schemas/PlatformConfig"
tpm: tpm:
@ -683,6 +687,21 @@ components:
features: features:
$ref: "#/components/schemas/CpuFeatures" $ref: "#/components/schemas/CpuFeatures"
PciSegmentConfig:
required:
- pci_segment
type: object
properties:
pci_segment:
type: integer
format: int16
mmio32_aperture_weight:
type: integer
format: int32
mmio64_aperture_weight:
type: integer
format: int32
PlatformConfig: PlatformConfig:
type: object type: object
properties: properties:

View File

@ -98,6 +98,8 @@ pub enum Error {
ParseUserDevice(OptionParserError), ParseUserDevice(OptionParserError),
/// Missing socket for userspace device /// Missing socket for userspace device
ParseUserDeviceSocketMissing, ParseUserDeviceSocketMissing,
/// Error parsing pci segment options
ParsePciSegment(OptionParserError),
/// Failed parsing platform parameters /// Failed parsing platform parameters
ParsePlatform(OptionParserError), ParsePlatform(OptionParserError),
/// Failed parsing vDPA device /// Failed parsing vDPA device
@ -170,6 +172,8 @@ pub enum ValidationError {
InvalidNumPciSegments(u16), InvalidNumPciSegments(u16),
/// Invalid PCI segment id /// Invalid PCI segment id
InvalidPciSegment(u16), InvalidPciSegment(u16),
/// Invalid PCI segment aperture weight
InvalidPciSegmentApertureWeight(u32),
/// Balloon too big /// Balloon too big
BalloonLargerThanRam(u64, u64), BalloonLargerThanRam(u64, u64),
/// On a IOMMU segment but not behind IOMMU /// On a IOMMU segment but not behind IOMMU
@ -279,6 +283,9 @@ impl fmt::Display for ValidationError {
InvalidPciSegment(pci_segment) => { InvalidPciSegment(pci_segment) => {
write!(f, "Invalid PCI segment id: {pci_segment}") write!(f, "Invalid PCI segment id: {pci_segment}")
} }
InvalidPciSegmentApertureWeight(aperture_weight) => {
write!(f, "Invalid PCI segment aperture weight: {aperture_weight}")
}
BalloonLargerThanRam(balloon_size, ram_size) => { BalloonLargerThanRam(balloon_size, ram_size) => {
write!( write!(
f, f,
@ -395,6 +402,7 @@ impl fmt::Display for Error {
ParseTdx(o) => write!(f, "Error parsing --tdx: {o}"), ParseTdx(o) => write!(f, "Error parsing --tdx: {o}"),
#[cfg(feature = "tdx")] #[cfg(feature = "tdx")]
FirmwarePathMissing => write!(f, "TDX firmware missing"), FirmwarePathMissing => write!(f, "TDX firmware missing"),
ParsePciSegment(o) => write!(f, "Error parsing --pci-segment: {o}"),
ParsePlatform(o) => write!(f, "Error parsing --platform: {o}"), ParsePlatform(o) => write!(f, "Error parsing --platform: {o}"),
ParseVdpa(o) => write!(f, "Error parsing --vdpa: {o}"), ParseVdpa(o) => write!(f, "Error parsing --vdpa: {o}"),
ParseVdpaPathMissing => write!(f, "Error parsing --vdpa: path missing"), ParseVdpaPathMissing => write!(f, "Error parsing --vdpa: path missing"),
@ -444,6 +452,7 @@ pub struct VmParams<'a> {
pub watchdog: bool, pub watchdog: bool,
#[cfg(feature = "guest_debug")] #[cfg(feature = "guest_debug")]
pub gdb: bool, pub gdb: bool,
pub pci_segments: Option<Vec<&'a str>>,
pub platform: Option<&'a str>, pub platform: Option<&'a str>,
pub tpm: Option<&'a str>, pub tpm: Option<&'a str>,
#[cfg(feature = "igvm")] #[cfg(feature = "igvm")]
@ -504,6 +513,9 @@ impl<'a> VmParams<'a> {
.get_many::<String>("numa") .get_many::<String>("numa")
.map(|x| x.map(|y| y as &str).collect()); .map(|x| x.map(|y| y as &str).collect());
let watchdog = args.get_flag("watchdog"); let watchdog = args.get_flag("watchdog");
let pci_segments: Option<Vec<&str>> = args
.get_many::<String>("pci-segment")
.map(|x| x.map(|y| y as &str).collect());
let platform = args.get_one::<String>("platform").map(|x| x as &str); let platform = args.get_one::<String>("platform").map(|x| x as &str);
#[cfg(feature = "guest_debug")] #[cfg(feature = "guest_debug")]
let gdb = args.contains_id("gdb"); let gdb = args.contains_id("gdb");
@ -542,6 +554,7 @@ impl<'a> VmParams<'a> {
watchdog, watchdog,
#[cfg(feature = "guest_debug")] #[cfg(feature = "guest_debug")]
gdb, gdb,
pci_segments,
platform, platform,
tpm, tpm,
#[cfg(feature = "igvm")] #[cfg(feature = "igvm")]
@ -678,6 +691,64 @@ impl CpusConfig {
} }
} }
impl PciSegmentConfig {
pub const SYNTAX: &'static str = "PCI Segment parameters \
\"pci_segment=<segment_id>,mmio32_aperture_weight=<scale>,mmio64_aperture_weight=<scale>\"";
pub fn parse(disk: &str) -> Result<Self> {
let mut parser = OptionParser::new();
parser
.add("mmio32_aperture_weight")
.add("mmio64_aperture_weight")
.add("pci_segment");
parser.parse(disk).map_err(Error::ParsePciSegment)?;
let pci_segment = parser
.convert("pci_segment")
.map_err(Error::ParsePciSegment)?
.unwrap_or_default();
let mmio32_aperture_weight = parser
.convert("mmio32_aperture_weight")
.map_err(Error::ParsePciSegment)?
.unwrap_or(DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT);
let mmio64_aperture_weight = parser
.convert("mmio64_aperture_weight")
.map_err(Error::ParsePciSegment)?
.unwrap_or(DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT);
Ok(PciSegmentConfig {
pci_segment,
mmio32_aperture_weight,
mmio64_aperture_weight,
})
}
pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> {
let num_pci_segments = match &vm_config.platform {
Some(platform_config) => platform_config.num_pci_segments,
None => 1,
};
if self.pci_segment >= num_pci_segments {
return Err(ValidationError::InvalidPciSegment(self.pci_segment));
}
if self.mmio32_aperture_weight == 0 {
return Err(ValidationError::InvalidPciSegmentApertureWeight(
self.mmio32_aperture_weight,
));
}
if self.mmio64_aperture_weight == 0 {
return Err(ValidationError::InvalidPciSegmentApertureWeight(
self.mmio64_aperture_weight,
));
}
Ok(())
}
}
impl PlatformConfig { impl PlatformConfig {
pub fn parse(platform: &str) -> Result<Self> { pub fn parse(platform: &str) -> Result<Self> {
let mut parser = OptionParser::new(); let mut parser = OptionParser::new();
@ -2449,6 +2520,12 @@ impl VmConfig {
} }
} }
if let Some(pci_segments) = &self.pci_segments {
for pci_segment in pci_segments {
pci_segment.validate(self)?;
}
}
self.platform.as_ref().map(|p| p.validate()).transpose()?; self.platform.as_ref().map(|p| p.validate()).transpose()?;
self.iommu |= self self.iommu |= self
.platform .platform
@ -2557,6 +2634,16 @@ impl VmConfig {
vsock = Some(vsock_config); vsock = Some(vsock_config);
} }
let mut pci_segments: Option<Vec<PciSegmentConfig>> = None;
if let Some(pci_segment_list) = &vm_params.pci_segments {
let mut pci_segment_config_list = Vec::new();
for item in pci_segment_list.iter() {
let pci_segment_config = PciSegmentConfig::parse(item)?;
pci_segment_config_list.push(pci_segment_config);
}
pci_segments = Some(pci_segment_config_list);
}
let platform = vm_params.platform.map(PlatformConfig::parse).transpose()?; let platform = vm_params.platform.map(PlatformConfig::parse).transpose()?;
#[cfg(target_arch = "x86_64")] #[cfg(target_arch = "x86_64")]
@ -2643,6 +2730,7 @@ impl VmConfig {
watchdog: vm_params.watchdog, watchdog: vm_params.watchdog,
#[cfg(feature = "guest_debug")] #[cfg(feature = "guest_debug")]
gdb, gdb,
pci_segments,
platform, platform,
tpm, tpm,
preserved_fds: None, preserved_fds: None,
@ -2764,6 +2852,7 @@ impl Clone for VmConfig {
#[cfg(target_arch = "x86_64")] #[cfg(target_arch = "x86_64")]
sgx_epc: self.sgx_epc.clone(), sgx_epc: self.sgx_epc.clone(),
numa: self.numa.clone(), numa: self.numa.clone(),
pci_segments: self.pci_segments.clone(),
platform: self.platform.clone(), platform: self.platform.clone(),
tpm: self.tpm.clone(), tpm: self.tpm.clone(),
preserved_fds: self preserved_fds: self
@ -2960,6 +3049,46 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn test_pci_segment_parsing() -> Result<()> {
assert_eq!(
PciSegmentConfig::parse("pci_segment=0")?,
PciSegmentConfig {
pci_segment: 0,
mmio32_aperture_weight: 1,
mmio64_aperture_weight: 1,
}
);
assert_eq!(
PciSegmentConfig::parse(
"pci_segment=0,mmio32_aperture_weight=1,mmio64_aperture_weight=1"
)?,
PciSegmentConfig {
pci_segment: 0,
mmio32_aperture_weight: 1,
mmio64_aperture_weight: 1,
}
);
assert_eq!(
PciSegmentConfig::parse("pci_segment=0,mmio32_aperture_weight=2")?,
PciSegmentConfig {
pci_segment: 0,
mmio32_aperture_weight: 2,
mmio64_aperture_weight: 1,
}
);
assert_eq!(
PciSegmentConfig::parse("pci_segment=0,mmio64_aperture_weight=2")?,
PciSegmentConfig {
pci_segment: 0,
mmio32_aperture_weight: 1,
mmio64_aperture_weight: 2,
}
);
Ok(())
}
fn disk_fixture() -> DiskConfig { fn disk_fixture() -> DiskConfig {
DiskConfig { DiskConfig {
path: Some(PathBuf::from("/path/to_file")), path: Some(PathBuf::from("/path/to_file")),
@ -3536,6 +3665,7 @@ mod tests {
watchdog: false, watchdog: false,
#[cfg(feature = "guest_debug")] #[cfg(feature = "guest_debug")]
gdb: false, gdb: false,
pci_segments: None,
platform: None, platform: None,
tpm: None, tpm: None,
preserved_fds: None, preserved_fds: None,
@ -3946,6 +4076,28 @@ mod tests {
Err(ValidationError::PciSegmentReused(1, 0, 1)) Err(ValidationError::PciSegmentReused(1, 0, 1))
); );
let mut invalid_config = valid_config.clone();
invalid_config.pci_segments = Some(vec![PciSegmentConfig {
pci_segment: 0,
mmio32_aperture_weight: 1,
mmio64_aperture_weight: 0,
}]);
assert_eq!(
invalid_config.validate(),
Err(ValidationError::InvalidPciSegmentApertureWeight(0))
);
let mut invalid_config = valid_config.clone();
invalid_config.pci_segments = Some(vec![PciSegmentConfig {
pci_segment: 0,
mmio32_aperture_weight: 0,
mmio64_aperture_weight: 1,
}]);
assert_eq!(
invalid_config.validate(),
Err(ValidationError::InvalidPciSegmentApertureWeight(0))
);
let mut invalid_config = valid_config.clone(); let mut invalid_config = valid_config.clone();
invalid_config.numa = Some(vec![ invalid_config.numa = Some(vec![
NumaConfig { NumaConfig {

View File

@ -22,6 +22,7 @@ use crate::pci_segment::PciSegment;
use crate::seccomp_filters::{get_seccomp_filter, Thread}; use crate::seccomp_filters::{get_seccomp_filter, Thread};
use crate::serial_manager::{Error as SerialManagerError, SerialManager}; use crate::serial_manager::{Error as SerialManagerError, SerialManager};
use crate::sigwinch_listener::start_sigwinch_listener; use crate::sigwinch_listener::start_sigwinch_listener;
use crate::vm_config::DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT;
use crate::GuestRegionMmap; use crate::GuestRegionMmap;
use crate::PciDeviceInfo; use crate::PciDeviceInfo;
use crate::{device_node, DEVICE_MANAGER_SNAPSHOT_ID}; use crate::{device_node, DEVICE_MANAGER_SNAPSHOT_ID};
@ -969,6 +970,34 @@ pub struct DeviceManager {
mmio_regions: Arc<Mutex<Vec<MmioRegion>>>, mmio_regions: Arc<Mutex<Vec<MmioRegion>>>,
} }
fn create_mmio_allocators(
start: u64,
end: u64,
num_pci_segments: u16,
weights: Vec<u32>,
alignment: u64,
) -> Vec<Arc<Mutex<AddressAllocator>>> {
let total_weight: u32 = weights.iter().sum();
// Start each PCI segment mmio range on an aligned boundary
let pci_segment_mmio_size = (end - start + 1) / (alignment * total_weight as u64) * alignment;
let mut mmio_allocators = vec![];
let mut i = 0;
for segment_id in 0..num_pci_segments as u64 {
let weight = weights[segment_id as usize] as u64;
let mmio_start = start + i * pci_segment_mmio_size;
let mmio_size = pci_segment_mmio_size * weight;
let allocator = Arc::new(Mutex::new(
AddressAllocator::new(GuestAddress(mmio_start), mmio_size).unwrap(),
));
mmio_allocators.push(allocator);
i += weight;
}
mmio_allocators
}
impl DeviceManager { impl DeviceManager {
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub fn new( pub fn new(
@ -1009,22 +1038,16 @@ impl DeviceManager {
1 1
}; };
let create_mmio_allocators = |start, end, num_pci_segments, alignment| { let mut mmio32_aperture_weights: Vec<u32> =
// Start each PCI segment mmio range on an aligned boundary std::iter::repeat(DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT)
let pci_segment_mmio_size = .take(num_pci_segments.into())
(end - start + 1) / (alignment * num_pci_segments as u64) * alignment; .collect();
if let Some(pci_segments) = &config.lock().unwrap().pci_segments {
let mut mmio_allocators = vec![]; for pci_segment in pci_segments.iter() {
for i in 0..num_pci_segments as u64 { mmio32_aperture_weights[pci_segment.pci_segment as usize] =
let mmio_start = start + i * pci_segment_mmio_size; pci_segment.mmio32_aperture_weight
let allocator = Arc::new(Mutex::new( }
AddressAllocator::new(GuestAddress(mmio_start), pci_segment_mmio_size).unwrap(),
));
mmio_allocators.push(allocator)
} }
mmio_allocators
};
let start_of_mmio32_area = layout::MEM_32BIT_DEVICES_START.0; let start_of_mmio32_area = layout::MEM_32BIT_DEVICES_START.0;
let end_of_mmio32_area = layout::MEM_32BIT_DEVICES_START.0 + layout::MEM_32BIT_DEVICES_SIZE; let end_of_mmio32_area = layout::MEM_32BIT_DEVICES_START.0 + layout::MEM_32BIT_DEVICES_SIZE;
@ -1032,15 +1055,28 @@ impl DeviceManager {
start_of_mmio32_area, start_of_mmio32_area,
end_of_mmio32_area, end_of_mmio32_area,
num_pci_segments, num_pci_segments,
mmio32_aperture_weights,
4 << 10, 4 << 10,
); );
let mut mmio64_aperture_weights: Vec<u32> =
std::iter::repeat(DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT)
.take(num_pci_segments.into())
.collect();
if let Some(pci_segments) = &config.lock().unwrap().pci_segments {
for pci_segment in pci_segments.iter() {
mmio64_aperture_weights[pci_segment.pci_segment as usize] =
pci_segment.mmio64_aperture_weight
}
}
let start_of_mmio64_area = memory_manager.lock().unwrap().start_of_device_area().0; let start_of_mmio64_area = memory_manager.lock().unwrap().start_of_device_area().0;
let end_of_mmio64_area = memory_manager.lock().unwrap().end_of_device_area().0; let end_of_mmio64_area = memory_manager.lock().unwrap().end_of_device_area().0;
let pci_mmio64_allocators = create_mmio_allocators( let pci_mmio64_allocators = create_mmio_allocators(
start_of_mmio64_area, start_of_mmio64_area,
end_of_mmio64_area, end_of_mmio64_area,
num_pci_segments, num_pci_segments,
mmio64_aperture_weights,
4 << 30, 4 << 30,
); );
@ -4997,3 +5033,60 @@ impl Drop for DeviceManager {
} }
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_create_mmio_allocators() {
let res = create_mmio_allocators(0x100000, 0x400000, 1, vec![1], 4 << 10);
assert_eq!(res.len(), 1);
assert_eq!(
res[0].lock().unwrap().base(),
vm_memory::GuestAddress(0x100000)
);
assert_eq!(
res[0].lock().unwrap().end(),
vm_memory::GuestAddress(0x3fffff)
);
let res = create_mmio_allocators(0x100000, 0x400000, 2, vec![1, 1], 4 << 10);
assert_eq!(res.len(), 2);
assert_eq!(
res[0].lock().unwrap().base(),
vm_memory::GuestAddress(0x100000)
);
assert_eq!(
res[0].lock().unwrap().end(),
vm_memory::GuestAddress(0x27ffff)
);
assert_eq!(
res[1].lock().unwrap().base(),
vm_memory::GuestAddress(0x280000)
);
assert_eq!(
res[1].lock().unwrap().end(),
vm_memory::GuestAddress(0x3fffff)
);
let res = create_mmio_allocators(0x100000, 0x400000, 2, vec![2, 1], 4 << 10);
assert_eq!(res.len(), 2);
assert_eq!(
res[0].lock().unwrap().base(),
vm_memory::GuestAddress(0x100000)
);
assert_eq!(
res[0].lock().unwrap().end(),
vm_memory::GuestAddress(0x2fffff)
);
assert_eq!(
res[1].lock().unwrap().base(),
vm_memory::GuestAddress(0x300000)
);
assert_eq!(
res[1].lock().unwrap().end(),
vm_memory::GuestAddress(0x3fffff)
);
}
}

View File

@ -2151,6 +2151,7 @@ mod unit_tests {
watchdog: false, watchdog: false,
#[cfg(feature = "guest_debug")] #[cfg(feature = "guest_debug")]
gdb: false, gdb: false,
pci_segments: None,
platform: None, platform: None,
tpm: None, tpm: None,
preserved_fds: None, preserved_fds: None,

View File

@ -94,6 +94,22 @@ pub struct PlatformConfig {
pub sev_snp: bool, pub sev_snp: bool,
} }
pub const DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT: u32 = 1;
fn default_pci_segment_aperture_weight() -> u32 {
DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT
}
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
pub struct PciSegmentConfig {
#[serde(default)]
pub pci_segment: u16,
#[serde(default = "default_pci_segment_aperture_weight")]
pub mmio32_aperture_weight: u32,
#[serde(default = "default_pci_segment_aperture_weight")]
pub mmio64_aperture_weight: u32,
}
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
pub struct MemoryZoneConfig { pub struct MemoryZoneConfig {
pub id: String, pub id: String,
@ -620,6 +636,7 @@ pub struct VmConfig {
#[cfg(feature = "guest_debug")] #[cfg(feature = "guest_debug")]
#[serde(default)] #[serde(default)]
pub gdb: bool, pub gdb: bool,
pub pci_segments: Option<Vec<PciSegmentConfig>>,
pub platform: Option<PlatformConfig>, pub platform: Option<PlatformConfig>,
pub tpm: Option<TpmConfig>, pub tpm: Option<TpmConfig>,
// Preserved FDs are the ones that share the same life-time as its holding // Preserved FDs are the ones that share the same life-time as its holding