vmm: Allow assignment of PCI segments to NUMA node

Signed-off-by: Thomas Barrett <tbarrett@crusoeenergy.com>
This commit is contained in:
Thomas Barrett 2023-10-15 19:36:34 -07:00 committed by Bo Chen
parent 0b4c153d4d
commit 3029fbeafd
9 changed files with 245 additions and 22 deletions

View File

@ -108,6 +108,7 @@ pub struct NumaNode {
pub memory_regions: Vec<Arc<GuestRegionMmap>>,
pub hotplug_regions: Vec<Arc<GuestRegionMmap>>,
pub cpus: Vec<u8>,
pub pci_segments: Vec<u16>,
pub distances: BTreeMap<u32, u8>,
pub memory_zones: Vec<String>,
#[cfg(target_arch = "x86_64")]

View File

@ -572,7 +572,16 @@ _Example_
### PCI bus
Cloud Hypervisor supports only one PCI bus, which is why it has been tied to
the NUMA node 0 by default. It is the user responsibility to organize the NUMA
nodes correctly so that vCPUs and guest RAM which should be located on the same
NUMA node as the PCI bus end up on the NUMA node 0.
Cloud Hypervisor supports guests with one or more PCI segments. The default PCI segment always
has affinity to NUMA node 0. Be default, all other PCI segments have afffinity to NUMA node 0.
The user may configure the NUMA affinity for any additional PCI segments.
_Example_
```
--platform num_pci_segments=2
--memory-zone size=16G,host_numa_node=0,id=mem0
--memory-zone size=16G,host_numa_node=1,id=mem1
--numa guest_numa_id=0,memory_zones=mem0,pci_segments=[0]
--numa guest_numa_id=1,memory_zones=mem1,pci_segments=[1]
```

View File

@ -2947,6 +2947,92 @@ mod common_parallel {
handle_child_output(r, &output);
}
fn test_pci_multiple_segments_numa_node() {
let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string());
let guest = Guest::new(Box::new(focal));
let api_socket = temp_api_path(&guest.tmp_dir);
#[cfg(target_arch = "x86_64")]
let kernel_path = direct_kernel_boot_path();
#[cfg(target_arch = "aarch64")]
let kernel_path = edk2_path();
// Prepare another disk file for the virtio-disk device
let test_disk_path = String::from(
guest
.tmp_dir
.as_path()
.join("test-disk.raw")
.to_str()
.unwrap(),
);
assert!(
exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success()
);
assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success());
const TEST_DISK_NODE: u16 = 1;
let mut child = GuestCommand::new(&guest)
.args(["--platform", "num_pci_segments=2"])
.args(["--cpus", "boot=2"])
.args(["--memory", "size=0"])
.args([
"--memory-zone",
"id=mem0,size=256M",
"--memory-zone",
"id=mem1,size=256M",
])
.args([
"--numa",
"guest_numa_id=0,cpus=[0],memory_zones=mem0,pci_segments=[0]",
"--numa",
"guest_numa_id=1,cpus=[1],memory_zones=mem1,pci_segments=[1]",
])
.args(["--kernel", kernel_path.to_str().unwrap()])
.args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE])
.args(["--api-socket", &api_socket])
.capture_output()
.args([
"--disk",
format!(
"path={}",
guest.disk_config.disk(DiskType::OperatingSystem).unwrap()
)
.as_str(),
"--disk",
format!(
"path={}",
guest.disk_config.disk(DiskType::CloudInit).unwrap()
)
.as_str(),
"--disk",
format!("path={test_disk_path},pci_segment={TEST_DISK_NODE}").as_str(),
])
.default_net()
.spawn()
.unwrap();
let cmd = "cat /sys/block/vdc/device/../numa_node";
let r = std::panic::catch_unwind(|| {
guest.wait_vm_boot(None).unwrap();
assert_eq!(
guest
.ssh_command(cmd)
.unwrap()
.trim()
.parse::<u16>()
.unwrap_or_default(),
TEST_DISK_NODE
);
});
let _ = child.kill();
let output = child.wait_with_output().unwrap();
handle_child_output(r, &output);
}
#[test]
fn test_direct_kernel_boot() {
let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string());

View File

@ -1080,6 +1080,11 @@ components:
type: array
items:
type: string
pci_segments:
type: array
items:
type: integer
format: int32
VmResize:
type: object

View File

@ -176,6 +176,10 @@ pub enum ValidationError {
DuplicateDevicePath(String),
/// Provided MTU is lower than what the VIRTIO specification expects
InvalidMtu(u16),
/// PCI segment is reused across NUMA nodes
PciSegmentReused(u16, u32, u32),
/// Default PCI segment is assigned to NUMA node other than 0.
DefaultPciSegmentInvalidNode(u32),
}
type ValidationResult<T> = std::result::Result<T, ValidationError>;
@ -288,6 +292,15 @@ impl fmt::Display for ValidationError {
"Provided MTU {mtu} is lower than 1280 (expected by VIRTIO specification)"
)
}
PciSegmentReused(pci_segment, u1, u2) => {
write!(
f,
"PCI segment: {pci_segment} belongs to multiple NUMA nodes {u1} and {u2}"
)
}
DefaultPciSegmentInvalidNode(u1) => {
write!(f, "Default PCI segment assigned to non-zero NUMA node {u1}")
}
}
}
}
@ -1619,7 +1632,9 @@ impl NumaConfig {
.add("cpus")
.add("distances")
.add("memory_zones")
.add("sgx_epc_sections");
.add("sgx_epc_sections")
.add("pci_segments");
parser.parse(numa).map_err(Error::ParseNuma)?;
let guest_numa_id = parser
@ -1650,7 +1665,10 @@ impl NumaConfig {
.convert::<StringList>("sgx_epc_sections")
.map_err(Error::ParseNuma)?
.map(|v| v.0);
let pci_segments = parser
.convert::<IntegerList>("pci_segments")
.map_err(Error::ParseNuma)?
.map(|v| v.0.iter().map(|e| *e as u16).collect());
Ok(NumaConfig {
guest_numa_id,
cpus,
@ -1658,6 +1676,7 @@ impl NumaConfig {
memory_zones,
#[cfg(target_arch = "x86_64")]
sgx_epc_sections,
pci_segments,
})
}
}
@ -1925,10 +1944,16 @@ impl VmConfig {
Self::validate_identifier(&mut id_list, &vsock.id)?;
}
let num_pci_segments = match &self.platform {
Some(platform_config) => platform_config.num_pci_segments,
None => 1,
};
if let Some(numa) = &self.numa {
let mut used_numa_node_memory_zones = HashMap::new();
let mut used_pci_segments = HashMap::new();
for numa_node in numa.iter() {
for memory_zone in numa_node.memory_zones.clone().unwrap().iter() {
if let Some(memory_zones) = numa_node.memory_zones.clone() {
for memory_zone in memory_zones.iter() {
if !used_numa_node_memory_zones.contains_key(memory_zone) {
used_numa_node_memory_zones
.insert(memory_zone.to_string(), numa_node.guest_numa_id);
@ -1941,6 +1966,29 @@ impl VmConfig {
}
}
}
if let Some(pci_segments) = numa_node.pci_segments.clone() {
for pci_segment in pci_segments.iter() {
if *pci_segment >= num_pci_segments {
return Err(ValidationError::InvalidPciSegment(*pci_segment));
}
if *pci_segment == 0 && numa_node.guest_numa_id != 0 {
return Err(ValidationError::DefaultPciSegmentInvalidNode(
numa_node.guest_numa_id,
));
}
if !used_pci_segments.contains_key(pci_segment) {
used_pci_segments.insert(*pci_segment, numa_node.guest_numa_id);
} else {
return Err(ValidationError::PciSegmentReused(
*pci_segment,
*used_pci_segments.get(pci_segment).unwrap(),
numa_node.guest_numa_id,
));
}
}
}
}
}
if let Some(zones) = &self.memory.zones {
@ -3304,6 +3352,63 @@ mod tests {
Err(ValidationError::IommuNotSupportedOnSegment(1))
);
let mut invalid_config = valid_config.clone();
invalid_config.platform = Some(PlatformConfig {
num_pci_segments: 2,
..Default::default()
});
invalid_config.numa = Some(vec![
NumaConfig {
guest_numa_id: 0,
pci_segments: Some(vec![1]),
..Default::default()
},
NumaConfig {
guest_numa_id: 1,
pci_segments: Some(vec![1]),
..Default::default()
},
]);
assert_eq!(
invalid_config.validate(),
Err(ValidationError::PciSegmentReused(1, 0, 1))
);
let mut invalid_config = valid_config.clone();
invalid_config.numa = Some(vec![
NumaConfig {
guest_numa_id: 0,
..Default::default()
},
NumaConfig {
guest_numa_id: 1,
pci_segments: Some(vec![0]),
..Default::default()
},
]);
assert_eq!(
invalid_config.validate(),
Err(ValidationError::DefaultPciSegmentInvalidNode(1))
);
let mut invalid_config = valid_config.clone();
invalid_config.numa = Some(vec![
NumaConfig {
guest_numa_id: 0,
pci_segments: Some(vec![0]),
..Default::default()
},
NumaConfig {
guest_numa_id: 1,
pci_segments: Some(vec![1]),
..Default::default()
},
]);
assert_eq!(
invalid_config.validate(),
Err(ValidationError::InvalidPciSegment(1))
);
let mut still_valid_config = valid_config.clone();
still_valid_config.devices = Some(vec![
DeviceConfig {

View File

@ -1065,6 +1065,7 @@ impl DeviceManager {
for i in 1..num_pci_segments as usize {
pci_segments.push(PciSegment::new(
i as u16,
numa_node_id_from_pci_segment_id(&numa_nodes, i as u16),
&address_manager,
Arc::clone(&address_manager.pci_mmio_allocators[i]),
&pci_irq_slots,
@ -4343,6 +4344,16 @@ fn numa_node_id_from_memory_zone_id(numa_nodes: &NumaNodes, memory_zone_id: &str
None
}
fn numa_node_id_from_pci_segment_id(numa_nodes: &NumaNodes, pci_segment_id: u16) -> u32 {
for (numa_node_id, numa_node) in numa_nodes.iter() {
if numa_node.pci_segments.contains(&pci_segment_id) {
return *numa_node_id;
}
}
0
}
struct TpmDevice {}
impl Aml for TpmDevice {

View File

@ -25,6 +25,7 @@ pub(crate) struct PciSegment {
pub(crate) pci_bus: Arc<Mutex<PciBus>>,
pub(crate) pci_config_mmio: Arc<Mutex<PciConfigMmio>>,
pub(crate) mmio_config_address: u64,
pub(crate) proximity_domain: u32,
#[cfg(target_arch = "x86_64")]
pub(crate) pci_config_io: Option<Arc<Mutex<PciConfigIo>>>,
@ -46,6 +47,7 @@ pub(crate) struct PciSegment {
impl PciSegment {
pub(crate) fn new(
id: u16,
numa_node: u32,
address_manager: &Arc<AddressManager>,
allocator: Arc<Mutex<AddressAllocator>>,
pci_irq_slots: &[u8; 32],
@ -77,6 +79,7 @@ impl PciSegment {
pci_bus,
pci_config_mmio,
mmio_config_address,
proximity_domain: numa_node,
pci_devices_up: 0,
pci_devices_down: 0,
#[cfg(target_arch = "x86_64")]
@ -100,7 +103,7 @@ impl PciSegment {
allocator: Arc<Mutex<AddressAllocator>>,
pci_irq_slots: &[u8; 32],
) -> DeviceManagerResult<PciSegment> {
let mut segment = Self::new(0, address_manager, allocator, pci_irq_slots)?;
let mut segment = Self::new(0, 0, address_manager, allocator, pci_irq_slots)?;
let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus))));
address_manager
@ -123,7 +126,7 @@ impl PciSegment {
allocator: Arc<Mutex<AddressAllocator>>,
pci_irq_slots: &[u8; 32],
) -> DeviceManagerResult<PciSegment> {
Self::new(0, address_manager, allocator, pci_irq_slots)
Self::new(0, 0, address_manager, allocator, pci_irq_slots)
}
pub(crate) fn next_device_bdf(&self) -> DeviceManagerResult<PciBdf> {
@ -329,10 +332,7 @@ impl Aml for PciSegment {
let supp = aml::Name::new("SUPP".into(), &aml::ZERO);
pci_dsdt_inner_data.push(&supp);
// Since Cloud Hypervisor supports only one PCI bus, it can be tied
// to the NUMA node 0. It's up to the user to organize the NUMA nodes
// so that the PCI bus relates to the expected vCPUs and guest RAM.
let proximity_domain = 0u32;
let proximity_domain = self.proximity_domain;
let pxm_return = aml::Return::new(&proximity_domain);
let pxm = aml::Method::new("_PXM".into(), 0, false, vec![&pxm_return]);
pci_dsdt_inner_data.push(&pxm);

View File

@ -701,6 +701,10 @@ impl Vm {
node.cpus.extend(cpus);
}
if let Some(pci_segments) = &config.pci_segments {
node.pci_segments.extend(pci_segments);
}
if let Some(distances) = &config.distances {
for distance in distances.iter() {
let dest = distance.destination;

View File

@ -538,6 +538,8 @@ pub struct NumaConfig {
#[cfg(target_arch = "x86_64")]
#[serde(default)]
pub sgx_epc_sections: Option<Vec<String>>,
#[serde(default)]
pub pci_segments: Option<Vec<u16>>,
}
#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)]