diff --git a/arch/src/lib.rs b/arch/src/lib.rs index 5de2247ea..3e570efc5 100644 --- a/arch/src/lib.rs +++ b/arch/src/lib.rs @@ -108,6 +108,7 @@ pub struct NumaNode { pub memory_regions: Vec>, pub hotplug_regions: Vec>, pub cpus: Vec, + pub pci_segments: Vec, pub distances: BTreeMap, pub memory_zones: Vec, #[cfg(target_arch = "x86_64")] diff --git a/docs/memory.md b/docs/memory.md index 3d420cc94..901a57509 100644 --- a/docs/memory.md +++ b/docs/memory.md @@ -572,7 +572,16 @@ _Example_ ### PCI bus -Cloud Hypervisor supports only one PCI bus, which is why it has been tied to -the NUMA node 0 by default. It is the user responsibility to organize the NUMA -nodes correctly so that vCPUs and guest RAM which should be located on the same -NUMA node as the PCI bus end up on the NUMA node 0. +Cloud Hypervisor supports guests with one or more PCI segments. The default PCI segment always +has affinity to NUMA node 0. Be default, all other PCI segments have afffinity to NUMA node 0. +The user may configure the NUMA affinity for any additional PCI segments. + +_Example_ + +``` +--platform num_pci_segments=2 +--memory-zone size=16G,host_numa_node=0,id=mem0 +--memory-zone size=16G,host_numa_node=1,id=mem1 +--numa guest_numa_id=0,memory_zones=mem0,pci_segments=[0] +--numa guest_numa_id=1,memory_zones=mem1,pci_segments=[1] +``` diff --git a/tests/integration.rs b/tests/integration.rs index 577930cce..68418fb2c 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -2947,6 +2947,92 @@ mod common_parallel { handle_child_output(r, &output); } + fn test_pci_multiple_segments_numa_node() { + let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(focal)); + let api_socket = temp_api_path(&guest.tmp_dir); + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); + + // Prepare another disk file for the virtio-disk device + let test_disk_path = String::from( + guest + .tmp_dir + .as_path() + .join("test-disk.raw") + .to_str() + .unwrap(), + ); + assert!( + exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() + ); + assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); + const TEST_DISK_NODE: u16 = 1; + + let mut child = GuestCommand::new(&guest) + .args(["--platform", "num_pci_segments=2"]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=0"]) + .args([ + "--memory-zone", + "id=mem0,size=256M", + "--memory-zone", + "id=mem1,size=256M", + ]) + .args([ + "--numa", + "guest_numa_id=0,cpus=[0],memory_zones=mem0,pci_segments=[0]", + "--numa", + "guest_numa_id=1,cpus=[1],memory_zones=mem1,pci_segments=[1]", + ]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args(["--api-socket", &api_socket]) + .capture_output() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + "--disk", + format!("path={test_disk_path},pci_segment={TEST_DISK_NODE}").as_str(), + ]) + .default_net() + .spawn() + .unwrap(); + + let cmd = "cat /sys/block/vdc/device/../numa_node"; + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot(None).unwrap(); + + assert_eq!( + guest + .ssh_command(cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + TEST_DISK_NODE + ); + }); + + let _ = child.kill(); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + } + #[test] fn test_direct_kernel_boot() { let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index cf2b1a0a3..fa855da61 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1080,6 +1080,11 @@ components: type: array items: type: string + pci_segments: + type: array + items: + type: integer + format: int32 VmResize: type: object diff --git a/vmm/src/config.rs b/vmm/src/config.rs index a20df87aa..93b1f67bb 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -176,6 +176,10 @@ pub enum ValidationError { DuplicateDevicePath(String), /// Provided MTU is lower than what the VIRTIO specification expects InvalidMtu(u16), + /// PCI segment is reused across NUMA nodes + PciSegmentReused(u16, u32, u32), + /// Default PCI segment is assigned to NUMA node other than 0. + DefaultPciSegmentInvalidNode(u32), } type ValidationResult = std::result::Result; @@ -288,6 +292,15 @@ impl fmt::Display for ValidationError { "Provided MTU {mtu} is lower than 1280 (expected by VIRTIO specification)" ) } + PciSegmentReused(pci_segment, u1, u2) => { + write!( + f, + "PCI segment: {pci_segment} belongs to multiple NUMA nodes {u1} and {u2}" + ) + } + DefaultPciSegmentInvalidNode(u1) => { + write!(f, "Default PCI segment assigned to non-zero NUMA node {u1}") + } } } } @@ -1619,7 +1632,9 @@ impl NumaConfig { .add("cpus") .add("distances") .add("memory_zones") - .add("sgx_epc_sections"); + .add("sgx_epc_sections") + .add("pci_segments"); + parser.parse(numa).map_err(Error::ParseNuma)?; let guest_numa_id = parser @@ -1650,7 +1665,10 @@ impl NumaConfig { .convert::("sgx_epc_sections") .map_err(Error::ParseNuma)? .map(|v| v.0); - + let pci_segments = parser + .convert::("pci_segments") + .map_err(Error::ParseNuma)? + .map(|v| v.0.iter().map(|e| *e as u16).collect()); Ok(NumaConfig { guest_numa_id, cpus, @@ -1658,6 +1676,7 @@ impl NumaConfig { memory_zones, #[cfg(target_arch = "x86_64")] sgx_epc_sections, + pci_segments, }) } } @@ -1925,19 +1944,48 @@ impl VmConfig { Self::validate_identifier(&mut id_list, &vsock.id)?; } + let num_pci_segments = match &self.platform { + Some(platform_config) => platform_config.num_pci_segments, + None => 1, + }; if let Some(numa) = &self.numa { let mut used_numa_node_memory_zones = HashMap::new(); + let mut used_pci_segments = HashMap::new(); for numa_node in numa.iter() { - for memory_zone in numa_node.memory_zones.clone().unwrap().iter() { - if !used_numa_node_memory_zones.contains_key(memory_zone) { - used_numa_node_memory_zones - .insert(memory_zone.to_string(), numa_node.guest_numa_id); - } else { - return Err(ValidationError::MemoryZoneReused( - memory_zone.to_string(), - *used_numa_node_memory_zones.get(memory_zone).unwrap(), - numa_node.guest_numa_id, - )); + if let Some(memory_zones) = numa_node.memory_zones.clone() { + for memory_zone in memory_zones.iter() { + if !used_numa_node_memory_zones.contains_key(memory_zone) { + used_numa_node_memory_zones + .insert(memory_zone.to_string(), numa_node.guest_numa_id); + } else { + return Err(ValidationError::MemoryZoneReused( + memory_zone.to_string(), + *used_numa_node_memory_zones.get(memory_zone).unwrap(), + numa_node.guest_numa_id, + )); + } + } + } + + if let Some(pci_segments) = numa_node.pci_segments.clone() { + for pci_segment in pci_segments.iter() { + if *pci_segment >= num_pci_segments { + return Err(ValidationError::InvalidPciSegment(*pci_segment)); + } + if *pci_segment == 0 && numa_node.guest_numa_id != 0 { + return Err(ValidationError::DefaultPciSegmentInvalidNode( + numa_node.guest_numa_id, + )); + } + if !used_pci_segments.contains_key(pci_segment) { + used_pci_segments.insert(*pci_segment, numa_node.guest_numa_id); + } else { + return Err(ValidationError::PciSegmentReused( + *pci_segment, + *used_pci_segments.get(pci_segment).unwrap(), + numa_node.guest_numa_id, + )); + } } } } @@ -3304,6 +3352,63 @@ mod tests { Err(ValidationError::IommuNotSupportedOnSegment(1)) ); + let mut invalid_config = valid_config.clone(); + invalid_config.platform = Some(PlatformConfig { + num_pci_segments: 2, + ..Default::default() + }); + invalid_config.numa = Some(vec![ + NumaConfig { + guest_numa_id: 0, + pci_segments: Some(vec![1]), + ..Default::default() + }, + NumaConfig { + guest_numa_id: 1, + pci_segments: Some(vec![1]), + ..Default::default() + }, + ]); + assert_eq!( + invalid_config.validate(), + Err(ValidationError::PciSegmentReused(1, 0, 1)) + ); + + let mut invalid_config = valid_config.clone(); + invalid_config.numa = Some(vec![ + NumaConfig { + guest_numa_id: 0, + ..Default::default() + }, + NumaConfig { + guest_numa_id: 1, + pci_segments: Some(vec![0]), + ..Default::default() + }, + ]); + assert_eq!( + invalid_config.validate(), + Err(ValidationError::DefaultPciSegmentInvalidNode(1)) + ); + + let mut invalid_config = valid_config.clone(); + invalid_config.numa = Some(vec![ + NumaConfig { + guest_numa_id: 0, + pci_segments: Some(vec![0]), + ..Default::default() + }, + NumaConfig { + guest_numa_id: 1, + pci_segments: Some(vec![1]), + ..Default::default() + }, + ]); + assert_eq!( + invalid_config.validate(), + Err(ValidationError::InvalidPciSegment(1)) + ); + let mut still_valid_config = valid_config.clone(); still_valid_config.devices = Some(vec![ DeviceConfig { diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index b2b483b7d..17c227131 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1065,6 +1065,7 @@ impl DeviceManager { for i in 1..num_pci_segments as usize { pci_segments.push(PciSegment::new( i as u16, + numa_node_id_from_pci_segment_id(&numa_nodes, i as u16), &address_manager, Arc::clone(&address_manager.pci_mmio_allocators[i]), &pci_irq_slots, @@ -4343,6 +4344,16 @@ fn numa_node_id_from_memory_zone_id(numa_nodes: &NumaNodes, memory_zone_id: &str None } +fn numa_node_id_from_pci_segment_id(numa_nodes: &NumaNodes, pci_segment_id: u16) -> u32 { + for (numa_node_id, numa_node) in numa_nodes.iter() { + if numa_node.pci_segments.contains(&pci_segment_id) { + return *numa_node_id; + } + } + + 0 +} + struct TpmDevice {} impl Aml for TpmDevice { diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index 6dd81bdf9..3ae51f471 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -25,6 +25,7 @@ pub(crate) struct PciSegment { pub(crate) pci_bus: Arc>, pub(crate) pci_config_mmio: Arc>, pub(crate) mmio_config_address: u64, + pub(crate) proximity_domain: u32, #[cfg(target_arch = "x86_64")] pub(crate) pci_config_io: Option>>, @@ -46,6 +47,7 @@ pub(crate) struct PciSegment { impl PciSegment { pub(crate) fn new( id: u16, + numa_node: u32, address_manager: &Arc, allocator: Arc>, pci_irq_slots: &[u8; 32], @@ -77,6 +79,7 @@ impl PciSegment { pci_bus, pci_config_mmio, mmio_config_address, + proximity_domain: numa_node, pci_devices_up: 0, pci_devices_down: 0, #[cfg(target_arch = "x86_64")] @@ -100,7 +103,7 @@ impl PciSegment { allocator: Arc>, pci_irq_slots: &[u8; 32], ) -> DeviceManagerResult { - let mut segment = Self::new(0, address_manager, allocator, pci_irq_slots)?; + let mut segment = Self::new(0, 0, address_manager, allocator, pci_irq_slots)?; let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus)))); address_manager @@ -123,7 +126,7 @@ impl PciSegment { allocator: Arc>, pci_irq_slots: &[u8; 32], ) -> DeviceManagerResult { - Self::new(0, address_manager, allocator, pci_irq_slots) + Self::new(0, 0, address_manager, allocator, pci_irq_slots) } pub(crate) fn next_device_bdf(&self) -> DeviceManagerResult { @@ -329,10 +332,7 @@ impl Aml for PciSegment { let supp = aml::Name::new("SUPP".into(), &aml::ZERO); pci_dsdt_inner_data.push(&supp); - // Since Cloud Hypervisor supports only one PCI bus, it can be tied - // to the NUMA node 0. It's up to the user to organize the NUMA nodes - // so that the PCI bus relates to the expected vCPUs and guest RAM. - let proximity_domain = 0u32; + let proximity_domain = self.proximity_domain; let pxm_return = aml::Return::new(&proximity_domain); let pxm = aml::Method::new("_PXM".into(), 0, false, vec![&pxm_return]); pci_dsdt_inner_data.push(&pxm); diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 74f7f9dd0..85f22f161 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -701,6 +701,10 @@ impl Vm { node.cpus.extend(cpus); } + if let Some(pci_segments) = &config.pci_segments { + node.pci_segments.extend(pci_segments); + } + if let Some(distances) = &config.distances { for distance in distances.iter() { let dest = distance.destination; diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index a115aca77..6d73795c3 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -538,6 +538,8 @@ pub struct NumaConfig { #[cfg(target_arch = "x86_64")] #[serde(default)] pub sgx_epc_sections: Option>, + #[serde(default)] + pub pci_segments: Option>, } #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)]