mirror of
https://github.com/cloud-hypervisor/cloud-hypervisor.git
synced 2025-01-21 20:15:21 +00:00
vmm: Allow assignment of PCI segments to NUMA node
Signed-off-by: Thomas Barrett <tbarrett@crusoeenergy.com>
This commit is contained in:
parent
0b4c153d4d
commit
3029fbeafd
@ -108,6 +108,7 @@ pub struct NumaNode {
|
||||
pub memory_regions: Vec<Arc<GuestRegionMmap>>,
|
||||
pub hotplug_regions: Vec<Arc<GuestRegionMmap>>,
|
||||
pub cpus: Vec<u8>,
|
||||
pub pci_segments: Vec<u16>,
|
||||
pub distances: BTreeMap<u32, u8>,
|
||||
pub memory_zones: Vec<String>,
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
|
@ -572,7 +572,16 @@ _Example_
|
||||
|
||||
### PCI bus
|
||||
|
||||
Cloud Hypervisor supports only one PCI bus, which is why it has been tied to
|
||||
the NUMA node 0 by default. It is the user responsibility to organize the NUMA
|
||||
nodes correctly so that vCPUs and guest RAM which should be located on the same
|
||||
NUMA node as the PCI bus end up on the NUMA node 0.
|
||||
Cloud Hypervisor supports guests with one or more PCI segments. The default PCI segment always
|
||||
has affinity to NUMA node 0. Be default, all other PCI segments have afffinity to NUMA node 0.
|
||||
The user may configure the NUMA affinity for any additional PCI segments.
|
||||
|
||||
_Example_
|
||||
|
||||
```
|
||||
--platform num_pci_segments=2
|
||||
--memory-zone size=16G,host_numa_node=0,id=mem0
|
||||
--memory-zone size=16G,host_numa_node=1,id=mem1
|
||||
--numa guest_numa_id=0,memory_zones=mem0,pci_segments=[0]
|
||||
--numa guest_numa_id=1,memory_zones=mem1,pci_segments=[1]
|
||||
```
|
||||
|
@ -2947,6 +2947,92 @@ mod common_parallel {
|
||||
handle_child_output(r, &output);
|
||||
}
|
||||
|
||||
fn test_pci_multiple_segments_numa_node() {
|
||||
let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string());
|
||||
let guest = Guest::new(Box::new(focal));
|
||||
let api_socket = temp_api_path(&guest.tmp_dir);
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
let kernel_path = direct_kernel_boot_path();
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
let kernel_path = edk2_path();
|
||||
|
||||
// Prepare another disk file for the virtio-disk device
|
||||
let test_disk_path = String::from(
|
||||
guest
|
||||
.tmp_dir
|
||||
.as_path()
|
||||
.join("test-disk.raw")
|
||||
.to_str()
|
||||
.unwrap(),
|
||||
);
|
||||
assert!(
|
||||
exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success()
|
||||
);
|
||||
assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success());
|
||||
const TEST_DISK_NODE: u16 = 1;
|
||||
|
||||
let mut child = GuestCommand::new(&guest)
|
||||
.args(["--platform", "num_pci_segments=2"])
|
||||
.args(["--cpus", "boot=2"])
|
||||
.args(["--memory", "size=0"])
|
||||
.args([
|
||||
"--memory-zone",
|
||||
"id=mem0,size=256M",
|
||||
"--memory-zone",
|
||||
"id=mem1,size=256M",
|
||||
])
|
||||
.args([
|
||||
"--numa",
|
||||
"guest_numa_id=0,cpus=[0],memory_zones=mem0,pci_segments=[0]",
|
||||
"--numa",
|
||||
"guest_numa_id=1,cpus=[1],memory_zones=mem1,pci_segments=[1]",
|
||||
])
|
||||
.args(["--kernel", kernel_path.to_str().unwrap()])
|
||||
.args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE])
|
||||
.args(["--api-socket", &api_socket])
|
||||
.capture_output()
|
||||
.args([
|
||||
"--disk",
|
||||
format!(
|
||||
"path={}",
|
||||
guest.disk_config.disk(DiskType::OperatingSystem).unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
"--disk",
|
||||
format!(
|
||||
"path={}",
|
||||
guest.disk_config.disk(DiskType::CloudInit).unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
"--disk",
|
||||
format!("path={test_disk_path},pci_segment={TEST_DISK_NODE}").as_str(),
|
||||
])
|
||||
.default_net()
|
||||
.spawn()
|
||||
.unwrap();
|
||||
|
||||
let cmd = "cat /sys/block/vdc/device/../numa_node";
|
||||
|
||||
let r = std::panic::catch_unwind(|| {
|
||||
guest.wait_vm_boot(None).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
guest
|
||||
.ssh_command(cmd)
|
||||
.unwrap()
|
||||
.trim()
|
||||
.parse::<u16>()
|
||||
.unwrap_or_default(),
|
||||
TEST_DISK_NODE
|
||||
);
|
||||
});
|
||||
|
||||
let _ = child.kill();
|
||||
let output = child.wait_with_output().unwrap();
|
||||
|
||||
handle_child_output(r, &output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_direct_kernel_boot() {
|
||||
let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string());
|
||||
|
@ -1080,6 +1080,11 @@ components:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
pci_segments:
|
||||
type: array
|
||||
items:
|
||||
type: integer
|
||||
format: int32
|
||||
|
||||
VmResize:
|
||||
type: object
|
||||
|
@ -176,6 +176,10 @@ pub enum ValidationError {
|
||||
DuplicateDevicePath(String),
|
||||
/// Provided MTU is lower than what the VIRTIO specification expects
|
||||
InvalidMtu(u16),
|
||||
/// PCI segment is reused across NUMA nodes
|
||||
PciSegmentReused(u16, u32, u32),
|
||||
/// Default PCI segment is assigned to NUMA node other than 0.
|
||||
DefaultPciSegmentInvalidNode(u32),
|
||||
}
|
||||
|
||||
type ValidationResult<T> = std::result::Result<T, ValidationError>;
|
||||
@ -288,6 +292,15 @@ impl fmt::Display for ValidationError {
|
||||
"Provided MTU {mtu} is lower than 1280 (expected by VIRTIO specification)"
|
||||
)
|
||||
}
|
||||
PciSegmentReused(pci_segment, u1, u2) => {
|
||||
write!(
|
||||
f,
|
||||
"PCI segment: {pci_segment} belongs to multiple NUMA nodes {u1} and {u2}"
|
||||
)
|
||||
}
|
||||
DefaultPciSegmentInvalidNode(u1) => {
|
||||
write!(f, "Default PCI segment assigned to non-zero NUMA node {u1}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1619,7 +1632,9 @@ impl NumaConfig {
|
||||
.add("cpus")
|
||||
.add("distances")
|
||||
.add("memory_zones")
|
||||
.add("sgx_epc_sections");
|
||||
.add("sgx_epc_sections")
|
||||
.add("pci_segments");
|
||||
|
||||
parser.parse(numa).map_err(Error::ParseNuma)?;
|
||||
|
||||
let guest_numa_id = parser
|
||||
@ -1650,7 +1665,10 @@ impl NumaConfig {
|
||||
.convert::<StringList>("sgx_epc_sections")
|
||||
.map_err(Error::ParseNuma)?
|
||||
.map(|v| v.0);
|
||||
|
||||
let pci_segments = parser
|
||||
.convert::<IntegerList>("pci_segments")
|
||||
.map_err(Error::ParseNuma)?
|
||||
.map(|v| v.0.iter().map(|e| *e as u16).collect());
|
||||
Ok(NumaConfig {
|
||||
guest_numa_id,
|
||||
cpus,
|
||||
@ -1658,6 +1676,7 @@ impl NumaConfig {
|
||||
memory_zones,
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
sgx_epc_sections,
|
||||
pci_segments,
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -1925,19 +1944,48 @@ impl VmConfig {
|
||||
Self::validate_identifier(&mut id_list, &vsock.id)?;
|
||||
}
|
||||
|
||||
let num_pci_segments = match &self.platform {
|
||||
Some(platform_config) => platform_config.num_pci_segments,
|
||||
None => 1,
|
||||
};
|
||||
if let Some(numa) = &self.numa {
|
||||
let mut used_numa_node_memory_zones = HashMap::new();
|
||||
let mut used_pci_segments = HashMap::new();
|
||||
for numa_node in numa.iter() {
|
||||
for memory_zone in numa_node.memory_zones.clone().unwrap().iter() {
|
||||
if !used_numa_node_memory_zones.contains_key(memory_zone) {
|
||||
used_numa_node_memory_zones
|
||||
.insert(memory_zone.to_string(), numa_node.guest_numa_id);
|
||||
} else {
|
||||
return Err(ValidationError::MemoryZoneReused(
|
||||
memory_zone.to_string(),
|
||||
*used_numa_node_memory_zones.get(memory_zone).unwrap(),
|
||||
numa_node.guest_numa_id,
|
||||
));
|
||||
if let Some(memory_zones) = numa_node.memory_zones.clone() {
|
||||
for memory_zone in memory_zones.iter() {
|
||||
if !used_numa_node_memory_zones.contains_key(memory_zone) {
|
||||
used_numa_node_memory_zones
|
||||
.insert(memory_zone.to_string(), numa_node.guest_numa_id);
|
||||
} else {
|
||||
return Err(ValidationError::MemoryZoneReused(
|
||||
memory_zone.to_string(),
|
||||
*used_numa_node_memory_zones.get(memory_zone).unwrap(),
|
||||
numa_node.guest_numa_id,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(pci_segments) = numa_node.pci_segments.clone() {
|
||||
for pci_segment in pci_segments.iter() {
|
||||
if *pci_segment >= num_pci_segments {
|
||||
return Err(ValidationError::InvalidPciSegment(*pci_segment));
|
||||
}
|
||||
if *pci_segment == 0 && numa_node.guest_numa_id != 0 {
|
||||
return Err(ValidationError::DefaultPciSegmentInvalidNode(
|
||||
numa_node.guest_numa_id,
|
||||
));
|
||||
}
|
||||
if !used_pci_segments.contains_key(pci_segment) {
|
||||
used_pci_segments.insert(*pci_segment, numa_node.guest_numa_id);
|
||||
} else {
|
||||
return Err(ValidationError::PciSegmentReused(
|
||||
*pci_segment,
|
||||
*used_pci_segments.get(pci_segment).unwrap(),
|
||||
numa_node.guest_numa_id,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -3304,6 +3352,63 @@ mod tests {
|
||||
Err(ValidationError::IommuNotSupportedOnSegment(1))
|
||||
);
|
||||
|
||||
let mut invalid_config = valid_config.clone();
|
||||
invalid_config.platform = Some(PlatformConfig {
|
||||
num_pci_segments: 2,
|
||||
..Default::default()
|
||||
});
|
||||
invalid_config.numa = Some(vec![
|
||||
NumaConfig {
|
||||
guest_numa_id: 0,
|
||||
pci_segments: Some(vec![1]),
|
||||
..Default::default()
|
||||
},
|
||||
NumaConfig {
|
||||
guest_numa_id: 1,
|
||||
pci_segments: Some(vec![1]),
|
||||
..Default::default()
|
||||
},
|
||||
]);
|
||||
assert_eq!(
|
||||
invalid_config.validate(),
|
||||
Err(ValidationError::PciSegmentReused(1, 0, 1))
|
||||
);
|
||||
|
||||
let mut invalid_config = valid_config.clone();
|
||||
invalid_config.numa = Some(vec![
|
||||
NumaConfig {
|
||||
guest_numa_id: 0,
|
||||
..Default::default()
|
||||
},
|
||||
NumaConfig {
|
||||
guest_numa_id: 1,
|
||||
pci_segments: Some(vec![0]),
|
||||
..Default::default()
|
||||
},
|
||||
]);
|
||||
assert_eq!(
|
||||
invalid_config.validate(),
|
||||
Err(ValidationError::DefaultPciSegmentInvalidNode(1))
|
||||
);
|
||||
|
||||
let mut invalid_config = valid_config.clone();
|
||||
invalid_config.numa = Some(vec![
|
||||
NumaConfig {
|
||||
guest_numa_id: 0,
|
||||
pci_segments: Some(vec![0]),
|
||||
..Default::default()
|
||||
},
|
||||
NumaConfig {
|
||||
guest_numa_id: 1,
|
||||
pci_segments: Some(vec![1]),
|
||||
..Default::default()
|
||||
},
|
||||
]);
|
||||
assert_eq!(
|
||||
invalid_config.validate(),
|
||||
Err(ValidationError::InvalidPciSegment(1))
|
||||
);
|
||||
|
||||
let mut still_valid_config = valid_config.clone();
|
||||
still_valid_config.devices = Some(vec![
|
||||
DeviceConfig {
|
||||
|
@ -1065,6 +1065,7 @@ impl DeviceManager {
|
||||
for i in 1..num_pci_segments as usize {
|
||||
pci_segments.push(PciSegment::new(
|
||||
i as u16,
|
||||
numa_node_id_from_pci_segment_id(&numa_nodes, i as u16),
|
||||
&address_manager,
|
||||
Arc::clone(&address_manager.pci_mmio_allocators[i]),
|
||||
&pci_irq_slots,
|
||||
@ -4343,6 +4344,16 @@ fn numa_node_id_from_memory_zone_id(numa_nodes: &NumaNodes, memory_zone_id: &str
|
||||
None
|
||||
}
|
||||
|
||||
fn numa_node_id_from_pci_segment_id(numa_nodes: &NumaNodes, pci_segment_id: u16) -> u32 {
|
||||
for (numa_node_id, numa_node) in numa_nodes.iter() {
|
||||
if numa_node.pci_segments.contains(&pci_segment_id) {
|
||||
return *numa_node_id;
|
||||
}
|
||||
}
|
||||
|
||||
0
|
||||
}
|
||||
|
||||
struct TpmDevice {}
|
||||
|
||||
impl Aml for TpmDevice {
|
||||
|
@ -25,6 +25,7 @@ pub(crate) struct PciSegment {
|
||||
pub(crate) pci_bus: Arc<Mutex<PciBus>>,
|
||||
pub(crate) pci_config_mmio: Arc<Mutex<PciConfigMmio>>,
|
||||
pub(crate) mmio_config_address: u64,
|
||||
pub(crate) proximity_domain: u32,
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
pub(crate) pci_config_io: Option<Arc<Mutex<PciConfigIo>>>,
|
||||
@ -46,6 +47,7 @@ pub(crate) struct PciSegment {
|
||||
impl PciSegment {
|
||||
pub(crate) fn new(
|
||||
id: u16,
|
||||
numa_node: u32,
|
||||
address_manager: &Arc<AddressManager>,
|
||||
allocator: Arc<Mutex<AddressAllocator>>,
|
||||
pci_irq_slots: &[u8; 32],
|
||||
@ -77,6 +79,7 @@ impl PciSegment {
|
||||
pci_bus,
|
||||
pci_config_mmio,
|
||||
mmio_config_address,
|
||||
proximity_domain: numa_node,
|
||||
pci_devices_up: 0,
|
||||
pci_devices_down: 0,
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
@ -100,7 +103,7 @@ impl PciSegment {
|
||||
allocator: Arc<Mutex<AddressAllocator>>,
|
||||
pci_irq_slots: &[u8; 32],
|
||||
) -> DeviceManagerResult<PciSegment> {
|
||||
let mut segment = Self::new(0, address_manager, allocator, pci_irq_slots)?;
|
||||
let mut segment = Self::new(0, 0, address_manager, allocator, pci_irq_slots)?;
|
||||
let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus))));
|
||||
|
||||
address_manager
|
||||
@ -123,7 +126,7 @@ impl PciSegment {
|
||||
allocator: Arc<Mutex<AddressAllocator>>,
|
||||
pci_irq_slots: &[u8; 32],
|
||||
) -> DeviceManagerResult<PciSegment> {
|
||||
Self::new(0, address_manager, allocator, pci_irq_slots)
|
||||
Self::new(0, 0, address_manager, allocator, pci_irq_slots)
|
||||
}
|
||||
|
||||
pub(crate) fn next_device_bdf(&self) -> DeviceManagerResult<PciBdf> {
|
||||
@ -329,10 +332,7 @@ impl Aml for PciSegment {
|
||||
let supp = aml::Name::new("SUPP".into(), &aml::ZERO);
|
||||
pci_dsdt_inner_data.push(&supp);
|
||||
|
||||
// Since Cloud Hypervisor supports only one PCI bus, it can be tied
|
||||
// to the NUMA node 0. It's up to the user to organize the NUMA nodes
|
||||
// so that the PCI bus relates to the expected vCPUs and guest RAM.
|
||||
let proximity_domain = 0u32;
|
||||
let proximity_domain = self.proximity_domain;
|
||||
let pxm_return = aml::Return::new(&proximity_domain);
|
||||
let pxm = aml::Method::new("_PXM".into(), 0, false, vec![&pxm_return]);
|
||||
pci_dsdt_inner_data.push(&pxm);
|
||||
|
@ -701,6 +701,10 @@ impl Vm {
|
||||
node.cpus.extend(cpus);
|
||||
}
|
||||
|
||||
if let Some(pci_segments) = &config.pci_segments {
|
||||
node.pci_segments.extend(pci_segments);
|
||||
}
|
||||
|
||||
if let Some(distances) = &config.distances {
|
||||
for distance in distances.iter() {
|
||||
let dest = distance.destination;
|
||||
|
@ -538,6 +538,8 @@ pub struct NumaConfig {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[serde(default)]
|
||||
pub sgx_epc_sections: Option<Vec<String>>,
|
||||
#[serde(default)]
|
||||
pub pci_segments: Option<Vec<u16>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)]
|
||||
|
Loading…
x
Reference in New Issue
Block a user