arch: x86_64: handle npot CPU topology

This PR addresses a bug in which the cpu topology of a guest
with non power-of-two number of cores is incorrect. For example,
in some contexts, a virtual machine with 2-sockets and 12-cores
will incorrectly believe that 16 cores are on socket 1 and 8
cores are on socket 2. In other cases, common topology enumeration
software such as hwloc will crash.

The root of the problem was the way that cloud-hypervisor generates
apic_id. On x86_64, the (x2) apic_id embeds information about cpu
topology. The cpuid instruction is primarily used to discover the
number of sockets, dies, cores, threads, etc. Using this information,
the (x2) apic_id is masked to determine which {core, die, socket} the
cpu is on. When the cpu topology is not a power of two
(e.g. a 12-core machine), this requires non-contiguous (x2) apic_id.

Signed-off-by: Thomas Barrett <tbarrett@crusoeenergy.com>
This commit is contained in:
Thomas Barrett 2023-12-22 03:38:13 +00:00 committed by Rob Bradford
parent a2ceb00fbc
commit 5c0b66529a
6 changed files with 164 additions and 34 deletions

View File

@ -221,6 +221,26 @@ impl From<Error> for super::Error {
}
}
pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u8, u8, u8)>) -> u32 {
if let Some(t) = topology {
let thread_mask_width = u8::BITS - (t.0 - 1).leading_zeros();
let core_mask_width = u8::BITS - (t.1 - 1).leading_zeros();
let die_mask_width = u8::BITS - (t.2 - 1).leading_zeros();
let thread_id = cpu_id % (t.0 as u32);
let core_id = cpu_id / (t.0 as u32) % (t.1 as u32);
let die_id = cpu_id / ((t.0 * t.1) as u32) % (t.2 as u32);
let socket_id = cpu_id / ((t.0 * t.1 * t.2) as u32);
return thread_id
| (core_id << thread_mask_width)
| (die_id << (thread_mask_width + core_mask_width))
| (socket_id << (thread_mask_width + core_mask_width + die_mask_width));
}
cpu_id
}
#[derive(Copy, Clone, Debug)]
pub enum CpuidReg {
EAX,
@ -777,18 +797,14 @@ pub fn configure_vcpu(
cpu_vendor: CpuVendor,
topology: Option<(u8, u8, u8)>,
) -> super::Result<()> {
let x2apic_id = get_x2apic_id(id as u32, topology);
// Per vCPU CPUID changes; common are handled via generate_common_cpuid()
let mut cpuid = cpuid;
CpuidPatch::set_cpuid_reg(&mut cpuid, 0xb, None, CpuidReg::EDX, u32::from(id));
CpuidPatch::set_cpuid_reg(&mut cpuid, 0x1f, None, CpuidReg::EDX, u32::from(id));
CpuidPatch::set_cpuid_reg(&mut cpuid, 0xb, None, CpuidReg::EDX, x2apic_id);
CpuidPatch::set_cpuid_reg(&mut cpuid, 0x1f, None, CpuidReg::EDX, x2apic_id);
if matches!(cpu_vendor, CpuVendor::AMD) {
CpuidPatch::set_cpuid_reg(
&mut cpuid,
0x8000_001e,
Some(0),
CpuidReg::EAX,
u32::from(id),
);
CpuidPatch::set_cpuid_reg(&mut cpuid, 0x8000_001e, Some(0), CpuidReg::EAX, x2apic_id);
}
if let Some(t) = topology {
@ -799,7 +815,7 @@ pub fn configure_vcpu(
// SAFETY: get host cpuid when eax=1
let mut cpu_ebx = unsafe { core::arch::x86_64::__cpuid(1) }.ebx;
cpu_ebx &= 0xffffff;
cpu_ebx |= (id as u32) << 24;
cpu_ebx |= x2apic_id << 24;
CpuidPatch::set_cpuid_reg(&mut cpuid, 0x1, None, CpuidReg::EBX, cpu_ebx);
// The TSC frequency CPUID leaf should not be included when running with HyperV emulation
@ -896,6 +912,7 @@ pub fn configure_system(
serial_number: Option<&str>,
uuid: Option<&str>,
oem_strings: Option<&[&str]>,
topology: Option<(u8, u8, u8)>,
) -> super::Result<()> {
// Write EBDA address to location where ACPICA expects to find it
guest_mem
@ -908,7 +925,7 @@ pub fn configure_system(
// Place the MP table after the SMIOS table aligned to 16 bytes
let offset = GuestAddress(layout::SMBIOS_START).unchecked_add(size);
let offset = GuestAddress((offset.0 + 16) & !0xf);
mptable::setup_mptable(offset, guest_mem, _num_cpus).map_err(Error::MpTableSetup)?;
mptable::setup_mptable(offset, guest_mem, _num_cpus, topology).map_err(Error::MpTableSetup)?;
// Check that the RAM is not smaller than the RSDP start address
if let Some(rsdp_addr) = rsdp_addr {
@ -1228,6 +1245,11 @@ fn update_cpuid_topology(
cpu_vendor: CpuVendor,
id: u8,
) {
let x2apic_id = get_x2apic_id(
id as u32,
Some((threads_per_core, cores_per_die, dies_per_package)),
);
let thread_width = 8 - (threads_per_core - 1).leading_zeros();
let core_width = (8 - (cores_per_die - 1).leading_zeros()) + thread_width;
let die_width = (8 - (dies_per_package - 1).leading_zeros()) + core_width;
@ -1290,7 +1312,7 @@ fn update_cpuid_topology(
0x8000_001e,
Some(0),
CpuidReg::EBX,
((threads_per_core as u32 - 1) << 8) | (id as u32 & 0xff),
((threads_per_core as u32 - 1) << 8) | (x2apic_id & 0xff),
);
CpuidPatch::set_cpuid_reg(
cpuid,
@ -1313,9 +1335,7 @@ fn update_cpuid_topology(
0x0000_0001,
Some(0),
CpuidReg::EBX,
((id as u32) << 24)
| (8 << 8)
| (((cores_per_die * threads_per_core) as u32) << 16),
(x2apic_id << 24) | (8 << 8) | (((cores_per_die * threads_per_core) as u32) << 16),
);
let cpuid_patches = vec![
// Patch tsc deadline timer bit
@ -1420,6 +1440,7 @@ mod tests {
None,
None,
None,
None,
);
assert!(config_err.is_err());
@ -1442,6 +1463,7 @@ mod tests {
None,
None,
None,
None,
)
.unwrap();
@ -1469,6 +1491,7 @@ mod tests {
None,
None,
None,
None,
)
.unwrap();
@ -1482,6 +1505,7 @@ mod tests {
None,
None,
None,
None,
)
.unwrap();
}
@ -1510,4 +1534,25 @@ mod tests {
assert_eq!(format!("{memmap:?}"), format!("{expected_memmap:?}"));
}
#[test]
fn test_get_x2apic_id() {
let x2apic_id = get_x2apic_id(0, Some((2, 3, 1)));
assert_eq!(x2apic_id, 0);
let x2apic_id = get_x2apic_id(1, Some((2, 3, 1)));
assert_eq!(x2apic_id, 1);
let x2apic_id = get_x2apic_id(2, Some((2, 3, 1)));
assert_eq!(x2apic_id, 2);
let x2apic_id = get_x2apic_id(6, Some((2, 3, 1)));
assert_eq!(x2apic_id, 8);
let x2apic_id = get_x2apic_id(7, Some((2, 3, 1)));
assert_eq!(x2apic_id, 9);
let x2apic_id = get_x2apic_id(8, Some((2, 3, 1)));
assert_eq!(x2apic_id, 10);
}
}

View File

@ -6,7 +6,7 @@
// found in the LICENSE-BSD-3-Clause file.
use crate::layout::{APIC_START, HIGH_RAM_START, IOAPIC_START};
use crate::x86_64::mpspec;
use crate::x86_64::{get_x2apic_id, mpspec};
use crate::GuestMemoryMmap;
use libc::c_char;
use std::mem;
@ -125,9 +125,18 @@ fn compute_mp_size(num_cpus: u8) -> usize {
}
/// Performs setup of the MP table for the given `num_cpus`.
pub fn setup_mptable(offset: GuestAddress, mem: &GuestMemoryMmap, num_cpus: u8) -> Result<()> {
if num_cpus as u32 > MAX_SUPPORTED_CPUS {
return Err(Error::TooManyCpus);
pub fn setup_mptable(
offset: GuestAddress,
mem: &GuestMemoryMmap,
num_cpus: u8,
topology: Option<(u8, u8, u8)>,
) -> Result<()> {
if num_cpus > 0 {
let cpu_id_max = num_cpus - 1;
let x2apic_id_max = get_x2apic_id(cpu_id_max.into(), topology);
if x2apic_id_max >= MAX_SUPPORTED_CPUS {
return Err(Error::TooManyCpus);
}
}
// Used to keep track of the next base pointer into the MP table.
@ -141,7 +150,7 @@ pub fn setup_mptable(offset: GuestAddress, mem: &GuestMemoryMmap, num_cpus: u8)
}
let mut checksum: u8 = 0;
let ioapicid: u8 = num_cpus + 1;
let ioapicid: u8 = MAX_SUPPORTED_CPUS as u8 + 1;
// The checked_add here ensures the all of the following base_mp.unchecked_add's will be without
// overflow.
@ -179,7 +188,7 @@ pub fn setup_mptable(offset: GuestAddress, mem: &GuestMemoryMmap, num_cpus: u8)
for cpu_id in 0..num_cpus {
let mut mpc_cpu = MpcCpuWrapper(mpspec::mpc_cpu::default());
mpc_cpu.0.type_ = mpspec::MP_PROCESSOR as u8;
mpc_cpu.0.apicid = cpu_id;
mpc_cpu.0.apicid = get_x2apic_id(cpu_id as u32, topology) as u8;
mpc_cpu.0.apicver = APIC_VERSION;
mpc_cpu.0.cpuflag = mpspec::CPU_ENABLED as u8
| if cpu_id == 0 {
@ -312,7 +321,7 @@ mod tests {
let mem =
GuestMemoryMmap::from_ranges(&[(MPTABLE_START, compute_mp_size(num_cpus))]).unwrap();
setup_mptable(MPTABLE_START, &mem, num_cpus).unwrap();
setup_mptable(MPTABLE_START, &mem, num_cpus, None).unwrap();
}
#[test]
@ -321,7 +330,7 @@ mod tests {
let mem = GuestMemoryMmap::from_ranges(&[(MPTABLE_START, compute_mp_size(num_cpus) - 1)])
.unwrap();
assert!(setup_mptable(MPTABLE_START, &mem, num_cpus).is_err());
assert!(setup_mptable(MPTABLE_START, &mem, num_cpus, None).is_err());
}
#[test]
@ -330,7 +339,7 @@ mod tests {
let mem =
GuestMemoryMmap::from_ranges(&[(MPTABLE_START, compute_mp_size(num_cpus))]).unwrap();
setup_mptable(MPTABLE_START, &mem, num_cpus).unwrap();
setup_mptable(MPTABLE_START, &mem, num_cpus, None).unwrap();
let mpf_intel: MpfIntelWrapper = mem.read_obj(MPTABLE_START).unwrap();
@ -346,7 +355,7 @@ mod tests {
let mem =
GuestMemoryMmap::from_ranges(&[(MPTABLE_START, compute_mp_size(num_cpus))]).unwrap();
setup_mptable(MPTABLE_START, &mem, num_cpus).unwrap();
setup_mptable(MPTABLE_START, &mem, num_cpus, None).unwrap();
let mpf_intel: MpfIntelWrapper = mem.read_obj(MPTABLE_START).unwrap();
let mpc_offset = GuestAddress(mpf_intel.0.physptr as GuestUsize);
@ -384,7 +393,7 @@ mod tests {
.unwrap();
for i in 0..MAX_SUPPORTED_CPUS as u8 {
setup_mptable(MPTABLE_START, &mem, i).unwrap();
setup_mptable(MPTABLE_START, &mem, i, None).unwrap();
let mpf_intel: MpfIntelWrapper = mem.read_obj(MPTABLE_START).unwrap();
let mpc_offset = GuestAddress(mpf_intel.0.physptr as GuestUsize);
@ -417,7 +426,7 @@ mod tests {
let mem =
GuestMemoryMmap::from_ranges(&[(MPTABLE_START, compute_mp_size(cpus as u8))]).unwrap();
let result = setup_mptable(MPTABLE_START, &mem, cpus as u8);
let result = setup_mptable(MPTABLE_START, &mem, cpus as u8, None);
assert!(result.is_err());
}
}

View File

@ -1035,6 +1035,40 @@ fn test_cpu_topology(threads_per_core: u8, cores_per_package: u8, packages: u8,
.unwrap_or(0),
packages
);
#[cfg(target_arch = "x86_64")]
{
let mut cpu_id = 0;
for package_id in 0..packages {
for core_id in 0..cores_per_package {
for _ in 0..threads_per_core {
assert_eq!(
guest
.ssh_command(&format!("cat /sys/devices/system/cpu/cpu{cpu_id}/topology/physical_package_id"))
.unwrap()
.trim()
.parse::<u8>()
.unwrap_or(0),
package_id
);
assert_eq!(
guest
.ssh_command(&format!(
"cat /sys/devices/system/cpu/cpu{cpu_id}/topology/core_id"
))
.unwrap()
.trim()
.parse::<u8>()
.unwrap_or(0),
core_id
);
cpu_id += 1;
}
}
}
}
});
let _ = child.kill();

View File

@ -277,7 +277,10 @@ fn create_tpm2_table() -> Sdt {
tpm
}
fn create_srat_table(numa_nodes: &NumaNodes) -> Sdt {
fn create_srat_table(
numa_nodes: &NumaNodes,
#[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
) -> Sdt {
let mut srat = Sdt::new(*b"SRAT", 36, 3, *b"CLOUDH", *b"CHSRAT ", 1);
// SRAT reserved 12 bytes
srat.append_slice(&[0u8; 12]);
@ -316,6 +319,9 @@ fn create_srat_table(numa_nodes: &NumaNodes) -> Sdt {
}
for cpu in &node.cpus {
#[cfg(target_arch = "x86_64")]
let x2apic_id = arch::x86_64::get_x2apic_id(*cpu as u32, topology);
#[cfg(target_arch = "aarch64")]
let x2apic_id = *cpu as u32;
// Flags
@ -752,8 +758,14 @@ pub fn create_acpi_tables(
// SRAT and SLIT
// Only created if the NUMA nodes list is not empty.
if !numa_nodes.is_empty() {
#[cfg(target_arch = "x86_64")]
let topology = cpu_manager.lock().unwrap().get_vcpu_topology();
// SRAT
let srat = create_srat_table(numa_nodes);
let srat = create_srat_table(
numa_nodes,
#[cfg(target_arch = "x86_64")]
topology,
);
let srat_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap();
guest_mem
.write_slice(srat.as_slice(), srat_offset)
@ -851,8 +863,15 @@ pub fn create_acpi_tables_tdx(
// SRAT and SLIT
// Only created if the NUMA nodes list is not empty.
if !numa_nodes.is_empty() {
#[cfg(target_arch = "x86_64")]
let topology = cpu_manager.lock().unwrap().get_vcpu_topology();
// SRAT
tables.push(create_srat_table(numa_nodes));
tables.push(create_srat_table(
numa_nodes,
#[cfg(target_arch = "x86_64")]
topology,
));
// SLIT
tables.push(create_slit_table(numa_nodes));

View File

@ -31,6 +31,8 @@ use acpi_tables::{aml, sdt::Sdt, Aml};
use anyhow::anyhow;
#[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
use arch::aarch64::regs;
#[cfg(target_arch = "x86_64")]
use arch::x86_64::get_x2apic_id;
use arch::EntryPoint;
use arch::NumaNodes;
#[cfg(target_arch = "aarch64")]
@ -331,12 +333,13 @@ impl Vcpu {
/// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
pub fn new(
id: u8,
apic_id: u8,
vm: &Arc<dyn hypervisor::Vm>,
vm_ops: Option<Arc<dyn VmOps>>,
#[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
) -> Result<Self> {
let vcpu = vm
.create_vcpu(id, vm_ops)
.create_vcpu(apic_id, vm_ops)
.map_err(|e| Error::VcpuCreate(e.into()))?;
// Initially the cpuid per vCPU is the one supported by this VM.
Ok(Vcpu {
@ -757,8 +760,16 @@ impl CpuManager {
fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
info!("Creating vCPU: cpu_id = {}", cpu_id);
#[cfg(target_arch = "x86_64")]
let topology = self.get_vcpu_topology();
#[cfg(target_arch = "x86_64")]
let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
#[cfg(target_arch = "aarch64")]
let x2apic_id = cpu_id as u32;
let mut vcpu = Vcpu::new(
cpu_id,
x2apic_id as u8,
&self.vm,
Some(self.vm_ops.clone()),
#[cfg(target_arch = "x86_64")]
@ -1334,7 +1345,6 @@ impl CpuManager {
.collect()
}
#[cfg(target_arch = "aarch64")]
pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
self.config
.topology
@ -1353,11 +1363,13 @@ impl CpuManager {
madt.write(36, arch::layout::APIC_START.0);
for cpu in 0..self.config.max_vcpus {
let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
let lapic = LocalX2Apic {
r#type: acpi::ACPI_X2APIC_PROCESSOR,
length: 16,
processor_id: cpu.into(),
apic_id: cpu.into(),
apic_id: x2apic_id,
flags: if cpu < self.config.boot_vcpus {
1 << MADT_CPU_ENABLE_FLAG
} else {
@ -1808,6 +1820,8 @@ struct Cpu {
cpu_id: u8,
proximity_domain: u32,
dynamic: bool,
#[cfg(target_arch = "x86_64")]
topology: Option<(u8, u8, u8)>,
}
#[cfg(target_arch = "x86_64")]
@ -1819,11 +1833,13 @@ const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
impl Cpu {
#[cfg(target_arch = "x86_64")]
fn generate_mat(&self) -> Vec<u8> {
let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
let lapic = LocalX2Apic {
r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
length: 16,
processor_id: self.cpu_id.into(),
apic_id: self.cpu_id.into(),
apic_id: x2apic_id,
flags: 1 << MADT_CPU_ENABLE_FLAG,
_reserved: 0,
};
@ -2126,6 +2142,8 @@ impl Aml for CpuManager {
};
let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
#[cfg(target_arch = "x86_64")]
let topology = self.get_vcpu_topology();
let mut cpu_devices = Vec::new();
for cpu_id in 0..self.config.max_vcpus {
let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
@ -2133,6 +2151,8 @@ impl Aml for CpuManager {
cpu_id,
proximity_domain,
dynamic: self.dynamic,
#[cfg(target_arch = "x86_64")]
topology,
};
cpu_devices.push(cpu_device);

View File

@ -1175,6 +1175,8 @@ impl Vm {
.as_deref()
.map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
arch::configure_system(
&mem,
arch::layout::CMDLINE_START,
@ -1185,6 +1187,7 @@ impl Vm {
serial_number.as_deref(),
uuid.as_deref(),
oem_strings.as_deref(),
topology,
)
.map_err(Error::ConfigureSystem)?;
Ok(())