From 932c8c971366da2504544664a238076b9c49167e Mon Sep 17 00:00:00 2001
From: Sebastien Boeuf <sebastien.boeuf@intel.com>
Date: Wed, 10 Nov 2021 10:43:52 +0100
Subject: [PATCH] vmm: Add CPU affinity support

With the introduction of a new option `affinity` to the `cpus`
parameter, Cloud Hypervisor can now let the user choose the set
of host CPUs where to run each vCPU.

This is useful when trying to achieve CPU pinning, as well as making
sure the VM runs on a specific NUMA node.

Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
---
 src/main.rs                               |  1 +
 vmm/src/api/openapi/cloud-hypervisor.yaml | 16 +++++++-
 vmm/src/config.rs                         | 24 +++++++++++-
 vmm/src/cpu.rs                            | 47 ++++++++++++++++++++++-
 vmm/src/seccomp_filters.rs                |  1 +
 5 files changed, 85 insertions(+), 4 deletions(-)
diff --git a/src/main.rs b/src/main.rs
index 4051b0df9..89ef302d1 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -647,6 +647,7 @@ mod unit_tests {
                     topology: None,
                     kvm_hyperv: false,
                     max_phys_bits: 46,
+                    affinity: None,
                 },
                 memory: MemoryConfig {
                     size: 536_870_912,
diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml
index 9d1d93982..163d811ed 100644
--- a/vmm/src/api/openapi/cloud-hypervisor.yaml
+++ b/vmm/src/api/openapi/cloud-hypervisor.yaml
@@ -480,6 +480,16 @@ components:
           default: false
       description: Virtual machine configuration
 
+    CpuAffinity:
+      type: object
+      properties:
+        vcpu:
+          type: integer
+        host_cpus:
+          type: array
+          items:
+            type: integer
+
     CpuTopology:
       type: object
       properties:
@@ -507,9 +517,13 @@ components:
           default: 1
           type: integer
         topology:
-            $ref: '#/components/schemas/CpuTopology'
+          $ref: '#/components/schemas/CpuTopology'
         max_phys_bits:
           type: integer
+        affinity:
+          type: array
+          items:
+            $ref: '#/components/schemas/CpuAffinity'
 
     MemoryZoneConfig:
       required:
diff --git a/vmm/src/config.rs b/vmm/src/config.rs
index ee3026884..aa0f3efe0 100644
--- a/vmm/src/config.rs
+++ b/vmm/src/config.rs
@@ -398,6 +398,12 @@ impl FromStr for HotplugMethod {
     }
 }
 
+#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
+pub struct CpuAffinity {
+    pub vcpu: u8,
+    pub host_cpus: Vec<u8>,
+}
+
 pub enum CpuTopologyParseError {
     InvalidValue(String),
 }
@@ -453,6 +459,8 @@ pub struct CpusConfig {
     pub kvm_hyperv: bool,
     #[serde(default = "default_cpuconfig_max_phys_bits")]
     pub max_phys_bits: u8,
+    #[serde(default)]
+    pub affinity: Option<Vec<CpuAffinity>>,
 }
 
 impl CpusConfig {
@@ -463,7 +471,8 @@ impl CpusConfig {
             .add("max")
             .add("topology")
             .add("kvm_hyperv")
-            .add("max_phys_bits");
+            .add("max_phys_bits")
+            .add("affinity");
         parser.parse(cpus).map_err(Error::ParseCpus)?;
 
         let boot_vcpus: u8 = parser
@@ -484,6 +493,17 @@ impl CpusConfig {
             .convert::<u8>("max_phys_bits")
             .map_err(Error::ParseCpus)?
             .unwrap_or(DEFAULT_MAX_PHYS_BITS);
+        let affinity = parser
+            .convert::<Tuple<u8, Vec<u8>>>("affinity")
+            .map_err(Error::ParseCpus)?
+            .map(|v| {
+                v.0.iter()
+                    .map(|(e1, e2)| CpuAffinity {
+                        vcpu: *e1,
+                        host_cpus: e2.clone(),
+                    })
+                    .collect()
+            });
 
         Ok(CpusConfig {
             boot_vcpus,
@@ -491,6 +511,7 @@ impl CpusConfig {
             topology,
             kvm_hyperv,
             max_phys_bits,
+            affinity,
         })
     }
 }
@@ -503,6 +524,7 @@ impl Default for CpusConfig {
             topology: None,
             kvm_hyperv: false,
             max_phys_bits: DEFAULT_MAX_PHYS_BITS,
+            affinity: None,
         }
     }
 }
diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs
index 4f9fe9a20..b10ef03f8 100644
--- a/vmm/src/cpu.rs
+++ b/vmm/src/cpu.rs
@@ -33,7 +33,6 @@ use hypervisor::CpuId;
 use hypervisor::{vm::VmmOps, CpuState, HypervisorCpuError, VmExit};
 use libc::{c_void, siginfo_t};
 use seccompiler::{apply_filter, SeccompAction};
-#[cfg(feature = "acpi")]
 use std::collections::BTreeMap;
 use std::os::unix::thread::JoinHandleExt;
 use std::sync::atomic::{AtomicBool, Ordering};
@@ -104,6 +103,9 @@ pub enum Error {
 
     #[cfg(feature = "tdx")]
     InitializeTdx(hypervisor::HypervisorCpuError),
+
+    /// Failed scheduling the thread on the expected CPU set.
+    ScheduleCpuSet,
 }
 pub type Result<T> = result::Result<T, Error>;
 
@@ -393,6 +395,7 @@ pub struct CpuManager {
     acpi_address: GuestAddress,
     #[cfg(feature = "acpi")]
     proximity_domain_per_cpu: BTreeMap<u8, u32>,
+    affinity: BTreeMap<u8, Vec<u8>>,
 }
 
 const CPU_ENABLE_FLAG: usize = 0;
@@ -591,6 +594,15 @@ impl CpuManager {
         .into_iter()
         .collect();
 
+        let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
+            cpu_affinity
+                .iter()
+                .map(|a| (a.vcpu, a.host_cpus.clone()))
+                .collect()
+        } else {
+            BTreeMap::new()
+        };
+
         let cpu_manager = Arc::new(Mutex::new(CpuManager {
             config: config.clone(),
             interrupt_controller: device_manager.interrupt_controller().clone(),
@@ -611,6 +623,7 @@ impl CpuManager {
             acpi_address,
             #[cfg(feature = "acpi")]
             proximity_domain_per_cpu,
+            affinity,
         }));
 
         #[cfg(feature = "acpi")]
@@ -713,7 +726,15 @@ impl CpuManager {
             .clone();
         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
 
-        info!("Starting vCPU: cpu_id = {}", cpu_id);
+        // Prepare the CPU set the current vCPU is expected to run onto.
+        let cpuset = self.affinity.get(&cpu_id).map(|host_cpus| {
+            let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
+            unsafe { libc::CPU_ZERO(&mut cpuset) };
+            for host_cpu in host_cpus {
+                unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
+            }
+            cpuset
+        });
 
         // Retrieve seccomp filter for vcpu thread
         let vcpu_seccomp_filter = get_seccomp_filter(&self.seccomp_action, Thread::Vcpu)
@@ -722,10 +743,32 @@ impl CpuManager {
         #[cfg(target_arch = "x86_64")]
         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
 
+        info!("Starting vCPU: cpu_id = {}", cpu_id);
+
         let handle = Some(
             thread::Builder::new()
                 .name(format!("vcpu{}", cpu_id))
                 .spawn(move || {
+                    // Schedule the thread to run on the expected CPU set
+                    if let Some(cpuset) = cpuset.as_ref() {
+                        let ret = unsafe {
+                            libc::sched_setaffinity(
+                                0,
+                                std::mem::size_of::<libc::cpu_set_t>(),
+                                cpuset as *const libc::cpu_set_t,
+                            )
+                        };
+
+                        if ret != 0 {
+                            error!(
+                                "Failed scheduling the vCPU {} on the expected CPU set: {}",
+                                cpu_id,
+                                io::Error::last_os_error()
+                            );
+                            return;
+                        }
+                    }
+
                     // Apply seccomp filter for vcpu thread.
                     if !vcpu_seccomp_filter.is_empty() {
                         if let Err(e) =
diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs
index 7119d03d7..8738a3c30 100644
--- a/vmm/src/seccomp_filters.rs
+++ b/vmm/src/seccomp_filters.rs
@@ -492,6 +492,7 @@ fn vmm_thread_rules() -> Result<Vec<(i64, Vec<SeccompRule>)>, BackendError> {
         (libc::SYS_rt_sigprocmask, vec![]),
         (libc::SYS_rt_sigreturn, vec![]),
         (libc::SYS_sched_getaffinity, vec![]),
+        (libc::SYS_sched_setaffinity, vec![]),
         (libc::SYS_sendmsg, vec![]),
         (libc::SYS_sendto, vec![]),
         (libc::SYS_set_robust_list, vec![]),