diff --git a/tests/integration.rs b/tests/integration.rs index 66ec28a79..3f1e7c8ad 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -2217,7 +2217,12 @@ mod tests { let mut cmd = GuestCommand::new(&guest); cmd.args(&["--cpus", "boot=1"]) .args(&["--memory", "size=0"]) - .args(&["--memory-zone", "size=1G", "size=3G,file=/dev/shm"]) + .args(&[ + "--memory-zone", + "size=1G", + "size=3G,file=/dev/shm", + "size=1G,host_numa_node=0", + ]) .args(&["--kernel", guest.fw_path.as_str()]) .default_disks() .default_net(); diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 7cc08f787..a536f05da 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -23,6 +23,7 @@ use std::convert::TryInto; use std::ffi; use std::fs::{File, OpenOptions}; use std::io; +use std::ops::Deref; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::path::PathBuf; use std::result; @@ -47,6 +48,11 @@ const X86_64_IRQ_BASE: u32 = 5; const HOTPLUG_COUNT: usize = 8; +// Memory policy constants +const MPOL_BIND: u32 = 2; +const MPOL_MF_STRICT: u32 = 1 << 0; +const MPOL_MF_MOVE: u32 = 1 << 1; + #[derive(Default)] struct HotPlugState { base: u64, @@ -165,6 +171,13 @@ pub enum Error { /// Forbidden operation. Impossible to resize guest memory if it is /// backed by user defined memory regions. InvalidResizeWithMemoryZones, + + /// It's invalid to try applying a NUMA policy to a memory zone that is + /// memory mapped with MAP_SHARED. + InvalidSharedMemoryZoneWithHostNuma, + + /// Failed applying NUMA memory policy. + ApplyNumaPolicy(io::Error), } const ENABLE_FLAG: usize = 0; @@ -314,6 +327,7 @@ impl MemoryManager { prefault, zone.shared, zone.hugepages, + zone.host_numa_node, &ext_regions, )?); @@ -387,6 +401,14 @@ impl MemoryManager { let mut total_ram_size: u64 = 0; for zone in zones.iter() { total_ram_size += zone.size; + + if zone.shared && zone.host_numa_node.is_some() { + error!( + "Invalid to set host NUMA policy for a memory zone \ + mapped as 'shared'" + ); + return Err(Error::InvalidSharedMemoryZoneWithHostNuma); + } } (total_ram_size, zones) @@ -431,6 +453,7 @@ impl MemoryManager { false, config.shared, config.hugepages, + None, &None, )?); } @@ -594,6 +617,33 @@ impl MemoryManager { } } + fn mbind( + addr: *mut u8, + len: u64, + mode: u32, + nodemask: Vec, + maxnode: u64, + flags: u32, + ) -> Result<(), io::Error> { + let res = unsafe { + libc::syscall( + libc::SYS_mbind, + addr as *mut libc::c_void, + len, + mode, + nodemask.as_ptr(), + maxnode, + flags, + ) + }; + + if res < 0 { + Err(io::Error::last_os_error()) + } else { + Ok(()) + } + } + #[allow(clippy::too_many_arguments)] fn create_ram_region( file: &Option, @@ -603,6 +653,7 @@ impl MemoryManager { prefault: bool, shared: bool, hugepages: bool, + host_numa_node: Option, ext_regions: &Option>, ) -> Result, Error> { let mut backing_file: Option = file.clone(); @@ -716,6 +767,38 @@ impl MemoryManager { .unwrap(); } + // Apply NUMA policy if needed. + if let Some(node) = host_numa_node { + let addr = region.deref().as_ptr(); + let len = region.deref().size() as u64; + let mode = MPOL_BIND; + let mut nodemask: Vec = Vec::new(); + let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; + + // Linux is kind of buggy in the way it interprets maxnode as it + // will cut off the last node. That's why we have to add 1 to what + // we would consider as the proper maxnode value. + let maxnode = node + 1 + 1; + + // Allocate the right size for the vector. + nodemask.resize((node as usize / 64) + 1, 0); + + // Fill the global bitmask through the nodemask vector. + let idx = (node / 64) as usize; + let shift = node % 64; + nodemask[idx] |= 1u64 << shift; + + // Policies are enforced by using MPOL_MF_MOVE flag as it will + // force the kernel to move all pages that might have been already + // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is + // used to throw an error if MPOL_MF_MOVE didn't succeed. + // MPOL_BIND is the selected mode as it specifies a strict policy + // that restricts memory allocation to the nodes specified in the + // nodemask. + Self::mbind(addr, len, mode, nodemask, maxnode, flags) + .map_err(Error::ApplyNumaPolicy)?; + } + Ok(Arc::new(region)) } @@ -783,6 +866,7 @@ impl MemoryManager { false, self.shared, self.hugepages, + None, &None, )?; diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 670f333c7..558f43c75 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -289,6 +289,7 @@ fn vmm_thread_rules() -> Result, Error> { allow_syscall(libc::SYS_listen), allow_syscall(libc::SYS_lseek), allow_syscall(libc::SYS_madvise), + allow_syscall(libc::SYS_mbind), allow_syscall(libc::SYS_memfd_create), allow_syscall(libc::SYS_mmap), allow_syscall(libc::SYS_mprotect),