vmm: prefault memory in parallel to optimize boot time

On guests with large amounts of memory, using the `prefault` option can lead to a very long boot time. This commit implements the strategy taken by QEMU to prefault memory in parallel using multiple threads, decreasing the time to allocate memory for large guests by an order of magnitude or more. For example, this commit reduces the time to allocate memory for a guest configured with 704 GiB of memory on 1 NUMA node using 1 GiB hugepages from 81.44134669s to just 6.865287881s. Signed-off-by: Sean Banko <sbanko@crusoeenergy.com>
2025-01-22 04:25:21 +00:00 · 2024-01-29 10:46:25 -08:00 · 2024-01-29 10:46:25 -08:00 · 7633d47293
commit 7633d47293
parent 939cc348ed
1 changed files with 115 additions and 19 deletions
--- a/vmm/src/memory_manager.rs
+++ b/vmm/src/memory_manager.rs
@ -21,6 +21,7 @@ use arch::RegionType;
 use devices::ioapic;
 #[cfg(target_arch = "aarch64")]
 use hypervisor::HypervisorVmError;
+use libc::_SC_NPROCESSORS_ONLN;
 #[cfg(target_arch = "x86_64")]
 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
 use serde::{Deserialize, Serialize};
@ -28,7 +29,6 @@ use serde::{Deserialize, Serialize};
 use std::collections::BTreeMap;
 use std::collections::HashMap;
 use std::convert::TryInto;
-use std::ffi;
 use std::fs::{File, OpenOptions};
 use std::io::{self};
 use std::ops::{BitAnd, Deref, Not, Sub};
@ -38,6 +38,7 @@ use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
 use std::path::PathBuf;
 use std::result;
 use std::sync::{Arc, Barrier, Mutex};
+use std::{ffi, thread};
 use tracer::trace_scoped;
 use versionize::{VersionMap, Versionize, VersionizeResult};
 use versionize_derive::Versionize;
@ -80,6 +81,8 @@ const MPOL_MF_MOVE: u32 = 1 << 1;
 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;

+const MAX_PREFAULT_THREAD_COUNT: usize = 16;
+
 #[derive(Clone, Default, Serialize, Deserialize, Versionize)]
 struct HotPlugState {
    base: u64,
@ -1420,10 +1423,6 @@ impl MemoryManager {
            None
        };

-        if prefault {
-            mmap_flags |= libc::MAP_POPULATE;
-        }
-
        let region = GuestRegionMmap::new(
            MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
                .map_err(Error::GuestMemoryRegion)?,
@ -1431,20 +1430,6 @@ impl MemoryManager {
        )
        .map_err(Error::GuestMemory)?;

-        if region.file_offset().is_none() && thp {
-            info!(
-                "Anonymous mapping at 0x{:x} (size = 0x{:x})",
-                region.as_ptr() as u64,
-                size
-            );
-            // SAFETY: FFI call with correct arguments
-            let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
-            if ret != 0 {
-                let e = io::Error::last_os_error();
-                warn!("Failed to mark pages as THP eligible: {}", e);
-            }
-        }
-
        // Apply NUMA policy if needed.
        if let Some(node) = host_numa_node {
            let addr = region.deref().as_ptr();
@ -1477,9 +1462,120 @@ impl MemoryManager {
                .map_err(Error::ApplyNumaPolicy)?;
        }

+        // Prefault the region if needed, in parallel.
+        if prefault {
+            let page_size =
+                Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize;
+
+            if !is_aligned(size, page_size) {
+                warn!(
+                    "Prefaulting memory size {} misaligned with page size {}",
+                    size, page_size
+                );
+            }
+
+            let num_pages = size / page_size;
+
+            let num_threads = Self::get_prefault_num_threads(page_size, num_pages);
+
+            let pages_per_thread = num_pages / num_threads;
+            let remainder = num_pages % num_threads;
+
+            let barrier = Arc::new(Barrier::new(num_threads));
+            thread::scope(|s| {
+                let r = &region;
+                for i in 0..num_threads {
+                    let barrier = Arc::clone(&barrier);
+                    s.spawn(move || {
+                        // Wait until all threads have been spawned to avoid contention
+                        // over mmap_sem between thread stack allocation and page faulting.
+                        barrier.wait();
+                        let pages = pages_per_thread + if i < remainder { 1 } else { 0 };
+                        let offset =
+                            page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder));
+                        // SAFETY: FFI call with correct arguments
+                        let ret = unsafe {
+                            let addr = r.as_ptr().add(offset);
+                            libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE)
+                        };
+                        if ret != 0 {
+                            let e = io::Error::last_os_error();
+                            warn!("Failed to prefault pages: {}", e);
+                        }
+                    });
+                }
+            });
+        }
+
+        if region.file_offset().is_none() && thp {
+            info!(
+                "Anonymous mapping at 0x{:x} (size = 0x{:x})",
+                region.as_ptr() as u64,
+                size
+            );
+            // SAFETY: FFI call with correct arguments
+            let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
+            if ret != 0 {
+                let e = io::Error::last_os_error();
+                warn!("Failed to mark pages as THP eligible: {}", e);
+            }
+        }
+
        Ok(Arc::new(region))
    }

+    // Duplicate of `memory_zone_get_align_size` that does not require a `zone`
+    fn get_prefault_align_size(
+        backing_file: &Option<PathBuf>,
+        hugepages: bool,
+        hugepage_size: Option<u64>,
+    ) -> Result<u64, Error> {
+        // SAFETY: FFI call. Trivially safe.
+        let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
+        match (hugepages, hugepage_size, backing_file) {
+            (false, _, _) => Ok(page_size),
+            (true, Some(hugepage_size), _) => Ok(hugepage_size),
+            (true, None, _) => {
+                // There are two scenarios here:
+                //  - `hugepages` is enabled but `hugepage_size` is not specified:
+                //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
+                //  - The backing file is specified:
+                //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
+                //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
+                //     value is less than or equal to the page size, just use the page size.
+                let path = backing_file
+                    .as_ref()
+                    .map_or(Ok("/dev/hugepages"), |pathbuf| {
+                        pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
+                    })?;
+                let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
+                Ok(align_size)
+            }
+        }
+    }
+
+    fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize {
+        let mut n: usize = 1;
+
+        // Do not create more threads than processors available.
+        // SAFETY: FFI call. Trivially safe.
+        let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) };
+        if procs > 0 {
+            n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT);
+        }
+
+        // Do not create more threads than pages being allocated.
+        n = std::cmp::min(n, num_pages);
+
+        // Do not create threads to allocate less than 64 MiB of memory.
+        n = std::cmp::min(
+            n,
+            std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))),
+        );
+
+        n
+    }
+
    // Update the GuestMemoryMmap with the new range
    fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
        let guest_memory = self