mirror of
https://github.com/cloud-hypervisor/cloud-hypervisor.git
synced 2024-12-22 13:45:20 +00:00
vmm: prefault memory in parallel to optimize boot time
On guests with large amounts of memory, using the `prefault` option can lead to a very long boot time. This commit implements the strategy taken by QEMU to prefault memory in parallel using multiple threads, decreasing the time to allocate memory for large guests by an order of magnitude or more. For example, this commit reduces the time to allocate memory for a guest configured with 704 GiB of memory on 1 NUMA node using 1 GiB hugepages from 81.44134669s to just 6.865287881s. Signed-off-by: Sean Banko <sbanko@crusoeenergy.com>
This commit is contained in:
parent
939cc348ed
commit
7633d47293
@ -21,6 +21,7 @@ use arch::RegionType;
|
|||||||
use devices::ioapic;
|
use devices::ioapic;
|
||||||
#[cfg(target_arch = "aarch64")]
|
#[cfg(target_arch = "aarch64")]
|
||||||
use hypervisor::HypervisorVmError;
|
use hypervisor::HypervisorVmError;
|
||||||
|
use libc::_SC_NPROCESSORS_ONLN;
|
||||||
#[cfg(target_arch = "x86_64")]
|
#[cfg(target_arch = "x86_64")]
|
||||||
use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
|
use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@ -28,7 +29,6 @@ use serde::{Deserialize, Serialize};
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::ffi;
|
|
||||||
use std::fs::{File, OpenOptions};
|
use std::fs::{File, OpenOptions};
|
||||||
use std::io::{self};
|
use std::io::{self};
|
||||||
use std::ops::{BitAnd, Deref, Not, Sub};
|
use std::ops::{BitAnd, Deref, Not, Sub};
|
||||||
@ -38,6 +38,7 @@ use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::result;
|
use std::result;
|
||||||
use std::sync::{Arc, Barrier, Mutex};
|
use std::sync::{Arc, Barrier, Mutex};
|
||||||
|
use std::{ffi, thread};
|
||||||
use tracer::trace_scoped;
|
use tracer::trace_scoped;
|
||||||
use versionize::{VersionMap, Versionize, VersionizeResult};
|
use versionize::{VersionMap, Versionize, VersionizeResult};
|
||||||
use versionize_derive::Versionize;
|
use versionize_derive::Versionize;
|
||||||
@ -80,6 +81,8 @@ const MPOL_MF_MOVE: u32 = 1 << 1;
|
|||||||
// Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
|
// Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
|
||||||
const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
|
const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
|
||||||
|
|
||||||
|
const MAX_PREFAULT_THREAD_COUNT: usize = 16;
|
||||||
|
|
||||||
#[derive(Clone, Default, Serialize, Deserialize, Versionize)]
|
#[derive(Clone, Default, Serialize, Deserialize, Versionize)]
|
||||||
struct HotPlugState {
|
struct HotPlugState {
|
||||||
base: u64,
|
base: u64,
|
||||||
@ -1420,10 +1423,6 @@ impl MemoryManager {
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
if prefault {
|
|
||||||
mmap_flags |= libc::MAP_POPULATE;
|
|
||||||
}
|
|
||||||
|
|
||||||
let region = GuestRegionMmap::new(
|
let region = GuestRegionMmap::new(
|
||||||
MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
|
MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
|
||||||
.map_err(Error::GuestMemoryRegion)?,
|
.map_err(Error::GuestMemoryRegion)?,
|
||||||
@ -1431,20 +1430,6 @@ impl MemoryManager {
|
|||||||
)
|
)
|
||||||
.map_err(Error::GuestMemory)?;
|
.map_err(Error::GuestMemory)?;
|
||||||
|
|
||||||
if region.file_offset().is_none() && thp {
|
|
||||||
info!(
|
|
||||||
"Anonymous mapping at 0x{:x} (size = 0x{:x})",
|
|
||||||
region.as_ptr() as u64,
|
|
||||||
size
|
|
||||||
);
|
|
||||||
// SAFETY: FFI call with correct arguments
|
|
||||||
let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
|
|
||||||
if ret != 0 {
|
|
||||||
let e = io::Error::last_os_error();
|
|
||||||
warn!("Failed to mark pages as THP eligible: {}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Apply NUMA policy if needed.
|
// Apply NUMA policy if needed.
|
||||||
if let Some(node) = host_numa_node {
|
if let Some(node) = host_numa_node {
|
||||||
let addr = region.deref().as_ptr();
|
let addr = region.deref().as_ptr();
|
||||||
@ -1477,9 +1462,120 @@ impl MemoryManager {
|
|||||||
.map_err(Error::ApplyNumaPolicy)?;
|
.map_err(Error::ApplyNumaPolicy)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prefault the region if needed, in parallel.
|
||||||
|
if prefault {
|
||||||
|
let page_size =
|
||||||
|
Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize;
|
||||||
|
|
||||||
|
if !is_aligned(size, page_size) {
|
||||||
|
warn!(
|
||||||
|
"Prefaulting memory size {} misaligned with page size {}",
|
||||||
|
size, page_size
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let num_pages = size / page_size;
|
||||||
|
|
||||||
|
let num_threads = Self::get_prefault_num_threads(page_size, num_pages);
|
||||||
|
|
||||||
|
let pages_per_thread = num_pages / num_threads;
|
||||||
|
let remainder = num_pages % num_threads;
|
||||||
|
|
||||||
|
let barrier = Arc::new(Barrier::new(num_threads));
|
||||||
|
thread::scope(|s| {
|
||||||
|
let r = ®ion;
|
||||||
|
for i in 0..num_threads {
|
||||||
|
let barrier = Arc::clone(&barrier);
|
||||||
|
s.spawn(move || {
|
||||||
|
// Wait until all threads have been spawned to avoid contention
|
||||||
|
// over mmap_sem between thread stack allocation and page faulting.
|
||||||
|
barrier.wait();
|
||||||
|
let pages = pages_per_thread + if i < remainder { 1 } else { 0 };
|
||||||
|
let offset =
|
||||||
|
page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder));
|
||||||
|
// SAFETY: FFI call with correct arguments
|
||||||
|
let ret = unsafe {
|
||||||
|
let addr = r.as_ptr().add(offset);
|
||||||
|
libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE)
|
||||||
|
};
|
||||||
|
if ret != 0 {
|
||||||
|
let e = io::Error::last_os_error();
|
||||||
|
warn!("Failed to prefault pages: {}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if region.file_offset().is_none() && thp {
|
||||||
|
info!(
|
||||||
|
"Anonymous mapping at 0x{:x} (size = 0x{:x})",
|
||||||
|
region.as_ptr() as u64,
|
||||||
|
size
|
||||||
|
);
|
||||||
|
// SAFETY: FFI call with correct arguments
|
||||||
|
let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
|
||||||
|
if ret != 0 {
|
||||||
|
let e = io::Error::last_os_error();
|
||||||
|
warn!("Failed to mark pages as THP eligible: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(Arc::new(region))
|
Ok(Arc::new(region))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Duplicate of `memory_zone_get_align_size` that does not require a `zone`
|
||||||
|
fn get_prefault_align_size(
|
||||||
|
backing_file: &Option<PathBuf>,
|
||||||
|
hugepages: bool,
|
||||||
|
hugepage_size: Option<u64>,
|
||||||
|
) -> Result<u64, Error> {
|
||||||
|
// SAFETY: FFI call. Trivially safe.
|
||||||
|
let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
|
||||||
|
match (hugepages, hugepage_size, backing_file) {
|
||||||
|
(false, _, _) => Ok(page_size),
|
||||||
|
(true, Some(hugepage_size), _) => Ok(hugepage_size),
|
||||||
|
(true, None, _) => {
|
||||||
|
// There are two scenarios here:
|
||||||
|
// - `hugepages` is enabled but `hugepage_size` is not specified:
|
||||||
|
// Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
|
||||||
|
// - The backing file is specified:
|
||||||
|
// Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page
|
||||||
|
// size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the
|
||||||
|
// value is less than or equal to the page size, just use the page size.
|
||||||
|
let path = backing_file
|
||||||
|
.as_ref()
|
||||||
|
.map_or(Ok("/dev/hugepages"), |pathbuf| {
|
||||||
|
pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
|
||||||
|
})?;
|
||||||
|
let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
|
||||||
|
Ok(align_size)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize {
|
||||||
|
let mut n: usize = 1;
|
||||||
|
|
||||||
|
// Do not create more threads than processors available.
|
||||||
|
// SAFETY: FFI call. Trivially safe.
|
||||||
|
let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) };
|
||||||
|
if procs > 0 {
|
||||||
|
n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do not create more threads than pages being allocated.
|
||||||
|
n = std::cmp::min(n, num_pages);
|
||||||
|
|
||||||
|
// Do not create threads to allocate less than 64 MiB of memory.
|
||||||
|
n = std::cmp::min(
|
||||||
|
n,
|
||||||
|
std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))),
|
||||||
|
);
|
||||||
|
|
||||||
|
n
|
||||||
|
}
|
||||||
|
|
||||||
// Update the GuestMemoryMmap with the new range
|
// Update the GuestMemoryMmap with the new range
|
||||||
fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
|
fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
|
||||||
let guest_memory = self
|
let guest_memory = self
|
||||||
|
Loading…
Reference in New Issue
Block a user