mirror of
https://github.com/cloud-hypervisor/cloud-hypervisor.git
synced 2024-12-22 05:35:20 +00:00
block: Set an option to pin virtio block threads to host cpus
Currently the only way to set the affinity for virtio block threads is to boot the VM, search for the tid of each of the virtio block threads, then set the affinity manually. This commit adds an option to pin virtio block queues to specific host cpus (similar to pinning vcpus to host cpus). A queue_affinity option has been added to the disk flag in the cli to specify a mapping of queue indices to host cpus. Signed-off-by: acarp <acarp@crusoeenergy.com>
This commit is contained in:
parent
ee0cf3a715
commit
035c4b20fb
@ -21,6 +21,7 @@ use virtio_devices::{Block, VirtioDevice, VirtioInterrupt, VirtioInterruptType};
|
||||
use virtio_queue::{Queue, QueueT};
|
||||
use vm_memory::{bitmap::AtomicBitmap, Bytes, GuestAddress, GuestMemoryAtomic};
|
||||
use vmm_sys_util::eventfd::{EventFd, EFD_NONBLOCK};
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
type GuestMemoryMmap = vm_memory::GuestMemoryMmap<AtomicBitmap>;
|
||||
|
||||
@ -49,6 +50,7 @@ fuzz_target!(|bytes| {
|
||||
let shm = memfd_create(&ffi::CString::new("fuzz").unwrap(), 0).unwrap();
|
||||
let disk_file: File = unsafe { File::from_raw_fd(shm) };
|
||||
let qcow_disk = Box::new(RawFileDiskSync::new(disk_file)) as Box<dyn DiskFile>;
|
||||
let queue_affinity = BTreeMap::new();
|
||||
let mut block = Block::new(
|
||||
"tmp".to_owned(),
|
||||
qcow_disk,
|
||||
@ -62,6 +64,7 @@ fuzz_target!(|bytes| {
|
||||
None,
|
||||
EventFd::new(EFD_NONBLOCK).unwrap(),
|
||||
None,
|
||||
queue_affinity,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2575,7 +2575,63 @@ mod common_parallel {
|
||||
|
||||
let _ = child.kill();
|
||||
let output = child.wait_with_output().unwrap();
|
||||
handle_child_output(r, &output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_virtio_queue_affinity() {
|
||||
let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string());
|
||||
let guest = Guest::new(Box::new(focal));
|
||||
|
||||
// We need the host to have at least 4 CPUs if we want to be able
|
||||
// to run this test.
|
||||
let host_cpus_count = exec_host_command_output("nproc");
|
||||
assert!(
|
||||
String::from_utf8_lossy(&host_cpus_count.stdout)
|
||||
.trim()
|
||||
.parse::<u16>()
|
||||
.unwrap_or(0)
|
||||
>= 4
|
||||
);
|
||||
|
||||
let mut child = GuestCommand::new(&guest)
|
||||
.args(["--cpus", "boot=4"])
|
||||
.args(["--memory", "size=512M"])
|
||||
.args(["--kernel", direct_kernel_boot_path().to_str().unwrap()])
|
||||
.args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE])
|
||||
.args([
|
||||
"--disk",
|
||||
format!(
|
||||
"path={}",
|
||||
guest.disk_config.disk(DiskType::OperatingSystem).unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!(
|
||||
"path={},num_queues=4,queue_affinity=[0@[0,2],1@[1,3],2@[1],3@[3]]",
|
||||
guest.disk_config.disk(DiskType::CloudInit).unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
])
|
||||
.default_net()
|
||||
.capture_output()
|
||||
.spawn()
|
||||
.unwrap();
|
||||
|
||||
let r = std::panic::catch_unwind(|| {
|
||||
guest.wait_vm_boot(None).unwrap();
|
||||
let pid = child.id();
|
||||
let taskset_q0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str());
|
||||
assert_eq!(String::from_utf8_lossy(&taskset_q0.stdout).trim(), "0,2");
|
||||
let taskset_q1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str());
|
||||
assert_eq!(String::from_utf8_lossy(&taskset_q1.stdout).trim(), "1,3");
|
||||
let taskset_q2 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q2 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str());
|
||||
assert_eq!(String::from_utf8_lossy(&taskset_q2.stdout).trim(), "1");
|
||||
let taskset_q3 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q3 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str());
|
||||
assert_eq!(String::from_utf8_lossy(&taskset_q3.stdout).trim(), "3");
|
||||
});
|
||||
|
||||
let _ = child.kill();
|
||||
let output = child.wait_with_output().unwrap();
|
||||
handle_child_output(r, &output);
|
||||
}
|
||||
|
||||
|
@ -25,6 +25,7 @@ use block::{
|
||||
use rate_limiter::group::{RateLimiterGroup, RateLimiterGroupHandle};
|
||||
use rate_limiter::TokenType;
|
||||
use seccompiler::SeccompAction;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::VecDeque;
|
||||
use std::io;
|
||||
use std::num::Wrapping;
|
||||
@ -134,6 +135,7 @@ struct BlockEpollHandler {
|
||||
rate_limiter: Option<RateLimiterGroupHandle>,
|
||||
access_platform: Option<Arc<dyn AccessPlatform>>,
|
||||
read_only: bool,
|
||||
host_cpus: Option<Vec<usize>>,
|
||||
}
|
||||
|
||||
impl BlockEpollHandler {
|
||||
@ -408,6 +410,41 @@ impl BlockEpollHandler {
|
||||
})
|
||||
}
|
||||
|
||||
fn set_queue_thread_affinity(&self) {
|
||||
// Prepare the CPU set the current queue thread is expected to run onto.
|
||||
let cpuset = self.host_cpus.as_ref().map(|host_cpus| {
|
||||
// SAFETY: all zeros is a valid pattern
|
||||
let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
|
||||
// SAFETY: FFI call, trivially safe
|
||||
unsafe { libc::CPU_ZERO(&mut cpuset) };
|
||||
for host_cpu in host_cpus {
|
||||
// SAFETY: FFI call, trivially safe
|
||||
unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
|
||||
}
|
||||
cpuset
|
||||
});
|
||||
|
||||
// Schedule the thread to run on the expected CPU set
|
||||
if let Some(cpuset) = cpuset.as_ref() {
|
||||
// SAFETY: FFI call with correct arguments
|
||||
let ret = unsafe {
|
||||
libc::sched_setaffinity(
|
||||
0,
|
||||
std::mem::size_of::<libc::cpu_set_t>(),
|
||||
cpuset as *const libc::cpu_set_t,
|
||||
)
|
||||
};
|
||||
|
||||
if ret != 0 {
|
||||
error!(
|
||||
"Failed scheduling the virtqueue thread {} on the expected CPU set: {}",
|
||||
self.queue_index,
|
||||
io::Error::last_os_error()
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn run(
|
||||
&mut self,
|
||||
paused: Arc<AtomicBool>,
|
||||
@ -419,6 +456,7 @@ impl BlockEpollHandler {
|
||||
if let Some(rate_limiter) = &self.rate_limiter {
|
||||
helper.add_event(rate_limiter.as_raw_fd(), RATE_LIMITER_EVENT)?;
|
||||
}
|
||||
self.set_queue_thread_affinity();
|
||||
helper.run(paused, paused_sync, self)?;
|
||||
|
||||
Ok(())
|
||||
@ -511,6 +549,7 @@ pub struct Block {
|
||||
exit_evt: EventFd,
|
||||
read_only: bool,
|
||||
serial: Vec<u8>,
|
||||
queue_affinity: BTreeMap<u16, Vec<usize>>,
|
||||
}
|
||||
|
||||
#[derive(Versionize)]
|
||||
@ -540,6 +579,7 @@ impl Block {
|
||||
rate_limiter: Option<Arc<RateLimiterGroup>>,
|
||||
exit_evt: EventFd,
|
||||
state: Option<BlockState>,
|
||||
queue_affinity: BTreeMap<u16, Vec<usize>>,
|
||||
) -> io::Result<Self> {
|
||||
let (disk_nsectors, avail_features, acked_features, config, paused) =
|
||||
if let Some(state) = state {
|
||||
@ -643,6 +683,7 @@ impl Block {
|
||||
exit_evt,
|
||||
read_only,
|
||||
serial,
|
||||
queue_affinity,
|
||||
})
|
||||
}
|
||||
|
||||
@ -746,9 +787,10 @@ impl VirtioDevice for Block {
|
||||
let (_, queue, queue_evt) = queues.remove(0);
|
||||
let queue_size = queue.size();
|
||||
let (kill_evt, pause_evt) = self.common.dup_eventfds();
|
||||
let queue_idx = i as u16;
|
||||
|
||||
let mut handler = BlockEpollHandler {
|
||||
queue_index: i as u16,
|
||||
queue_index: queue_idx,
|
||||
queue,
|
||||
mem: mem.clone(),
|
||||
disk_image: self
|
||||
@ -778,6 +820,7 @@ impl VirtioDevice for Block {
|
||||
.unwrap(),
|
||||
access_platform: self.common.access_platform.clone(),
|
||||
read_only: self.read_only,
|
||||
host_cpus: self.queue_affinity.get(&queue_idx).cloned(),
|
||||
};
|
||||
|
||||
let paused = self.common.paused.clone();
|
||||
|
@ -96,6 +96,7 @@ fn virtio_block_thread_rules() -> Vec<(i64, Vec<SeccompRule>)> {
|
||||
(libc::SYS_pwritev, vec![]),
|
||||
(libc::SYS_pwrite64, vec![]),
|
||||
(libc::SYS_sched_getaffinity, vec![]),
|
||||
(libc::SYS_sched_setaffinity, vec![]),
|
||||
(libc::SYS_set_robust_list, vec![]),
|
||||
(libc::SYS_timerfd_settime, vec![]),
|
||||
]
|
||||
|
@ -829,6 +829,19 @@ components:
|
||||
rate_limiter_config:
|
||||
$ref: "#/components/schemas/RateLimiterConfig"
|
||||
|
||||
VirtQueueAffinity:
|
||||
required:
|
||||
- queue_index
|
||||
- host_cpus
|
||||
type: object
|
||||
properties:
|
||||
queue_index:
|
||||
type: integer
|
||||
host_cpus:
|
||||
type: array
|
||||
items:
|
||||
type: integer
|
||||
|
||||
DiskConfig:
|
||||
required:
|
||||
- path
|
||||
@ -867,6 +880,10 @@ components:
|
||||
type: string
|
||||
rate_limit_group:
|
||||
type: string
|
||||
affinity:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/VirtQueueAffinity"
|
||||
|
||||
NetConfig:
|
||||
type: object
|
||||
|
@ -1001,7 +1001,8 @@ impl DiskConfig {
|
||||
vhost_user=on|off,socket=<vhost_user_socket_path>,\
|
||||
bw_size=<bytes>,bw_one_time_burst=<bytes>,bw_refill_time=<ms>,\
|
||||
ops_size=<io_ops>,ops_one_time_burst=<io_ops>,ops_refill_time=<ms>,\
|
||||
id=<device_id>,pci_segment=<segment_id>,rate_limit_group=<group_id>\"";
|
||||
id=<device_id>,pci_segment=<segment_id>,rate_limit_group=<group_id>,\
|
||||
queue_affinity=<list_of_queue_indices_with_their_associated_cpuset>";
|
||||
|
||||
pub fn parse(disk: &str) -> Result<Self> {
|
||||
let mut parser = OptionParser::new();
|
||||
@ -1025,7 +1026,8 @@ impl DiskConfig {
|
||||
.add("_disable_aio")
|
||||
.add("pci_segment")
|
||||
.add("serial")
|
||||
.add("rate_limit_group");
|
||||
.add("rate_limit_group")
|
||||
.add("queue_affinity");
|
||||
parser.parse(disk).map_err(Error::ParseDisk)?;
|
||||
|
||||
let path = parser.get("path").map(PathBuf::from);
|
||||
@ -1099,6 +1101,17 @@ impl DiskConfig {
|
||||
.map_err(Error::ParseDisk)?
|
||||
.unwrap_or_default();
|
||||
let serial = parser.get("serial");
|
||||
let queue_affinity = parser
|
||||
.convert::<Tuple<u16, Vec<usize>>>("queue_affinity")
|
||||
.map_err(Error::ParseDisk)?
|
||||
.map(|v| {
|
||||
v.0.iter()
|
||||
.map(|(e1, e2)| VirtQueueAffinity {
|
||||
queue_index: *e1,
|
||||
host_cpus: e2.clone(),
|
||||
})
|
||||
.collect()
|
||||
});
|
||||
let bw_tb_config = if bw_size != 0 && bw_refill_time != 0 {
|
||||
Some(TokenBucketConfig {
|
||||
size: bw_size,
|
||||
@ -1142,6 +1155,7 @@ impl DiskConfig {
|
||||
disable_aio,
|
||||
pci_segment,
|
||||
serial,
|
||||
queue_affinity,
|
||||
})
|
||||
}
|
||||
|
||||
@ -2922,6 +2936,7 @@ mod tests {
|
||||
rate_limiter_config: None,
|
||||
pci_segment: 0,
|
||||
serial: None,
|
||||
queue_affinity: None,
|
||||
}
|
||||
}
|
||||
|
||||
@ -2992,6 +3007,30 @@ mod tests {
|
||||
..disk_fixture()
|
||||
}
|
||||
);
|
||||
assert_eq!(
|
||||
DiskConfig::parse("path=/path/to_file,queue_affinity=[0@[1],1@[2],2@[3,4],3@[5-8]]")?,
|
||||
DiskConfig {
|
||||
queue_affinity: Some(vec![
|
||||
VirtQueueAffinity {
|
||||
queue_index: 0,
|
||||
host_cpus: vec![1],
|
||||
},
|
||||
VirtQueueAffinity {
|
||||
queue_index: 1,
|
||||
host_cpus: vec![2],
|
||||
},
|
||||
VirtQueueAffinity {
|
||||
queue_index: 2,
|
||||
host_cpus: vec![3, 4],
|
||||
},
|
||||
VirtQueueAffinity {
|
||||
queue_index: 3,
|
||||
host_cpus: vec![5, 6, 7, 8],
|
||||
}
|
||||
]),
|
||||
..disk_fixture()
|
||||
}
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -64,7 +64,7 @@ use pci::{
|
||||
use rate_limiter::group::RateLimiterGroup;
|
||||
use seccompiler::SeccompAction;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::collections::{BTreeMap, BTreeSet, HashMap};
|
||||
use std::fs::{read_link, File, OpenOptions};
|
||||
use std::io::{self, stdout, Seek, SeekFrom};
|
||||
use std::mem::zeroed;
|
||||
@ -2512,6 +2512,15 @@ impl DeviceManager {
|
||||
None
|
||||
};
|
||||
|
||||
let queue_affinity = if let Some(queue_affinity) = disk_cfg.queue_affinity.as_ref() {
|
||||
queue_affinity
|
||||
.iter()
|
||||
.map(|a| (a.queue_index, a.host_cpus.clone()))
|
||||
.collect()
|
||||
} else {
|
||||
BTreeMap::new()
|
||||
};
|
||||
|
||||
let virtio_block = Arc::new(Mutex::new(
|
||||
virtio_devices::Block::new(
|
||||
id.clone(),
|
||||
@ -2535,6 +2544,7 @@ impl DeviceManager {
|
||||
.map(|s| s.to_versioned_state())
|
||||
.transpose()
|
||||
.map_err(DeviceManagerError::RestoreGetState)?,
|
||||
queue_affinity,
|
||||
)
|
||||
.map_err(DeviceManagerError::CreateVirtioBlock)?,
|
||||
));
|
||||
|
@ -187,6 +187,12 @@ pub struct RateLimiterGroupConfig {
|
||||
pub rate_limiter_config: RateLimiterConfig,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
|
||||
pub struct VirtQueueAffinity {
|
||||
pub queue_index: u16,
|
||||
pub host_cpus: Vec<usize>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
|
||||
pub struct DiskConfig {
|
||||
pub path: Option<PathBuf>,
|
||||
@ -219,6 +225,8 @@ pub struct DiskConfig {
|
||||
pub pci_segment: u16,
|
||||
#[serde(default)]
|
||||
pub serial: Option<String>,
|
||||
#[serde(default)]
|
||||
pub queue_affinity: Option<Vec<VirtQueueAffinity>>,
|
||||
}
|
||||
|
||||
pub const DEFAULT_DISK_NUM_QUEUES: usize = 1;
|
||||
|
Loading…
Reference in New Issue
Block a user