cloud-hypervisor/vhost_user_block/src/lib.rs
Sergio Lopez 5c06b7f862 vhost_user_block: Implement optional static polling
Actively polling the virtqueue significantly reduces the latency of
each I/O operation, at the expense of using more CPU time. This
features is specially useful when using low-latency devices (SSD,
NVMe) as the backend.

This change implements static polling. When a request arrives after
being idle, vhost_user_block will keep checking the virtqueue for new
requests, until POLL_QUEUE_US (50us) has passed without finding one.

POLL_QUEUE_US is defined to be 50us, based on the current latency of
enterprise SSDs (< 30us) and the overhead of the emulation.

This feature is enabled by default, and can be disabled by using the
"poll_queue" parameter of "block-backend".

This is a test using null_blk as a backend for the image, with the
following parameters:

 - null_blk gb=20 nr_devices=1 irqmode=2 completion_nsec=0 no_sched=1

With "poll_queue=false":

fio --ioengine=sync --bs=4k --rw randread --name randread --direct=1
--filename=/dev/vdb --time_based --runtime=10

randread: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=sync, iodepth=1
fio-3.14
Starting 1 process
Jobs: 1 (f=1): [r(1)][100.0%][r=169MiB/s][r=43.2k IOPS][eta 00m:00s]
randread: (groupid=0, jobs=1): err= 0: pid=433: Tue Feb 18 11:12:59 2020
  read: IOPS=43.2k, BW=169MiB/s (177MB/s)(1688MiB/10001msec)
    clat (usec): min=17, max=836, avg=21.64, stdev= 3.81
     lat (usec): min=17, max=836, avg=21.77, stdev= 3.81
    clat percentiles (nsec):
     |  1.00th=[19328],  5.00th=[19840], 10.00th=[20352], 20.00th=[21120],
     | 30.00th=[21376], 40.00th=[21376], 50.00th=[21376], 60.00th=[21632],
     | 70.00th=[21632], 80.00th=[21888], 90.00th=[22144], 95.00th=[22912],
     | 99.00th=[28544], 99.50th=[30336], 99.90th=[39168], 99.95th=[42752],
     | 99.99th=[71168]
   bw (  KiB/s): min=168440, max=188496, per=100.00%, avg=172912.00, stdev=3975.63, samples=19
   iops        : min=42110, max=47124, avg=43228.00, stdev=993.91, samples=19
  lat (usec)   : 20=5.90%, 50=94.08%, 100=0.02%, 250=0.01%, 500=0.01%
  lat (usec)   : 750=0.01%, 1000=0.01%
  cpu          : usr=10.35%, sys=25.82%, ctx=432417, majf=0, minf=10
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=432220,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
   READ: bw=169MiB/s (177MB/s), 169MiB/s-169MiB/s (177MB/s-177MB/s), io=1688MiB (1770MB), run=10001-10001msec

Disk stats (read/write):
  vdb: ios=427867/0, merge=0/0, ticks=7346/0, in_queue=0, util=99.04%

With "poll_queue=true" (default):

fio --ioengine=sync --bs=4k --rw randread --name randread --direct=1
--filename=/dev/vdb --time_based --runtime=10

randread: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=sync, iodepth=1
fio-3.14
Starting 1 process
Jobs: 1 (f=1): [r(1)][100.0%][r=260MiB/s][r=66.7k IOPS][eta 00m:00s]
randread: (groupid=0, jobs=1): err= 0: pid=422: Tue Feb 18 11:14:47 2020
  read: IOPS=68.5k, BW=267MiB/s (280MB/s)(2674MiB/10001msec)
    clat (usec): min=10, max=966, avg=13.60, stdev= 3.49
     lat (usec): min=10, max=966, avg=13.70, stdev= 3.50
    clat percentiles (nsec):
     |  1.00th=[11200],  5.00th=[11968], 10.00th=[11968], 20.00th=[12224],
     | 30.00th=[12992], 40.00th=[13504], 50.00th=[13760], 60.00th=[13888],
     | 70.00th=[14016], 80.00th=[14144], 90.00th=[14272], 95.00th=[14656],
     | 99.00th=[20352], 99.50th=[23936], 99.90th=[35072], 99.95th=[36096],
     | 99.99th=[47872]
   bw (  KiB/s): min=265456, max=296456, per=100.00%, avg=274229.05, stdev=13048.14, samples=19
   iops        : min=66364, max=74114, avg=68557.26, stdev=3262.03, samples=19
  lat (usec)   : 20=98.84%, 50=1.15%, 100=0.01%, 250=0.01%, 500=0.01%
  lat (usec)   : 750=0.01%, 1000=0.01%
  cpu          : usr=8.24%, sys=21.15%, ctx=684669, majf=0, minf=10
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=684611,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
   READ: bw=267MiB/s (280MB/s), 267MiB/s-267MiB/s (280MB/s-280MB/s), io=2674MiB (2804MB), run=10001-10001msec

Disk stats (read/write):
  vdb: ios=677855/0, merge=0/0, ticks=7026/0, in_queue=0, util=99.04%

Signed-off-by: Sergio Lopez <slp@redhat.com>
2020-02-19 17:13:47 +00:00

452 lines
14 KiB
Rust

// Copyright 2019 Red Hat, Inc. All Rights Reserved.
//
// Portions Copyright 2019 Intel Corporation. All Rights Reserved.
//
// Portions Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
//
// SPDX-License-Identifier: (Apache-2.0 AND BSD-3-Clause)
extern crate log;
extern crate vhost_rs;
extern crate vhost_user_backend;
extern crate vm_virtio;
use epoll;
use libc::EFD_NONBLOCK;
use log::*;
use qcow::{self, ImageType, QcowFile};
use std::fs::File;
use std::fs::OpenOptions;
use std::io::Read;
use std::io::{Seek, SeekFrom, Write};
use std::mem;
use std::num::Wrapping;
use std::os::unix::fs::OpenOptionsExt;
use std::path::PathBuf;
use std::process;
use std::slice;
use std::sync::{Arc, RwLock};
use std::time::Instant;
use std::vec::Vec;
use std::{convert, error, fmt, io};
use vhost_rs::vhost_user::message::*;
use vhost_user_backend::{VhostUserBackend, VhostUserDaemon, Vring, VringWorker};
use virtio_bindings::bindings::virtio_blk::*;
use virtio_bindings::bindings::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
use vm_memory::{Bytes, GuestMemoryError, GuestMemoryMmap};
use vm_virtio::block::{build_disk_image_id, Request};
use vmm_sys_util::eventfd::EventFd;
const QUEUE_SIZE: usize = 1024;
const SECTOR_SHIFT: u8 = 9;
const SECTOR_SIZE: u64 = (0x01 as u64) << SECTOR_SHIFT;
const BLK_SIZE: u32 = 512;
// Current (2020) enterprise SSDs have a latency lower than 30us.
// Polling for 50us should be enough to cover for the device latency
// and the overhead of the emulation layer.
const POLL_QUEUE_US: u128 = 50;
trait DiskFile: Read + Seek + Write + Send + Sync {}
impl<D: Read + Seek + Write + Send + Sync> DiskFile for D {}
pub type Result<T> = std::result::Result<T, Error>;
pub type VhostUserBackendResult<T> = std::result::Result<T, std::io::Error>;
#[derive(Debug)]
pub enum Error {
/// Failed to detect image type.
DetectImageType,
/// Bad memory address.
GuestMemory(GuestMemoryError),
/// Can't open image file.
OpenImage,
/// Failed to parse direct parameter.
ParseDirectParam,
/// Failed to parse image parameter.
ParseImageParam,
/// Failed to parse sock parameter.
ParseSockParam,
/// Failed to parse readonly parameter.
ParseReadOnlyParam,
/// Failed parsing fs number of queues parameter.
ParseBlkNumQueuesParam(std::num::ParseIntError),
/// Failed to parse the poll_queue parameter.
ParsePollQueueParam,
/// Failed to handle event other than input event.
HandleEventNotEpollIn,
/// Failed to create kill eventfd
CreateKillEventFd(io::Error),
/// Failed to handle unknown event.
HandleEventUnknownEvent,
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "vhost_user_block_error: {:?}", self)
}
}
impl error::Error for Error {}
impl convert::From<Error> for io::Error {
fn from(e: Error) -> Self {
io::Error::new(io::ErrorKind::Other, e)
}
}
pub struct VhostUserBlkBackend {
mem: Option<GuestMemoryMmap>,
vring_worker: Option<Arc<VringWorker>>,
disk_image: Box<dyn DiskFile>,
disk_image_id: Vec<u8>,
disk_nsectors: u64,
config: virtio_blk_config,
rdonly: bool,
poll_queue: bool,
event_idx: bool,
kill_evt: EventFd,
}
impl VhostUserBlkBackend {
pub fn new(
image_path: String,
num_queues: usize,
rdonly: bool,
direct: bool,
poll_queue: bool,
) -> Result<Self> {
let mut options = OpenOptions::new();
options.read(true);
options.write(!rdonly);
if direct {
options.custom_flags(libc::O_DIRECT);
}
let image: File = options.open(&image_path).unwrap();
let mut raw_img: vm_virtio::RawFile = vm_virtio::RawFile::new(image, direct);
let image_id = build_disk_image_id(&PathBuf::from(&image_path));
let image_type = qcow::detect_image_type(&mut raw_img).unwrap();
let mut image = match image_type {
ImageType::Raw => Box::new(raw_img) as Box<dyn DiskFile>,
ImageType::Qcow2 => Box::new(QcowFile::from(raw_img).unwrap()) as Box<dyn DiskFile>,
};
let nsectors = (image.seek(SeekFrom::End(0)).unwrap() as u64) / SECTOR_SIZE;
let mut config = virtio_blk_config::default();
config.capacity = nsectors;
config.blk_size = BLK_SIZE;
config.size_max = 65535;
config.seg_max = 128 - 2;
config.min_io_size = 1;
config.opt_io_size = 1;
config.num_queues = num_queues as u16;
config.wce = 1;
Ok(VhostUserBlkBackend {
mem: None,
vring_worker: None,
disk_image: image,
disk_image_id: image_id,
disk_nsectors: nsectors,
config,
rdonly,
poll_queue,
event_idx: false,
kill_evt: EventFd::new(EFD_NONBLOCK).map_err(Error::CreateKillEventFd)?,
})
}
pub fn process_queue(&mut self, vring: &mut Vring) -> bool {
let mut used_any = false;
let mem = match self.mem.as_ref() {
Some(m) => m,
None => return false,
};
while let Some(head) = vring.mut_queue().iter(mem).next() {
debug!("got an element in the queue");
let len;
match Request::parse(&head, mem) {
Ok(request) => {
debug!("element is a valid request");
let status = match request.execute(
&mut self.disk_image,
self.disk_nsectors,
mem,
&self.disk_image_id,
) {
Ok(l) => {
len = l;
VIRTIO_BLK_S_OK
}
Err(e) => {
len = 1;
e.status()
}
};
mem.write_obj(status, request.status_addr).unwrap();
}
Err(err) => {
error!("failed to parse available descriptor chain: {:?}", err);
len = 0;
}
}
if self.event_idx {
if let Some(used_idx) = vring.mut_queue().add_used(mem, head.index, len) {
let used_event = vring.mut_queue().get_used_event(mem);
if vring.needs_notification(Wrapping(used_idx), used_event) {
debug!("signalling queue");
vring.signal_used_queue().unwrap();
} else {
debug!("omitting signal (event_idx)");
}
used_any = true;
}
} else {
debug!("signalling queue");
vring.mut_queue().add_used(mem, head.index, len);
vring.signal_used_queue().unwrap();
used_any = true;
}
}
used_any
}
pub fn set_vring_worker(&mut self, vring_worker: Option<Arc<VringWorker>>) {
self.vring_worker = vring_worker;
}
}
impl VhostUserBackend for VhostUserBlkBackend {
fn num_queues(&self) -> usize {
self.config.num_queues as usize
}
fn max_queue_size(&self) -> usize {
QUEUE_SIZE
}
fn features(&self) -> u64 {
let mut avail_features = 1 << VIRTIO_BLK_F_MQ
| 1 << VIRTIO_BLK_F_CONFIG_WCE
| 1 << VIRTIO_RING_F_EVENT_IDX
| 1 << VIRTIO_F_VERSION_1
| VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits();
if self.rdonly {
avail_features |= 1 << VIRTIO_BLK_F_RO;
}
avail_features
}
fn protocol_features(&self) -> VhostUserProtocolFeatures {
VhostUserProtocolFeatures::CONFIG
}
fn set_event_idx(&mut self, enabled: bool) {
self.event_idx = enabled;
}
fn update_memory(&mut self, mem: GuestMemoryMmap) -> VhostUserBackendResult<()> {
self.mem = Some(mem);
Ok(())
}
fn handle_event(
&mut self,
device_event: u16,
evset: epoll::Events,
vrings: &[Arc<RwLock<Vring>>],
) -> VhostUserBackendResult<bool> {
if evset != epoll::Events::EPOLLIN {
return Err(Error::HandleEventNotEpollIn.into());
}
debug!("event received: {:?}", device_event);
match device_event {
q if device_event < self.config.num_queues => {
let mut vring = vrings[q as usize].write().unwrap();
if self.poll_queue {
// Actively poll the queue until POLL_QUEUE_US has passed
// without seeing a new request.
let mut now = Instant::now();
loop {
if self.process_queue(&mut vring) {
now = Instant::now();
} else if now.elapsed().as_micros() > POLL_QUEUE_US {
break;
}
}
}
if self.event_idx {
// vm-virtio's Queue implementation only checks avail_index
// once, so to properly support EVENT_IDX we need to keep
// calling process_queue() until it stops finding new
// requests on the queue.
loop {
vring
.mut_queue()
.update_avail_event(self.mem.as_ref().unwrap());
if !self.process_queue(&mut vring) {
break;
}
}
} else {
// Without EVENT_IDX, a single call is enough.
self.process_queue(&mut vring);
}
Ok(false)
}
_ => Err(Error::HandleEventUnknownEvent.into()),
}
}
fn get_config(&self, _offset: u32, _size: u32) -> Vec<u8> {
// self.config is a statically allocated virtio_blk_config
let buf = unsafe {
slice::from_raw_parts(
&self.config as *const virtio_blk_config as *const _,
mem::size_of::<virtio_blk_config>(),
)
};
buf.to_vec()
}
fn exit_event(&self) -> Option<(EventFd, Option<u16>)> {
Some((self.kill_evt.try_clone().unwrap(), None))
}
}
pub struct VhostUserBlkBackendConfig<'a> {
pub image: &'a str,
pub sock: &'a str,
pub num_queues: usize,
pub readonly: bool,
pub direct: bool,
pub poll_queue: bool,
}
impl<'a> VhostUserBlkBackendConfig<'a> {
pub fn parse(backend: &'a str) -> Result<Self> {
let params_list: Vec<&str> = backend.split(',').collect();
let mut image: &str = "";
let mut sock: &str = "";
let mut num_queues_str: &str = "";
let mut readonly: bool = false;
let mut direct: bool = false;
let mut poll_queue: bool = true;
for param in params_list.iter() {
if param.starts_with("image=") {
image = &param[6..];
} else if param.starts_with("sock=") {
sock = &param[5..];
} else if param.starts_with("num_queues=") {
num_queues_str = &param[11..];
} else if param.starts_with("readonly=") {
readonly = match param[9..].parse::<bool>() {
Ok(b) => b,
Err(_) => return Err(Error::ParseReadOnlyParam),
}
} else if param.starts_with("direct=") {
direct = match param[7..].parse::<bool>() {
Ok(b) => b,
Err(_) => return Err(Error::ParseDirectParam),
}
} else if param.starts_with("poll_queue=") {
poll_queue = match param[11..].parse::<bool>() {
Ok(b) => b,
Err(_) => return Err(Error::ParsePollQueueParam),
}
}
}
let mut num_queues: usize = 1;
if image.is_empty() {
return Err(Error::ParseImageParam);
}
if sock.is_empty() {
return Err(Error::ParseSockParam);
}
if !num_queues_str.is_empty() {
num_queues = num_queues_str
.parse()
.map_err(Error::ParseBlkNumQueuesParam)?;
}
Ok(VhostUserBlkBackendConfig {
image,
sock,
num_queues,
readonly,
direct,
poll_queue,
})
}
}
pub fn start_block_backend(backend_command: &str) {
let backend_config = match VhostUserBlkBackendConfig::parse(backend_command) {
Ok(config) => config,
Err(e) => {
println!("Failed parsing parameters {:?}", e);
process::exit(1);
}
};
let blk_backend = Arc::new(RwLock::new(
VhostUserBlkBackend::new(
backend_config.image.to_string(),
backend_config.num_queues,
backend_config.readonly,
backend_config.direct,
backend_config.poll_queue,
)
.unwrap(),
));
debug!("blk_backend is created!\n");
let name = "vhost-user-blk-backend";
let mut blk_daemon = VhostUserDaemon::new(
name.to_string(),
backend_config.sock.to_string(),
blk_backend.clone(),
)
.unwrap();
debug!("blk_daemon is created!\n");
let vring_worker = blk_daemon.get_vring_worker();
blk_backend
.write()
.unwrap()
.set_vring_worker(Some(vring_worker));
if let Err(e) = blk_daemon.start() {
error!(
"Failed to start daemon for vhost-user-block with error: {:?}\n",
e
);
process::exit(1);
}
if let Err(e) = blk_daemon.wait() {
error!("Error from the main thread: {:?}", e);
}
let kill_evt = &blk_backend.write().unwrap().kill_evt;
if let Err(e) = kill_evt.write(1) {
error!("Error shutting down worker thread: {:?}", e)
}
}