cloud-hypervisor/vm-virtio/src/block.rs
Rob Bradford 1151b07682 vm-virtio: block: Add support for resetting a block device
As it is necessary to return the interrupt EventFD and the queue EventFD
to the transport layer upon reset the activate function has been
modified to clone these descriptors as well as the underlying disk
itself.

Signed-off-by: Rob Bradford <robert.bradford@intel.com>
2019-05-09 15:44:18 +02:00

637 lines
21 KiB
Rust

// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE-BSD-3-Clause file.
//
// Copyright © 2019 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
use epoll;
use libc::EFD_NONBLOCK;
use std::cmp;
use std::fs::File;
use std::io::{self, Read, Seek, SeekFrom, Write};
use std::os::linux::fs::MetadataExt;
use std::os::unix::io::AsRawFd;
use std::result;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::thread;
use super::Error as DeviceError;
use super::{
ActivateError, ActivateResult, DescriptorChain, DeviceEventT, Queue, VirtioDevice,
VirtioDeviceType, INTERRUPT_STATUS_USED_RING,
};
use virtio_bindings::virtio_blk::*;
use vm_memory::{Bytes, GuestAddress, GuestMemory, GuestMemoryError, GuestMemoryMmap};
use vmm_sys_util::EventFd;
const CONFIG_SPACE_SIZE: usize = 8;
const SECTOR_SHIFT: u8 = 9;
pub const SECTOR_SIZE: u64 = (0x01 as u64) << SECTOR_SHIFT;
const QUEUE_SIZE: u16 = 256;
const NUM_QUEUES: usize = 1;
const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
// New descriptors are pending on the virtio queue.
const QUEUE_AVAIL_EVENT: DeviceEventT = 0;
// The device has been dropped.
pub const KILL_EVENT: DeviceEventT = 1;
// Number of DeviceEventT events supported by this implementation.
pub const BLOCK_EVENTS_COUNT: usize = 2;
#[derive(Debug)]
enum Error {
/// Guest gave us bad memory addresses.
GuestMemory(GuestMemoryError),
/// Guest gave us offsets that would have overflowed a usize.
CheckedOffset(GuestAddress, usize),
/// Guest gave us a write only descriptor that protocol says to read from.
UnexpectedWriteOnlyDescriptor,
/// Guest gave us a read only descriptor that protocol says to write to.
UnexpectedReadOnlyDescriptor,
/// Guest gave us too few descriptors in a descriptor chain.
DescriptorChainTooShort,
/// Guest gave us a descriptor that was too short to use.
DescriptorLengthTooSmall,
/// Getting a block's metadata fails for any reason.
GetFileMetadata,
/// The requested operation would cause a seek beyond disk end.
InvalidOffset,
}
#[derive(Debug)]
enum ExecuteError {
BadRequest(Error),
Flush(io::Error),
Read(GuestMemoryError),
Seek(io::Error),
Write(GuestMemoryError),
Unsupported(u32),
}
impl ExecuteError {
fn status(&self) -> u32 {
match *self {
ExecuteError::BadRequest(_) => VIRTIO_BLK_S_IOERR,
ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR,
ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR,
ExecuteError::Seek(_) => VIRTIO_BLK_S_IOERR,
ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR,
ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP,
}
}
}
#[derive(Clone, Copy, Debug, PartialEq)]
enum RequestType {
In,
Out,
Flush,
GetDeviceID,
Unsupported(u32),
}
fn request_type(
mem: &GuestMemoryMmap,
desc_addr: GuestAddress,
) -> result::Result<RequestType, Error> {
let type_ = mem.read_obj(desc_addr).map_err(Error::GuestMemory)?;
match type_ {
VIRTIO_BLK_T_IN => Ok(RequestType::In),
VIRTIO_BLK_T_OUT => Ok(RequestType::Out),
VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush),
VIRTIO_BLK_T_GET_ID => Ok(RequestType::GetDeviceID),
t => Ok(RequestType::Unsupported(t)),
}
}
fn sector(mem: &GuestMemoryMmap, desc_addr: GuestAddress) -> result::Result<u64, Error> {
const SECTOR_OFFSET: usize = 8;
let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) {
Some(v) => v,
None => return Err(Error::CheckedOffset(desc_addr, SECTOR_OFFSET)),
};
mem.read_obj(addr).map_err(Error::GuestMemory)
}
fn build_device_id(disk_image: &File) -> result::Result<String, Error> {
let blk_metadata = match disk_image.metadata() {
Err(_) => return Err(Error::GetFileMetadata),
Ok(m) => m,
};
// This is how kvmtool does it.
let device_id = format!(
"{}{}{}",
blk_metadata.st_dev(),
blk_metadata.st_rdev(),
blk_metadata.st_ino()
)
.to_owned();
Ok(device_id)
}
fn build_disk_image_id(disk_image: &File) -> Vec<u8> {
let mut default_disk_image_id = vec![0; VIRTIO_BLK_ID_BYTES as usize];
match build_device_id(disk_image) {
Err(_) => {
warn!("Could not generate device id. We'll use a default.");
}
Ok(m) => {
// The kernel only knows to read a maximum of VIRTIO_BLK_ID_BYTES.
// This will also zero out any leftover bytes.
let disk_id = m.as_bytes();
let bytes_to_copy = cmp::min(disk_id.len(), VIRTIO_BLK_ID_BYTES as usize);
default_disk_image_id[..bytes_to_copy].clone_from_slice(&disk_id[..bytes_to_copy])
}
}
default_disk_image_id
}
struct Request {
request_type: RequestType,
sector: u64,
data_addr: GuestAddress,
data_len: u32,
status_addr: GuestAddress,
}
impl Request {
fn parse(
avail_desc: &DescriptorChain,
mem: &GuestMemoryMmap,
) -> result::Result<Request, Error> {
// The head contains the request type which MUST be readable.
if avail_desc.is_write_only() {
return Err(Error::UnexpectedWriteOnlyDescriptor);
}
let mut req = Request {
request_type: request_type(&mem, avail_desc.addr)?,
sector: sector(&mem, avail_desc.addr)?,
data_addr: GuestAddress(0),
data_len: 0,
status_addr: GuestAddress(0),
};
let data_desc;
let status_desc;
let desc = avail_desc
.next_descriptor()
.ok_or(Error::DescriptorChainTooShort)?;
if !desc.has_next() {
status_desc = desc;
// Only flush requests are allowed to skip the data descriptor.
if req.request_type != RequestType::Flush {
return Err(Error::DescriptorChainTooShort);
}
} else {
data_desc = desc;
status_desc = data_desc
.next_descriptor()
.ok_or(Error::DescriptorChainTooShort)?;
if data_desc.is_write_only() && req.request_type == RequestType::Out {
return Err(Error::UnexpectedWriteOnlyDescriptor);
}
if !data_desc.is_write_only() && req.request_type == RequestType::In {
return Err(Error::UnexpectedReadOnlyDescriptor);
}
if !data_desc.is_write_only() && req.request_type == RequestType::GetDeviceID {
return Err(Error::UnexpectedReadOnlyDescriptor);
}
req.data_addr = data_desc.addr;
req.data_len = data_desc.len;
}
// The status MUST always be writable.
if !status_desc.is_write_only() {
return Err(Error::UnexpectedReadOnlyDescriptor);
}
if status_desc.len < 1 {
return Err(Error::DescriptorLengthTooSmall);
}
req.status_addr = status_desc.addr;
Ok(req)
}
#[allow(clippy::ptr_arg)]
fn execute<T: Seek + Read + Write>(
&self,
disk: &mut T,
disk_nsectors: u64,
mem: &GuestMemoryMmap,
disk_id: &Vec<u8>,
) -> result::Result<u32, ExecuteError> {
let mut top: u64 = u64::from(self.data_len) / SECTOR_SIZE;
if u64::from(self.data_len) % SECTOR_SIZE != 0 {
top += 1;
}
top = top
.checked_add(self.sector)
.ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?;
if top > disk_nsectors {
return Err(ExecuteError::BadRequest(Error::InvalidOffset));
}
disk.seek(SeekFrom::Start(self.sector << SECTOR_SHIFT))
.map_err(ExecuteError::Seek)?;
match self.request_type {
RequestType::In => {
mem.read_exact_from(self.data_addr, disk, self.data_len as usize)
.map_err(ExecuteError::Read)?;
return Ok(self.data_len);
}
RequestType::Out => {
mem.write_all_to(self.data_addr, disk, self.data_len as usize)
.map_err(ExecuteError::Write)?;
}
RequestType::Flush => match disk.flush() {
Ok(_) => {
return Ok(0);
}
Err(e) => return Err(ExecuteError::Flush(e)),
},
RequestType::GetDeviceID => {
if (self.data_len as usize) < disk_id.len() {
return Err(ExecuteError::BadRequest(Error::InvalidOffset));
}
mem.write_slice(&disk_id.as_slice(), self.data_addr)
.map_err(ExecuteError::Write)?;
}
RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
};
Ok(0)
}
}
struct BlockEpollHandler {
queues: Vec<Queue>,
mem: GuestMemoryMmap,
disk_image: File,
disk_nsectors: u64,
interrupt_status: Arc<AtomicUsize>,
interrupt_evt: EventFd,
disk_image_id: Vec<u8>,
}
impl BlockEpollHandler {
fn process_queue(&mut self, queue_index: usize) -> bool {
let queue = &mut self.queues[queue_index];
let mut used_desc_heads = [(0, 0); QUEUE_SIZE as usize];
let mut used_count = 0;
for avail_desc in queue.iter(&self.mem) {
let len;
match Request::parse(&avail_desc, &self.mem) {
Ok(request) => {
let status = match request.execute(
&mut self.disk_image,
self.disk_nsectors,
&self.mem,
&self.disk_image_id,
) {
Ok(l) => {
len = l;
VIRTIO_BLK_S_OK
}
Err(e) => {
error!("Failed to execute request: {:?}", e);
len = 1; // We need at least 1 byte for the status.
e.status()
}
};
// We use unwrap because the request parsing process already checked that the
// status_addr was valid.
self.mem.write_obj(status, request.status_addr).unwrap();
}
Err(e) => {
error!("Failed to parse available descriptor chain: {:?}", e);
len = 0;
}
}
used_desc_heads[used_count] = (avail_desc.index, len);
used_count += 1;
}
for &(desc_index, len) in &used_desc_heads[..used_count] {
queue.add_used(&self.mem, desc_index, len);
}
used_count > 0
}
fn signal_used_queue(&self) -> result::Result<(), DeviceError> {
self.interrupt_status
.fetch_or(INTERRUPT_STATUS_USED_RING as usize, Ordering::SeqCst);
self.interrupt_evt.write(1).map_err(|e| {
error!("Failed to signal used queue: {:?}", e);
DeviceError::FailedSignalingUsedQueue(e)
})
}
#[allow(dead_code)]
fn update_disk_image(&mut self, disk_image: File) -> result::Result<(), DeviceError> {
self.disk_image = disk_image;
self.disk_nsectors = self
.disk_image
.seek(SeekFrom::End(0))
.map_err(DeviceError::IoError)?
/ SECTOR_SIZE;
self.disk_image_id = build_disk_image_id(&self.disk_image);
Ok(())
}
fn run(&mut self, queue_evt: EventFd, kill_evt: EventFd) -> result::Result<(), DeviceError> {
// Create the epoll file descriptor
let epoll_fd = epoll::create(true).map_err(DeviceError::EpollCreateFd)?;
// Add events
epoll::ctl(
epoll_fd,
epoll::ControlOptions::EPOLL_CTL_ADD,
queue_evt.as_raw_fd(),
epoll::Event::new(epoll::Events::EPOLLIN, u64::from(QUEUE_AVAIL_EVENT)),
)
.map_err(DeviceError::EpollCtl)?;
epoll::ctl(
epoll_fd,
epoll::ControlOptions::EPOLL_CTL_ADD,
kill_evt.as_raw_fd(),
epoll::Event::new(epoll::Events::EPOLLIN, u64::from(KILL_EVENT)),
)
.map_err(DeviceError::EpollCtl)?;
const EPOLL_EVENTS_LEN: usize = 100;
let mut events = vec![epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN];
'epoll: loop {
let num_events =
epoll::wait(epoll_fd, -1, &mut events[..]).map_err(DeviceError::EpollWait)?;
for event in events.iter().take(num_events) {
let ev_type = event.data as u16;
match ev_type {
QUEUE_AVAIL_EVENT => {
if let Err(e) = queue_evt.read() {
error!("Failed to get queue event: {:?}", e);
break 'epoll;
} else if self.process_queue(0) {
if let Err(e) = self.signal_used_queue() {
error!("Failed to signal used queue: {:?}", e);
break 'epoll;
}
}
}
KILL_EVENT => {
debug!("KILL_EVENT received, stopping epoll loop");
break 'epoll;
}
_ => {
error!("Unknown event for virtio-block");
}
}
}
}
Ok(())
}
}
/// Virtio device for exposing block level read/write operations on a host file.
pub struct Block {
kill_evt: Option<EventFd>,
disk_image: Option<File>,
disk_nsectors: u64,
avail_features: u64,
acked_features: u64,
config_space: Vec<u8>,
queue_evt: Option<EventFd>,
interrupt_evt: Option<EventFd>,
}
pub fn build_config_space(disk_size: u64) -> Vec<u8> {
// We only support disk size, which uses the first two words of the configuration space.
// If the image is not a multiple of the sector size, the tail bits are not exposed.
// The config space is little endian.
let mut config = Vec::with_capacity(CONFIG_SPACE_SIZE);
let num_sectors = disk_size >> SECTOR_SHIFT;
for i in 0..8 {
config.push((num_sectors >> (8 * i)) as u8);
}
config
}
impl Block {
/// Create a new virtio block device that operates on the given file.
///
/// The given file must be seekable and sizable.
pub fn new(mut disk_image: File, is_disk_read_only: bool) -> io::Result<Block> {
let disk_size = disk_image.seek(SeekFrom::End(0))? as u64;
if disk_size % SECTOR_SIZE != 0 {
warn!(
"Disk size {} is not a multiple of sector size {}; \
the remainder will not be visible to the guest.",
disk_size, SECTOR_SIZE
);
}
let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_BLK_F_FLUSH);
if is_disk_read_only {
avail_features |= 1u64 << VIRTIO_BLK_F_RO;
};
Ok(Block {
kill_evt: None,
disk_image: Some(disk_image),
disk_nsectors: disk_size / SECTOR_SIZE,
avail_features,
acked_features: 0u64,
config_space: build_config_space(disk_size),
queue_evt: None,
interrupt_evt: None,
})
}
}
impl Drop for Block {
fn drop(&mut self) {
if let Some(kill_evt) = self.kill_evt.take() {
// Ignore the result because there is nothing we can do about it.
let _ = kill_evt.write(1);
}
}
}
impl VirtioDevice for Block {
fn device_type(&self) -> u32 {
VirtioDeviceType::TYPE_BLOCK as u32
}
fn queue_max_sizes(&self) -> &[u16] {
QUEUE_SIZES
}
fn features(&self, page: u32) -> u32 {
match page {
// Get the lower 32-bits of the features bitfield.
0 => self.avail_features as u32,
// Get the upper 32-bits of the features bitfield.
1 => (self.avail_features >> 32) as u32,
_ => {
warn!("Received request for unknown features page.");
0u32
}
}
}
fn ack_features(&mut self, page: u32, value: u32) {
let mut v = match page {
0 => u64::from(value),
1 => u64::from(value) << 32,
_ => {
warn!("Cannot acknowledge unknown features page.");
0u64
}
};
// Check if the guest is ACK'ing a feature that we didn't claim to have.
let unrequested_features = v & !self.avail_features;
if unrequested_features != 0 {
warn!("Received acknowledge request for unknown feature.");
// Don't count these features as acked.
v &= !unrequested_features;
}
self.acked_features |= v;
}
fn read_config(&self, offset: u64, mut data: &mut [u8]) {
let config_len = self.config_space.len() as u64;
if offset >= config_len {
error!("Failed to read config space");
return;
}
if let Some(end) = offset.checked_add(data.len() as u64) {
// This write can't fail, offset and end are checked against config_len.
data.write_all(&self.config_space[offset as usize..cmp::min(end, config_len) as usize])
.unwrap();
}
}
fn write_config(&mut self, offset: u64, data: &[u8]) {
let data_len = data.len() as u64;
let config_len = self.config_space.len() as u64;
if offset + data_len > config_len {
error!("Failed to write config space");
return;
}
let (_, right) = self.config_space.split_at_mut(offset as usize);
right.copy_from_slice(&data[..]);
}
fn activate(
&mut self,
mem: GuestMemoryMmap,
interrupt_evt: EventFd,
status: Arc<AtomicUsize>,
queues: Vec<Queue>,
mut queue_evts: Vec<EventFd>,
) -> ActivateResult {
if queues.len() != NUM_QUEUES || queue_evts.len() != NUM_QUEUES {
error!(
"Cannot perform activate. Expected {} queue(s), got {}",
NUM_QUEUES,
queues.len()
);
return Err(ActivateError::BadActivate);
}
let (self_kill_evt, kill_evt) =
match EventFd::new(EFD_NONBLOCK).and_then(|e| Ok((e.try_clone()?, e))) {
Ok(v) => v,
Err(e) => {
error!("failed creating kill EventFd pair: {}", e);
return Err(ActivateError::BadActivate);
}
};
self.kill_evt = Some(self_kill_evt);
if self.disk_image.is_some() {
let disk_image = self.disk_image.as_ref().unwrap().try_clone().map_err(|e| {
error!("failed to clone disk image: {}", e);
ActivateError::BadActivate
})?;
let disk_image_id = build_disk_image_id(&disk_image);
// Save the interrupt EventFD as we need to return it on reset
// but clone it to pass into the thread.
self.interrupt_evt = Some(interrupt_evt);
let interrupt_evt = self
.interrupt_evt
.as_ref()
.unwrap()
.try_clone()
.map_err(|e| {
error!("failed to clone interrupt EventFd: {}", e);
ActivateError::BadActivate
})?;
// Save the queue EventFD as we need to return it on reset
// but clone it to pass into the thread.
self.queue_evt = Some(queue_evts.remove(0));
let queue_evt = self.queue_evt.as_ref().unwrap().try_clone().map_err(|e| {
error!("failed to clone queue EventFd: {}", e);
ActivateError::BadActivate
})?;
let mut handler = BlockEpollHandler {
queues,
mem,
disk_image,
disk_nsectors: self.disk_nsectors,
interrupt_status: status,
interrupt_evt,
disk_image_id,
};
let worker_result = thread::Builder::new()
.name("virtio_blk".to_string())
.spawn(move || handler.run(queue_evt, kill_evt));
if let Err(e) = worker_result {
error!("failed to spawn virtio_blk worker: {}", e);
return Err(ActivateError::BadActivate);
}
return Ok(());
}
Err(ActivateError::BadActivate)
}
fn reset(&mut self) -> Option<(EventFd, Vec<EventFd>)> {
if let Some(kill_evt) = self.kill_evt.take() {
// Ignore the result because there is nothing we can do about it.
let _ = kill_evt.write(1);
}
// Return the interrupt and queue EventFDs
Some((
self.interrupt_evt.take().unwrap(),
vec![self.queue_evt.take().unwrap()],
))
}
}