cloud-hypervisor/vm-virtio/src/block.rs

689 lines
22 KiB
Rust
Raw Normal View History

// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE-BSD-3-Clause file.
//
// Copyright © 2019 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
use epoll;
use libc::EFD_NONBLOCK;
use std::cmp;
use std::fs::File;
use std::io::{self, Read, Seek, SeekFrom, Write};
use std::os::linux::fs::MetadataExt;
use std::os::unix::io::AsRawFd;
use std::path::PathBuf;
use std::result;
use std::sync::Arc;
use std::thread;
use super::Error as DeviceError;
use super::{
ActivateError, ActivateResult, DescriptorChain, DeviceEventT, Queue, VirtioDevice,
VirtioDeviceType, VirtioInterruptType,
};
use crate::VirtioInterrupt;
use virtio_bindings::virtio_blk::*;
use vm_memory::{Bytes, GuestAddress, GuestMemory, GuestMemoryError, GuestMemoryMmap};
use vmm_sys_util::eventfd::EventFd;
const CONFIG_SPACE_SIZE: usize = 8;
const SECTOR_SHIFT: u8 = 9;
pub const SECTOR_SIZE: u64 = (0x01 as u64) << SECTOR_SHIFT;
const QUEUE_SIZE: u16 = 256;
const NUM_QUEUES: usize = 1;
const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
// New descriptors are pending on the virtio queue.
const QUEUE_AVAIL_EVENT: DeviceEventT = 0;
// The device has been dropped.
pub const KILL_EVENT: DeviceEventT = 1;
// Number of DeviceEventT events supported by this implementation.
pub const BLOCK_EVENTS_COUNT: usize = 2;
#[derive(Debug)]
enum Error {
/// Guest gave us bad memory addresses.
GuestMemory(GuestMemoryError),
/// Guest gave us offsets that would have overflowed a usize.
CheckedOffset(GuestAddress, usize),
/// Guest gave us a write only descriptor that protocol says to read from.
UnexpectedWriteOnlyDescriptor,
/// Guest gave us a read only descriptor that protocol says to write to.
UnexpectedReadOnlyDescriptor,
/// Guest gave us too few descriptors in a descriptor chain.
DescriptorChainTooShort,
/// Guest gave us a descriptor that was too short to use.
DescriptorLengthTooSmall,
/// Getting a block's metadata fails for any reason.
GetFileMetadata,
/// The requested operation would cause a seek beyond disk end.
InvalidOffset,
}
#[derive(Debug)]
enum ExecuteError {
BadRequest(Error),
Flush(io::Error),
Read(GuestMemoryError),
Seek(io::Error),
Write(GuestMemoryError),
Unsupported(u32),
}
impl ExecuteError {
fn status(&self) -> u32 {
match *self {
ExecuteError::BadRequest(_) => VIRTIO_BLK_S_IOERR,
ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR,
ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR,
ExecuteError::Seek(_) => VIRTIO_BLK_S_IOERR,
ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR,
ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP,
}
}
}
pub trait DiskFile: Read + Seek + Write + Clone {}
impl<D: Read + Seek + Write + Clone> DiskFile for D {}
pub struct RawFile {
file: File,
}
impl RawFile {
pub fn new(file: File) -> Self {
RawFile { file }
}
}
impl Read for RawFile {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.file.read(buf)
}
}
impl Seek for RawFile {
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
self.file.seek(pos)
}
}
impl Write for RawFile {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.file.write(buf)
}
fn flush(&mut self) -> std::io::Result<()> {
self.file.flush()
}
}
impl Clone for RawFile {
fn clone(&self) -> Self {
RawFile {
file: self.file.try_clone().expect("RawFile cloning failed"),
}
}
}
#[derive(Clone, Copy, Debug, PartialEq)]
enum RequestType {
In,
Out,
Flush,
GetDeviceID,
Unsupported(u32),
}
fn request_type(
mem: &GuestMemoryMmap,
desc_addr: GuestAddress,
) -> result::Result<RequestType, Error> {
let type_ = mem.read_obj(desc_addr).map_err(Error::GuestMemory)?;
match type_ {
VIRTIO_BLK_T_IN => Ok(RequestType::In),
VIRTIO_BLK_T_OUT => Ok(RequestType::Out),
VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush),
VIRTIO_BLK_T_GET_ID => Ok(RequestType::GetDeviceID),
t => Ok(RequestType::Unsupported(t)),
}
}
fn sector(mem: &GuestMemoryMmap, desc_addr: GuestAddress) -> result::Result<u64, Error> {
const SECTOR_OFFSET: usize = 8;
let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) {
Some(v) => v,
None => return Err(Error::CheckedOffset(desc_addr, SECTOR_OFFSET)),
};
mem.read_obj(addr).map_err(Error::GuestMemory)
}
fn build_device_id(disk_path: &PathBuf) -> result::Result<String, Error> {
let blk_metadata = match disk_path.metadata() {
Err(_) => return Err(Error::GetFileMetadata),
Ok(m) => m,
};
// This is how kvmtool does it.
let device_id = format!(
"{}{}{}",
blk_metadata.st_dev(),
blk_metadata.st_rdev(),
blk_metadata.st_ino()
)
.to_owned();
Ok(device_id)
}
fn build_disk_image_id(disk_path: &PathBuf) -> Vec<u8> {
let mut default_disk_image_id = vec![0; VIRTIO_BLK_ID_BYTES as usize];
match build_device_id(disk_path) {
Err(_) => {
warn!("Could not generate device id. We'll use a default.");
}
Ok(m) => {
// The kernel only knows to read a maximum of VIRTIO_BLK_ID_BYTES.
// This will also zero out any leftover bytes.
let disk_id = m.as_bytes();
let bytes_to_copy = cmp::min(disk_id.len(), VIRTIO_BLK_ID_BYTES as usize);
default_disk_image_id[..bytes_to_copy].clone_from_slice(&disk_id[..bytes_to_copy])
}
}
default_disk_image_id
}
struct Request {
request_type: RequestType,
sector: u64,
data_addr: GuestAddress,
data_len: u32,
status_addr: GuestAddress,
}
impl Request {
fn parse(
avail_desc: &DescriptorChain,
mem: &GuestMemoryMmap,
) -> result::Result<Request, Error> {
// The head contains the request type which MUST be readable.
if avail_desc.is_write_only() {
return Err(Error::UnexpectedWriteOnlyDescriptor);
}
let mut req = Request {
request_type: request_type(&mem, avail_desc.addr)?,
sector: sector(&mem, avail_desc.addr)?,
data_addr: GuestAddress(0),
data_len: 0,
status_addr: GuestAddress(0),
};
let data_desc;
let status_desc;
let desc = avail_desc
.next_descriptor()
.ok_or(Error::DescriptorChainTooShort)?;
if !desc.has_next() {
status_desc = desc;
// Only flush requests are allowed to skip the data descriptor.
if req.request_type != RequestType::Flush {
return Err(Error::DescriptorChainTooShort);
}
} else {
data_desc = desc;
status_desc = data_desc
.next_descriptor()
.ok_or(Error::DescriptorChainTooShort)?;
if data_desc.is_write_only() && req.request_type == RequestType::Out {
return Err(Error::UnexpectedWriteOnlyDescriptor);
}
if !data_desc.is_write_only() && req.request_type == RequestType::In {
return Err(Error::UnexpectedReadOnlyDescriptor);
}
if !data_desc.is_write_only() && req.request_type == RequestType::GetDeviceID {
return Err(Error::UnexpectedReadOnlyDescriptor);
}
req.data_addr = data_desc.addr;
req.data_len = data_desc.len;
}
// The status MUST always be writable.
if !status_desc.is_write_only() {
return Err(Error::UnexpectedReadOnlyDescriptor);
}
if status_desc.len < 1 {
return Err(Error::DescriptorLengthTooSmall);
}
req.status_addr = status_desc.addr;
Ok(req)
}
#[allow(clippy::ptr_arg)]
fn execute<T: Seek + Read + Write>(
&self,
disk: &mut T,
disk_nsectors: u64,
mem: &GuestMemoryMmap,
disk_id: &Vec<u8>,
) -> result::Result<u32, ExecuteError> {
let mut top: u64 = u64::from(self.data_len) / SECTOR_SIZE;
if u64::from(self.data_len) % SECTOR_SIZE != 0 {
top += 1;
}
top = top
.checked_add(self.sector)
.ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?;
if top > disk_nsectors {
return Err(ExecuteError::BadRequest(Error::InvalidOffset));
}
disk.seek(SeekFrom::Start(self.sector << SECTOR_SHIFT))
.map_err(ExecuteError::Seek)?;
match self.request_type {
RequestType::In => {
mem.read_exact_from(self.data_addr, disk, self.data_len as usize)
.map_err(ExecuteError::Read)?;
return Ok(self.data_len);
}
RequestType::Out => {
mem.write_all_to(self.data_addr, disk, self.data_len as usize)
.map_err(ExecuteError::Write)?;
}
RequestType::Flush => match disk.flush() {
Ok(_) => {
return Ok(0);
}
Err(e) => return Err(ExecuteError::Flush(e)),
},
RequestType::GetDeviceID => {
if (self.data_len as usize) < disk_id.len() {
return Err(ExecuteError::BadRequest(Error::InvalidOffset));
}
mem.write_slice(&disk_id.as_slice(), self.data_addr)
.map_err(ExecuteError::Write)?;
}
RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
};
Ok(0)
}
}
struct BlockEpollHandler<T: DiskFile> {
queues: Vec<Queue>,
mem: GuestMemoryMmap,
disk_image: T,
disk_nsectors: u64,
interrupt_cb: Arc<VirtioInterrupt>,
disk_image_id: Vec<u8>,
}
impl<T: DiskFile> BlockEpollHandler<T> {
fn process_queue(&mut self, queue_index: usize) -> bool {
let queue = &mut self.queues[queue_index];
let mut used_desc_heads = [(0, 0); QUEUE_SIZE as usize];
let mut used_count = 0;
for avail_desc in queue.iter(&self.mem) {
let len;
match Request::parse(&avail_desc, &self.mem) {
Ok(request) => {
let status = match request.execute(
&mut self.disk_image,
self.disk_nsectors,
&self.mem,
&self.disk_image_id,
) {
Ok(l) => {
len = l;
VIRTIO_BLK_S_OK
}
Err(e) => {
error!("Failed to execute request: {:?}", e);
len = 1; // We need at least 1 byte for the status.
e.status()
}
};
// We use unwrap because the request parsing process already checked that the
// status_addr was valid.
self.mem.write_obj(status, request.status_addr).unwrap();
}
Err(e) => {
error!("Failed to parse available descriptor chain: {:?}", e);
len = 0;
}
}
used_desc_heads[used_count] = (avail_desc.index, len);
used_count += 1;
}
for &(desc_index, len) in &used_desc_heads[..used_count] {
queue.add_used(&self.mem, desc_index, len);
}
used_count > 0
}
fn signal_used_queue(&self, queue_index: usize) -> result::Result<(), DeviceError> {
(self.interrupt_cb)(&VirtioInterruptType::Queue, Some(&self.queues[queue_index])).map_err(
|e| {
error!("Failed to signal used queue: {:?}", e);
DeviceError::FailedSignalingUsedQueue(e)
},
)
}
#[allow(dead_code)]
fn update_disk_image(
&mut self,
disk_image: T,
disk_path: &PathBuf,
) -> result::Result<(), DeviceError> {
self.disk_image = disk_image;
self.disk_nsectors = self
.disk_image
.seek(SeekFrom::End(0))
.map_err(DeviceError::IoError)?
/ SECTOR_SIZE;
self.disk_image_id = build_disk_image_id(disk_path);
Ok(())
}
fn run(&mut self, queue_evt: EventFd, kill_evt: EventFd) -> result::Result<(), DeviceError> {
// Create the epoll file descriptor
let epoll_fd = epoll::create(true).map_err(DeviceError::EpollCreateFd)?;
// Add events
epoll::ctl(
epoll_fd,
epoll::ControlOptions::EPOLL_CTL_ADD,
queue_evt.as_raw_fd(),
epoll::Event::new(epoll::Events::EPOLLIN, u64::from(QUEUE_AVAIL_EVENT)),
)
.map_err(DeviceError::EpollCtl)?;
epoll::ctl(
epoll_fd,
epoll::ControlOptions::EPOLL_CTL_ADD,
kill_evt.as_raw_fd(),
epoll::Event::new(epoll::Events::EPOLLIN, u64::from(KILL_EVENT)),
)
.map_err(DeviceError::EpollCtl)?;
const EPOLL_EVENTS_LEN: usize = 100;
let mut events = vec![epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN];
'epoll: loop {
let num_events = match epoll::wait(epoll_fd, -1, &mut events[..]) {
Ok(res) => res,
Err(e) => {
if e.kind() == io::ErrorKind::Interrupted {
// It's well defined from the epoll_wait() syscall
// documentation that the epoll loop can be interrupted
// before any of the requested events occurred or the
// timeout expired. In both those cases, epoll_wait()
// returns an error of type EINTR, but this should not
// be considered as a regular error. Instead it is more
// appropriate to retry, by calling into epoll_wait().
continue;
}
return Err(DeviceError::EpollWait(e));
}
};
for event in events.iter().take(num_events) {
let ev_type = event.data as u16;
match ev_type {
QUEUE_AVAIL_EVENT => {
if let Err(e) = queue_evt.read() {
error!("Failed to get queue event: {:?}", e);
break 'epoll;
} else if self.process_queue(0) {
if let Err(e) = self.signal_used_queue(0) {
error!("Failed to signal used queue: {:?}", e);
break 'epoll;
}
}
}
KILL_EVENT => {
debug!("KILL_EVENT received, stopping epoll loop");
break 'epoll;
}
_ => {
error!("Unknown event for virtio-block");
}
}
}
}
Ok(())
}
}
/// Virtio device for exposing block level read/write operations on a host file.
pub struct Block<T: DiskFile> {
kill_evt: Option<EventFd>,
disk_image: Option<T>,
disk_path: PathBuf,
disk_nsectors: u64,
avail_features: u64,
acked_features: u64,
config_space: Vec<u8>,
queue_evt: Option<EventFd>,
interrupt_cb: Option<Arc<VirtioInterrupt>>,
}
pub fn build_config_space(disk_size: u64) -> Vec<u8> {
// We only support disk size, which uses the first two words of the configuration space.
// If the image is not a multiple of the sector size, the tail bits are not exposed.
// The config space is little endian.
let mut config = Vec::with_capacity(CONFIG_SPACE_SIZE);
let num_sectors = disk_size >> SECTOR_SHIFT;
for i in 0..8 {
config.push((num_sectors >> (8 * i)) as u8);
}
config
}
impl<T: DiskFile> Block<T> {
/// Create a new virtio block device that operates on the given file.
///
/// The given file must be seekable and sizable.
pub fn new(
mut disk_image: T,
disk_path: PathBuf,
is_disk_read_only: bool,
) -> io::Result<Block<T>> {
let disk_size = disk_image.seek(SeekFrom::End(0))? as u64;
if disk_size % SECTOR_SIZE != 0 {
warn!(
"Disk size {} is not a multiple of sector size {}; \
the remainder will not be visible to the guest.",
disk_size, SECTOR_SIZE
);
}
let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_BLK_F_FLUSH);
if is_disk_read_only {
avail_features |= 1u64 << VIRTIO_BLK_F_RO;
};
Ok(Block {
kill_evt: None,
disk_image: Some(disk_image),
disk_path,
disk_nsectors: disk_size / SECTOR_SIZE,
avail_features,
acked_features: 0u64,
config_space: build_config_space(disk_size),
queue_evt: None,
interrupt_cb: None,
})
}
}
impl<T: DiskFile> Drop for Block<T> {
fn drop(&mut self) {
if let Some(kill_evt) = self.kill_evt.take() {
// Ignore the result because there is nothing we can do about it.
let _ = kill_evt.write(1);
}
}
}
impl<T: 'static + DiskFile + Send> VirtioDevice for Block<T> {
fn device_type(&self) -> u32 {
VirtioDeviceType::TYPE_BLOCK as u32
}
fn queue_max_sizes(&self) -> &[u16] {
QUEUE_SIZES
}
fn features(&self, page: u32) -> u32 {
match page {
// Get the lower 32-bits of the features bitfield.
0 => self.avail_features as u32,
// Get the upper 32-bits of the features bitfield.
1 => (self.avail_features >> 32) as u32,
_ => {
warn!("Received request for unknown features page.");
0u32
}
}
}
fn ack_features(&mut self, page: u32, value: u32) {
let mut v = match page {
0 => u64::from(value),
1 => u64::from(value) << 32,
_ => {
warn!("Cannot acknowledge unknown features page.");
0u64
}
};
// Check if the guest is ACK'ing a feature that we didn't claim to have.
let unrequested_features = v & !self.avail_features;
if unrequested_features != 0 {
warn!("Received acknowledge request for unknown feature.");
// Don't count these features as acked.
v &= !unrequested_features;
}
self.acked_features |= v;
}
fn read_config(&self, offset: u64, mut data: &mut [u8]) {
let config_len = self.config_space.len() as u64;
if offset >= config_len {
error!("Failed to read config space");
return;
}
if let Some(end) = offset.checked_add(data.len() as u64) {
// This write can't fail, offset and end are checked against config_len.
data.write_all(&self.config_space[offset as usize..cmp::min(end, config_len) as usize])
.unwrap();
}
}
fn write_config(&mut self, offset: u64, data: &[u8]) {
let data_len = data.len() as u64;
let config_len = self.config_space.len() as u64;
if offset + data_len > config_len {
error!("Failed to write config space");
return;
}
let (_, right) = self.config_space.split_at_mut(offset as usize);
right.copy_from_slice(&data[..]);
}
fn activate(
&mut self,
mem: GuestMemoryMmap,
interrupt_cb: Arc<VirtioInterrupt>,
queues: Vec<Queue>,
mut queue_evts: Vec<EventFd>,
) -> ActivateResult {
if queues.len() != NUM_QUEUES || queue_evts.len() != NUM_QUEUES {
error!(
"Cannot perform activate. Expected {} queue(s), got {}",
NUM_QUEUES,
queues.len()
);
return Err(ActivateError::BadActivate);
}
let (self_kill_evt, kill_evt) =
match EventFd::new(EFD_NONBLOCK).and_then(|e| Ok((e.try_clone()?, e))) {
Ok(v) => v,
Err(e) => {
error!("failed creating kill EventFd pair: {}", e);
return Err(ActivateError::BadActivate);
}
};
self.kill_evt = Some(self_kill_evt);
if let Some(disk_image) = self.disk_image.clone() {
let disk_image_id = build_disk_image_id(&self.disk_path);
// Save the interrupt EventFD as we need to return it on reset
// but clone it to pass into the thread.
self.interrupt_cb = Some(interrupt_cb);
let interrupt_cb = self.interrupt_cb.as_ref().unwrap().clone();
// Save the queue EventFD as we need to return it on reset
// but clone it to pass into the thread.
self.queue_evt = Some(queue_evts.remove(0));
let queue_evt = self.queue_evt.as_ref().unwrap().try_clone().map_err(|e| {
error!("failed to clone queue EventFd: {}", e);
ActivateError::BadActivate
})?;
let mut handler = BlockEpollHandler {
queues,
mem,
disk_image,
disk_nsectors: self.disk_nsectors,
interrupt_cb,
disk_image_id,
};
let worker_result = thread::Builder::new()
.name("virtio_blk".to_string())
.spawn(move || handler.run(queue_evt, kill_evt));
if let Err(e) = worker_result {
error!("failed to spawn virtio_blk worker: {}", e);
return Err(ActivateError::BadActivate);
}
return Ok(());
}
Err(ActivateError::BadActivate)
}
fn reset(&mut self) -> Option<(Arc<VirtioInterrupt>, Vec<EventFd>)> {
if let Some(kill_evt) = self.kill_evt.take() {
// Ignore the result because there is nothing we can do about it.
let _ = kill_evt.write(1);
}
// Return the interrupt and queue EventFDs
Some((
self.interrupt_cb.take().unwrap(),
vec![self.queue_evt.take().unwrap()],
))
}
}