// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. // // Copyright © 2019 Intel Corporation // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause use epoll; use libc::EFD_NONBLOCK; use std::cmp; use std::fs::File; use std::io::{self, Read, Seek, SeekFrom, Write}; use std::os::linux::fs::MetadataExt; use std::os::unix::io::AsRawFd; use std::path::PathBuf; use std::result; use std::sync::{Arc, RwLock}; use std::thread; use super::Error as DeviceError; use super::{ ActivateError, ActivateResult, DescriptorChain, DeviceEventT, Queue, VirtioDevice, VirtioDeviceType, VirtioInterruptType, }; use crate::VirtioInterrupt; use virtio_bindings::virtio_blk::*; use vm_memory::{Bytes, GuestAddress, GuestMemory, GuestMemoryError, GuestMemoryMmap}; use vmm_sys_util::eventfd::EventFd; const CONFIG_SPACE_SIZE: usize = 8; const SECTOR_SHIFT: u8 = 9; pub const SECTOR_SIZE: u64 = (0x01 as u64) << SECTOR_SHIFT; const QUEUE_SIZE: u16 = 256; const NUM_QUEUES: usize = 1; const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE]; // New descriptors are pending on the virtio queue. const QUEUE_AVAIL_EVENT: DeviceEventT = 0; // The device has been dropped. pub const KILL_EVENT: DeviceEventT = 1; // Number of DeviceEventT events supported by this implementation. pub const BLOCK_EVENTS_COUNT: usize = 2; #[derive(Debug)] enum Error { /// Guest gave us bad memory addresses. GuestMemory(GuestMemoryError), /// Guest gave us offsets that would have overflowed a usize. CheckedOffset(GuestAddress, usize), /// Guest gave us a write only descriptor that protocol says to read from. UnexpectedWriteOnlyDescriptor, /// Guest gave us a read only descriptor that protocol says to write to. UnexpectedReadOnlyDescriptor, /// Guest gave us too few descriptors in a descriptor chain. DescriptorChainTooShort, /// Guest gave us a descriptor that was too short to use. DescriptorLengthTooSmall, /// Getting a block's metadata fails for any reason. GetFileMetadata, /// The requested operation would cause a seek beyond disk end. InvalidOffset, } #[derive(Debug)] enum ExecuteError { BadRequest(Error), Flush(io::Error), Read(GuestMemoryError), Seek(io::Error), Write(GuestMemoryError), Unsupported(u32), } impl ExecuteError { fn status(&self) -> u32 { match *self { ExecuteError::BadRequest(_) => VIRTIO_BLK_S_IOERR, ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR, ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR, ExecuteError::Seek(_) => VIRTIO_BLK_S_IOERR, ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR, ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP, } } } pub trait DiskFile: Read + Seek + Write + Clone {} impl DiskFile for D {} pub struct RawFile { file: File, } impl RawFile { pub fn new(file: File) -> Self { RawFile { file } } } impl Read for RawFile { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { self.file.read(buf) } } impl Seek for RawFile { fn seek(&mut self, pos: SeekFrom) -> std::io::Result { self.file.seek(pos) } } impl Write for RawFile { fn write(&mut self, buf: &[u8]) -> std::io::Result { self.file.write(buf) } fn flush(&mut self) -> std::io::Result<()> { self.file.flush() } } impl Clone for RawFile { fn clone(&self) -> Self { RawFile { file: self.file.try_clone().expect("RawFile cloning failed"), } } } #[derive(Clone, Copy, Debug, PartialEq)] enum RequestType { In, Out, Flush, GetDeviceID, Unsupported(u32), } fn request_type( mem: &GuestMemoryMmap, desc_addr: GuestAddress, ) -> result::Result { let type_ = mem.read_obj(desc_addr).map_err(Error::GuestMemory)?; match type_ { VIRTIO_BLK_T_IN => Ok(RequestType::In), VIRTIO_BLK_T_OUT => Ok(RequestType::Out), VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush), VIRTIO_BLK_T_GET_ID => Ok(RequestType::GetDeviceID), t => Ok(RequestType::Unsupported(t)), } } fn sector(mem: &GuestMemoryMmap, desc_addr: GuestAddress) -> result::Result { const SECTOR_OFFSET: usize = 8; let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) { Some(v) => v, None => return Err(Error::CheckedOffset(desc_addr, SECTOR_OFFSET)), }; mem.read_obj(addr).map_err(Error::GuestMemory) } fn build_device_id(disk_path: &PathBuf) -> result::Result { let blk_metadata = match disk_path.metadata() { Err(_) => return Err(Error::GetFileMetadata), Ok(m) => m, }; // This is how kvmtool does it. let device_id = format!( "{}{}{}", blk_metadata.st_dev(), blk_metadata.st_rdev(), blk_metadata.st_ino() ) .to_owned(); Ok(device_id) } fn build_disk_image_id(disk_path: &PathBuf) -> Vec { let mut default_disk_image_id = vec![0; VIRTIO_BLK_ID_BYTES as usize]; match build_device_id(disk_path) { Err(_) => { warn!("Could not generate device id. We'll use a default."); } Ok(m) => { // The kernel only knows to read a maximum of VIRTIO_BLK_ID_BYTES. // This will also zero out any leftover bytes. let disk_id = m.as_bytes(); let bytes_to_copy = cmp::min(disk_id.len(), VIRTIO_BLK_ID_BYTES as usize); default_disk_image_id[..bytes_to_copy].clone_from_slice(&disk_id[..bytes_to_copy]) } } default_disk_image_id } struct Request { request_type: RequestType, sector: u64, data_addr: GuestAddress, data_len: u32, status_addr: GuestAddress, } impl Request { fn parse( avail_desc: &DescriptorChain, mem: &GuestMemoryMmap, ) -> result::Result { // The head contains the request type which MUST be readable. if avail_desc.is_write_only() { return Err(Error::UnexpectedWriteOnlyDescriptor); } let mut req = Request { request_type: request_type(&mem, avail_desc.addr)?, sector: sector(&mem, avail_desc.addr)?, data_addr: GuestAddress(0), data_len: 0, status_addr: GuestAddress(0), }; let data_desc; let status_desc; let desc = avail_desc .next_descriptor() .ok_or(Error::DescriptorChainTooShort)?; if !desc.has_next() { status_desc = desc; // Only flush requests are allowed to skip the data descriptor. if req.request_type != RequestType::Flush { return Err(Error::DescriptorChainTooShort); } } else { data_desc = desc; status_desc = data_desc .next_descriptor() .ok_or(Error::DescriptorChainTooShort)?; if data_desc.is_write_only() && req.request_type == RequestType::Out { return Err(Error::UnexpectedWriteOnlyDescriptor); } if !data_desc.is_write_only() && req.request_type == RequestType::In { return Err(Error::UnexpectedReadOnlyDescriptor); } if !data_desc.is_write_only() && req.request_type == RequestType::GetDeviceID { return Err(Error::UnexpectedReadOnlyDescriptor); } req.data_addr = data_desc.addr; req.data_len = data_desc.len; } // The status MUST always be writable. if !status_desc.is_write_only() { return Err(Error::UnexpectedReadOnlyDescriptor); } if status_desc.len < 1 { return Err(Error::DescriptorLengthTooSmall); } req.status_addr = status_desc.addr; Ok(req) } #[allow(clippy::ptr_arg)] fn execute( &self, disk: &mut T, disk_nsectors: u64, mem: &GuestMemoryMmap, disk_id: &Vec, ) -> result::Result { let mut top: u64 = u64::from(self.data_len) / SECTOR_SIZE; if u64::from(self.data_len) % SECTOR_SIZE != 0 { top += 1; } top = top .checked_add(self.sector) .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?; if top > disk_nsectors { return Err(ExecuteError::BadRequest(Error::InvalidOffset)); } disk.seek(SeekFrom::Start(self.sector << SECTOR_SHIFT)) .map_err(ExecuteError::Seek)?; match self.request_type { RequestType::In => { mem.read_exact_from(self.data_addr, disk, self.data_len as usize) .map_err(ExecuteError::Read)?; return Ok(self.data_len); } RequestType::Out => { mem.write_all_to(self.data_addr, disk, self.data_len as usize) .map_err(ExecuteError::Write)?; } RequestType::Flush => match disk.flush() { Ok(_) => { return Ok(0); } Err(e) => return Err(ExecuteError::Flush(e)), }, RequestType::GetDeviceID => { if (self.data_len as usize) < disk_id.len() { return Err(ExecuteError::BadRequest(Error::InvalidOffset)); } mem.write_slice(&disk_id.as_slice(), self.data_addr) .map_err(ExecuteError::Write)?; } RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)), }; Ok(0) } } struct BlockEpollHandler { queues: Vec, mem: Arc>, disk_image: T, disk_nsectors: u64, interrupt_cb: Arc, disk_image_id: Vec, } impl BlockEpollHandler { fn process_queue(&mut self, queue_index: usize) -> bool { let queue = &mut self.queues[queue_index]; let mut used_desc_heads = [(0, 0); QUEUE_SIZE as usize]; let mut used_count = 0; let mem = self.mem.read().unwrap(); for avail_desc in queue.iter(&mem) { let len; match Request::parse(&avail_desc, &mem) { Ok(request) => { let status = match request.execute( &mut self.disk_image, self.disk_nsectors, &mem, &self.disk_image_id, ) { Ok(l) => { len = l; VIRTIO_BLK_S_OK } Err(e) => { error!("Failed to execute request: {:?}", e); len = 1; // We need at least 1 byte for the status. e.status() } }; // We use unwrap because the request parsing process already checked that the // status_addr was valid. mem.write_obj(status, request.status_addr).unwrap(); } Err(e) => { error!("Failed to parse available descriptor chain: {:?}", e); len = 0; } } used_desc_heads[used_count] = (avail_desc.index, len); used_count += 1; } for &(desc_index, len) in &used_desc_heads[..used_count] { queue.add_used(&mem, desc_index, len); } used_count > 0 } fn signal_used_queue(&self, queue_index: usize) -> result::Result<(), DeviceError> { (self.interrupt_cb)(&VirtioInterruptType::Queue, Some(&self.queues[queue_index])).map_err( |e| { error!("Failed to signal used queue: {:?}", e); DeviceError::FailedSignalingUsedQueue(e) }, ) } #[allow(dead_code)] fn update_disk_image( &mut self, disk_image: T, disk_path: &PathBuf, ) -> result::Result<(), DeviceError> { self.disk_image = disk_image; self.disk_nsectors = self .disk_image .seek(SeekFrom::End(0)) .map_err(DeviceError::IoError)? / SECTOR_SIZE; self.disk_image_id = build_disk_image_id(disk_path); Ok(()) } fn run(&mut self, queue_evt: EventFd, kill_evt: EventFd) -> result::Result<(), DeviceError> { // Create the epoll file descriptor let epoll_fd = epoll::create(true).map_err(DeviceError::EpollCreateFd)?; // Add events epoll::ctl( epoll_fd, epoll::ControlOptions::EPOLL_CTL_ADD, queue_evt.as_raw_fd(), epoll::Event::new(epoll::Events::EPOLLIN, u64::from(QUEUE_AVAIL_EVENT)), ) .map_err(DeviceError::EpollCtl)?; epoll::ctl( epoll_fd, epoll::ControlOptions::EPOLL_CTL_ADD, kill_evt.as_raw_fd(), epoll::Event::new(epoll::Events::EPOLLIN, u64::from(KILL_EVENT)), ) .map_err(DeviceError::EpollCtl)?; const EPOLL_EVENTS_LEN: usize = 100; let mut events = vec![epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; 'epoll: loop { let num_events = match epoll::wait(epoll_fd, -1, &mut events[..]) { Ok(res) => res, Err(e) => { if e.kind() == io::ErrorKind::Interrupted { // It's well defined from the epoll_wait() syscall // documentation that the epoll loop can be interrupted // before any of the requested events occurred or the // timeout expired. In both those cases, epoll_wait() // returns an error of type EINTR, but this should not // be considered as a regular error. Instead it is more // appropriate to retry, by calling into epoll_wait(). continue; } return Err(DeviceError::EpollWait(e)); } }; for event in events.iter().take(num_events) { let ev_type = event.data as u16; match ev_type { QUEUE_AVAIL_EVENT => { if let Err(e) = queue_evt.read() { error!("Failed to get queue event: {:?}", e); break 'epoll; } else if self.process_queue(0) { if let Err(e) = self.signal_used_queue(0) { error!("Failed to signal used queue: {:?}", e); break 'epoll; } } } KILL_EVENT => { debug!("KILL_EVENT received, stopping epoll loop"); break 'epoll; } _ => { error!("Unknown event for virtio-block"); } } } } Ok(()) } } /// Virtio device for exposing block level read/write operations on a host file. pub struct Block { kill_evt: Option, disk_image: Option, disk_path: PathBuf, disk_nsectors: u64, avail_features: u64, acked_features: u64, config_space: Vec, queue_evt: Option, interrupt_cb: Option>, } pub fn build_config_space(disk_size: u64) -> Vec { // We only support disk size, which uses the first two words of the configuration space. // If the image is not a multiple of the sector size, the tail bits are not exposed. // The config space is little endian. let mut config = Vec::with_capacity(CONFIG_SPACE_SIZE); let num_sectors = disk_size >> SECTOR_SHIFT; for i in 0..8 { config.push((num_sectors >> (8 * i)) as u8); } config } impl Block { /// Create a new virtio block device that operates on the given file. /// /// The given file must be seekable and sizable. pub fn new( mut disk_image: T, disk_path: PathBuf, is_disk_read_only: bool, ) -> io::Result> { let disk_size = disk_image.seek(SeekFrom::End(0))? as u64; if disk_size % SECTOR_SIZE != 0 { warn!( "Disk size {} is not a multiple of sector size {}; \ the remainder will not be visible to the guest.", disk_size, SECTOR_SIZE ); } let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_BLK_F_FLUSH); if is_disk_read_only { avail_features |= 1u64 << VIRTIO_BLK_F_RO; }; Ok(Block { kill_evt: None, disk_image: Some(disk_image), disk_path, disk_nsectors: disk_size / SECTOR_SIZE, avail_features, acked_features: 0u64, config_space: build_config_space(disk_size), queue_evt: None, interrupt_cb: None, }) } } impl Drop for Block { fn drop(&mut self) { if let Some(kill_evt) = self.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } } } impl VirtioDevice for Block { fn device_type(&self) -> u32 { VirtioDeviceType::TYPE_BLOCK as u32 } fn queue_max_sizes(&self) -> &[u16] { QUEUE_SIZES } fn features(&self, page: u32) -> u32 { match page { // Get the lower 32-bits of the features bitfield. 0 => self.avail_features as u32, // Get the upper 32-bits of the features bitfield. 1 => (self.avail_features >> 32) as u32, _ => { warn!("Received request for unknown features page."); 0u32 } } } fn ack_features(&mut self, page: u32, value: u32) { let mut v = match page { 0 => u64::from(value), 1 => u64::from(value) << 32, _ => { warn!("Cannot acknowledge unknown features page."); 0u64 } }; // Check if the guest is ACK'ing a feature that we didn't claim to have. let unrequested_features = v & !self.avail_features; if unrequested_features != 0 { warn!("Received acknowledge request for unknown feature."); // Don't count these features as acked. v &= !unrequested_features; } self.acked_features |= v; } fn read_config(&self, offset: u64, mut data: &mut [u8]) { let config_len = self.config_space.len() as u64; if offset >= config_len { error!("Failed to read config space"); return; } if let Some(end) = offset.checked_add(data.len() as u64) { // This write can't fail, offset and end are checked against config_len. data.write_all(&self.config_space[offset as usize..cmp::min(end, config_len) as usize]) .unwrap(); } } fn write_config(&mut self, offset: u64, data: &[u8]) { let data_len = data.len() as u64; let config_len = self.config_space.len() as u64; if offset + data_len > config_len { error!("Failed to write config space"); return; } let (_, right) = self.config_space.split_at_mut(offset as usize); right.copy_from_slice(&data[..]); } fn activate( &mut self, mem: Arc>, interrupt_cb: Arc, queues: Vec, mut queue_evts: Vec, ) -> ActivateResult { if queues.len() != NUM_QUEUES || queue_evts.len() != NUM_QUEUES { error!( "Cannot perform activate. Expected {} queue(s), got {}", NUM_QUEUES, queues.len() ); return Err(ActivateError::BadActivate); } let (self_kill_evt, kill_evt) = match EventFd::new(EFD_NONBLOCK).and_then(|e| Ok((e.try_clone()?, e))) { Ok(v) => v, Err(e) => { error!("failed creating kill EventFd pair: {}", e); return Err(ActivateError::BadActivate); } }; self.kill_evt = Some(self_kill_evt); if let Some(disk_image) = self.disk_image.clone() { let disk_image_id = build_disk_image_id(&self.disk_path); // Save the interrupt EventFD as we need to return it on reset // but clone it to pass into the thread. self.interrupt_cb = Some(interrupt_cb); let interrupt_cb = self.interrupt_cb.as_ref().unwrap().clone(); // Save the queue EventFD as we need to return it on reset // but clone it to pass into the thread. self.queue_evt = Some(queue_evts.remove(0)); let queue_evt = self.queue_evt.as_ref().unwrap().try_clone().map_err(|e| { error!("failed to clone queue EventFd: {}", e); ActivateError::BadActivate })?; let mut handler = BlockEpollHandler { queues, mem, disk_image, disk_nsectors: self.disk_nsectors, interrupt_cb, disk_image_id, }; let worker_result = thread::Builder::new() .name("virtio_blk".to_string()) .spawn(move || handler.run(queue_evt, kill_evt)); if let Err(e) = worker_result { error!("failed to spawn virtio_blk worker: {}", e); return Err(ActivateError::BadActivate); } return Ok(()); } Err(ActivateError::BadActivate) } fn reset(&mut self) -> Option<(Arc, Vec)> { if let Some(kill_evt) = self.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } // Return the interrupt and queue EventFDs Some(( self.interrupt_cb.take().unwrap(), vec![self.queue_evt.take().unwrap()], )) } }