From c5a656c9dc660b9a8157d98ab0e7d41e35206c8d Mon Sep 17 00:00:00 2001 From: Sergio Lopez Date: Mon, 13 Jan 2020 14:10:51 +0100 Subject: [PATCH] vm-virtio: block: Add support for alignment restrictions Doing I/O on an image opened with O_DIRECT requires to adhere to certain restrictions, requiring the following elements to be aligned: - Address of the source/destination memory buffer. - File offset. - Length of the data to be read/written. The actual alignment value depends on various elements, and according to open(2) "(...) there is currently no filesystem-independent interface for an application to discover these restrictions (...)". To discover such value, we iterate through a list of alignments (currently, 512 and 4096) calling pread() with each one and checking if the operation succeeded. We also extend RawFile so it can be used as a backend for QcowFile, so the later can be easily adapted to support O_DIRECT too. Signed-off-by: Sergio Lopez --- src/bin/vhost_user_blk.rs | 4 +- vm-virtio/src/block.rs | 295 ++++++++++++++++++++++++++++++++++++-- vmm/src/device_manager.rs | 2 +- 3 files changed, 286 insertions(+), 15 deletions(-) diff --git a/src/bin/vhost_user_blk.rs b/src/bin/vhost_user_blk.rs index c570095cc..d0737815a 100644 --- a/src/bin/vhost_user_blk.rs +++ b/src/bin/vhost_user_blk.rs @@ -87,7 +87,9 @@ impl VhostUserBlkBackend { let image_id = build_disk_image_id(&PathBuf::from(&image_path)); let image_type = qcow::detect_image_type(&raw_img).unwrap(); let mut image = match image_type { - ImageType::Raw => Box::new(vm_virtio::RawFile::new(raw_img)) as Box, + ImageType::Raw => { + Box::new(vm_virtio::RawFile::new(raw_img, false)) as Box + } ImageType::Qcow2 => Box::new(QcowFile::from(raw_img).unwrap()) as Box, }; diff --git a/vm-virtio/src/block.rs b/vm-virtio/src/block.rs index e3149a0b8..fdd839a69 100755 --- a/vm-virtio/src/block.rs +++ b/vm-virtio/src/block.rs @@ -16,21 +16,24 @@ use super::{ use crate::VirtioInterrupt; use arc_swap::ArcSwap; use epoll; -use libc::EFD_NONBLOCK; +use libc::{c_void, EFD_NONBLOCK}; +use std::alloc::{alloc_zeroed, dealloc, Layout}; use std::cmp; -use std::fs::File; +use std::convert::TryInto; +use std::fs::{File, Metadata}; use std::io::{self, Read, Seek, SeekFrom, Write}; use std::os::linux::fs::MetadataExt; -use std::os::unix::io::AsRawFd; +use std::os::unix::io::{AsRawFd, RawFd}; use std::path::PathBuf; use std::result; +use std::slice; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::thread; use virtio_bindings::bindings::virtio_blk::*; use vm_device::{Migratable, MigratableError, Pausable, Snapshotable}; use vm_memory::{Bytes, GuestAddress, GuestMemory, GuestMemoryError, GuestMemoryMmap}; -use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::{eventfd::EventFd, seek_hole::SeekHole, write_zeroes::PunchHole}; const CONFIG_SPACE_SIZE: usize = 8; const SECTOR_SHIFT: u8 = 9; @@ -94,31 +97,251 @@ impl ExecuteError { pub trait DiskFile: Read + Seek + Write + Clone {} impl DiskFile for D {} +#[derive(Debug)] pub struct RawFile { file: File, + alignment: usize, + position: u64, +} + +const BLK_ALIGNMENTS: [usize; 2] = [512, 4096]; + +fn is_valid_alignment(fd: RawFd, alignment: usize) -> bool { + let layout = Layout::from_size_align(alignment, alignment).unwrap(); + let ptr = unsafe { alloc_zeroed(layout) }; + + let ret = unsafe { + ::libc::pread( + fd, + ptr as *mut c_void, + alignment, + alignment.try_into().unwrap(), + ) + }; + + unsafe { dealloc(ptr, layout) }; + + ret >= 0 } impl RawFile { - pub fn new(file: File) -> Self { - RawFile { file } + pub fn new(file: File, direct_io: bool) -> Self { + // Assume no alignment restrictions if we aren't using O_DIRECT. + let mut alignment = 0; + if direct_io { + for align in &BLK_ALIGNMENTS { + if is_valid_alignment(file.as_raw_fd(), *align) { + alignment = *align; + break; + } + } + } + RawFile { + file, + alignment: alignment.try_into().unwrap(), + position: 0, + } + } + + fn round_up(&self, offset: u64) -> u64 { + let align: u64 = self.alignment.try_into().unwrap(); + ((offset / (align + 1)) + 1) * align + } + + fn round_down(&self, offset: u64) -> u64 { + let align: u64 = self.alignment.try_into().unwrap(); + (offset / align) * align + } + + fn is_aligned(&self, buf: &[u8]) -> bool { + if self.alignment == 0 { + return true; + } + + let align64: u64 = self.alignment.try_into().unwrap(); + + (self.position % align64 == 0) + && ((buf.as_ptr() as usize) % self.alignment == 0) + && (buf.len() % self.alignment == 0) + } + + pub fn set_len(&self, size: u64) -> std::io::Result<()> { + self.file.set_len(size) + } + + pub fn metadata(&self) -> std::io::Result { + self.file.metadata() + } + + pub fn try_clone(&self) -> std::io::Result { + Ok(RawFile { + file: self.file.try_clone().expect("RawFile cloning failed"), + alignment: self.alignment, + position: self.position, + }) + } + + pub fn sync_all(&self) -> std::io::Result<()> { + self.file.sync_all() + } + + pub fn sync_data(&self) -> std::io::Result<()> { + self.file.sync_data() } } impl Read for RawFile { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - self.file.read(buf) - } -} + if self.is_aligned(buf) { + match self.file.read(buf) { + Ok(r) => { + self.position = self.position.checked_add(r.try_into().unwrap()).unwrap(); + Ok(r) + } + Err(e) => Err(e), + } + } else { + let rounded_pos: u64 = self.round_down(self.position); + let file_offset: usize = self + .position + .checked_sub(rounded_pos) + .unwrap() + .try_into() + .unwrap(); + let buf_len: usize = buf.len(); + let rounded_len: usize = self + .round_up( + file_offset + .checked_add(buf_len) + .unwrap() + .try_into() + .unwrap(), + ) + .try_into() + .unwrap(); -impl Seek for RawFile { - fn seek(&mut self, pos: SeekFrom) -> std::io::Result { - self.file.seek(pos) + let layout = Layout::from_size_align(rounded_len, self.alignment).unwrap(); + let tmp_ptr = unsafe { alloc_zeroed(layout) }; + let tmp_buf = unsafe { slice::from_raw_parts_mut(tmp_ptr, rounded_len) }; + + // This can eventually replaced with read_at once its interface + // has been stabilized. + let ret = unsafe { + ::libc::pread64( + self.file.as_raw_fd(), + tmp_buf.as_mut_ptr() as *mut c_void, + tmp_buf.len(), + rounded_pos.try_into().unwrap(), + ) + }; + if ret < 0 { + unsafe { dealloc(tmp_ptr, layout) }; + return Err(io::Error::last_os_error()); + } + + let read: usize = ret.try_into().unwrap(); + if read < file_offset { + unsafe { dealloc(tmp_ptr, layout) }; + return Ok(0); + } + + let mut to_copy = read - file_offset; + if to_copy > buf_len { + to_copy = buf_len; + } + + buf.copy_from_slice(&tmp_buf[file_offset..(file_offset + buf_len)]); + unsafe { dealloc(tmp_ptr, layout) }; + + self.seek(SeekFrom::Current(to_copy.try_into().unwrap())) + .unwrap(); + Ok(to_copy.try_into().unwrap()) + } } } impl Write for RawFile { fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.file.write(buf) + if self.is_aligned(buf) { + match self.file.write(buf) { + Ok(r) => { + self.position = self.position.checked_add(r.try_into().unwrap()).unwrap(); + Ok(r) + } + Err(e) => Err(e), + } + } else { + let rounded_pos: u64 = self.round_down(self.position); + let file_offset: usize = self + .position + .checked_sub(rounded_pos) + .unwrap() + .try_into() + .unwrap(); + let buf_len: usize = buf.len(); + let rounded_len: usize = self + .round_up( + file_offset + .checked_add(buf_len) + .unwrap() + .try_into() + .unwrap(), + ) + .try_into() + .unwrap(); + + let layout = Layout::from_size_align(rounded_len, self.alignment).unwrap(); + let tmp_ptr = unsafe { alloc_zeroed(layout) }; + let tmp_buf = unsafe { slice::from_raw_parts_mut(tmp_ptr, rounded_len) }; + + // This can eventually replaced with read_at once its interface + // has been stabilized. + let ret = unsafe { + ::libc::pread64( + self.file.as_raw_fd(), + tmp_buf.as_mut_ptr() as *mut c_void, + tmp_buf.len(), + rounded_pos.try_into().unwrap(), + ) + }; + if ret < 0 { + unsafe { dealloc(tmp_ptr, layout) }; + return Err(io::Error::last_os_error()); + }; + + tmp_buf[file_offset..(file_offset + buf_len)].copy_from_slice(buf); + + // This can eventually replaced with write_at once its interface + // has been stabilized. + let ret = unsafe { + ::libc::pwrite64( + self.file.as_raw_fd(), + tmp_buf.as_ptr() as *const c_void, + tmp_buf.len(), + rounded_pos.try_into().unwrap(), + ) + }; + + unsafe { dealloc(tmp_ptr, layout) }; + + if ret < 0 { + return Err(io::Error::last_os_error()); + } + + let written: usize = ret.try_into().unwrap(); + if written < file_offset { + Ok(0) + } else { + let mut to_seek = written - file_offset; + if to_seek > buf_len { + to_seek = buf_len; + } + + self.seek(SeekFrom::Current(to_seek.try_into().unwrap())) + .unwrap(); + Ok(to_seek.try_into().unwrap()) + } + } } fn flush(&mut self) -> std::io::Result<()> { @@ -126,10 +349,56 @@ impl Write for RawFile { } } +impl Seek for RawFile { + fn seek(&mut self, newpos: SeekFrom) -> std::io::Result { + match self.file.seek(newpos) { + Ok(pos) => { + self.position = pos; + Ok(pos) + } + Err(e) => Err(e), + } + } +} + +impl PunchHole for RawFile { + fn punch_hole(&mut self, offset: u64, length: u64) -> std::io::Result<()> { + self.file.punch_hole(offset, length) + } +} + +impl SeekHole for RawFile { + fn seek_hole(&mut self, offset: u64) -> std::io::Result> { + match self.file.seek_hole(offset) { + Ok(pos) => { + if let Some(p) = pos { + self.position = p; + } + Ok(pos) + } + Err(e) => Err(e), + } + } + + fn seek_data(&mut self, offset: u64) -> std::io::Result> { + match self.file.seek_data(offset) { + Ok(pos) => { + if let Some(p) = pos { + self.position = p; + } + Ok(pos) + } + Err(e) => Err(e), + } + } +} + impl Clone for RawFile { fn clone(&self) -> Self { RawFile { file: self.file.try_clone().expect("RawFile cloning failed"), + alignment: self.alignment, + position: self.position, } } } diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 063c69d4b..d6bc5730a 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -937,7 +937,7 @@ impl DeviceManager { .map_err(DeviceManagerError::DetectImageType)?; match image_type { ImageType::Raw => { - let raw_img = vm_virtio::RawFile::new(raw_img); + let raw_img = vm_virtio::RawFile::new(raw_img, false); let dev = vm_virtio::Block::new( raw_img, disk_cfg.path.clone(),