// Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // // Portions Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. /// This is the `VirtioDevice` implementation for our vsock device. It handles the virtio-level /// device logic: feature negociation, device configuration, and device activation. /// The run-time device logic (i.e. event-driven data handling) is implemented by /// `super::epoll_handler::EpollHandler`. /// /// We aim to conform to the VirtIO v1.1 spec: /// https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.html /// /// The vsock device has two input parameters: a CID to identify the device, and a `VsockBackend` /// to use for offloading vsock traffic. /// /// Upon its activation, the vsock device creates its `EpollHandler`, passes it the event-interested /// file descriptors, and registers these descriptors with the VMM `EpollContext`. Going forward, /// the `EpollHandler` will get notified whenever an event occurs on the just-registered FDs: /// - an RX queue FD; /// - a TX queue FD; /// - an event queue FD; and /// - a backend FD. /// use epoll; use libc::EFD_NONBLOCK; use std; use std::io; use std::os::unix::io::AsRawFd; use std::result; use std::sync::{Arc, RwLock}; use std::thread; use super::{VsockBackend, VsockPacket}; use crate::Error as DeviceError; use crate::VirtioInterrupt; use crate::{ ActivateError, ActivateResult, DeviceEventT, Queue, VirtioDevice, VirtioDeviceType, VirtioInterruptType, VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1, }; use byteorder::{ByteOrder, LittleEndian}; use vm_memory::GuestMemoryMmap; use vmm_sys_util::eventfd::EventFd; const QUEUE_SIZE: u16 = 256; const NUM_QUEUES: usize = 3; const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE; NUM_QUEUES]; // New descriptors are pending on the rx queue. pub const RX_QUEUE_EVENT: DeviceEventT = 0; // New descriptors are pending on the tx queue. pub const TX_QUEUE_EVENT: DeviceEventT = 1; // New descriptors are pending on the event queue. pub const EVT_QUEUE_EVENT: DeviceEventT = 2; // Notification coming from the backend. pub const BACKEND_EVENT: DeviceEventT = 3; // The device has been dropped. pub const KILL_EVENT: DeviceEventT = 4; pub const EVENTS_LEN: usize = 5; /// The `VsockEpollHandler` implements the runtime logic of our vsock device: /// 1. Respond to TX queue events by wrapping virtio buffers into `VsockPacket`s, then sending those /// packets to the `VsockBackend`; /// 2. Forward backend FD event notifications to the `VsockBackend`; /// 3. Fetch incoming packets from the `VsockBackend` and place them into the virtio RX queue; /// 4. Whenever we have processed some virtio buffers (either TX or RX), let the driver know by /// raising our assigned IRQ. /// /// In a nutshell, the `VsockEpollHandler` logic looks like this: /// - on TX queue event: /// - fetch all packets from the TX queue and send them to the backend; then /// - if the backend has queued up any incoming packets, fetch them into any available RX buffers. /// - on RX queue event: /// - fetch any incoming packets, queued up by the backend, into newly available RX buffers. /// - on backend event: /// - forward the event to the backend; then /// - again, attempt to fetch any incoming packets queued by the backend into virtio RX buffers. /// pub struct VsockEpollHandler { pub mem: Arc>, pub queues: Vec, pub queue_evts: Vec, pub kill_evt: EventFd, pub interrupt_cb: Arc, pub backend: B, } impl VsockEpollHandler where B: VsockBackend, { /// Signal the guest driver that we've used some virtio buffers that it had previously made /// available. /// fn signal_used_queue(&self, queue: &Queue) -> result::Result<(), DeviceError> { debug!("vsock: raising IRQ"); (self.interrupt_cb)(&VirtioInterruptType::Queue, Some(queue)).map_err(|e| { error!("Failed to signal used queue: {:?}", e); DeviceError::FailedSignalingUsedQueue(e) }) } /// Walk the driver-provided RX queue buffers and attempt to fill them up with any data that we /// have pending. /// fn process_rx(&mut self) -> result::Result<(), DeviceError> { debug!("vsock: epoll_handler::process_rx()"); let mut used_desc_heads = [(0, 0); QUEUE_SIZE as usize]; let mut used_count = 0; let mem = self.mem.read().unwrap(); for avail_desc in self.queues[0].iter(&mem) { let used_len = match VsockPacket::from_rx_virtq_head(&avail_desc) { Ok(mut pkt) => { if self.backend.recv_pkt(&mut pkt).is_ok() { pkt.hdr().len() as u32 + pkt.len() } else { // We are using a consuming iterator over the virtio buffers, so, if we can't // fill in this buffer, we'll need to undo the last iterator step. self.queues[0].go_to_previous_position(); break; } } Err(e) => { warn!("vsock: RX queue error: {:?}", e); 0 } }; used_desc_heads[used_count] = (avail_desc.index, used_len); used_count += 1; } for &(desc_index, len) in &used_desc_heads[..used_count] { self.queues[0].add_used(&mem, desc_index, len); } if used_count > 0 { self.signal_used_queue(&self.queues[0]) } else { Ok(()) } } /// Walk the driver-provided TX queue buffers, package them up as vsock packets, and send them to /// the backend for processing. /// fn process_tx(&mut self) -> result::Result<(), DeviceError> { debug!("vsock: epoll_handler::process_tx()"); let mut used_desc_heads = [(0, 0); QUEUE_SIZE as usize]; let mut used_count = 0; let mem = self.mem.read().unwrap(); for avail_desc in self.queues[1].iter(&mem) { let pkt = match VsockPacket::from_tx_virtq_head(&avail_desc) { Ok(pkt) => pkt, Err(e) => { error!("vsock: error reading TX packet: {:?}", e); used_desc_heads[used_count] = (avail_desc.index, 0); used_count += 1; continue; } }; if self.backend.send_pkt(&pkt).is_err() { self.queues[1].go_to_previous_position(); break; } used_desc_heads[used_count] = (avail_desc.index, 0); used_count += 1; } for &(desc_index, len) in &used_desc_heads[..used_count] { self.queues[1].add_used(&mem, desc_index, len); } if used_count > 0 { self.signal_used_queue(&self.queues[1]) } else { Ok(()) } } fn run(&mut self) -> result::Result<(), DeviceError> { // Create the epoll file descriptor let epoll_fd = epoll::create(true).map_err(DeviceError::EpollCreateFd)?; // Add events epoll::ctl( epoll_fd, epoll::ControlOptions::EPOLL_CTL_ADD, self.queue_evts[0].as_raw_fd(), epoll::Event::new(epoll::Events::EPOLLIN, u64::from(RX_QUEUE_EVENT)), ) .map_err(DeviceError::EpollCtl)?; epoll::ctl( epoll_fd, epoll::ControlOptions::EPOLL_CTL_ADD, self.queue_evts[1].as_raw_fd(), epoll::Event::new(epoll::Events::EPOLLIN, u64::from(TX_QUEUE_EVENT)), ) .map_err(DeviceError::EpollCtl)?; epoll::ctl( epoll_fd, epoll::ControlOptions::EPOLL_CTL_ADD, self.queue_evts[2].as_raw_fd(), epoll::Event::new(epoll::Events::EPOLLIN, u64::from(EVT_QUEUE_EVENT)), ) .map_err(DeviceError::EpollCtl)?; epoll::ctl( epoll_fd, epoll::ControlOptions::EPOLL_CTL_ADD, self.backend.get_polled_fd(), epoll::Event::new(self.backend.get_polled_evset(), u64::from(BACKEND_EVENT)), ) .map_err(DeviceError::EpollCtl)?; epoll::ctl( epoll_fd, epoll::ControlOptions::EPOLL_CTL_ADD, self.kill_evt.as_raw_fd(), epoll::Event::new(epoll::Events::EPOLLIN, u64::from(KILL_EVENT)), ) .map_err(DeviceError::EpollCtl)?; let mut events = vec![epoll::Event::new(epoll::Events::empty(), 0); EVENTS_LEN]; 'epoll: loop { let num_events = match epoll::wait(epoll_fd, -1, &mut events[..]) { Ok(res) => res, Err(e) => { if e.kind() == io::ErrorKind::Interrupted { // It's well defined from the epoll_wait() syscall // documentation that the epoll loop can be interrupted // before any of the requested events occurred or the // timeout expired. In both those cases, epoll_wait() // returns an error of type EINTR, but this should not // be considered as a regular error. Instead it is more // appropriate to retry, by calling into epoll_wait(). continue; } return Err(DeviceError::EpollWait(e)); } }; for event in events.iter().take(num_events) { let evset = match epoll::Events::from_bits(event.events) { Some(evset) => evset, None => { let evbits = event.events; warn!("epoll: ignoring unknown event set: 0x{:x}", evbits); continue; } }; let ev_type = event.data as DeviceEventT; if self.handle_event(ev_type, evset)? { break 'epoll; } } } Ok(()) } pub fn handle_event( &mut self, device_event: DeviceEventT, evset: epoll::Events, ) -> Result { match device_event { RX_QUEUE_EVENT => { debug!("vsock: RX queue event"); if let Err(e) = self.queue_evts[0].read() { error!("Failed to get RX queue event: {:?}", e); return Err(DeviceError::FailedReadingQueue { event_type: "rx queue event", underlying: e, }); } else if self.backend.has_pending_rx() { self.process_rx()?; } } TX_QUEUE_EVENT => { debug!("vsock: TX queue event"); if let Err(e) = self.queue_evts[1].read() { error!("Failed to get TX queue event: {:?}", e); return Err(DeviceError::FailedReadingQueue { event_type: "tx queue event", underlying: e, }); } else { self.process_tx()?; // The backend may have queued up responses to the packets we sent during TX queue // processing. If that happened, we need to fetch those responses and place them // into RX buffers. if self.backend.has_pending_rx() { self.process_rx()?; } } } EVT_QUEUE_EVENT => { debug!("vsock: EVT queue event"); if let Err(e) = self.queue_evts[2].read() { error!("Failed to get EVT queue event: {:?}", e); return Err(DeviceError::FailedReadingQueue { event_type: "evt queue event", underlying: e, }); } } BACKEND_EVENT => { debug!("vsock: backend event"); self.backend.notify(evset); // After the backend has been kicked, it might've freed up some resources, so we // can attempt to send it more data to process. // In particular, if `self.backend.send_pkt()` halted the TX queue processing (by // reurning an error) at some point in the past, now is the time to try walking the // TX queue again. self.process_tx()?; if self.backend.has_pending_rx() { self.process_rx()?; } } KILL_EVENT => { debug!("KILL_EVENT received, stopping epoll loop"); return Ok(true); } other => { error!("Unknown event for virtio-vsock"); return Err(DeviceError::UnknownEvent { device: "vsock", event: other, }); } } Ok(false) } } /// Virtio device exposing virtual socket to the guest. pub struct Vsock { cid: u64, backend: Option, kill_evt: Option, avail_features: u64, acked_features: u64, } impl Vsock where B: VsockBackend, { /// Create a new virtio-vsock device with the given VM CID and vsock /// backend. pub fn new(cid: u64, backend: B) -> io::Result> { let avail_features = 1u64 << VIRTIO_F_VERSION_1 | 1u64 << VIRTIO_F_IN_ORDER; Ok(Vsock { cid, backend: Some(backend), kill_evt: None, avail_features, acked_features: 0u64, }) } } impl Drop for Vsock where B: VsockBackend, { fn drop(&mut self) { if let Some(kill_evt) = self.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } } } impl VirtioDevice for Vsock where B: VsockBackend + 'static, { fn device_type(&self) -> u32 { VirtioDeviceType::TYPE_VSOCK as u32 } fn queue_max_sizes(&self) -> &[u16] { QUEUE_SIZES } fn features(&self, page: u32) -> u32 { match page { // Get the lower 32-bits of the features bitfield. 0 => self.avail_features as u32, // Get the upper 32-bits of the features bitfield. 1 => (self.avail_features >> 32) as u32, _ => { warn!("Received request for unknown features page."); 0u32 } } } fn ack_features(&mut self, page: u32, value: u32) { let mut v = match page { 0 => u64::from(value), 1 => u64::from(value) << 32, _ => { warn!("Cannot acknowledge unknown features page."); 0u64 } }; // Check if the guest is ACK'ing a feature that we didn't claim to have. let unrequested_features = v & !self.avail_features; if unrequested_features != 0 { warn!("Received acknowledge request for unknown feature."); // Don't count these features as acked. v &= !unrequested_features; } self.acked_features |= v; } fn read_config(&self, offset: u64, data: &mut [u8]) { match offset { 0 if data.len() == 8 => LittleEndian::write_u64(data, self.cid), 0 if data.len() == 4 => LittleEndian::write_u32(data, (self.cid & 0xffff_ffff) as u32), 4 if data.len() == 4 => { LittleEndian::write_u32(data, ((self.cid >> 32) & 0xffff_ffff) as u32) } _ => warn!( "vsock: virtio-vsock received invalid read request of {} bytes at offset {}", data.len(), offset ), } } fn write_config(&mut self, offset: u64, data: &[u8]) { warn!( "vsock: guest driver attempted to write device config (offset={:x}, len={:x})", offset, data.len() ); } fn activate( &mut self, mem: Arc>, interrupt_cb: Arc, queues: Vec, queue_evts: Vec, ) -> ActivateResult { if queues.len() != NUM_QUEUES || queue_evts.len() != NUM_QUEUES { error!( "Cannot perform activate. Expected {} queue(s), got {}", NUM_QUEUES, queues.len() ); return Err(ActivateError::BadActivate); } let (self_kill_evt, kill_evt) = match EventFd::new(EFD_NONBLOCK).and_then(|e| Ok((e.try_clone()?, e))) { Ok(v) => v, Err(e) => { error!("failed creating kill EventFd pair: {}", e); return Err(ActivateError::BadActivate); } }; self.kill_evt = Some(self_kill_evt); let mut handler = VsockEpollHandler { mem, queues, queue_evts, kill_evt, interrupt_cb, backend: self.backend.take().unwrap(), }; let worker_result = thread::Builder::new() .name("virtio_vsock".to_string()) .spawn(move || handler.run()); if let Err(e) = worker_result { error!("failed to spawn virtio_vsock worker: {}", e); return Err(ActivateError::BadActivate);; } Ok(()) } }