// Copyright 2019 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use super::fs_cache_req_handler::FsCacheReqHandler; use crate::filesystem::{ Context, DirEntry, Entry, FileSystem, FsOptions, GetxattrReply, ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use crate::fuse; use crate::multikey::MultikeyBTreeMap; use std::collections::btree_map; use std::collections::BTreeMap; use std::ffi::{CStr, CString}; use std::fs::File; use std::io; use std::mem::{self, size_of, MaybeUninit}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::str::FromStr; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, RwLock}; use std::time::Duration; use vm_memory::ByteValued; const CURRENT_DIR_CSTR: &[u8] = b".\0"; const PARENT_DIR_CSTR: &[u8] = b"..\0"; const EMPTY_CSTR: &[u8] = b"\0"; const PROC_CSTR: &[u8] = b"/proc/self/fd\0"; type Inode = u64; type Handle = u64; #[derive(Clone, Copy, PartialOrd, Ord, PartialEq, Eq)] struct InodeAltKey { ino: libc::ino64_t, dev: libc::dev_t, } struct InodeData { inode: Inode, // Most of these aren't actually files but ¯\_(ツ)_/¯. file: File, refcount: AtomicU64, } struct HandleData { inode: Inode, file: RwLock, } #[repr(C, packed)] #[derive(Clone, Copy, Debug, Default)] struct LinuxDirent64 { d_ino: libc::ino64_t, d_off: libc::off64_t, d_reclen: libc::c_ushort, d_ty: libc::c_uchar, } unsafe impl ByteValued for LinuxDirent64 {} macro_rules! scoped_cred { ($name:ident, $ty:ty, $syscall_nr:expr) => { #[derive(Debug)] struct $name; impl $name { // Changes the effective uid/gid of the current thread to `val`. Changes // the thread's credentials back to root when the returned struct is dropped. fn new(val: $ty) -> io::Result> { if val == 0 { // Nothing to do since we are already uid 0. return Ok(None); } // We want credential changes to be per-thread because otherwise // we might interfere with operations being carried out on other // threads with different uids/gids. However, posix requires that // all threads in a process share the same credentials. To do this // libc uses signals to ensure that when one thread changes its // credentials the other threads do the same thing. // // So instead we invoke the syscall directly in order to get around // this limitation. Another option is to use the setfsuid and // setfsgid systems calls. However since those calls have no way to // return an error, it's preferable to do this instead. // This call is safe because it doesn't modify any memory and we // check the return value. let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) }; if res == 0 { Ok(Some($name)) } else { Err(io::Error::last_os_error()) } } } impl Drop for $name { fn drop(&mut self) { let res = unsafe { libc::syscall($syscall_nr, -1, 0, -1) }; if res < 0 { error!( "failed to change credentials back to root: {}", io::Error::last_os_error(), ); } } } }; } scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid); scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid); fn set_creds( uid: libc::uid_t, gid: libc::gid_t, ) -> io::Result<(Option, Option)> { // We have to change the gid before we change the uid because if we change the uid first then we // lose the capability to change the gid. However changing back can happen in any order. ScopedGid::new(gid).and_then(|gid| Ok((ScopedUid::new(uid)?, gid))) } fn ebadf() -> io::Error { io::Error::from_raw_os_error(libc::EBADF) } fn stat(f: &File) -> io::Result { let mut st = MaybeUninit::::zeroed(); // Safe because this is a constant value and a valid C string. let pathname = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; // Safe because the kernel will only write data in `st` and we check the return // value. let res = unsafe { libc::fstatat64( f.as_raw_fd(), pathname.as_ptr(), st.as_mut_ptr(), libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW, ) }; if res >= 0 { // Safe because the kernel guarantees that the struct is now fully initialized. Ok(unsafe { st.assume_init() }) } else { Err(io::Error::last_os_error()) } } /// The caching policy that the file system should report to the FUSE client. By default the FUSE /// protocol uses close-to-open consistency. This means that any cached contents of the file are /// invalidated the next time that file is opened. #[derive(Debug, Clone)] pub enum CachePolicy { /// The client should never cache file data and all I/O should be directly forwarded to the /// server. This policy must be selected when file contents may change without the knowledge of /// the FUSE client (i.e., the file system does not have exclusive access to the directory). Never, /// The client is free to choose when and how to cache file data. This is the default policy and /// uses close-to-open consistency as described in the enum documentation. Auto, /// The client should always cache file data. This means that the FUSE client will not /// invalidate any cached data that was returned by the file system the last time the file was /// opened. This policy should only be selected when the file system has exclusive access to the /// directory. Always, } impl FromStr for CachePolicy { type Err = &'static str; fn from_str(s: &str) -> Result { match s { "never" | "Never" | "NEVER" => Ok(CachePolicy::Never), "auto" | "Auto" | "AUTO" => Ok(CachePolicy::Auto), "always" | "Always" | "ALWAYS" => Ok(CachePolicy::Always), _ => Err("invalid cache policy"), } } } impl Default for CachePolicy { fn default() -> Self { CachePolicy::Auto } } /// Options that configure the behavior of the file system. #[derive(Debug, Clone)] pub struct Config { /// How long the FUSE client should consider directory entries to be valid. If the contents of a /// directory can only be modified by the FUSE client (i.e., the file system has exclusive /// access), then this should be a large value. /// /// The default value for this option is 5 seconds. pub entry_timeout: Duration, /// How long the FUSE client should consider file and directory attributes to be valid. If the /// attributes of a file or directory can only be modified by the FUSE client (i.e., the file /// system has exclusive access), then this should be set to a large value. /// /// The default value for this option is 5 seconds. pub attr_timeout: Duration, /// The caching policy the file system should use. See the documentation of `CachePolicy` for /// more details. pub cache_policy: CachePolicy, /// Whether the file system should enabled writeback caching. This can improve performance as it /// allows the FUSE client to cache and coalesce multiple writes before sending them to the file /// system. However, enabling this option can increase the risk of data corruption if the file /// contents can change without the knowledge of the FUSE client (i.e., the server does **NOT** /// have exclusive access). Additionally, the file system should have read access to all files /// in the directory it is serving as the FUSE client may send read requests even for files /// opened with `O_WRONLY`. /// /// Therefore callers should only enable this option when they can guarantee that: 1) the file /// system has exclusive access to the directory and 2) the file system has read permissions for /// all files in that directory. /// /// The default value for this option is `false`. pub writeback: bool, /// The path of the root directory. /// /// The default is `/`. pub root_dir: String, /// Whether the file system should support Extended Attributes (xattr). Enabling this feature may /// have a significant impact on performance, especially on write parallelism. This is the result /// of FUSE attempting to remove the special file privileges after each write request. /// /// The default value for this options is `false`. pub xattr: bool, /// Optional file descriptor for /proc/self/fd. Callers can obtain a file descriptor and pass it /// here, so there's no need to open it in PassthroughFs::new(). This is specially useful for /// sandboxing. /// /// The default is `None`. pub proc_sfd_rawfd: Option, } impl Default for Config { fn default() -> Self { Config { entry_timeout: Duration::from_secs(5), attr_timeout: Duration::from_secs(5), cache_policy: Default::default(), writeback: false, root_dir: String::from("/"), xattr: false, proc_sfd_rawfd: None, } } } /// A file system that simply "passes through" all requests it receives to the underlying file /// system. To keep the implementation simple it servers the contents of its root directory. Users /// that wish to serve only a specific directory should set up the environment so that that /// directory ends up as the root of the file system process. One way to accomplish this is via a /// combination of mount namespaces and the pivot_root system call. pub struct PassthroughFs { // File descriptors for various points in the file system tree. These fds are always opened with // the `O_PATH` option so they cannot be used for reading or writing any data. See the // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot // do with an fd opened with this flag. inodes: RwLock>>, next_inode: AtomicU64, // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be // used for reading and writing data. handles: RwLock>>, next_handle: AtomicU64, // File descriptor pointing to the `/proc/self/fd` directory. This is used to convert an fd from // `inodes` into one that can go into `handles`. This is accomplished by reading the // `/proc/self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are // meant to be serving doesn't have access to `/proc/self/fd`. proc_self_fd: File, // Whether writeback caching is enabled for this directory. This will only be true when // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`. writeback: AtomicBool, cfg: Config, } impl PassthroughFs { pub fn new(cfg: Config) -> io::Result { let fd = if let Some(fd) = cfg.proc_sfd_rawfd { fd } else { // Safe because this is a constant value and a valid C string. let proc_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_CSTR) }; // Safe because this doesn't modify any memory and we check the return value. let fd = unsafe { libc::openat( libc::AT_FDCWD, proc_cstr.as_ptr(), libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, ) }; if fd < 0 { return Err(io::Error::last_os_error()); } fd }; // Safe because we just opened this fd or it was provided by our caller. let proc_self_fd = unsafe { File::from_raw_fd(fd) }; Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), next_inode: AtomicU64::new(fuse::ROOT_ID + 1), handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(0), proc_self_fd, writeback: AtomicBool::new(false), cfg, }) } pub fn keep_fds(&self) -> Vec { vec![self.proc_self_fd.as_raw_fd()] } fn open_inode(&self, inode: Inode, mut flags: i32) -> io::Result { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let pathname = CString::new(format!("{}", data.file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; // When writeback caching is enabled, the kernel may send read requests even if the // userspace program opened the file write-only. So we need to ensure that we have opened // the file for reading as well as writing. let writeback = self.writeback.load(Ordering::Relaxed); if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY { flags &= !libc::O_ACCMODE; flags |= libc::O_RDWR; } // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`. // However, this breaks atomicity as the file may have changed on disk, invalidating the // cached copy of the data in the kernel and the offset that the kernel thinks is the end of // the file. Just allow this for now as it is the user's responsibility to enable writeback // caching only for directories that are not shared. It also means that we need to clear the // `O_APPEND` flag. if writeback && flags & libc::O_APPEND != 0 { flags &= !libc::O_APPEND; } // Safe because this doesn't modify any memory and we check the return value. We don't // really check `flags` because if the kernel can't handle poorly specified flags then we // have much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need // to follow the `/proc/self/fd` symlink to get the file. let fd = unsafe { libc::openat( self.proc_self_fd.as_raw_fd(), pathname.as_ptr(), (flags | libc::O_CLOEXEC) & (!libc::O_NOFOLLOW), ) }; if fd < 0 { return Err(io::Error::last_os_error()); } // Safe because we just opened this fd. Ok(unsafe { File::from_raw_fd(fd) }) } fn do_lookup(&self, parent: Inode, name: &CStr) -> io::Result { let p = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; // Safe because this doesn't modify any memory and we check the return value. let fd = unsafe { libc::openat( p.file.as_raw_fd(), name.as_ptr(), libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, ) }; if fd < 0 { return Err(io::Error::last_os_error()); } // Safe because we just opened this fd. let f = unsafe { File::from_raw_fd(fd) }; let st = stat(&f)?; let altkey = InodeAltKey { ino: st.st_ino, dev: st.st_dev, }; let data = self.inodes.read().unwrap().get_alt(&altkey).map(Arc::clone); let inode = if let Some(data) = data { // Matches with the release store in `forget`. data.refcount.fetch_add(1, Ordering::Acquire); data.inode } else { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); self.inodes.write().unwrap().insert( inode, InodeAltKey { ino: st.st_ino, dev: st.st_dev, }, Arc::new(InodeData { inode, file: f, refcount: AtomicU64::new(1), }), ); inode }; Ok(Entry { inode, generation: 0, attr: st, attr_timeout: self.cfg.attr_timeout, entry_timeout: self.cfg.entry_timeout, }) } fn do_readdir( &self, inode: Inode, handle: Handle, size: u32, offset: u64, mut add_entry: F, ) -> io::Result<()> where F: FnMut(DirEntry) -> io::Result, { if size == 0 { return Ok(()); } let data = self .handles .read() .unwrap() .get(&handle) .filter(|hd| hd.inode == inode) .map(Arc::clone) .ok_or_else(ebadf)?; let mut buf = vec![0; size as usize]; { // Since we are going to work with the kernel offset, we have to acquire the file lock // for both the `lseek64` and `getdents64` syscalls to ensure that no other thread // changes the kernel offset while we are using it. let dir = data.file.write().unwrap(); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::lseek64(dir.as_raw_fd(), offset as libc::off64_t, libc::SEEK_SET) }; if res < 0 { return Err(io::Error::last_os_error()); } // Safe because the kernel guarantees that it will only write to `buf` and we check the // return value. let res = unsafe { libc::syscall( libc::SYS_getdents64, dir.as_raw_fd(), buf.as_mut_ptr() as *mut LinuxDirent64, size as libc::c_int, ) }; if res < 0 { return Err(io::Error::last_os_error()); } buf.resize(res as usize, 0); // Explicitly drop the lock so that it's not held while we fill in the fuse buffer. mem::drop(dir); } let mut rem = &buf[..]; while !rem.is_empty() { // We only use debug asserts here because these values are coming from the kernel and we // trust them implicitly. debug_assert!( rem.len() >= size_of::(), "not enough space left in `rem`" ); let (front, back) = rem.split_at(size_of::()); let dirent64 = LinuxDirent64::from_slice(front).expect("unable to get LinuxDirent64 from slice"); let namelen = dirent64.d_reclen as usize - size_of::(); debug_assert!(namelen <= back.len(), "back is smaller than `namelen`"); let name = &back[..namelen]; let res = if name.starts_with(CURRENT_DIR_CSTR) || name.starts_with(PARENT_DIR_CSTR) { // We don't want to report the "." and ".." entries. However, returning `Ok(0)` will // break the loop so return `Ok` with a non-zero value instead. Ok(1) } else { add_entry(DirEntry { ino: dirent64.d_ino, offset: dirent64.d_off as u64, type_: u32::from(dirent64.d_ty), name, }) }; debug_assert!( rem.len() >= dirent64.d_reclen as usize, "rem is smaller than `d_reclen`" ); match res { Ok(0) => break, Ok(_) => rem = &rem[dirent64.d_reclen as usize..], Err(e) => return Err(e), } } Ok(()) } fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option, OpenOptions)> { let file = RwLock::new(self.open_inode(inode, flags as i32)?); let handle = self.next_handle.fetch_add(1, Ordering::Relaxed); let data = HandleData { inode, file }; self.handles.write().unwrap().insert(handle, Arc::new(data)); let mut opts = OpenOptions::empty(); match self.cfg.cache_policy { // We only set the direct I/O option on files. CachePolicy::Never => opts.set( OpenOptions::DIRECT_IO, flags & (libc::O_DIRECTORY as u32) == 0, ), CachePolicy::Always => { if flags & (libc::O_DIRECTORY as u32) == 0 { opts |= OpenOptions::KEEP_CACHE; } else { opts |= OpenOptions::CACHE_DIR; } } _ => {} }; Ok((Some(handle), opts)) } fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> { let mut handles = self.handles.write().unwrap(); if let btree_map::Entry::Occupied(e) = handles.entry(handle) { if e.get().inode == inode { // We don't need to close the file here because that will happen automatically when // the last `Arc` is dropped. e.remove(); return Ok(()); } } Err(ebadf()) } fn do_getattr(&self, inode: Inode) -> io::Result<(libc::stat64, Duration)> { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let st = stat(&data.file)?; Ok((st, self.cfg.attr_timeout)) } fn do_unlink(&self, parent: Inode, name: &CStr, flags: libc::c_int) -> io::Result<()> { let data = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::unlinkat(data.file.as_raw_fd(), name.as_ptr(), flags) }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } } fn forget_one( inodes: &mut MultikeyBTreeMap>, inode: Inode, count: u64, ) { if let Some(data) = inodes.get(&inode) { // Acquiring the write lock on the inode map prevents new lookups from incrementing the // refcount but there is the possibility that a previous lookup already acquired a // reference to the inode data and is in the process of updating the refcount so we need // to loop here until we can decrement successfully. loop { let refcount = data.refcount.load(Ordering::Relaxed); // Saturating sub because it doesn't make sense for a refcount to go below zero and // we don't want misbehaving clients to cause integer overflow. let new_count = refcount.saturating_sub(count); // Synchronizes with the acquire load in `do_lookup`. if data .refcount .compare_and_swap(refcount, new_count, Ordering::Release) == refcount { if new_count == 0 { // We just removed the last refcount for this inode. There's no need for an // acquire fence here because we hold a write lock on the inode map and any // thread that is waiting to do a forget on the same inode will have to wait // until we release the lock. So there's is no other release store for us to // synchronize with before deleting the entry. inodes.remove(&inode); } break; } } } } impl FileSystem for PassthroughFs { type Inode = Inode; type Handle = Handle; fn init(&self, capable: FsOptions) -> io::Result { let root = CString::new(self.cfg.root_dir.as_str()).expect("CString::new failed"); // Safe because this doesn't modify any memory and we check the return value. // We use `O_PATH` because we just want this for traversing the directory tree // and not for actually reading the contents. let fd = unsafe { libc::openat( libc::AT_FDCWD, root.as_ptr(), libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC, ) }; if fd < 0 { return Err(io::Error::last_os_error()); } // Safe because we just opened this fd above. let f = unsafe { File::from_raw_fd(fd) }; let st = stat(&f)?; // Safe because this doesn't modify any memory and there is no need to check the return // value because this system call always succeeds. We need to clear the umask here because // we want the client to be able to set all the bits in the mode. unsafe { libc::umask(0o000) }; let mut inodes = self.inodes.write().unwrap(); // Not sure why the root inode gets a refcount of 2 but that's what libfuse does. inodes.insert( fuse::ROOT_ID, InodeAltKey { ino: st.st_ino, dev: st.st_dev, }, Arc::new(InodeData { inode: fuse::ROOT_ID, file: f, refcount: AtomicU64::new(2), }), ); let mut opts = FsOptions::DO_READDIRPLUS | FsOptions::READDIRPLUS_AUTO; if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) { opts |= FsOptions::WRITEBACK_CACHE; self.writeback.store(true, Ordering::Relaxed); } Ok(opts) } fn destroy(&self) { self.handles.write().unwrap().clear(); self.inodes.write().unwrap().clear(); } fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let mut out = MaybeUninit::::zeroed(); // Safe because this will only modify `out` and we check the return value. let res = unsafe { libc::fstatvfs64(data.file.as_raw_fd(), out.as_mut_ptr()) }; if res == 0 { // Safe because the kernel guarantees that `out` has been initialized. Ok(unsafe { out.assume_init() }) } else { Err(io::Error::last_os_error()) } } fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { let mut inodes = self.inodes.write().unwrap(); forget_one(&mut inodes, inode, count) } fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) { let mut inodes = self.inodes.write().unwrap(); for (inode, count) in requests { forget_one(&mut inodes, inode, count) } } fn opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option, OpenOptions)> { self.do_open(inode, flags | (libc::O_DIRECTORY as u32)) } fn releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()> { self.do_release(inode, handle) } fn mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, ) -> io::Result { let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; let data = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::mkdirat(data.file.as_raw_fd(), name.as_ptr(), mode & !umask) }; if res == 0 { self.do_lookup(parent, name) } else { Err(io::Error::last_os_error()) } } fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { self.do_unlink(parent, name, libc::AT_REMOVEDIR) } fn readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, add_entry: F, ) -> io::Result<()> where F: FnMut(DirEntry) -> io::Result, { self.do_readdir(inode, handle, size, offset, add_entry) } fn readdirplus( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, mut add_entry: F, ) -> io::Result<()> where F: FnMut(DirEntry, Entry) -> io::Result, { self.do_readdir(inode, handle, size, offset, |dir_entry| { // Safe because the kernel guarantees that the buffer is nul-terminated. Additionally, // the kernel will pad the name with '\0' bytes up to 8-byte alignment and there's no // way for us to know exactly how many padding bytes there are. This would cause // `CStr::from_bytes_with_nul` to return an error because it would think there are // interior '\0' bytes. We trust the kernel to provide us with properly formatted data // so we'll just skip the checks here. let name = unsafe { CStr::from_bytes_with_nul_unchecked(dir_entry.name) }; let entry = self.do_lookup(inode, name)?; add_entry(dir_entry, entry) }) } fn open( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option, OpenOptions)> { self.do_open(inode, flags) } fn release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option, ) -> io::Result<()> { self.do_release(inode, handle) } fn create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, flags: u32, umask: u32, ) -> io::Result<(Entry, Option, OpenOptions)> { let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; let data = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; // Safe because this doesn't modify any memory and we check the return value. We don't // really check `flags` because if the kernel can't handle poorly specified flags then we // have much bigger problems. let fd = unsafe { libc::openat( data.file.as_raw_fd(), name.as_ptr(), flags as i32 | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW, mode & !(umask & 0o777), ) }; if fd < 0 { return Err(io::Error::last_os_error()); } // Safe because we just opened this fd. let file = RwLock::new(unsafe { File::from_raw_fd(fd) }); let entry = self.do_lookup(parent, name)?; let handle = self.next_handle.fetch_add(1, Ordering::Relaxed); let data = HandleData { inode: entry.inode, file, }; self.handles.write().unwrap().insert(handle, Arc::new(data)); let mut opts = OpenOptions::empty(); match self.cfg.cache_policy { CachePolicy::Never => opts |= OpenOptions::DIRECT_IO, CachePolicy::Always => opts |= OpenOptions::KEEP_CACHE, _ => {} }; Ok((entry, Some(handle), opts)) } fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { self.do_unlink(parent, name, 0) } fn setupmapping( &self, _ctx: Context, inode: Inode, _handle: Handle, foffset: u64, len: u64, flags: u64, moffset: u64, vu_req: &mut T, ) -> io::Result<()> { debug!( "setupmapping: ino {:?} foffset {} len {} flags {} moffset {}", inode, foffset, len, flags, moffset ); let open_flags = if (flags & fuse::SetupmappingFlags::WRITE.bits()) != 0 { libc::O_RDWR } else { libc::O_RDONLY }; let file = self.open_inode(inode, open_flags as i32)?; (*vu_req).map(foffset, moffset, len, flags, file.as_raw_fd()) } fn removemapping( &self, _ctx: Context, requests: Vec, vu_req: &mut T, ) -> io::Result<()> { (*vu_req).unmap(requests) } fn read( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option, _flags: u32, ) -> io::Result { let data = self .handles .read() .unwrap() .get(&handle) .filter(|hd| hd.inode == inode) .map(Arc::clone) .ok_or_else(ebadf)?; // This is safe because write_from uses preadv64, so the underlying file descriptor // offset is not affected by this operation. let mut f = data.file.read().unwrap().try_clone().unwrap(); w.write_from(&mut f, size as usize, offset) } fn write( &self, ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option, _delayed_write: bool, kill_priv: bool, _flags: u32, ) -> io::Result { if kill_priv { // We need to change credentials during a write so that the kernel will remove setuid // or setgid bits from the file if it was written to by someone other than the owner. let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; } let data = self .handles .read() .unwrap() .get(&handle) .filter(|hd| hd.inode == inode) .map(Arc::clone) .ok_or_else(ebadf)?; // This is safe because read_to uses pwritev64, so the underlying file descriptor // offset is not affected by this operation. let mut f = data.file.read().unwrap().try_clone().unwrap(); r.read_to(&mut f, size as usize, offset) } fn getattr( &self, _ctx: Context, inode: Inode, _handle: Option, ) -> io::Result<(libc::stat64, Duration)> { self.do_getattr(inode) } fn setattr( &self, _ctx: Context, inode: Inode, attr: libc::stat64, handle: Option, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)> { let inode_data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; enum Data { Handle(Arc, RawFd), ProcPath(CString), } // If we have a handle then use it otherwise get a new fd from the inode. let data = if let Some(handle) = handle { let hd = self .handles .read() .unwrap() .get(&handle) .filter(|hd| hd.inode == inode) .map(Arc::clone) .ok_or_else(ebadf)?; let fd = hd.file.write().unwrap().as_raw_fd(); Data::Handle(hd, fd) } else { let pathname = CString::new(format!("{}", inode_data.file.as_raw_fd())) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; Data::ProcPath(pathname) }; if valid.contains(SetattrValid::MODE) { // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { match data { Data::Handle(_, fd) => libc::fchmod(fd, attr.st_mode), Data::ProcPath(ref p) => { libc::fchmodat(self.proc_self_fd.as_raw_fd(), p.as_ptr(), attr.st_mode, 0) } } }; if res < 0 { return Err(io::Error::last_os_error()); } } if valid.intersects(SetattrValid::UID | SetattrValid::GID) { let uid = if valid.contains(SetattrValid::UID) { attr.st_uid } else { // Cannot use -1 here because these are unsigned values. ::std::u32::MAX }; let gid = if valid.contains(SetattrValid::GID) { attr.st_gid } else { // Cannot use -1 here because these are unsigned values. ::std::u32::MAX }; // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::fchownat( inode_data.file.as_raw_fd(), empty.as_ptr(), uid, gid, libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW, ) }; if res < 0 { return Err(io::Error::last_os_error()); } } if valid.contains(SetattrValid::SIZE) { // Safe because this doesn't modify any memory and we check the return value. let res = match data { Data::Handle(_, fd) => unsafe { libc::ftruncate(fd, attr.st_size) }, _ => { // There is no `ftruncateat` so we need to get a new fd and truncate it. let f = self.open_inode(inode, libc::O_NONBLOCK | libc::O_RDWR)?; unsafe { libc::ftruncate(f.as_raw_fd(), attr.st_size) } } }; if res < 0 { return Err(io::Error::last_os_error()); } } if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) { let mut tvs = [ libc::timespec { tv_sec: 0, tv_nsec: libc::UTIME_OMIT, }, libc::timespec { tv_sec: 0, tv_nsec: libc::UTIME_OMIT, }, ]; if valid.contains(SetattrValid::ATIME_NOW) { tvs[0].tv_nsec = libc::UTIME_NOW; } else if valid.contains(SetattrValid::ATIME) { tvs[0].tv_sec = attr.st_atime; tvs[0].tv_nsec = attr.st_atime_nsec; } if valid.contains(SetattrValid::MTIME_NOW) { tvs[1].tv_nsec = libc::UTIME_NOW; } else if valid.contains(SetattrValid::MTIME) { tvs[1].tv_sec = attr.st_mtime; tvs[1].tv_nsec = attr.st_mtime_nsec; } // Safe because this doesn't modify any memory and we check the return value. let res = match data { Data::Handle(_, fd) => unsafe { libc::futimens(fd, tvs.as_ptr()) }, Data::ProcPath(ref p) => unsafe { libc::utimensat(self.proc_self_fd.as_raw_fd(), p.as_ptr(), tvs.as_ptr(), 0) }, }; if res < 0 { return Err(io::Error::last_os_error()); } } self.do_getattr(inode) } fn rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()> { let old_inode = self .inodes .read() .unwrap() .get(&olddir) .map(Arc::clone) .ok_or_else(ebadf)?; let new_inode = self .inodes .read() .unwrap() .get(&newdir) .map(Arc::clone) .ok_or_else(ebadf)?; // Safe because this doesn't modify any memory and we check the return value. // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands // and we have glibc 2.28. let res = unsafe { libc::syscall( libc::SYS_renameat2, old_inode.file.as_raw_fd(), oldname.as_ptr(), new_inode.file.as_raw_fd(), newname.as_ptr(), flags, ) }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, ) -> io::Result { let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; let data = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::mknodat( data.file.as_raw_fd(), name.as_ptr(), (mode & !umask) as libc::mode_t, u64::from(rdev), ) }; if res < 0 { Err(io::Error::last_os_error()) } else { self.do_lookup(parent, name) } } fn link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let new_inode = self .inodes .read() .unwrap() .get(&newparent) .map(Arc::clone) .ok_or_else(ebadf)?; // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::linkat( data.file.as_raw_fd(), empty.as_ptr(), new_inode.file.as_raw_fd(), newname.as_ptr(), libc::AT_EMPTY_PATH, ) }; if res == 0 { self.do_lookup(newparent, newname) } else { Err(io::Error::last_os_error()) } } fn symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, ) -> io::Result { let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?; let data = self .inodes .read() .unwrap() .get(&parent) .map(Arc::clone) .ok_or_else(ebadf)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::symlinkat(linkname.as_ptr(), data.file.as_raw_fd(), name.as_ptr()) }; if res == 0 { self.do_lookup(parent, name) } else { Err(io::Error::last_os_error()) } } fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result> { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let mut buf = vec![0; libc::PATH_MAX as usize]; // Safe because this is a constant value and a valid C string. let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) }; // Safe because this will only modify the contents of `buf` and we check the return value. let res = unsafe { libc::readlinkat( data.file.as_raw_fd(), empty.as_ptr(), buf.as_mut_ptr() as *mut libc::c_char, buf.len(), ) }; if res < 0 { return Err(io::Error::last_os_error()); } buf.resize(res as usize, 0); Ok(buf) } fn flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()> { let data = self .handles .read() .unwrap() .get(&handle) .filter(|hd| hd.inode == inode) .map(Arc::clone) .ok_or_else(ebadf)?; // Since this method is called whenever an fd is closed in the client, we can emulate that // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe // because this doesn't modify any memory and we check the return values. unsafe { let newfd = libc::dup(data.file.write().unwrap().as_raw_fd()); if newfd < 0 { return Err(io::Error::last_os_error()); } if libc::close(newfd) < 0 { Err(io::Error::last_os_error()) } else { Ok(()) } } } fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> { let data = self .handles .read() .unwrap() .get(&handle) .filter(|hd| hd.inode == inode) .map(Arc::clone) .ok_or_else(ebadf)?; let fd = data.file.write().unwrap().as_raw_fd(); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { if datasync { libc::fdatasync(fd) } else { libc::fsync(fd) } }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn fsyncdir( &self, ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()> { self.fsync(ctx, inode, datasync, handle) } fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> { let data = self .inodes .read() .unwrap() .get(&inode) .map(Arc::clone) .ok_or_else(ebadf)?; let st = stat(&data.file)?; let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK); if mode == libc::F_OK { // The file exists since we were able to call `stat(2)` on it. return Ok(()); } if (mode & libc::R_OK) != 0 && ctx.uid != 0 && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0) && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0) && st.st_mode & 0o004 == 0 { return Err(io::Error::from_raw_os_error(libc::EACCES)); } if (mode & libc::W_OK) != 0 && ctx.uid != 0 && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0) && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0) && st.st_mode & 0o002 == 0 { return Err(io::Error::from_raw_os_error(libc::EACCES)); } // root can only execute something if it is executable by one of the owner, the group, or // everyone. if (mode & libc::X_OK) != 0 && (ctx.uid != 0 || st.st_mode & 0o111 == 0) && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0) && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0) && st.st_mode & 0o001 == 0 { return Err(io::Error::from_raw_os_error(libc::EACCES)); } Ok(()) } fn setxattr( &self, _ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, ) -> io::Result<()> { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::fsetxattr( file.as_raw_fd(), name.as_ptr(), value.as_ptr() as *const libc::c_void, value.len(), flags as libc::c_int, ) }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; let mut buf = vec![0; size as usize]; // Safe because this will only modify the contents of `buf`. let res = unsafe { libc::fgetxattr( file.as_raw_fd(), name.as_ptr(), buf.as_mut_ptr() as *mut libc::c_void, size as libc::size_t, ) }; if res < 0 { return Err(io::Error::last_os_error()); } if size == 0 { Ok(GetxattrReply::Count(res as u32)) } else { buf.resize(res as usize, 0); Ok(GetxattrReply::Value(buf)) } } fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; let mut buf = vec![0; size as usize]; // Safe because this will only modify the contents of `buf`. let res = unsafe { libc::flistxattr( file.as_raw_fd(), buf.as_mut_ptr() as *mut libc::c_char, size as libc::size_t, ) }; if res < 0 { return Err(io::Error::last_os_error()); } if size == 0 { Ok(ListxattrReply::Count(res as u32)) } else { buf.resize(res as usize, 0); Ok(ListxattrReply::Names(buf)) } } fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> { if !self.cfg.xattr { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we // need to get a new fd. let file = self.open_inode(inode, libc::O_RDONLY | libc::O_NONBLOCK)?; // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::fremovexattr(file.as_raw_fd(), name.as_ptr()) }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()> { let data = self .handles .read() .unwrap() .get(&handle) .filter(|hd| hd.inode == inode) .map(Arc::clone) .ok_or_else(ebadf)?; let fd = data.file.write().unwrap().as_raw_fd(); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::fallocate64( fd, mode as libc::c_int, offset as libc::off64_t, length as libc::off64_t, ) }; if res == 0 { Ok(()) } else { Err(io::Error::last_os_error()) } } fn lseek( &self, _ctx: Context, inode: Inode, handle: Handle, offset: u64, whence: u32, ) -> io::Result { let data = self .handles .read() .unwrap() .get(&handle) .filter(|hd| hd.inode == inode) .map(Arc::clone) .ok_or_else(ebadf)?; let fd = data.file.write().unwrap().as_raw_fd(); // Safe because this doesn't modify any memory and we check the return value. let res = unsafe { libc::lseek(fd, offset as libc::off64_t, whence as libc::c_int) }; if res < 0 { Err(io::Error::last_os_error()) } else { Ok(res as u64) } } fn copyfilerange( &self, _ctx: Context, inode_in: Inode, handle_in: Handle, offset_in: u64, inode_out: Inode, handle_out: Handle, offset_out: u64, len: u64, flags: u64, ) -> io::Result { let data_in = self .handles .read() .unwrap() .get(&handle_in) .filter(|hd| hd.inode == inode_in) .map(Arc::clone) .ok_or_else(ebadf)?; // Take just a read lock as we're not going to alter the file descriptor offset. let fd_in = data_in.file.read().unwrap().as_raw_fd(); let data_out = self .handles .read() .unwrap() .get(&handle_out) .filter(|hd| hd.inode == inode_out) .map(Arc::clone) .ok_or_else(ebadf)?; // Take just a read lock as we're not going to alter the file descriptor offset. let fd_out = data_out.file.read().unwrap().as_raw_fd(); // Safe because this will only modify `offset_in` and `offset_out` and we check // the return value. let res = unsafe { libc::syscall( libc::SYS_copy_file_range, fd_in, &mut (offset_in as i64) as &mut _ as *mut _, fd_out, &mut (offset_out as i64) as &mut _ as *mut _, len, flags, ) }; if res < 0 { Err(io::Error::last_os_error()) } else { Ok(res as usize) } } }