devices/virtio/fs/
passthrough.rs

1// Copyright 2019 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::borrow::Cow;
6use std::cell::RefCell;
7use std::cmp;
8use std::collections::btree_map;
9use std::collections::BTreeMap;
10use std::ffi::CStr;
11use std::ffi::CString;
12#[cfg(feature = "fs_runtime_ugid_map")]
13use std::ffi::OsStr;
14use std::fs::File;
15use std::io;
16use std::mem;
17use std::mem::size_of;
18use std::mem::MaybeUninit;
19use std::os::raw::c_int;
20use std::os::raw::c_long;
21#[cfg(feature = "fs_runtime_ugid_map")]
22use std::os::unix::ffi::OsStrExt;
23#[cfg(feature = "fs_runtime_ugid_map")]
24use std::path::Path;
25use std::ptr;
26use std::ptr::addr_of;
27use std::ptr::addr_of_mut;
28use std::sync::atomic::AtomicBool;
29use std::sync::atomic::AtomicU64;
30use std::sync::atomic::Ordering;
31use std::sync::Arc;
32use std::sync::MutexGuard;
33use std::sync::RwLock;
34use std::time::Duration;
35
36#[cfg(feature = "arc_quota")]
37use base::debug;
38use base::error;
39use base::ioctl_ior_nr;
40use base::ioctl_iow_nr;
41use base::ioctl_iowr_nr;
42use base::ioctl_with_mut_ptr;
43use base::ioctl_with_ptr;
44use base::open_how;
45use base::openat2;
46use base::syscall;
47use base::unix::FileFlags;
48use base::warn;
49use base::AsRawDescriptor;
50use base::FromRawDescriptor;
51use base::IntoRawDescriptor;
52use base::IoctlNr;
53use base::Protection;
54use base::RawDescriptor;
55use fuse::filesystem::Context;
56use fuse::filesystem::DirectoryIterator;
57use fuse::filesystem::Entry;
58use fuse::filesystem::FileSystem;
59use fuse::filesystem::FsOptions;
60use fuse::filesystem::GetxattrReply;
61use fuse::filesystem::IoctlFlags;
62use fuse::filesystem::IoctlReply;
63use fuse::filesystem::ListxattrReply;
64use fuse::filesystem::OpenOptions;
65use fuse::filesystem::RemoveMappingOne;
66use fuse::filesystem::SetattrValid;
67use fuse::filesystem::ZeroCopyReader;
68use fuse::filesystem::ZeroCopyWriter;
69use fuse::filesystem::ROOT_ID;
70use fuse::sys::WRITE_KILL_PRIV;
71use fuse::Mapper;
72#[cfg(feature = "arc_quota")]
73use protobuf::Message;
74use sync::Mutex;
75#[cfg(feature = "arc_quota")]
76use system_api::client::OrgChromiumSpaced;
77#[cfg(feature = "arc_quota")]
78use system_api::spaced::SetProjectIdReply;
79#[cfg(feature = "arc_quota")]
80use system_api::spaced::SetProjectInheritanceFlagReply;
81use zerocopy::FromBytes;
82use zerocopy::FromZeros;
83use zerocopy::Immutable;
84use zerocopy::IntoBytes;
85use zerocopy::KnownLayout;
86
87use crate::virtio::fs::allowlist::PathAllowlist;
88#[cfg(feature = "arc_quota")]
89use crate::virtio::fs::arc_ioctl::FsPathXattrDataBuffer;
90#[cfg(feature = "arc_quota")]
91use crate::virtio::fs::arc_ioctl::FsPermissionDataBuffer;
92#[cfg(feature = "arc_quota")]
93use crate::virtio::fs::arc_ioctl::XattrData;
94use crate::virtio::fs::caps::Capability;
95use crate::virtio::fs::caps::Caps;
96use crate::virtio::fs::caps::Set as CapSet;
97use crate::virtio::fs::caps::Value as CapValue;
98use crate::virtio::fs::config::CachePolicy;
99use crate::virtio::fs::config::Config;
100#[cfg(feature = "fs_permission_translation")]
101use crate::virtio::fs::config::PermissionData;
102use crate::virtio::fs::expiring_map::ExpiringMap;
103use crate::virtio::fs::multikey::MultikeyBTreeMap;
104use crate::virtio::fs::read_dir::ReadDir;
105
106// RESOLVE_* constants are missing in libc crate for some targets (e.g. Android).
107// Define them here as they are stable Linux kernel API constants.
108const RESOLVE_NO_MAGICLINKS: u64 = 0x02;
109const RESOLVE_NO_SYMLINKS: u64 = 0x04;
110const RESOLVE_IN_ROOT: u64 = 0x10;
111
112const EMPTY_CSTR: &CStr = c"";
113const PROC_CSTR: &CStr = c"/proc";
114const UNLABELED_CSTR: &CStr = c"unlabeled";
115
116const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
117const SECURITY_XATTR: &[u8] = b"security.";
118const SELINUX_XATTR: &[u8] = b"security.selinux";
119
120const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
121const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
122
123#[cfg(feature = "arc_quota")]
124const FS_PROJINHERIT_FL: c_int = 0x20000000;
125
126// 25 seconds is the default timeout for dbus-send.
127#[cfg(feature = "arc_quota")]
128const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
129
130/// Internal utility wrapper for `cros_tracing::trace_event!()` macro with VirtioFS calls.
131macro_rules! fs_trace {
132    ($tag:expr, $name:expr, $($arg:expr),+) => {
133        cros_tracing::trace_event!(VirtioFs, $name, $tag, $($arg),*)
134    };
135}
136
137#[repr(C)]
138#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
139struct fscrypt_policy_v1 {
140    _version: u8,
141    _contents_encryption_mode: u8,
142    _filenames_encryption_mode: u8,
143    _flags: u8,
144    _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
145}
146
147#[repr(C)]
148#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
149struct fscrypt_policy_v2 {
150    _version: u8,
151    _contents_encryption_mode: u8,
152    _filenames_encryption_mode: u8,
153    _flags: u8,
154    __reserved: [u8; 4],
155    master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
156}
157
158#[repr(C)]
159#[derive(Copy, Clone, FromBytes, Immutable, KnownLayout)]
160union fscrypt_policy {
161    _version: u8,
162    _v1: fscrypt_policy_v1,
163    _v2: fscrypt_policy_v2,
164}
165
166#[repr(C)]
167#[derive(Copy, Clone, FromBytes, Immutable, KnownLayout)]
168struct fscrypt_get_policy_ex_arg {
169    policy_size: u64,       /* input/output */
170    policy: fscrypt_policy, /* output */
171}
172
173impl From<&fscrypt_get_policy_ex_arg> for &[u8] {
174    fn from(value: &fscrypt_get_policy_ex_arg) -> Self {
175        assert!(value.policy_size <= size_of::<fscrypt_policy>() as u64);
176        let data_raw: *const fscrypt_get_policy_ex_arg = value;
177        // SAFETY: the length of the output slice is asserted to be within the struct it points to
178        unsafe {
179            std::slice::from_raw_parts(
180                data_raw.cast(),
181                value.policy_size as usize + size_of::<u64>(),
182            )
183        }
184    }
185}
186
187ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
188
189#[repr(C)]
190#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
191struct fsxattr {
192    fsx_xflags: u32,     /* xflags field value (get/set) */
193    fsx_extsize: u32,    /* extsize field value (get/set) */
194    fsx_nextents: u32,   /* nextents field value (get) */
195    fsx_projid: u32,     /* project identifier (get/set) */
196    fsx_cowextsize: u32, /* CoW extsize field value (get/set) */
197    fsx_pad: [u8; 8],
198}
199
200ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
201ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
202
203ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
204ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
205
206ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
207ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
208
209ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
210ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
211
212#[cfg(feature = "arc_quota")]
213ioctl_iow_nr!(FS_IOC_SETPERMISSION, 'f' as u32, 1, FsPermissionDataBuffer);
214#[cfg(feature = "arc_quota")]
215ioctl_iow_nr!(FS_IOC_SETPATHXATTR, 'f' as u32, 1, FsPathXattrDataBuffer);
216
217#[repr(C)]
218#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
219struct fsverity_enable_arg {
220    _version: u32,
221    _hash_algorithm: u32,
222    _block_size: u32,
223    salt_size: u32,
224    salt_ptr: u64,
225    sig_size: u32,
226    __reserved1: u32,
227    sig_ptr: u64,
228    __reserved2: [u64; 11],
229}
230
231#[repr(C)]
232#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
233struct fsverity_digest {
234    _digest_algorithm: u16,
235    digest_size: u16,
236    // __u8 digest[];
237}
238
239ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
240ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
241
242pub type Inode = u64;
243type Handle = u64;
244
245#[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq)]
246struct InodeAltKey {
247    ino: libc::ino64_t,
248    dev: libc::dev_t,
249}
250
251#[derive(PartialEq, Eq, Debug)]
252enum FileType {
253    Regular,
254    Directory,
255    Other,
256}
257
258impl From<libc::mode_t> for FileType {
259    fn from(mode: libc::mode_t) -> Self {
260        match mode & libc::S_IFMT {
261            libc::S_IFREG => FileType::Regular,
262            libc::S_IFDIR => FileType::Directory,
263            _ => FileType::Other,
264        }
265    }
266}
267
268#[derive(Debug)]
269struct OpenedFile {
270    file: Option<File>,
271    open_flags: libc::c_int,
272}
273
274impl AsRawDescriptor for OpenedFile {
275    fn as_raw_descriptor(&self) -> RawDescriptor {
276        self.file().as_raw_descriptor()
277    }
278}
279
280impl OpenedFile {
281    fn new(file: File, open_flags: libc::c_int) -> Self {
282        OpenedFile {
283            file: Some(file),
284            open_flags,
285        }
286    }
287
288    fn file(&self) -> &File {
289        self.file.as_ref().expect("must have a file")
290    }
291
292    fn file_mut(&mut self) -> &mut File {
293        self.file.as_mut().expect("must have a file")
294    }
295
296    /// Leaks the file descriptor and makes the struct unusable.
297    ///
298    /// This is an optimization to speed up dropping `OpenedFile` instances, which is useful
299    /// during an abrupt shutdown. Instead of properly closing the file descriptor, which
300    /// involves a syscall, this function effectively forgets the file descriptor, relying on the
301    /// OS to clean it up when the process terminates.
302    fn leak_fd(&mut self) {
303        let f = self.file.take().expect("must have a file");
304        let _ = f.into_raw_descriptor();
305    }
306}
307
308#[derive(Debug)]
309struct InodeData {
310    inode: Inode,
311    // (File, open_flags)
312    file: Mutex<OpenedFile>,
313    refcount: AtomicU64,
314    filetype: FileType,
315    path: String,
316    // This needs to be atomic because we need to set it through a shared reference.
317    unsafe_leak_fd: AtomicBool,
318}
319
320impl AsRawDescriptor for InodeData {
321    fn as_raw_descriptor(&self) -> RawDescriptor {
322        self.file.lock().as_raw_descriptor()
323    }
324}
325
326impl Drop for InodeData {
327    /// If `unsafe_leak_fd` is set, this `drop` implementation will "leak" the file descriptor.
328    /// This is an optimization to speed up the cleanup process, based on the
329    /// assumption that the OS will handle the cleanup of file descriptors after the process
330    /// terminates. This is only okay if the process is guaranteed to terminate immediately
331    /// after the `PassthroughFs` instance is dropped.
332    fn drop(&mut self) {
333        if self.unsafe_leak_fd.load(Ordering::Relaxed) {
334            self.file.get_mut().leak_fd();
335        }
336    }
337}
338
339impl InodeData {
340    fn set_unsafe_leak_fd(&self) {
341        self.unsafe_leak_fd.store(true, Ordering::Relaxed);
342    }
343}
344
345#[derive(Debug)]
346struct HandleData {
347    inode: Inode,
348    file: Mutex<OpenedFile>,
349
350    unsafe_leak_fd: AtomicBool,
351}
352
353impl AsRawDescriptor for HandleData {
354    fn as_raw_descriptor(&self) -> RawDescriptor {
355        self.file.lock().as_raw_descriptor()
356    }
357}
358
359impl Drop for HandleData {
360    /// If `unsafe_leak_fd` is set, this `drop` implementation will "leak" the file descriptor by
361    /// forgetting it. This is an optimization to speed up the cleanup process, based on the
362    /// assumption that the OS will handle the cleanup of file descriptors after the process
363    // terminates. This is only safe if the process is guaranteed to terminate immediately
364    /// after the `PassthroughFs` instance is dropped.
365    fn drop(&mut self) {
366        if self.unsafe_leak_fd.load(Ordering::Relaxed) {
367            self.file.get_mut().leak_fd();
368        }
369    }
370}
371
372impl HandleData {
373    fn set_unsafe_leak_fd(&self) {
374        self.unsafe_leak_fd.store(true, Ordering::Relaxed);
375    }
376}
377
378macro_rules! scoped_cred {
379    ($name:ident, $ty:ty, $syscall_nr:expr) => {
380        #[derive(Debug)]
381        struct $name {
382            old: $ty,
383        }
384
385        impl $name {
386            // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
387            // credentials back to `old` when the returned struct is dropped.
388            fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
389                if val == old {
390                    // Nothing to do since we already have the correct value.
391                    return Ok(None);
392                }
393
394                // We want credential changes to be per-thread because otherwise
395                // we might interfere with operations being carried out on other
396                // threads with different uids/gids.  However, posix requires that
397                // all threads in a process share the same credentials.  To do this
398                // libc uses signals to ensure that when one thread changes its
399                // credentials the other threads do the same thing.
400                //
401                // So instead we invoke the syscall directly in order to get around
402                // this limitation.  Another option is to use the setfsuid and
403                // setfsgid systems calls.   However since those calls have no way to
404                // return an error, it's preferable to do this instead.
405
406                // SAFETY: this call is safe because it doesn't modify any memory and we
407                // check the return value.
408                let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
409                if res == 0 {
410                    Ok(Some($name { old }))
411                } else {
412                    Err(io::Error::last_os_error())
413                }
414            }
415        }
416
417        impl Drop for $name {
418            fn drop(&mut self) {
419                // SAFETY: trivially safe
420                let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
421                if res < 0 {
422                    error!(
423                        "failed to change credentials back to {}: {}",
424                        self.old,
425                        io::Error::last_os_error(),
426                    );
427                }
428            }
429        }
430    };
431}
432scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
433scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
434
435const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
436const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
437
438thread_local! {
439    // SAFETY: both calls take no parameters and only return an integer value. The kernel also
440    // guarantees that they can never fail.
441    static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
442    // SAFETY: both calls take no parameters and only return an integer value. The kernel also
443    // guarantees that they can never fail.
444    static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
445}
446
447fn set_creds(
448    uid: libc::uid_t,
449    gid: libc::gid_t,
450) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
451    let olduid = THREAD_EUID.with(|uid| *uid);
452    let oldgid = THREAD_EGID.with(|gid| *gid);
453
454    // We have to change the gid before we change the uid because if we change the uid first then we
455    // lose the capability to change the gid.  However changing back can happen in any order.
456    ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
457}
458
459thread_local!(static THREAD_FSCREATE: RefCell<Option<File>> = const { RefCell::new(None) });
460
461// Opens and returns a write-only handle to /proc/thread-self/attr/fscreate. Panics if it fails to
462// open the file.
463fn open_fscreate(proc: &File) -> File {
464    let fscreate = c"thread-self/attr/fscreate";
465
466    // SAFETY: this doesn't modify any memory and we check the return value.
467    let raw_descriptor = unsafe {
468        libc::openat(
469            proc.as_raw_descriptor(),
470            fscreate.as_ptr(),
471            libc::O_CLOEXEC | libc::O_WRONLY,
472        )
473    };
474
475    // We don't expect this to fail and we're not in a position to return an error here so just
476    // panic.
477    if raw_descriptor < 0 {
478        panic!(
479            "Failed to open /proc/thread-self/attr/fscreate: {}",
480            io::Error::last_os_error()
481        );
482    }
483
484    // SAFETY: safe because we just opened this descriptor.
485    unsafe { File::from_raw_descriptor(raw_descriptor) }
486}
487
488struct ScopedSecurityContext;
489
490impl ScopedSecurityContext {
491    fn new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext> {
492        THREAD_FSCREATE.with(|thread_fscreate| {
493            let mut fscreate = thread_fscreate.borrow_mut();
494            let file = fscreate.get_or_insert_with(|| open_fscreate(proc));
495            // SAFETY: this doesn't modify any memory and we check the return value.
496            let ret = unsafe {
497                libc::write(
498                    file.as_raw_descriptor(),
499                    ctx.as_ptr() as *const libc::c_void,
500                    ctx.to_bytes_with_nul().len(),
501                )
502            };
503            if ret < 0 {
504                Err(io::Error::last_os_error())
505            } else {
506                Ok(ScopedSecurityContext)
507            }
508        })
509    }
510}
511
512impl Drop for ScopedSecurityContext {
513    fn drop(&mut self) {
514        THREAD_FSCREATE.with(|thread_fscreate| {
515            // expect is safe here because the thread local would have been initialized by the call
516            // to `new` above.
517            let fscreate = thread_fscreate.borrow();
518            let file = fscreate
519                .as_ref()
520                .expect("Uninitialized thread-local when dropping ScopedSecurityContext");
521
522            // SAFETY: this doesn't modify any memory and we check the return value.
523            let ret = unsafe { libc::write(file.as_raw_descriptor(), ptr::null(), 0) };
524
525            if ret < 0 {
526                warn!(
527                    "Failed to restore security context: {}",
528                    io::Error::last_os_error()
529                );
530            }
531        })
532    }
533}
534
535struct ScopedUmask {
536    old: libc::mode_t,
537    mask: libc::mode_t,
538}
539
540impl ScopedUmask {
541    fn new(mask: libc::mode_t) -> ScopedUmask {
542        ScopedUmask {
543            // SAFETY: this doesn't modify any memory and always succeeds.
544            old: unsafe { libc::umask(mask) },
545            mask,
546        }
547    }
548}
549
550impl Drop for ScopedUmask {
551    fn drop(&mut self) {
552        // SAFETY: this doesn't modify any memory and always succeeds.
553        let previous = unsafe { libc::umask(self.old) };
554        debug_assert_eq!(
555            previous, self.mask,
556            "umask changed while holding ScopedUmask"
557        );
558    }
559}
560
561struct ScopedFsetid(Caps);
562impl Drop for ScopedFsetid {
563    fn drop(&mut self) {
564        if let Err(e) = raise_cap_fsetid(&mut self.0) {
565            error!(
566                "Failed to restore CAP_FSETID: {}.  Some operations may be broken.",
567                e
568            )
569        }
570    }
571}
572
573fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
574    c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
575    c.apply()
576}
577
578// Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
579// adds the capability back when it is dropped.
580fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
581    let mut caps = Caps::for_current_thread()?;
582    caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
583    caps.apply()?;
584    Ok(ScopedFsetid(caps))
585}
586
587fn ebadf() -> io::Error {
588    io::Error::from_raw_os_error(libc::EBADF)
589}
590
591fn eexist() -> io::Error {
592    io::Error::from_raw_os_error(libc::EEXIST)
593}
594
595fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
596    let mut st: MaybeUninit<libc::stat64> = MaybeUninit::<libc::stat64>::zeroed();
597
598    // SAFETY: the kernel will only write data in `st` and we check the return value.
599    syscall!(unsafe {
600        libc::fstatat64(
601            f.as_raw_descriptor(),
602            EMPTY_CSTR.as_ptr(),
603            st.as_mut_ptr(),
604            libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
605        )
606    })?;
607
608    // SAFETY: the kernel guarantees that the struct is now fully initialized.
609    Ok(unsafe { st.assume_init() })
610}
611
612fn validate_path_component(name: &CStr) -> io::Result<()> {
613    let bytes = name.to_bytes();
614    if bytes == b".." || (bytes.contains(&b'/') && bytes != b"/") {
615        return Err(io::Error::from_raw_os_error(libc::EINVAL));
616    }
617    Ok(())
618}
619
620/// A safe wrapper around `openat2` with a fallback to `openat64` for backward compatibility.
621///
622/// It attempts to use `openat2` to leverage secure path resolution flags (like `RESOLVE_IN_ROOT`).
623/// If `openat2` is not supported by the kernel (returns `ENOSYS`, e.g. on kernels older than 5.6),
624/// it falls back to standard path resolution using `openat64` to allow operation on older
625/// platforms.
626fn safe_openat2<D: AsRawDescriptor>(
627    dir: &D,
628    name: &CStr,
629    flags: libc::c_int,
630    mode: Option<libc::mode_t>,
631    resolve: u64,
632) -> io::Result<File> {
633    let mut how = open_how {
634        flags: flags as u64,
635        resolve,
636        ..Default::default()
637    };
638    if let Some(m) = mode {
639        how.mode = (m & 0o7777) as u64;
640    }
641
642    let res = openat2(dir, name, &how);
643    match res {
644        Ok(file) => Ok(file),
645        Err(e) if e.errno() == libc::ENOSYS => {
646            // Fallback to openat64 if openat2 is not supported.
647            let fd = if let Some(m) = mode {
648                // SAFETY: openat64 doesn't modify any memory and we check the return value.
649                syscall!(unsafe {
650                    libc::openat64(dir.as_raw_descriptor(), name.as_ptr(), flags, m)
651                })
652            } else {
653                // SAFETY: openat64 doesn't modify any memory and we check the return value.
654                syscall!(unsafe { libc::openat64(dir.as_raw_descriptor(), name.as_ptr(), flags) })
655            }?;
656            // SAFETY: safe because we own the fd.
657            Ok(unsafe { File::from_raw_descriptor(fd) })
658        }
659        Err(e) => Err(e.into()),
660    }
661}
662
663#[cfg(feature = "arc_quota")]
664fn is_android_project_id(project_id: u32) -> bool {
665    // The following constants defines the valid range of project ID used by
666    // Android and are taken from android_filesystem_config.h in Android
667    // codebase.
668    //
669    // Project IDs reserved for Android files on external storage. Total 100 IDs
670    // from PROJECT_ID_EXT_DEFAULT (1000) are reserved.
671    const PROJECT_ID_FOR_ANDROID_FILES: std::ops::RangeInclusive<u32> = 1000..=1099;
672    // Project IDs reserved for Android apps.
673    // The lower-limit of the range is PROJECT_ID_EXT_DATA_START.
674    // The upper-limit of the range differs before and after T. Here we use that
675    // of T (PROJECT_ID_APP_CACHE_END) as it is larger.
676    const PROJECT_ID_FOR_ANDROID_APPS: std::ops::RangeInclusive<u32> = 20000..=69999;
677
678    PROJECT_ID_FOR_ANDROID_FILES.contains(&project_id)
679        || PROJECT_ID_FOR_ANDROID_APPS.contains(&project_id)
680}
681
682/// Per-directory cache for `PassthroughFs::ascii_casefold_lookup()`.
683///
684/// The key of the underlying `BTreeMap` is a lower-cased file name in the direcoty.
685/// The value is the case-sensitive file name stored in the host file system.
686/// We assume that if PassthroughFs has exclusive access to the filesystem, this cache exhaustively
687///  covers all file names that exist within the directory.
688/// So every `PassthroughFs`'s handler that adds or removes files in the directory is expected to
689/// update this cache.
690struct CasefoldCache(BTreeMap<Vec<u8>, CString>);
691
692impl CasefoldCache {
693    fn new(dir: &InodeData) -> io::Result<Self> {
694        let mut mp = BTreeMap::new();
695
696        let mut buf = [0u8; 1024];
697        let mut offset = 0;
698        loop {
699            let mut read_dir = ReadDir::new(dir, offset, &mut buf[..])?;
700            if read_dir.remaining() == 0 {
701                break;
702            }
703
704            while let Some(entry) = read_dir.next() {
705                offset = entry.offset as libc::off64_t;
706                let entry_name = entry.name;
707                mp.insert(
708                    entry_name.to_bytes().to_ascii_lowercase(),
709                    entry_name.to_owned(),
710                );
711            }
712        }
713        Ok(Self(mp))
714    }
715
716    fn insert(&mut self, name: &CStr) {
717        let lower_case = name.to_bytes().to_ascii_lowercase();
718        self.0.insert(lower_case, name.into());
719    }
720
721    fn lookup(&self, name: &[u8]) -> Option<CString> {
722        let lower = name.to_ascii_lowercase();
723        self.0.get(&lower).cloned()
724    }
725
726    fn remove(&mut self, name: &CStr) {
727        let lower_case = name.to_bytes().to_ascii_lowercase();
728        self.0.remove(&lower_case);
729    }
730}
731
732/// Time expiring mapping from an inode of a directory to `CasefoldCache` for the directory.
733/// Each entry will be expired after `timeout`.
734/// When ascii_casefold is disabled, this struct does nothing.
735struct ExpiringCasefoldLookupCaches {
736    inner: ExpiringMap<Inode, CasefoldCache>,
737}
738
739impl ExpiringCasefoldLookupCaches {
740    fn new(timeout: Duration) -> Self {
741        Self {
742            inner: ExpiringMap::new(timeout),
743        }
744    }
745
746    fn insert(&mut self, parent: Inode, name: &CStr) {
747        if let Some(dir_cache) = self.inner.get_mut(&parent) {
748            dir_cache.insert(name);
749        }
750    }
751
752    fn remove(&mut self, parent: Inode, name: &CStr) {
753        if let Some(dir_cache) = self.inner.get_mut(&parent) {
754            dir_cache.remove(name);
755        }
756    }
757
758    fn forget(&mut self, parent: Inode) {
759        self.inner.remove(&parent);
760    }
761
762    /// Get `CasefoldCache` for the given directory.
763    /// If the cache doesn't exist, generate it by fetching directory information with
764    /// `getdents64()`.
765    fn get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache> {
766        self.inner
767            .get_or_insert_with(&parent.inode, || CasefoldCache::new(parent))
768    }
769
770    #[cfg(test)]
771    fn exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool {
772        if let Some(dir_cache) = self.inner.get(&parent) {
773            dir_cache.lookup(name.to_bytes()).is_some()
774        } else {
775            false
776        }
777    }
778}
779
780#[cfg(feature = "fs_permission_translation")]
781impl PermissionData {
782    pub(crate) fn need_set_permission(&self, path: &str) -> bool {
783        path.starts_with(&self.perm_path)
784    }
785}
786
787/// A file system that simply "passes through" all requests it receives to the underlying file
788/// system. To keep the implementation simple it servers the contents of its root directory. Users
789/// that wish to serve only a specific directory should set up the environment so that that
790/// directory ends up as the root of the file system process. One way to accomplish this is via a
791/// combination of mount namespaces and the pivot_root system call.
792///
793/// # Safety
794///
795/// The `Drop` implementation for this struct intentionally leaks all open file
796/// descriptors. It is **critical** that an instance of `PassthroughFs` is
797/// only dropped immediately prior to process termination. Failure to uphold
798/// this invariant **will** result in resource leaks. This is a deliberate
799/// performance optimization for abrupt shutdowns, where we let the OS
800/// handle resource cleanup.
801pub struct PassthroughFs {
802    // Mutex that must be acquired before executing a process-wide operation such as fchdir.
803    process_lock: Mutex<()>,
804    // virtio-fs tag that the guest uses when mounting. This is only used for debugging
805    // when tracing is enabled.
806    tag: String,
807
808    // File descriptors for various points in the file system tree.
809    inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
810    next_inode: AtomicU64,
811
812    // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
813    // used for reading and writing data.
814    handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
815    next_handle: AtomicU64,
816
817    // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
818    // `inodes` into one that can go into `handles`. This is accomplished by reading the
819    // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
820    // to be serving doesn't have access to `/proc`.
821    proc: File,
822
823    // Whether writeback caching is enabled for this directory. This will only be true when
824    // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
825    writeback: AtomicBool,
826
827    // Whether zero message opens are supported by the kernel driver.
828    zero_message_open: AtomicBool,
829
830    // Whether zero message opendir is supported by the kernel driver.
831    zero_message_opendir: AtomicBool,
832
833    // Used to communicate with other processes using D-Bus.
834    #[cfg(feature = "arc_quota")]
835    dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
836    #[cfg(feature = "arc_quota")]
837    dbus_fd: Option<std::os::unix::io::RawFd>,
838
839    // Time-expiring cache for `ascii_casefold_lookup()`.
840    // The key is an inode of a directory, and the value is a cache for the directory.
841    // Each value will be expired `cfg.timeout` after it's created.
842    //
843    // TODO(b/267748212): Instead of per-device Mutex, we might want to have per-directory Mutex
844    // if we use PassthroughFs in multi-threaded environments.
845    expiring_casefold_lookup_caches: Option<Mutex<ExpiringCasefoldLookupCaches>>,
846
847    // paths and coresponding permission setting set by `crosvm_client_fs_permission_set` API
848    #[cfg(feature = "fs_permission_translation")]
849    permission_paths: RwLock<Vec<PermissionData>>,
850
851    // paths and coresponding xattr setting set by `crosvm_client_fs_xattr_set` API
852    #[cfg(feature = "arc_quota")]
853    xattr_paths: RwLock<Vec<XattrData>>,
854
855    cfg: Config,
856
857    // Set the root directory when pivot root isn't enabled for jailed process.
858    //
859    // virtio-fs typically uses mount namespaces and pivot_root for file system isolation,
860    // making the jailed process's root directory "/".
861    //
862    // However, Android's security model prevents crosvm from having the necessary SYS_ADMIN
863    // capability for mount namespaces and pivot_root. This lack of isolation means that
864    // root_dir defaults to the path provided via "--shared-dir".
865    root_dir: String,
866    allowlist: Option<Arc<RwLock<PathAllowlist>>>,
867}
868
869impl std::fmt::Debug for PassthroughFs {
870    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
871        f.debug_struct("PassthroughFs")
872            .field("tag", &self.tag)
873            .field("next_inode", &self.next_inode)
874            .field("next_handle", &self.next_handle)
875            .field("proc", &self.proc)
876            .field("writeback", &self.writeback)
877            .field("zero_message_open", &self.zero_message_open)
878            .field("zero_message_opendir", &self.zero_message_opendir)
879            .field("cfg", &self.cfg)
880            .finish()
881    }
882}
883
884impl PassthroughFs {
885    pub fn new(tag: &str, cfg: Config) -> io::Result<PassthroughFs> {
886        // SAFETY: this doesn't modify any memory and we check the return value.
887        let raw_descriptor = syscall!(unsafe {
888            libc::openat64(
889                libc::AT_FDCWD,
890                PROC_CSTR.as_ptr(),
891                libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
892            )
893        })?;
894
895        // Privileged UIDs can use D-Bus to perform some operations.
896        #[cfg(feature = "arc_quota")]
897        let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
898            (None, None)
899        } else {
900            let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
901                .map_err(io::Error::other)?;
902            channel.set_watch_enabled(true);
903            let dbus_fd = channel.watch().fd;
904            channel.set_watch_enabled(false);
905            (
906                Some(Mutex::new(dbus::blocking::Connection::from(channel))),
907                Some(dbus_fd),
908            )
909        };
910
911        // SAFETY: safe because we just opened this descriptor.
912        let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
913
914        let expiring_casefold_lookup_caches = if cfg.ascii_casefold {
915            Some(Mutex::new(ExpiringCasefoldLookupCaches::new(cfg.timeout)))
916        } else {
917            None
918        };
919
920        #[allow(unused_mut)]
921        let mut passthroughfs = PassthroughFs {
922            process_lock: Mutex::new(()),
923            tag: tag.to_string(),
924            inodes: Mutex::new(MultikeyBTreeMap::new()),
925            next_inode: AtomicU64::new(ROOT_ID + 1),
926
927            handles: Mutex::new(BTreeMap::new()),
928            next_handle: AtomicU64::new(1),
929
930            proc,
931
932            writeback: AtomicBool::new(false),
933            zero_message_open: AtomicBool::new(false),
934            zero_message_opendir: AtomicBool::new(false),
935
936            #[cfg(feature = "arc_quota")]
937            dbus_connection,
938            #[cfg(feature = "arc_quota")]
939            dbus_fd,
940            expiring_casefold_lookup_caches,
941            #[cfg(feature = "fs_permission_translation")]
942            permission_paths: RwLock::new(Vec::new()),
943            #[cfg(feature = "arc_quota")]
944            xattr_paths: RwLock::new(Vec::new()),
945            cfg,
946            root_dir: "/".to_string(),
947            allowlist: None,
948        };
949
950        #[cfg(feature = "fs_runtime_ugid_map")]
951        passthroughfs.set_permission_path();
952
953        cros_tracing::trace_simple_print!(
954            VirtioFs,
955            "New PassthroughFS initialized: {:?}",
956            passthroughfs
957        );
958        Ok(passthroughfs)
959    }
960
961    pub fn set_allowlist(&mut self, allowlist: Option<Arc<RwLock<PathAllowlist>>>) {
962        self.allowlist = allowlist;
963    }
964
965    fn is_path_accessible(&self, path: &str) -> bool {
966        self.allowlist
967            .as_ref()
968            .map(|al| al.read().unwrap().is_accessible(path))
969            .unwrap_or(true)
970    }
971
972    /// Validates and resolves a write path component against the dynamic path allowlist.
973    ///
974    /// This function enforces security boundaries by performing the following checks:
975    /// 1. Checks if the `name` component does not contain malicious patterns (e.g., `..` or `/`).
976    /// 2. Converts the `name` from a `CStr` to a UTF-8 string slice, failing with `EILSEQ` if
977    ///    invalid.
978    /// 3. Checks if the allowlist authorizes write access for the resolved path.
979    ///
980    /// # Errors
981    ///
982    /// Returns an `io::Error` if any validation or authorization check fails:
983    /// * `EINVAL` - If `name` contains `..` or `/`.
984    /// * `EILSEQ` - If `name` is not valid UTF-8.
985    /// * `EACCES` - If the dynamic allowlist does not grant full write access to the resolved path.
986    fn authorize_write_path(&self, parent_path: &str, name: &CStr) -> io::Result<String> {
987        validate_path_component(name)?;
988        let name_str = name
989            .to_str()
990            .map_err(|_| io::Error::from_raw_os_error(libc::EILSEQ))?;
991        let path = if parent_path.is_empty() || parent_path == "/" {
992            format!("/{name_str}")
993        } else {
994            format!("{parent_path}/{name_str}")
995        };
996        let is_writable = self
997            .allowlist
998            .as_ref()
999            .map(|al| al.read().unwrap().is_writable(&path))
1000            .unwrap_or(true);
1001        if !is_writable {
1002            return Err(io::Error::from_raw_os_error(libc::EACCES));
1003        }
1004        Ok(path)
1005    }
1006
1007    #[cfg(feature = "fs_runtime_ugid_map")]
1008    fn set_permission_path(&mut self) {
1009        if !self.cfg.ugid_map.is_empty() {
1010            let mut write_lock = self
1011                .permission_paths
1012                .write()
1013                .expect("Failed to acquire write lock on permission_paths");
1014            *write_lock = self.cfg.ugid_map.clone();
1015        }
1016    }
1017
1018    pub fn set_root_dir(&mut self, shared_dir: String) -> io::Result<()> {
1019        let canonicalized_root = match std::fs::canonicalize(shared_dir) {
1020            Ok(path) => path,
1021            Err(e) => {
1022                return Err(io::Error::new(
1023                    io::ErrorKind::InvalidInput,
1024                    format!("Failed to canonicalize root_dir: {e}"),
1025                ));
1026            }
1027        };
1028        self.root_dir = canonicalized_root.to_string_lossy().to_string();
1029        Ok(())
1030    }
1031
1032    pub fn cfg(&self) -> &Config {
1033        &self.cfg
1034    }
1035
1036    pub fn keep_rds(&self) -> Vec<RawDescriptor> {
1037        #[cfg_attr(not(feature = "arc_quota"), allow(unused_mut))]
1038        let mut keep_rds = vec![self.proc.as_raw_descriptor()];
1039        #[cfg(feature = "arc_quota")]
1040        if let Some(fd) = self.dbus_fd {
1041            keep_rds.push(fd);
1042        }
1043        keep_rds
1044    }
1045
1046    fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
1047        if !self.cfg.rewrite_security_xattrs {
1048            return Cow::Borrowed(name);
1049        }
1050
1051        // Does not include nul-terminator.
1052        let buf = name.to_bytes();
1053        if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
1054            return Cow::Borrowed(name);
1055        }
1056
1057        let mut newname = USER_VIRTIOFS_XATTR.to_vec();
1058        newname.extend_from_slice(buf);
1059
1060        // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
1061        // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
1062        Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
1063    }
1064
1065    fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
1066        self.inodes.lock().get(&inode).cloned().ok_or_else(ebadf)
1067    }
1068
1069    fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
1070        self.handles
1071            .lock()
1072            .get(&handle)
1073            .filter(|hd| hd.inode == inode)
1074            .cloned()
1075            .ok_or_else(ebadf)
1076    }
1077
1078    fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
1079        let pathname = CString::new(format!("self/fd/{fd}"))
1080            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1081
1082        // SAFETY: this doesn't modify any memory and we check the return value. We don't really
1083        // check `flags` because if the kernel can't handle poorly specified flags then we have
1084        // much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
1085        // to follow the `/proc/self/fd` symlink to get the file.
1086        let raw_descriptor = syscall!(unsafe {
1087            libc::openat64(
1088                self.proc.as_raw_descriptor(),
1089                pathname.as_ptr(),
1090                (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1091            )
1092        })?;
1093
1094        // SAFETY: safe because we just opened this descriptor.
1095        Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
1096    }
1097
1098    /// Modifies the provided open flags based on the writeback caching configuration.
1099    /// Return the updated open flags.
1100    fn update_open_flags(&self, mut flags: i32) -> i32 {
1101        // When writeback caching is enabled, the kernel may send read requests even if the
1102        // userspace program opened the file write-only. So we need to ensure that we have opened
1103        // the file for reading as well as writing.
1104        let writeback = self.writeback.load(Ordering::Relaxed);
1105        if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
1106            flags &= !libc::O_ACCMODE;
1107            flags |= libc::O_RDWR;
1108        }
1109
1110        // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
1111        // However, this breaks atomicity as the file may have changed on disk, invalidating the
1112        // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
1113        // the file. Just allow this for now as it is the user's responsibility to enable writeback
1114        // caching only for directories that are not shared. It also means that we need to clear the
1115        // `O_APPEND` flag.
1116        if writeback && flags & libc::O_APPEND != 0 {
1117            flags &= !libc::O_APPEND;
1118        }
1119
1120        flags
1121    }
1122
1123    fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
1124        // handle writeback caching cases
1125        flags = self.update_open_flags(flags);
1126
1127        self.open_fd(inode.as_raw_descriptor(), flags)
1128    }
1129
1130    // Increases the inode refcount and returns the inode.
1131    fn increase_inode_refcount(&self, inode_data: &InodeData) -> Inode {
1132        // Matches with the release store in `forget`.
1133        inode_data.refcount.fetch_add(1, Ordering::Acquire);
1134        inode_data.inode
1135    }
1136
1137    // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
1138    // The inodes mutex lock must not be already taken by the same thread otherwise this
1139    // will deadlock.
1140    fn add_entry(
1141        &self,
1142        f: File,
1143        #[cfg_attr(not(feature = "fs_permission_translation"), allow(unused_mut))]
1144        mut st: libc::stat64,
1145        open_flags: libc::c_int,
1146        path: String,
1147    ) -> Entry {
1148        #[cfg(feature = "arc_quota")]
1149        self.set_permission(&mut st, &path);
1150        #[cfg(feature = "fs_runtime_ugid_map")]
1151        self.set_ugid_permission(&mut st, &path);
1152        let mut inodes = self.inodes.lock();
1153
1154        let altkey = InodeAltKey {
1155            ino: st.st_ino,
1156            dev: st.st_dev,
1157        };
1158
1159        let inode = if let Some(data) = inodes.get_alt(&altkey) {
1160            self.increase_inode_refcount(data)
1161        } else {
1162            let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
1163            inodes.insert(
1164                inode,
1165                altkey,
1166                Arc::new(InodeData {
1167                    inode,
1168                    file: Mutex::new(OpenedFile::new(f, open_flags)),
1169                    refcount: AtomicU64::new(1),
1170                    filetype: st.st_mode.into(),
1171                    path,
1172                    unsafe_leak_fd: AtomicBool::new(false),
1173                }),
1174            );
1175
1176            inode
1177        };
1178
1179        Entry {
1180            inode,
1181            generation: 0,
1182            attr: st,
1183            // We use the same timeout for the attribute and the entry.
1184            attr_timeout: self.cfg.timeout,
1185            entry_timeout: self.cfg.timeout,
1186        }
1187    }
1188
1189    /// Acquires lock of `expiring_casefold_lookup_caches` if `ascii_casefold` is enabled.
1190    fn lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>> {
1191        self.expiring_casefold_lookup_caches
1192            .as_ref()
1193            .map(|c| c.lock())
1194    }
1195
1196    // Returns an actual case-sensitive file name that matches with the given `name`.
1197    // Returns `Ok(None)` if no file matches with the give `name`.
1198    // This function will panic if casefold is not enabled.
1199    fn get_case_unfolded_name(
1200        &self,
1201        parent: &InodeData,
1202        name: &[u8],
1203    ) -> io::Result<Option<CString>> {
1204        let mut caches = self
1205            .lock_casefold_lookup_caches()
1206            .expect("casefold must be enabled");
1207        let dir_cache = caches.get(parent)?;
1208        Ok(dir_cache.lookup(name))
1209    }
1210
1211    // Performs an ascii case insensitive lookup.
1212    fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
1213        match self.get_case_unfolded_name(parent, name)? {
1214            None => Err(io::Error::from_raw_os_error(libc::ENOENT)),
1215            Some(actual_name) => self.do_lookup(parent, &actual_name),
1216        }
1217    }
1218
1219    #[cfg(test)]
1220    fn exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool {
1221        let mut cache = self
1222            .lock_casefold_lookup_caches()
1223            .expect("casefold must be enabled");
1224        cache.exists_in_cache(parent, name)
1225    }
1226
1227    fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
1228        let path_file = safe_openat2(
1229            parent,
1230            name,
1231            libc::O_PATH | libc::O_CLOEXEC | libc::O_NOFOLLOW,
1232            None,
1233            RESOLVE_IN_ROOT | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS,
1234        )?;
1235
1236        #[allow(unused_mut)]
1237        let mut st = stat(&path_file)?;
1238
1239        let altkey = InodeAltKey {
1240            ino: st.st_ino,
1241            dev: st.st_dev,
1242        };
1243
1244        let path = format!(
1245            "{}/{}",
1246            parent.path.clone(),
1247            name.to_str().unwrap_or("<non UTF-8 str>")
1248        );
1249
1250        // Check if we already have an entry before opening a new file.
1251        if let Some(data) = self.inodes.lock().get_alt(&altkey) {
1252            // Return the same inode with the reference counter increased.
1253            #[cfg(feature = "arc_quota")]
1254            self.set_permission(&mut st, &path);
1255            #[cfg(feature = "fs_runtime_ugid_map")]
1256            self.set_ugid_permission(&mut st, &path);
1257            return Ok(Entry {
1258                inode: self.increase_inode_refcount(data),
1259                generation: 0,
1260                attr: st,
1261                // We use the same timeout for the attribute and the entry.
1262                attr_timeout: self.cfg.timeout,
1263                entry_timeout: self.cfg.timeout,
1264            });
1265        }
1266
1267        // Now we need to get a file descriptor that can be used for operations
1268        // that don't support O_PATH. We try to open it with O_RDONLY or O_DIRECTORY
1269        // first.
1270        let mut flags = libc::O_RDONLY | libc::O_CLOEXEC;
1271        match FileType::from(st.st_mode) {
1272            FileType::Regular => {}
1273            FileType::Directory => flags |= libc::O_DIRECTORY,
1274            FileType::Other => flags |= libc::O_PATH,
1275        };
1276
1277        // We use /proc/self/fd/{path_fd} to open the file again with full permissions.
1278        // This is safe because we resolved the path securely above.
1279        let pathname = CString::new(format!("self/fd/{}", path_file.as_raw_descriptor()))
1280            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1281
1282        // SAFETY: this doesn't modify any memory and we check the return value.
1283        let fd = match syscall!(unsafe {
1284            libc::openat64(self.proc.as_raw_descriptor(), pathname.as_ptr(), flags)
1285        }) {
1286            Ok(fd) => fd,
1287            Err(e) if e.errno() == libc::EACCES => {
1288                // Fall back to O_PATH if we can't read it.
1289                flags |= libc::O_PATH;
1290                // SAFETY: this doesn't modify any memory and we check the return value.
1291                syscall!(unsafe {
1292                    libc::openat64(self.proc.as_raw_descriptor(), pathname.as_ptr(), flags)
1293                })?
1294            }
1295            Err(e) => return Err(e.into()),
1296        };
1297
1298        // SAFETY: safe because we own the fd.
1299        let f = unsafe { File::from_raw_descriptor(fd) };
1300        flags |= libc::O_NOFOLLOW;
1301        Ok(self.add_entry(f, st, flags, path))
1302    }
1303
1304    fn get_cache_open_options(&self, flags: u32) -> OpenOptions {
1305        let mut opts = OpenOptions::empty();
1306        match self.cfg.cache_policy {
1307            // We only set the direct I/O option on files.
1308            CachePolicy::Never => opts.set(
1309                OpenOptions::DIRECT_IO,
1310                flags & (libc::O_DIRECTORY as u32) == 0,
1311            ),
1312            CachePolicy::Always => {
1313                opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
1314                    OpenOptions::KEEP_CACHE
1315                } else {
1316                    OpenOptions::CACHE_DIR
1317                }
1318            }
1319            _ => {}
1320        };
1321        opts
1322    }
1323
1324    // Performs lookup using original name first, if it fails and ascii_casefold is enabled,
1325    // it tries to unfold the name and do lookup again.
1326    fn do_lookup_with_casefold_fallback(
1327        &self,
1328        parent: &InodeData,
1329        name: &CStr,
1330    ) -> io::Result<Entry> {
1331        let mut res = self.do_lookup(parent, name);
1332        // If `ascii_casefold` is enabled, fallback to `ascii_casefold_lookup()`.
1333        if res.is_err() && self.cfg.ascii_casefold {
1334            res = self.ascii_casefold_lookup(parent, name.to_bytes());
1335        }
1336        res
1337    }
1338
1339    fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
1340        let inode_data = self.find_inode(inode)?;
1341
1342        let file = self.open_inode(&inode_data, flags as i32)?;
1343
1344        let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1345        let data = HandleData {
1346            inode,
1347            file: Mutex::new(OpenedFile::new(file, flags as i32)),
1348            unsafe_leak_fd: AtomicBool::new(false),
1349        };
1350
1351        self.handles.lock().insert(handle, Arc::new(data));
1352
1353        let opts = self.get_cache_open_options(flags);
1354
1355        Ok((Some(handle), opts))
1356    }
1357
1358    fn do_open_at(
1359        &self,
1360        parent_data: Arc<InodeData>,
1361        name: &CStr,
1362        inode: Inode,
1363        flags: u32,
1364    ) -> io::Result<(Option<Handle>, OpenOptions)> {
1365        let open_flags = self.update_open_flags(flags as i32);
1366
1367        let fd_open = syscall!(
1368            // SAFETY: return value is checked.
1369            unsafe {
1370                libc::openat64(
1371                    parent_data.as_raw_descriptor(),
1372                    name.as_ptr(),
1373                    (open_flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1374                )
1375            }
1376        )?;
1377
1378        // SAFETY: fd_open is valid
1379        let file_open = unsafe { File::from_raw_descriptor(fd_open) };
1380        let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1381        let data = HandleData {
1382            inode,
1383            file: Mutex::new(OpenedFile::new(file_open, open_flags)),
1384            unsafe_leak_fd: AtomicBool::new(false),
1385        };
1386
1387        self.handles.lock().insert(handle, Arc::new(data));
1388
1389        let opts = self.get_cache_open_options(open_flags as u32);
1390        Ok((Some(handle), opts))
1391    }
1392
1393    fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
1394        let mut handles = self.handles.lock();
1395
1396        if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
1397            if e.get().inode == inode {
1398                // We don't need to close the file here because that will happen automatically when
1399                // the last `Arc` is dropped.
1400                e.remove();
1401                return Ok(());
1402            }
1403        }
1404
1405        Err(ebadf())
1406    }
1407
1408    fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
1409        #[allow(unused_mut)]
1410        let mut st = stat(inode)?;
1411
1412        #[cfg(feature = "arc_quota")]
1413        self.set_permission(&mut st, &inode.path);
1414        #[cfg(feature = "fs_runtime_ugid_map")]
1415        self.set_ugid_permission(&mut st, &inode.path);
1416        Ok((st, self.cfg.timeout))
1417    }
1418
1419    fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
1420        if name.to_bytes().contains(&b'/') {
1421            return Err(io::Error::from_raw_os_error(libc::EINVAL));
1422        }
1423        // SAFETY: this doesn't modify any memory and we check the return value.
1424        syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
1425        Ok(())
1426    }
1427
1428    fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
1429        // SAFETY: this doesn't modify any memory and we check the return value.
1430        syscall!(unsafe {
1431            if datasync {
1432                libc::fdatasync(file.as_raw_descriptor())
1433            } else {
1434                libc::fsync(file.as_raw_descriptor())
1435            }
1436        })?;
1437
1438        Ok(())
1439    }
1440
1441    // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
1442    // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
1443    // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
1444    // root inode.
1445    //
1446    // NOTE: this method acquires an `self`-wide lock. If any locks are acquired in `f`, care must
1447    // be taken to avoid the risk of deadlocks.
1448    fn with_proc_chdir<F, T>(&self, f: F) -> T
1449    where
1450        F: FnOnce() -> T,
1451    {
1452        let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
1453
1454        // Acquire a lock for `fchdir`.
1455        let _proc_lock = self.process_lock.lock();
1456        // SAFETY: this doesn't modify any memory and we check the return value. Since the
1457        // fchdir should never fail we just use debug_asserts.
1458        let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
1459        debug_assert_eq!(
1460            proc_cwd,
1461            0,
1462            "failed to fchdir to /proc: {}",
1463            io::Error::last_os_error()
1464        );
1465
1466        let res = f();
1467
1468        // SAFETY: this doesn't modify any memory and we check the return value. Since the
1469        // fchdir should never fail we just use debug_asserts.
1470        let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
1471        debug_assert_eq!(
1472            root_cwd,
1473            0,
1474            "failed to fchdir back to root directory: {}",
1475            io::Error::last_os_error()
1476        );
1477
1478        res
1479    }
1480
1481    fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
1482        let file = inode.file.lock();
1483        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
1484        let res = if o_path_file {
1485            // For FDs opened with `O_PATH`, we cannot call `fgetxattr` normally. Instead we
1486            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1487            //  and then setting the CWD back to the root directory.
1488            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
1489                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1490
1491            // SAFETY: this will only modify `value` and we check the return value.
1492            self.with_proc_chdir(|| unsafe {
1493                libc::getxattr(
1494                    path.as_ptr(),
1495                    name.as_ptr(),
1496                    value.as_mut_ptr() as *mut libc::c_void,
1497                    value.len() as libc::size_t,
1498                )
1499            })
1500        } else {
1501            // For regular files and directories, we can just use fgetxattr.
1502            // SAFETY: this will only write to `value` and we check the return value.
1503            unsafe {
1504                libc::fgetxattr(
1505                    file.as_raw_descriptor(),
1506                    name.as_ptr(),
1507                    value.as_mut_ptr() as *mut libc::c_void,
1508                    value.len() as libc::size_t,
1509                )
1510            }
1511        };
1512
1513        if res < 0 {
1514            Err(io::Error::last_os_error())
1515        } else {
1516            Ok(res as usize)
1517        }
1518    }
1519
1520    fn get_encryption_policy_ex<R: io::Read>(
1521        &self,
1522        inode: Inode,
1523        handle: Handle,
1524        mut r: R,
1525    ) -> io::Result<IoctlReply> {
1526        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1527            self.find_inode(inode)?
1528        } else {
1529            self.find_handle(handle, inode)?
1530        };
1531
1532        // SAFETY: this struct only has integer fields and any value is valid.
1533        let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
1534        r.read_exact(arg.policy_size.as_mut_bytes())?;
1535
1536        let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
1537        arg.policy_size = policy_size;
1538
1539        let res =
1540            // SAFETY: the kernel will only write to `arg` and we check the return value.
1541            unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX, &mut arg) };
1542        if res < 0 {
1543            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1544        } else {
1545            let len = size_of::<u64>() + arg.policy_size as usize;
1546            Ok(IoctlReply::Done(Ok(<&[u8]>::from(&arg)[..len].to_vec())))
1547        }
1548    }
1549
1550    fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1551        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1552            self.find_inode(inode)?
1553        } else {
1554            self.find_handle(handle, inode)?
1555        };
1556
1557        let mut buf = MaybeUninit::<fsxattr>::zeroed();
1558
1559        // SAFETY: the kernel will only write to `buf` and we check the return value.
1560        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR, buf.as_mut_ptr()) };
1561        if res < 0 {
1562            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1563        } else {
1564            // SAFETY: the kernel guarantees that the policy is now initialized.
1565            let xattr = unsafe { buf.assume_init() };
1566            Ok(IoctlReply::Done(Ok(xattr.as_bytes().to_vec())))
1567        }
1568    }
1569
1570    fn set_fsxattr<R: io::Read>(
1571        &self,
1572        #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1573        inode: Inode,
1574        handle: Handle,
1575        mut r: R,
1576    ) -> io::Result<IoctlReply> {
1577        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1578            self.find_inode(inode)?
1579        } else {
1580            self.find_handle(handle, inode)?
1581        };
1582
1583        let mut in_attr = fsxattr::new_zeroed();
1584        r.read_exact(in_attr.as_mut_bytes())?;
1585
1586        #[cfg(feature = "arc_quota")]
1587        let st = stat(&*data)?;
1588
1589        #[cfg(feature = "arc_quota")]
1590        let ctx_uid = self.lookup_host_uid(&ctx, inode);
1591
1592        // Changing quota project ID requires CAP_FOWNER or being file owner.
1593        // Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
1594        #[cfg(feature = "arc_quota")]
1595        if ctx_uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx_uid) {
1596            // Get the current fsxattr.
1597            let mut buf = MaybeUninit::<fsxattr>::zeroed();
1598            // SAFETY: the kernel will only write to `buf` and we check the return value.
1599            let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR, buf.as_mut_ptr()) };
1600            if res < 0 {
1601                return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1602            }
1603            // SAFETY: the kernel guarantees that the policy is now initialized.
1604            let current_attr = unsafe { buf.assume_init() };
1605
1606            // Project ID cannot be changed inside a user namespace.
1607            // Use Spaced to avoid this restriction.
1608            if current_attr.fsx_projid != in_attr.fsx_projid {
1609                let connection = self.dbus_connection.as_ref().unwrap().lock();
1610                let proxy = connection.with_proxy(
1611                    "org.chromium.Spaced",
1612                    "/org/chromium/Spaced",
1613                    DEFAULT_DBUS_TIMEOUT,
1614                );
1615                let project_id = in_attr.fsx_projid;
1616                if !is_android_project_id(project_id) {
1617                    return Err(io::Error::from_raw_os_error(libc::EINVAL));
1618                }
1619                let file_clone = base::SafeDescriptor::try_from(&*data)?;
1620                match proxy.set_project_id(file_clone.into(), project_id) {
1621                    Ok(r) => {
1622                        let r = SetProjectIdReply::parse_from_bytes(&r)
1623                            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1624                        if !r.success {
1625                            return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1626                                r.error,
1627                            ))));
1628                        }
1629                    }
1630                    Err(e) => {
1631                        return Err(io::Error::other(e));
1632                    }
1633                };
1634            }
1635        }
1636
1637        //  SAFETY: this doesn't modify any memory and we check the return value.
1638        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR, &in_attr) };
1639        if res < 0 {
1640            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1641        } else {
1642            Ok(IoctlReply::Done(Ok(Vec::new())))
1643        }
1644    }
1645
1646    fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1647        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1648            self.find_inode(inode)?
1649        } else {
1650            self.find_handle(handle, inode)?
1651        };
1652
1653        // The ioctl encoding is a long but the parameter is actually an int.
1654        let mut flags: c_int = 0;
1655
1656        // SAFETY: the kernel will only write to `flags` and we check the return value.
1657        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS, &mut flags) };
1658        if res < 0 {
1659            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1660        } else {
1661            Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
1662        }
1663    }
1664
1665    fn set_flags<R: io::Read>(
1666        &self,
1667        #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1668        inode: Inode,
1669        handle: Handle,
1670        mut r: R,
1671    ) -> io::Result<IoctlReply> {
1672        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1673            self.find_inode(inode)?
1674        } else {
1675            self.find_handle(handle, inode)?
1676        };
1677
1678        // The ioctl encoding is a long but the parameter is actually an int.
1679        let mut in_flags: c_int = 0;
1680        r.read_exact(in_flags.as_mut_bytes())?;
1681
1682        #[cfg(feature = "arc_quota")]
1683        let st = stat(&*data)?;
1684
1685        #[cfg(feature = "arc_quota")]
1686        let ctx_uid = self.lookup_host_uid(&ctx, inode);
1687
1688        // Only privleged uid can perform FS_IOC_SETFLAGS through cryptohome.
1689        #[cfg(feature = "arc_quota")]
1690        if ctx_uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx_uid) {
1691            // Get the current flag.
1692            let mut buf = MaybeUninit::<c_int>::zeroed();
1693            // SAFETY: the kernel will only write to `buf` and we check the return value.
1694            let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS, buf.as_mut_ptr()) };
1695            if res < 0 {
1696                return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1697            }
1698            // SAFETY: the kernel guarantees that the policy is now initialized.
1699            let current_flags = unsafe { buf.assume_init() };
1700
1701            // Project inheritance flag cannot be changed inside a user namespace.
1702            // Use Spaced to avoid this restriction.
1703            if (in_flags & FS_PROJINHERIT_FL) != (current_flags & FS_PROJINHERIT_FL) {
1704                let connection = self.dbus_connection.as_ref().unwrap().lock();
1705                let proxy = connection.with_proxy(
1706                    "org.chromium.Spaced",
1707                    "/org/chromium/Spaced",
1708                    DEFAULT_DBUS_TIMEOUT,
1709                );
1710                // If the input flags contain FS_PROJINHERIT_FL, then it is a set. Otherwise it is a
1711                // reset.
1712                let enable = (in_flags & FS_PROJINHERIT_FL) == FS_PROJINHERIT_FL;
1713                let file_clone = base::SafeDescriptor::try_from(&*data)?;
1714                match proxy.set_project_inheritance_flag(file_clone.into(), enable) {
1715                    Ok(r) => {
1716                        let r = SetProjectInheritanceFlagReply::parse_from_bytes(&r)
1717                            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1718                        if !r.success {
1719                            return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1720                                r.error,
1721                            ))));
1722                        }
1723                    }
1724                    Err(e) => {
1725                        return Err(io::Error::other(e));
1726                    }
1727                };
1728            }
1729        }
1730
1731        // SAFETY: this doesn't modify any memory and we check the return value.
1732        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS, &in_flags) };
1733        if res < 0 {
1734            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1735        } else {
1736            Ok(IoctlReply::Done(Ok(Vec::new())))
1737        }
1738    }
1739
1740    fn enable_verity<R: io::Read>(
1741        &self,
1742        inode: Inode,
1743        handle: Handle,
1744        mut r: R,
1745    ) -> io::Result<IoctlReply> {
1746        let inode_data = self.find_inode(inode)?;
1747
1748        // These match the return codes from `fsverity_ioctl_enable` in the kernel.
1749        match inode_data.filetype {
1750            FileType::Regular => {}
1751            FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
1752            FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
1753        }
1754
1755        {
1756            // We cannot enable verity while holding a writable fd so get a new one, if necessary.
1757            let mut file = inode_data.file.lock();
1758            let mut flags = file.open_flags;
1759            match flags & libc::O_ACCMODE {
1760                libc::O_WRONLY | libc::O_RDWR => {
1761                    flags &= !libc::O_ACCMODE;
1762                    flags |= libc::O_RDONLY;
1763
1764                    // We need to get a read-only handle for this file.
1765                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
1766                    *file = OpenedFile::new(newfile, flags);
1767                }
1768                libc::O_RDONLY => {}
1769                _ => panic!("Unexpected flags: {flags:#x}"),
1770            }
1771        }
1772
1773        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1774            inode_data
1775        } else {
1776            let data = self.find_handle(handle, inode)?;
1777
1778            {
1779                // We can't enable verity while holding a writable fd. We don't know whether the
1780                // file was opened for writing so check it here. We don't expect
1781                // this to be a frequent operation so the extra latency should be
1782                // fine.
1783                let mut file = data.file.lock();
1784                let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
1785                match flags {
1786                    FileFlags::ReadWrite | FileFlags::Write => {
1787                        // We need to get a read-only handle for this file.
1788                        *file = OpenedFile::new(
1789                            self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?,
1790                            libc::O_RDONLY,
1791                        );
1792                    }
1793                    FileFlags::Read => {}
1794                }
1795            }
1796
1797            data
1798        };
1799
1800        let mut arg = fsverity_enable_arg::new_zeroed();
1801        r.read_exact(arg.as_mut_bytes())?;
1802
1803        let mut salt;
1804        if arg.salt_size > 0 {
1805            if arg.salt_size > self.max_buffer_size() {
1806                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1807                    libc::ENOMEM,
1808                ))));
1809            }
1810            salt = vec![0; arg.salt_size as usize];
1811            r.read_exact(&mut salt)?;
1812            arg.salt_ptr = salt.as_ptr() as usize as u64;
1813        } else {
1814            arg.salt_ptr = 0;
1815        }
1816
1817        let mut sig;
1818        if arg.sig_size > 0 {
1819            if arg.sig_size > self.max_buffer_size() {
1820                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1821                    libc::ENOMEM,
1822                ))));
1823            }
1824            sig = vec![0; arg.sig_size as usize];
1825            r.read_exact(&mut sig)?;
1826            arg.sig_ptr = sig.as_ptr() as usize as u64;
1827        } else {
1828            arg.sig_ptr = 0;
1829        }
1830
1831        // SAFETY: this doesn't modify any memory and we check the return value.
1832        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY, &arg) };
1833        if res < 0 {
1834            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1835        } else {
1836            Ok(IoctlReply::Done(Ok(Vec::new())))
1837        }
1838    }
1839
1840    fn measure_verity<R: io::Read>(
1841        &self,
1842        inode: Inode,
1843        handle: Handle,
1844        mut r: R,
1845        out_size: u32,
1846    ) -> io::Result<IoctlReply> {
1847        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1848            self.find_inode(inode)?
1849        } else {
1850            self.find_handle(handle, inode)?
1851        };
1852
1853        let mut digest = fsverity_digest::new_zeroed();
1854        r.read_exact(digest.as_mut_bytes())?;
1855
1856        // Taken from fs/verity/fsverity_private.h.
1857        const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
1858
1859        // This digest size is what the fsverity command line utility uses.
1860        const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
1861        const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
1862        const ROUNDED_LEN: usize = BUFLEN.div_ceil(size_of::<fsverity_digest>());
1863
1864        // Make sure we get a properly aligned allocation.
1865        let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
1866
1867        // SAFETY: we are only writing data and not reading uninitialized memory.
1868        unsafe {
1869            // TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
1870            addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
1871                .write(DIGEST_SIZE)
1872        };
1873
1874        // SAFETY: this will only modify `buf` and we check the return value.
1875        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY, buf.as_mut_ptr()) };
1876        if res < 0 {
1877            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1878        } else {
1879            let digest_size =
1880                // SAFETY: this value was initialized by us already and then overwritten by the kernel.
1881                // TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
1882                unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
1883            let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
1884
1885            // The kernel guarantees this but it doesn't hurt to be paranoid.
1886            debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
1887            if digest.digest_size < digest_size || out_size < outlen {
1888                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1889                    libc::EOVERFLOW,
1890                ))));
1891            }
1892
1893            let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
1894                // SAFETY: any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
1895                // doesn't contain any references.
1896                unsafe { mem::transmute(buf) };
1897
1898            let buf =
1899                // SAFETY: Casting to `*const [u8]` is safe because the kernel guarantees that the
1900                // first `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed
1901                // to have the same layout as `u8`.
1902                // TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
1903                unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
1904            Ok(IoctlReply::Done(Ok(buf.to_vec())))
1905        }
1906    }
1907}
1908
1909#[cfg(feature = "fs_runtime_ugid_map")]
1910impl PassthroughFs {
1911    fn find_and_set_ugid_permission(
1912        &self,
1913        st: &mut libc::stat64,
1914        path: &str,
1915        is_root_path: bool,
1916    ) -> bool {
1917        for perm_data in self
1918            .permission_paths
1919            .read()
1920            .expect("acquire permission_paths read lock")
1921            .iter()
1922        {
1923            if (is_root_path && perm_data.perm_path == "/")
1924                || (!is_root_path
1925                    && perm_data.perm_path != "/"
1926                    && perm_data.need_set_permission(path))
1927            {
1928                self.set_permission_from_data(st, perm_data);
1929                return true;
1930            }
1931        }
1932        false
1933    }
1934
1935    fn set_permission_from_data(&self, st: &mut libc::stat64, perm_data: &PermissionData) {
1936        st.st_uid = perm_data.guest_uid;
1937        st.st_gid = perm_data.guest_gid;
1938        st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
1939    }
1940
1941    /// Set permission according to path
1942    fn set_ugid_permission(&self, st: &mut libc::stat64, path: &str) {
1943        let is_root_path = path.is_empty();
1944
1945        if self.find_and_set_ugid_permission(st, path, is_root_path) {
1946            return;
1947        }
1948
1949        if let Some(perm_data) = self
1950            .permission_paths
1951            .read()
1952            .expect("acquire permission_paths read lock")
1953            .iter()
1954            .find(|pd| pd.perm_path == "/")
1955        {
1956            self.set_permission_from_data(st, perm_data);
1957        }
1958    }
1959
1960    /// Set host uid/gid to configured value according to path
1961    fn change_ugid_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32) {
1962        let path = format!(
1963            "{}/{}",
1964            parent_data.path.clone(),
1965            name.to_str().unwrap_or("<non UTF-8 str>")
1966        );
1967
1968        self.change_ugid_creds_for_path(ctx, &path)
1969    }
1970
1971    /// Set host uid/gid to configured value according to path
1972    fn change_ugid_creds_for_path(&self, ctx: &Context, path: &str) -> (u32, u32) {
1973        let is_root_path = path.is_empty();
1974
1975        if let Some(creds) = self.find_ugid_creds_for_path(path, is_root_path) {
1976            return creds;
1977        }
1978
1979        if let Some(perm_data) = self
1980            .permission_paths
1981            .read()
1982            .expect("acquire permission_paths read lock")
1983            .iter()
1984            .find(|pd| pd.perm_path == "/")
1985        {
1986            return (perm_data.host_uid, perm_data.host_gid);
1987        }
1988
1989        (ctx.uid, ctx.gid)
1990    }
1991
1992    fn find_ugid_creds_for_path(&self, path: &str, is_root_path: bool) -> Option<(u32, u32)> {
1993        for perm_data in self
1994            .permission_paths
1995            .read()
1996            .expect("acquire permission_paths read lock")
1997            .iter()
1998        {
1999            if (is_root_path && perm_data.perm_path == "/")
2000                || (!is_root_path
2001                    && perm_data.perm_path != "/"
2002                    && perm_data.need_set_permission(path))
2003            {
2004                return Some((perm_data.host_uid, perm_data.host_gid));
2005            }
2006        }
2007        None
2008    }
2009}
2010
2011#[cfg(feature = "arc_quota")]
2012impl PassthroughFs {
2013    /// Convert u8 slice to string
2014    fn string_from_u8_slice(&self, buf: &[u8]) -> io::Result<String> {
2015        match CStr::from_bytes_until_nul(buf).map(|s| s.to_string_lossy().to_string()) {
2016            Ok(s) => Ok(s),
2017            Err(e) => {
2018                error!("fail to convert u8 slice to string: {}", e);
2019                Err(io::Error::from_raw_os_error(libc::EINVAL))
2020            }
2021        }
2022    }
2023
2024    /// Set permission according to path
2025    fn set_permission(&self, st: &mut libc::stat64, path: &str) {
2026        for perm_data in self
2027            .permission_paths
2028            .read()
2029            .expect("acquire permission_paths read lock")
2030            .iter()
2031        {
2032            if perm_data.need_set_permission(path) {
2033                st.st_uid = perm_data.guest_uid;
2034                st.st_gid = perm_data.guest_gid;
2035                st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
2036            }
2037        }
2038    }
2039
2040    /// Set host uid/gid to configured value according to path
2041    fn change_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32) {
2042        let path = format!(
2043            "{}/{}",
2044            parent_data.path.clone(),
2045            name.to_str().unwrap_or("<non UTF-8 str>")
2046        );
2047
2048        self.change_creds_for_path(ctx, &path)
2049    }
2050
2051    /// Set host uid/gid to configured value according to path
2052    fn change_creds_for_path(&self, ctx: &Context, path: &str) -> (u32, u32) {
2053        for perm_data in self
2054            .permission_paths
2055            .read()
2056            .expect("acquire permission_paths read lock")
2057            .iter()
2058        {
2059            if perm_data.need_set_permission(path) {
2060                return (perm_data.host_uid, perm_data.host_gid);
2061            }
2062        }
2063
2064        (ctx.uid, ctx.gid)
2065    }
2066
2067    fn read_permission_data<R: io::Read>(&self, mut r: R) -> io::Result<PermissionData> {
2068        let mut fs_permission_data = FsPermissionDataBuffer::new_zeroed();
2069        r.read_exact(fs_permission_data.as_mut_bytes())?;
2070
2071        let perm_path = self.string_from_u8_slice(&fs_permission_data.perm_path)?;
2072        if !perm_path.starts_with('/') {
2073            error!("FS_IOC_SETPERMISSION: perm path must start with '/'");
2074            return Err(io::Error::from_raw_os_error(libc::EINVAL));
2075        }
2076        Ok(PermissionData {
2077            guest_uid: fs_permission_data.guest_uid,
2078            guest_gid: fs_permission_data.guest_gid,
2079            host_uid: fs_permission_data.host_uid,
2080            host_gid: fs_permission_data.host_gid,
2081            umask: fs_permission_data.umask,
2082            perm_path,
2083        })
2084    }
2085
2086    /// Sets uid/gid/umask for all files and directories under a specific path.
2087    ///
2088    /// This ioctl does not correspond to any upstream FUSE feature. It is used for arcvm
2089    /// It associates the specified path with the provide uid, gid, and umask values within the
2090    /// filesystem metadata.
2091    ///
2092    /// During subsequent lookup operations, the stored uid/gid/umask values are retrieved and
2093    /// applied to all files and directories found under the registered path. Before sending
2094    /// file stat information to the client, the uid and gid are substituted by `guest_uid` and
2095    /// `guest_gid` if the file falls under the registered path. The file mode is masked by the
2096    ///  umask.
2097    ///
2098    /// When the guest creates a file within the specified path, the file gid/uid stat in host
2099    /// will be overwritten to `host_uid` and `host_gid` values.
2100    ///
2101    /// This functionality enables dynamic configuration of ownership and permissions for a
2102    /// specific directory hierarchy within the filesystem.
2103    ///
2104    /// # Notes
2105    /// - This method affects all existing and future files under the registered path.
2106    /// - The original file ownership and permissions are overridden by the provided values.
2107    /// - The registered path should not be renamed
2108    /// - Refer go/remove-mount-passthrough-fuse for more design details
2109    fn set_permission_by_path<R: io::Read>(&self, r: R) -> IoctlReply {
2110        if self
2111            .permission_paths
2112            .read()
2113            .expect("acquire permission_paths read lock")
2114            .len()
2115            >= self.cfg.max_dynamic_perm
2116        {
2117            error!(
2118                "FS_IOC_SETPERMISSION exceeds limits of max_dynamic_perm: {}",
2119                self.cfg.max_dynamic_perm
2120            );
2121            return IoctlReply::Done(Err(io::Error::from_raw_os_error(libc::EPERM)));
2122        }
2123
2124        let perm_data = match self.read_permission_data(r) {
2125            Ok(data) => data,
2126            Err(e) => {
2127                error!("fail to read permission data: {}", e);
2128                return IoctlReply::Done(Err(e));
2129            }
2130        };
2131
2132        self.permission_paths
2133            .write()
2134            .expect("acquire permission_paths write lock")
2135            .push(perm_data);
2136
2137        IoctlReply::Done(Ok(Vec::new()))
2138    }
2139
2140    // Get xattr value according to path and name
2141    fn get_xattr_by_path(&self, path: &str, name: &str) -> Option<String> {
2142        self.xattr_paths
2143            .read()
2144            .expect("acquire permission_paths read lock")
2145            .iter()
2146            .find(|data| data.need_set_guest_xattr(path, name))
2147            .map(|data| data.xattr_value.clone())
2148    }
2149
2150    fn skip_host_set_xattr(&self, path: &str, name: &str) -> bool {
2151        self.get_xattr_by_path(path, name).is_some()
2152    }
2153
2154    fn read_xattr_data<R: io::Read>(&self, mut r: R) -> io::Result<XattrData> {
2155        let mut fs_path_xattr_data = FsPathXattrDataBuffer::new_zeroed();
2156        r.read_exact(fs_path_xattr_data.as_mut_bytes())?;
2157
2158        let xattr_path = self.string_from_u8_slice(&fs_path_xattr_data.path)?;
2159        if !xattr_path.starts_with('/') {
2160            error!("FS_IOC_SETPATHXATTR: perm path must start with '/'");
2161            return Err(io::Error::from_raw_os_error(libc::EINVAL));
2162        }
2163        let xattr_name = self.string_from_u8_slice(&fs_path_xattr_data.xattr_name)?;
2164        let xattr_value = self.string_from_u8_slice(&fs_path_xattr_data.xattr_value)?;
2165
2166        Ok(XattrData {
2167            xattr_path,
2168            xattr_name,
2169            xattr_value,
2170        })
2171    }
2172
2173    /// Sets xattr value for all files and directories under a specific path.
2174    ///
2175    /// This ioctl does not correspond to any upstream FUSE feature. It is used for arcvm.
2176    /// It associates the specified path and xattr name with a value.
2177    ///
2178    /// When the getxattr is called for the specified path and name, the predefined
2179    /// value is returned.
2180    ///
2181    /// # Notes
2182    /// - This method affects all existing and future files under the registered path.
2183    /// - The SECURITY_CONTEXT feature will be disabled if this ioctl is enabled.
2184    /// - The registered path should not be renamed
2185    /// - Refer go/remove-mount-passthrough-fuse for more design details
2186    fn set_xattr_by_path<R: io::Read>(&self, r: R) -> IoctlReply {
2187        if self
2188            .xattr_paths
2189            .read()
2190            .expect("acquire xattr_paths read lock")
2191            .len()
2192            >= self.cfg.max_dynamic_xattr
2193        {
2194            error!(
2195                "FS_IOC_SETPATHXATTR exceeds limits of max_dynamic_xattr: {}",
2196                self.cfg.max_dynamic_xattr
2197            );
2198            return IoctlReply::Done(Err(io::Error::from_raw_os_error(libc::EPERM)));
2199        }
2200
2201        let xattr_data = match self.read_xattr_data(r) {
2202            Ok(data) => data,
2203            Err(e) => {
2204                error!("fail to read xattr data: {}", e);
2205                return IoctlReply::Done(Err(e));
2206            }
2207        };
2208
2209        self.xattr_paths
2210            .write()
2211            .expect("acquire xattr_paths write lock")
2212            .push(xattr_data);
2213
2214        IoctlReply::Done(Ok(Vec::new()))
2215    }
2216
2217    fn do_getxattr_with_filter(
2218        &self,
2219        data: Arc<InodeData>,
2220        name: Cow<CStr>,
2221        buf: &mut [u8],
2222    ) -> io::Result<usize> {
2223        let res: usize = match self.get_xattr_by_path(&data.path, &name.to_string_lossy()) {
2224            Some(predifined_xattr) => {
2225                let x = predifined_xattr.into_bytes();
2226                if x.len() > buf.len() {
2227                    return Err(io::Error::from_raw_os_error(libc::ERANGE));
2228                }
2229                buf[..x.len()].copy_from_slice(&x);
2230                x.len()
2231            }
2232            None => self.do_getxattr(&data, &name, &mut buf[..])?,
2233        };
2234        Ok(res)
2235    }
2236
2237    /// Looks up the host uid according to the path of file that inode is referring to.
2238    fn lookup_host_uid(&self, ctx: &Context, inode: Inode) -> u32 {
2239        if let Ok(inode_data) = self.find_inode(inode) {
2240            let path = &inode_data.path;
2241            for perm_data in self
2242                .permission_paths
2243                .read()
2244                .expect("acquire permission_paths read lock")
2245                .iter()
2246            {
2247                if perm_data.need_set_permission(path) {
2248                    return perm_data.host_uid;
2249                }
2250            }
2251        }
2252        ctx.uid
2253    }
2254}
2255
2256/// Decrements the refcount of the inode.
2257/// Returns `true` if the refcount became 0.
2258fn forget_one(
2259    inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
2260    inode: Inode,
2261    count: u64,
2262) -> bool {
2263    if let Some(data) = inodes.get(&inode) {
2264        // Acquiring the write lock on the inode map prevents new lookups from incrementing the
2265        // refcount but there is the possibility that a previous lookup already acquired a
2266        // reference to the inode data and is in the process of updating the refcount so we need
2267        // to loop here until we can decrement successfully.
2268        loop {
2269            let refcount = data.refcount.load(Ordering::Relaxed);
2270
2271            // Saturating sub because it doesn't make sense for a refcount to go below zero and
2272            // we don't want misbehaving clients to cause integer overflow.
2273            let new_count = refcount.saturating_sub(count);
2274
2275            // Synchronizes with the acquire load in `do_lookup`.
2276            if data
2277                .refcount
2278                .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
2279                .is_ok()
2280            {
2281                if new_count == 0 {
2282                    // We just removed the last refcount for this inode. There's no need for an
2283                    // acquire fence here because we hold a write lock on the inode map and any
2284                    // thread that is waiting to do a forget on the same inode will have to wait
2285                    // until we release the lock. So there's is no other release store for us to
2286                    // synchronize with before deleting the entry.
2287                    inodes.remove(&inode);
2288                    return true;
2289                }
2290                break;
2291            }
2292        }
2293    }
2294    false
2295}
2296
2297// Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
2298// nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
2299fn strip_xattr_prefix(buf: &mut Vec<u8>) {
2300    fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
2301        if start >= b.len() {
2302            return None;
2303        }
2304
2305        let end = b[start..]
2306            .iter()
2307            .position(|&c| c == b'\0')
2308            .map(|p| start + p + 1)
2309            .unwrap_or(b.len());
2310
2311        Some(&b[start..end])
2312    }
2313
2314    let mut pos = 0;
2315    while let Some(name) = next_cstr(buf, pos) {
2316        if !name.starts_with(USER_VIRTIOFS_XATTR) {
2317            pos += name.len();
2318            continue;
2319        }
2320
2321        let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
2322        buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
2323        pos += newlen;
2324    }
2325}
2326
2327impl Drop for PassthroughFs {
2328    /// The `Drop` implementation for this struct intentionally leaks all open file descriptors.
2329    /// It sets the `unsafe_leak_fd` flag on all `InodeData` and `HandleData` instances, which
2330    /// causes their `drop` implementations to forget the underlying `File` objects.
2331    ///
2332    /// This is a deliberate performance optimization for abrupt shutdowns. It relies on the
2333    /// operating system to clean up the file descriptors when the process terminates. It is
2334    /// **critical** that an instance of `PassthroughFs` is only dropped immediately prior to
2335    /// process termination.
2336    fn drop(&mut self) {
2337        let inodes = self.inodes.lock();
2338        inodes.apply(|v| {
2339            v.set_unsafe_leak_fd();
2340        });
2341        let handles = self.handles.lock();
2342        handles.values().for_each(|v| v.set_unsafe_leak_fd());
2343    }
2344}
2345
2346impl FileSystem for PassthroughFs {
2347    type Inode = Inode;
2348    type Handle = Handle;
2349    type DirIter = ReadDir<Box<[u8]>>;
2350
2351    fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
2352        let root = CString::new(self.root_dir.clone())
2353            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
2354
2355        let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
2356        // SAFETY: this doesn't modify any memory and we check the return value.
2357        let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
2358        if raw_descriptor < 0 {
2359            return Err(io::Error::last_os_error());
2360        }
2361
2362        // SAFETY: safe because we just opened this descriptor above.
2363        let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
2364
2365        let st = stat(&f)?;
2366
2367        // SAFETY: this doesn't modify any memory and there is no need to check the return
2368        // value because this system call always succeeds. We need to clear the umask here because
2369        // we want the client to be able to set all the bits in the mode.
2370        unsafe { libc::umask(0o000) };
2371
2372        let mut inodes = self.inodes.lock();
2373
2374        // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
2375        inodes.insert(
2376            ROOT_ID,
2377            InodeAltKey {
2378                ino: st.st_ino,
2379                dev: st.st_dev,
2380            },
2381            Arc::new(InodeData {
2382                inode: ROOT_ID,
2383                file: Mutex::new(OpenedFile::new(f, flags)),
2384                refcount: AtomicU64::new(2),
2385                filetype: st.st_mode.into(),
2386                path: "".to_string(),
2387                unsafe_leak_fd: AtomicBool::new(false),
2388            }),
2389        );
2390
2391        let mut opts = FsOptions::DO_READDIRPLUS
2392            | FsOptions::READDIRPLUS_AUTO
2393            | FsOptions::EXPORT_SUPPORT
2394            | FsOptions::DONT_MASK
2395            | FsOptions::CACHE_SYMLINKS;
2396
2397        // Device using dynamic xattr feature will have different security context in
2398        // host and guests. The SECURITY_CONTEXT feature should not be enabled in the
2399        // device.
2400        if self.cfg.max_dynamic_xattr == 0 && self.cfg.security_ctx {
2401            opts |= FsOptions::SECURITY_CONTEXT;
2402        }
2403
2404        if self.cfg.posix_acl {
2405            opts |= FsOptions::POSIX_ACL;
2406        }
2407        if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
2408            opts |= FsOptions::WRITEBACK_CACHE;
2409            self.writeback.store(true, Ordering::Relaxed);
2410        }
2411        if self.cfg.cache_policy == CachePolicy::Always {
2412            if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
2413                opts |= FsOptions::ZERO_MESSAGE_OPEN;
2414                self.zero_message_open.store(true, Ordering::Relaxed);
2415            }
2416            if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
2417                opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
2418                self.zero_message_opendir.store(true, Ordering::Relaxed);
2419            }
2420        }
2421        Ok(opts)
2422    }
2423
2424    fn destroy(&self) {
2425        cros_tracing::trace_simple_print!(VirtioFs, "{:?}: destroy", self);
2426        self.handles.lock().clear();
2427        self.inodes.lock().clear();
2428    }
2429
2430    fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
2431        let _trace = fs_trace!(self.tag, "statfs", inode);
2432        let data = self.find_inode(inode)?;
2433
2434        let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
2435
2436        // SAFETY: this will only modify `out` and we check the return value.
2437        syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
2438
2439        // SAFETY: the kernel guarantees that `out` has been initialized.
2440        Ok(unsafe { out.assume_init() })
2441    }
2442
2443    fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
2444        validate_path_component(name)?;
2445        let data = self.find_inode(parent)?;
2446        #[allow(unused_variables)]
2447        let path = format!(
2448            "{}/{}",
2449            data.path,
2450            name.to_str().unwrap_or("<non UTF-8 path>")
2451        );
2452        let _trace = fs_trace!(self.tag, "lookup", parent, path);
2453
2454        if !self.is_path_accessible(&path) {
2455            return Err(io::Error::from_raw_os_error(libc::ENOENT));
2456        }
2457
2458        let mut res = self.do_lookup_with_casefold_fallback(&data, name);
2459
2460        // FUSE takes a inode=0 as a request to do negative dentry cache.
2461        // So, if `negative_timeout` is set, return success with the timeout value and inode=0 as a
2462        // response.
2463        if let Err(e) = &res {
2464            if e.kind() == std::io::ErrorKind::NotFound && !self.cfg.negative_timeout.is_zero() {
2465                res = Ok(Entry::new_negative(self.cfg.negative_timeout));
2466            }
2467        }
2468
2469        res
2470    }
2471
2472    fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
2473        let _trace = fs_trace!(self.tag, "forget", inode, count);
2474        let mut inodes = self.inodes.lock();
2475        let caches = self.lock_casefold_lookup_caches();
2476        if forget_one(&mut inodes, inode, count) {
2477            if let Some(mut c) = caches {
2478                c.forget(inode);
2479            }
2480        }
2481    }
2482
2483    fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
2484        let mut inodes = self.inodes.lock();
2485        let mut caches = self.lock_casefold_lookup_caches();
2486        for (inode, count) in requests {
2487            if forget_one(&mut inodes, inode, count) {
2488                if let Some(c) = caches.as_mut() {
2489                    c.forget(inode);
2490                }
2491            }
2492        }
2493    }
2494
2495    fn opendir(
2496        &self,
2497        _ctx: Context,
2498        inode: Inode,
2499        flags: u32,
2500    ) -> io::Result<(Option<Handle>, OpenOptions)> {
2501        let _trace = fs_trace!(self.tag, "opendir", inode, flags);
2502        if self.zero_message_opendir.load(Ordering::Relaxed) {
2503            Err(io::Error::from_raw_os_error(libc::ENOSYS))
2504        } else {
2505            self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
2506        }
2507    }
2508
2509    fn releasedir(
2510        &self,
2511        _ctx: Context,
2512        inode: Inode,
2513        _flags: u32,
2514        handle: Handle,
2515    ) -> io::Result<()> {
2516        let _trace = fs_trace!(self.tag, "releasedir", inode, handle);
2517        if self.zero_message_opendir.load(Ordering::Relaxed) {
2518            Ok(())
2519        } else {
2520            self.do_release(inode, handle)
2521        }
2522    }
2523
2524    fn mkdir(
2525        &self,
2526        ctx: Context,
2527        parent: Inode,
2528        name: &CStr,
2529        mode: u32,
2530        umask: u32,
2531        security_ctx: Option<&CStr>,
2532    ) -> io::Result<Entry> {
2533        let _trace = fs_trace!(self.tag, "mkdir", parent, name, mode, umask, security_ctx);
2534        let data = self.find_inode(parent)?;
2535        self.authorize_write_path(&data.path, name)?;
2536
2537        let _ctx = security_ctx
2538            .filter(|ctx| *ctx != UNLABELED_CSTR)
2539            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2540            .transpose()?;
2541
2542        #[allow(unused_variables)]
2543        #[cfg(feature = "arc_quota")]
2544        let (uid, gid) = self.change_creds(&ctx, &data, name);
2545        #[cfg(feature = "fs_runtime_ugid_map")]
2546        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2547        #[cfg(not(feature = "fs_permission_translation"))]
2548        let (uid, gid) = (ctx.uid, ctx.gid);
2549
2550        let (_uid, _gid) = set_creds(uid, gid)?;
2551        {
2552            let casefold_cache = self.lock_casefold_lookup_caches();
2553            let _scoped_umask = ScopedUmask::new(umask);
2554
2555            // SAFETY: this doesn't modify any memory and we check the return value.
2556            syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
2557            if let Some(mut c) = casefold_cache {
2558                c.insert(data.inode, name);
2559            }
2560        }
2561        self.do_lookup(&data, name)
2562    }
2563
2564    fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2565        let _trace = fs_trace!(self.tag, "rmdir", parent, name);
2566        let data = self.find_inode(parent)?;
2567        self.authorize_write_path(&data.path, name)?;
2568        let casefold_cache = self.lock_casefold_lookup_caches();
2569        // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2570        // `get_case_unfolded_name()` to get the actual name to be unlinked.
2571        self.do_unlink(&data, name, libc::AT_REMOVEDIR)?;
2572        if let Some(mut c) = casefold_cache {
2573            c.remove(data.inode, name);
2574        }
2575        Ok(())
2576    }
2577
2578    fn readdir(
2579        &self,
2580        _ctx: Context,
2581        inode: Inode,
2582        handle: Handle,
2583        size: u32,
2584        offset: u64,
2585    ) -> io::Result<Self::DirIter> {
2586        let _trace = fs_trace!(self.tag, "readdir", inode, handle, size, offset);
2587        let buf = vec![0; size as usize].into_boxed_slice();
2588
2589        // Identify the absolute path of the directory being read.
2590        // This path is required to construct full paths for each directory entry
2591        // during filtering.
2592        let (parent_path, mut read_dir) = if self.zero_message_opendir.load(Ordering::Relaxed) {
2593            let data = self.find_inode(inode)?;
2594            let path = data.path.clone();
2595            let dir_guard = data.file.lock();
2596            let read_dir = ReadDir::new(&*dir_guard, offset as libc::off64_t, buf)?;
2597            (path, read_dir)
2598        } else {
2599            let data = self.find_handle(handle, inode)?;
2600            let inode_data = self.find_inode(data.inode)?;
2601            let path = inode_data.path.clone();
2602            let dir_guard = data.file.lock();
2603            let read_dir = ReadDir::new(&*dir_guard, offset as libc::off64_t, buf)?;
2604            (path, read_dir)
2605        };
2606
2607        // If an allowlist is configured, inject the allowlist context into the ReadDir
2608        // iterator to enable directory entry filtering (hiding unauthorized files).
2609        if let Some(allowlist) = &self.allowlist {
2610            let allowlist_guard = allowlist
2611                .read()
2612                .expect("failed to acquire read lock on allowlist");
2613            let filter = allowlist_guard.get_read_dir_filter(&parent_path);
2614            read_dir = read_dir.with_filter(filter);
2615        }
2616
2617        Ok(read_dir)
2618    }
2619
2620    fn open(
2621        &self,
2622        _ctx: Context,
2623        inode: Inode,
2624        flags: u32,
2625    ) -> io::Result<(Option<Handle>, OpenOptions)> {
2626        if self.zero_message_open.load(Ordering::Relaxed) {
2627            let _trace = fs_trace!(self.tag, "open (zero-message)", inode, flags);
2628            Err(io::Error::from_raw_os_error(libc::ENOSYS))
2629        } else {
2630            let _trace = fs_trace!(self.tag, "open", inode, flags);
2631            self.do_open(inode, flags)
2632        }
2633    }
2634
2635    fn release(
2636        &self,
2637        _ctx: Context,
2638        inode: Inode,
2639        _flags: u32,
2640        handle: Handle,
2641        _flush: bool,
2642        _flock_release: bool,
2643        _lock_owner: Option<u64>,
2644    ) -> io::Result<()> {
2645        if self.zero_message_open.load(Ordering::Relaxed) {
2646            let _trace = fs_trace!(self.tag, "release (zero-message)", inode, handle);
2647            Ok(())
2648        } else {
2649            let _trace = fs_trace!(self.tag, "release", inode, handle);
2650            self.do_release(inode, handle)
2651        }
2652    }
2653
2654    fn chromeos_tmpfile(
2655        &self,
2656        ctx: Context,
2657        parent: Self::Inode,
2658        mode: u32,
2659        umask: u32,
2660        security_ctx: Option<&CStr>,
2661    ) -> io::Result<Entry> {
2662        let _trace = fs_trace!(
2663            self.tag,
2664            "chromeos_tempfile",
2665            parent,
2666            mode,
2667            umask,
2668            security_ctx
2669        );
2670        let data = self.find_inode(parent)?;
2671
2672        let _ctx = security_ctx
2673            .filter(|ctx| *ctx != UNLABELED_CSTR)
2674            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2675            .transpose()?;
2676
2677        let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
2678
2679        let current_dir = c".";
2680
2681        #[allow(unused_variables)]
2682        #[cfg(feature = "arc_quota")]
2683        let (uid, gid) = self.change_creds(&ctx, &data, current_dir);
2684        #[cfg(feature = "fs_runtime_ugid_map")]
2685        let (uid, gid) = self.change_ugid_creds(&ctx, &data, current_dir);
2686        #[cfg(not(feature = "fs_permission_translation"))]
2687        let (uid, gid) = (ctx.uid, ctx.gid);
2688
2689        let (_uid, _gid) = set_creds(uid, gid)?;
2690
2691        let fd = {
2692            let _scoped_umask = ScopedUmask::new(umask);
2693
2694            // SAFETY: this doesn't modify any memory and we check the return value.
2695            syscall!(unsafe {
2696                libc::openat64(
2697                    data.as_raw_descriptor(),
2698                    current_dir.as_ptr(),
2699                    tmpflags,
2700                    mode,
2701                )
2702            })?
2703        };
2704        // No need to add casefold_cache becuase we created an anonymous file.
2705
2706        // SAFETY: safe because we just opened this fd.
2707        let tmpfile = unsafe { File::from_raw_descriptor(fd) };
2708        let st = stat(&tmpfile)?;
2709        let path = format!(
2710            "{}/{}",
2711            data.path.clone(),
2712            current_dir.to_str().unwrap_or("<non UTF-8 str>")
2713        );
2714        Ok(self.add_entry(tmpfile, st, tmpflags, path))
2715    }
2716
2717    fn create(
2718        &self,
2719        ctx: Context,
2720        parent: Inode,
2721        name: &CStr,
2722        mode: u32,
2723        flags: u32,
2724        umask: u32,
2725        security_ctx: Option<&CStr>,
2726    ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
2727        let _trace = fs_trace!(
2728            self.tag,
2729            "create",
2730            parent,
2731            name,
2732            mode,
2733            flags,
2734            umask,
2735            security_ctx
2736        );
2737        let data = self.find_inode(parent)?;
2738        let path = self.authorize_write_path(&data.path, name)?;
2739
2740        let _ctx = security_ctx
2741            .filter(|ctx| *ctx != UNLABELED_CSTR)
2742            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2743            .transpose()?;
2744
2745        #[allow(unused_variables)]
2746        #[cfg(feature = "arc_quota")]
2747        let (uid, gid) = self.change_creds(&ctx, &data, name);
2748        #[cfg(feature = "fs_runtime_ugid_map")]
2749        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2750        #[cfg(not(feature = "fs_permission_translation"))]
2751        let (uid, gid) = (ctx.uid, ctx.gid);
2752
2753        let (_uid, _gid) = set_creds(uid, gid)?;
2754
2755        let flags = self.update_open_flags(flags as i32);
2756        // Mask out O_DIRECT. Also mask out O_PATH because we need to return a readable/writable
2757        // file descriptor for create.
2758        let create_flags = (flags | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW)
2759            & !(libc::O_DIRECT | libc::O_PATH);
2760
2761        let file = {
2762            let _scoped_umask = ScopedUmask::new(umask);
2763            let casefold_cache = self.lock_casefold_lookup_caches();
2764
2765            let file = safe_openat2(
2766                &data,
2767                name,
2768                create_flags,
2769                Some(mode),
2770                RESOLVE_IN_ROOT | RESOLVE_NO_MAGICLINKS,
2771            )?;
2772            if let Some(mut c) = casefold_cache {
2773                c.insert(parent, name);
2774            }
2775            file
2776        };
2777
2778        let st = stat(&file)?;
2779        let entry = self.add_entry(file, st, create_flags, path);
2780
2781        let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
2782            (None, OpenOptions::KEEP_CACHE)
2783        } else {
2784            self.do_open_at(
2785                data,
2786                name,
2787                entry.inode,
2788                flags as u32 & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
2789            )
2790            .inspect_err(|_e| {
2791                // Don't leak the entry.
2792                self.forget(ctx, entry.inode, 1);
2793            })?
2794        };
2795        Ok((entry, handle, opts))
2796    }
2797
2798    fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2799        let _trace = fs_trace!(self.tag, "unlink", parent, name);
2800        let data = self.find_inode(parent)?;
2801        self.authorize_write_path(&data.path, name)?;
2802        let casefold_cache = self.lock_casefold_lookup_caches();
2803        // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2804        // `get_case_unfolded_name()` to get the actual name to be unlinked.
2805        self.do_unlink(&data, name, 0)?;
2806        if let Some(mut c) = casefold_cache {
2807            c.remove(data.inode, name);
2808        }
2809        Ok(())
2810    }
2811
2812    fn read<W: io::Write + ZeroCopyWriter>(
2813        &self,
2814        _ctx: Context,
2815        inode: Inode,
2816        handle: Handle,
2817        mut w: W,
2818        size: u32,
2819        offset: u64,
2820        _lock_owner: Option<u64>,
2821        _flags: u32,
2822    ) -> io::Result<usize> {
2823        if self.zero_message_open.load(Ordering::Relaxed) {
2824            let _trace = fs_trace!(self.tag, "read (zero-message)", inode, handle, size, offset);
2825            let data = self.find_inode(inode)?;
2826
2827            let mut file = data.file.lock();
2828            let mut flags = file.open_flags;
2829            match flags & libc::O_ACCMODE {
2830                libc::O_WRONLY => {
2831                    flags &= !libc::O_WRONLY;
2832                    flags |= libc::O_RDWR;
2833
2834                    // We need to get a readable handle for this file.
2835                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
2836                    *file = OpenedFile::new(newfile, flags);
2837                }
2838                libc::O_RDONLY | libc::O_RDWR => {}
2839                _ => panic!("Unexpected flags: {flags:#x}"),
2840            }
2841
2842            w.write_from(file.file_mut(), size as usize, offset)
2843        } else {
2844            let _trace = fs_trace!(self.tag, "read", inode, handle, size, offset);
2845            let data = self.find_handle(handle, inode)?;
2846
2847            let mut f = data.file.lock();
2848            w.write_from(f.file_mut(), size as usize, offset)
2849        }
2850    }
2851
2852    fn write<R: io::Read + ZeroCopyReader>(
2853        &self,
2854        _ctx: Context,
2855        inode: Inode,
2856        handle: Handle,
2857        mut r: R,
2858        size: u32,
2859        offset: u64,
2860        _lock_owner: Option<u64>,
2861        _delayed_write: bool,
2862        flags: u32,
2863    ) -> io::Result<usize> {
2864        // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
2865        // automatically clear the setuid and setgid bits for us.
2866        let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
2867            Some(drop_cap_fsetid()?)
2868        } else {
2869            None
2870        };
2871
2872        if self.zero_message_open.load(Ordering::Relaxed) {
2873            let _trace = fs_trace!(
2874                self.tag,
2875                "write (zero-message)",
2876                inode,
2877                handle,
2878                size,
2879                offset
2880            );
2881
2882            let data = self.find_inode(inode)?;
2883
2884            let mut file = data.file.lock();
2885            let mut flags = file.open_flags;
2886            match flags & libc::O_ACCMODE {
2887                libc::O_RDONLY => {
2888                    flags &= !libc::O_RDONLY;
2889                    flags |= libc::O_RDWR;
2890
2891                    // We need to get a writable handle for this file.
2892                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
2893                    *file = OpenedFile::new(newfile, flags);
2894                }
2895                libc::O_WRONLY | libc::O_RDWR => {}
2896                _ => panic!("Unexpected flags: {flags:#x}"),
2897            }
2898
2899            r.read_to(file.file_mut(), size as usize, offset)
2900        } else {
2901            let _trace = fs_trace!(self.tag, "write", inode, handle, size, offset);
2902
2903            let data = self.find_handle(handle, inode)?;
2904
2905            let mut f = data.file.lock();
2906            r.read_to(f.file_mut(), size as usize, offset)
2907        }
2908    }
2909
2910    fn getattr(
2911        &self,
2912        _ctx: Context,
2913        inode: Inode,
2914        _handle: Option<Handle>,
2915    ) -> io::Result<(libc::stat64, Duration)> {
2916        let _trace = fs_trace!(self.tag, "getattr", inode, _handle);
2917
2918        let data = self.find_inode(inode)?;
2919        self.do_getattr(&data)
2920    }
2921
2922    fn setattr(
2923        &self,
2924        _ctx: Context,
2925        inode: Inode,
2926        attr: libc::stat64,
2927        handle: Option<Handle>,
2928        valid: SetattrValid,
2929    ) -> io::Result<(libc::stat64, Duration)> {
2930        let _trace = fs_trace!(self.tag, "setattr", inode, handle);
2931        let inode_data = self.find_inode(inode)?;
2932
2933        enum Data<'a> {
2934            Handle(MutexGuard<'a, OpenedFile>),
2935            ProcPath(CString),
2936        }
2937
2938        // If we have a handle then use it otherwise get a new fd from the inode.
2939        let hd;
2940        let data = if let Some(handle) = handle.filter(|&h| h != 0) {
2941            hd = self.find_handle(handle, inode)?;
2942            Data::Handle(hd.file.lock())
2943        } else {
2944            let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
2945                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2946            Data::ProcPath(pathname)
2947        };
2948
2949        if valid.contains(SetattrValid::MODE) {
2950            // SAFETY: this doesn't modify any memory and we check the return value.
2951            syscall!(unsafe {
2952                match data {
2953                    Data::Handle(ref fd) => libc::fchmod(fd.as_raw_descriptor(), attr.st_mode),
2954                    Data::ProcPath(ref p) => {
2955                        libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
2956                    }
2957                }
2958            })?;
2959        }
2960
2961        if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
2962            let uid = if valid.contains(SetattrValid::UID) {
2963                attr.st_uid
2964            } else {
2965                // Cannot use -1 here because these are unsigned values.
2966                u32::MAX
2967            };
2968            let gid = if valid.contains(SetattrValid::GID) {
2969                attr.st_gid
2970            } else {
2971                // Cannot use -1 here because these are unsigned values.
2972                u32::MAX
2973            };
2974
2975            // SAFETY: this doesn't modify any memory and we check the return value.
2976            syscall!(unsafe {
2977                libc::fchownat(
2978                    inode_data.as_raw_descriptor(),
2979                    EMPTY_CSTR.as_ptr(),
2980                    uid,
2981                    gid,
2982                    libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
2983                )
2984            })?;
2985        }
2986
2987        if valid.contains(SetattrValid::SIZE) {
2988            syscall!(match data {
2989                Data::Handle(ref fd) => {
2990                    // SAFETY: this doesn't modify any memory and we check the return value.
2991                    unsafe { libc::ftruncate64(fd.as_raw_descriptor(), attr.st_size) }
2992                }
2993                _ => {
2994                    // There is no `ftruncateat` so we need to get a new fd and truncate it.
2995                    let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
2996                    // SAFETY: this doesn't modify any memory and we check the return value.
2997                    unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
2998                }
2999            })?;
3000        }
3001
3002        if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
3003            let mut tvs = [
3004                libc::timespec {
3005                    tv_sec: 0,
3006                    tv_nsec: libc::UTIME_OMIT,
3007                },
3008                libc::timespec {
3009                    tv_sec: 0,
3010                    tv_nsec: libc::UTIME_OMIT,
3011                },
3012            ];
3013
3014            if valid.contains(SetattrValid::ATIME_NOW) {
3015                tvs[0].tv_nsec = libc::UTIME_NOW;
3016            } else if valid.contains(SetattrValid::ATIME) {
3017                tvs[0].tv_sec = attr.st_atime;
3018                tvs[0].tv_nsec = attr.st_atime_nsec;
3019            }
3020
3021            if valid.contains(SetattrValid::MTIME_NOW) {
3022                tvs[1].tv_nsec = libc::UTIME_NOW;
3023            } else if valid.contains(SetattrValid::MTIME) {
3024                tvs[1].tv_sec = attr.st_mtime;
3025                tvs[1].tv_nsec = attr.st_mtime_nsec;
3026            }
3027
3028            // SAFETY: this doesn't modify any memory and we check the return value.
3029            syscall!(unsafe {
3030                match data {
3031                    Data::Handle(ref fd) => libc::futimens(fd.as_raw_descriptor(), tvs.as_ptr()),
3032                    Data::ProcPath(ref p) => {
3033                        libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
3034                    }
3035                }
3036            })?;
3037        }
3038
3039        self.do_getattr(&inode_data)
3040    }
3041
3042    fn rename(
3043        &self,
3044        _ctx: Context,
3045        olddir: Inode,
3046        oldname: &CStr,
3047        newdir: Inode,
3048        newname: &CStr,
3049        flags: u32,
3050    ) -> io::Result<()> {
3051        let _trace = fs_trace!(self.tag, "rename", olddir, oldname, newdir, newname, flags);
3052        let old_inode = self.find_inode(olddir)?;
3053        let new_inode = self.find_inode(newdir)?;
3054        // Both old and new names must be writable.
3055        self.authorize_write_path(&old_inode.path, oldname)?;
3056        self.authorize_write_path(&new_inode.path, newname)?;
3057        {
3058            let casefold_cache = self.lock_casefold_lookup_caches();
3059
3060            // SAFETY: this doesn't modify any memory and we check the return value.
3061            // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
3062            // and we have glibc 2.28.
3063            syscall!(unsafe {
3064                libc::syscall(
3065                    libc::SYS_renameat2,
3066                    old_inode.as_raw_descriptor(),
3067                    oldname.as_ptr(),
3068                    new_inode.as_raw_descriptor(),
3069                    newname.as_ptr(),
3070                    flags,
3071                )
3072            })?;
3073            if let Some(mut c) = casefold_cache {
3074                c.remove(olddir, oldname);
3075                c.insert(newdir, newname);
3076            }
3077        }
3078
3079        Ok(())
3080    }
3081
3082    fn mknod(
3083        &self,
3084        ctx: Context,
3085        parent: Inode,
3086        name: &CStr,
3087        mode: u32,
3088        rdev: u32,
3089        umask: u32,
3090        security_ctx: Option<&CStr>,
3091    ) -> io::Result<Entry> {
3092        let _trace = fs_trace!(
3093            self.tag,
3094            "mknod",
3095            parent,
3096            name,
3097            mode,
3098            rdev,
3099            umask,
3100            security_ctx
3101        );
3102        let data = self.find_inode(parent)?;
3103        self.authorize_write_path(&data.path, name)?;
3104
3105        let _ctx = security_ctx
3106            .filter(|ctx| *ctx != UNLABELED_CSTR)
3107            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
3108            .transpose()?;
3109
3110        #[allow(unused_variables)]
3111        #[cfg(feature = "arc_quota")]
3112        let (uid, gid) = self.change_creds(&ctx, &data, name);
3113        #[cfg(feature = "fs_runtime_ugid_map")]
3114        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3115        #[cfg(not(feature = "fs_permission_translation"))]
3116        let (uid, gid) = (ctx.uid, ctx.gid);
3117
3118        let (_uid, _gid) = set_creds(uid, gid)?;
3119        {
3120            let _scoped_umask = ScopedUmask::new(umask);
3121            let casefold_cache = self.lock_casefold_lookup_caches();
3122
3123            // SAFETY: this doesn't modify any memory and we check the return value.
3124            syscall!(unsafe {
3125                libc::mknodat(
3126                    data.as_raw_descriptor(),
3127                    name.as_ptr(),
3128                    mode as libc::mode_t,
3129                    rdev as libc::dev_t,
3130                )
3131            })?;
3132            if let Some(mut c) = casefold_cache {
3133                c.insert(parent, name);
3134            }
3135        }
3136
3137        self.do_lookup(&data, name)
3138    }
3139
3140    fn link(
3141        &self,
3142        _ctx: Context,
3143        inode: Inode,
3144        newparent: Inode,
3145        newname: &CStr,
3146    ) -> io::Result<Entry> {
3147        let _trace = fs_trace!(self.tag, "link", inode, newparent, newname);
3148        let data = self.find_inode(inode)?;
3149        let new_inode = self.find_inode(newparent)?;
3150        self.authorize_write_path(&new_inode.path, newname)?;
3151
3152        let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
3153            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3154
3155        {
3156            let casefold_cache = self.lock_casefold_lookup_caches();
3157            // SAFETY: this doesn't modify any memory and we check the return value.
3158            syscall!(unsafe {
3159                libc::linkat(
3160                    self.proc.as_raw_descriptor(),
3161                    path.as_ptr(),
3162                    new_inode.as_raw_descriptor(),
3163                    newname.as_ptr(),
3164                    libc::AT_SYMLINK_FOLLOW,
3165                )
3166            })?;
3167            if let Some(mut c) = casefold_cache {
3168                c.insert(newparent, newname);
3169            }
3170        }
3171
3172        self.do_lookup(&new_inode, newname)
3173    }
3174
3175    fn symlink(
3176        &self,
3177        ctx: Context,
3178        linkname: &CStr,
3179        parent: Inode,
3180        name: &CStr,
3181        security_ctx: Option<&CStr>,
3182    ) -> io::Result<Entry> {
3183        let _trace = fs_trace!(self.tag, "symlink", parent, linkname, name, security_ctx);
3184        let data = self.find_inode(parent)?;
3185        self.authorize_write_path(&data.path, name)?;
3186
3187        let _ctx = security_ctx
3188            .filter(|ctx| *ctx != UNLABELED_CSTR)
3189            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
3190            .transpose()?;
3191
3192        #[allow(unused_variables)]
3193        #[cfg(feature = "arc_quota")]
3194        let (uid, gid) = self.change_creds(&ctx, &data, name);
3195        #[cfg(feature = "fs_runtime_ugid_map")]
3196        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3197        #[cfg(not(feature = "fs_permission_translation"))]
3198        let (uid, gid) = (ctx.uid, ctx.gid);
3199
3200        let (_uid, _gid) = set_creds(uid, gid)?;
3201        {
3202            let casefold_cache = self.lock_casefold_lookup_caches();
3203            // SAFETY: this doesn't modify any memory and we check the return value.
3204            syscall!(unsafe {
3205                libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
3206            })?;
3207            if let Some(mut c) = casefold_cache {
3208                c.insert(parent, name);
3209            }
3210        }
3211
3212        self.do_lookup(&data, name)
3213    }
3214
3215    fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
3216        let _trace = fs_trace!(self.tag, "readlink", inode);
3217        let data = self.find_inode(inode)?;
3218
3219        let mut buf = vec![0; libc::PATH_MAX as usize];
3220
3221        // SAFETY: this will only modify the contents of `buf` and we check the return value.
3222        let res = syscall!(unsafe {
3223            libc::readlinkat(
3224                data.as_raw_descriptor(),
3225                EMPTY_CSTR.as_ptr(),
3226                buf.as_mut_ptr() as *mut libc::c_char,
3227                buf.len(),
3228            )
3229        })?;
3230
3231        buf.resize(res as usize, 0);
3232
3233        #[cfg(feature = "fs_runtime_ugid_map")]
3234        {
3235            let link_target = Path::new(OsStr::from_bytes(&buf[..res as usize]));
3236            if !link_target.starts_with(&self.root_dir) {
3237                return Err(io::Error::new(
3238                    io::ErrorKind::InvalidInput,
3239                    "Symbolic link points outside of root_dir",
3240                ));
3241            }
3242        }
3243        Ok(buf)
3244    }
3245
3246    fn flush(
3247        &self,
3248        _ctx: Context,
3249        inode: Inode,
3250        handle: Handle,
3251        _lock_owner: u64,
3252    ) -> io::Result<()> {
3253        let _trace = fs_trace!(self.tag, "flush", inode, handle);
3254        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
3255            self.find_inode(inode)?
3256        } else {
3257            self.find_handle(handle, inode)?
3258        };
3259
3260        // SAFETY:
3261        // Since this method is called whenever an fd is closed in the client, we can emulate that
3262        // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
3263        // because this doesn't modify any memory and we check the return values.
3264        unsafe {
3265            let newfd = syscall!(libc::fcntl(
3266                data.as_raw_descriptor(),
3267                libc::F_DUPFD_CLOEXEC,
3268                0
3269            ))?;
3270
3271            syscall!(libc::close(newfd))?;
3272        }
3273        Ok(())
3274    }
3275
3276    fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
3277        if self.zero_message_open.load(Ordering::Relaxed) {
3278            let _trace = fs_trace!(self.tag, "fsync (zero-message)", inode, datasync, handle);
3279            let data = self.find_inode(inode)?;
3280            self.do_fsync(&*data, datasync)
3281        } else {
3282            let _trace = fs_trace!(self.tag, "fsync", inode, datasync, handle);
3283            let data = self.find_handle(handle, inode)?;
3284
3285            let file = data.file.lock();
3286            self.do_fsync(&*file, datasync)
3287        }
3288    }
3289
3290    fn fsyncdir(
3291        &self,
3292        _ctx: Context,
3293        inode: Inode,
3294        datasync: bool,
3295        handle: Handle,
3296    ) -> io::Result<()> {
3297        if self.zero_message_opendir.load(Ordering::Relaxed) {
3298            let _trace = fs_trace!(self.tag, "fsyncdir (zero-message)", inode, datasync, handle);
3299            let data = self.find_inode(inode)?;
3300            self.do_fsync(&*data, datasync)
3301        } else {
3302            let _trace = fs_trace!(self.tag, "fsyncdir", inode, datasync, handle);
3303            let data = self.find_handle(handle, inode)?;
3304
3305            let file = data.file.lock();
3306            self.do_fsync(&*file, datasync)
3307        }
3308    }
3309
3310    fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
3311        let _trace = fs_trace!(self.tag, "access", inode, mask);
3312        let data = self.find_inode(inode)?;
3313
3314        let st = stat(&*data)?;
3315        let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
3316
3317        if mode == libc::F_OK {
3318            // The file exists since we were able to call `stat(2)` on it.
3319            return Ok(());
3320        }
3321
3322        if (mode & libc::R_OK) != 0 {
3323            if ctx.uid != 0
3324                && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
3325                && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
3326                && st.st_mode & 0o004 == 0
3327            {
3328                return Err(io::Error::from_raw_os_error(libc::EACCES));
3329            }
3330        }
3331
3332        if (mode & libc::W_OK) != 0 {
3333            if ctx.uid != 0
3334                && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
3335                && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
3336                && st.st_mode & 0o002 == 0
3337            {
3338                return Err(io::Error::from_raw_os_error(libc::EACCES));
3339            }
3340        }
3341
3342        // root can only execute something if it is executable by one of the owner, the group, or
3343        // everyone.
3344        if (mode & libc::X_OK) != 0 {
3345            if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
3346                && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
3347                && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
3348                && st.st_mode & 0o001 == 0
3349            {
3350                return Err(io::Error::from_raw_os_error(libc::EACCES));
3351            }
3352        }
3353
3354        Ok(())
3355    }
3356
3357    fn setxattr(
3358        &self,
3359        _ctx: Context,
3360        inode: Inode,
3361        name: &CStr,
3362        value: &[u8],
3363        flags: u32,
3364    ) -> io::Result<()> {
3365        let _trace = fs_trace!(self.tag, "setxattr", inode, name, flags);
3366        // We can't allow the VM to set this xattr because an unprivileged process may use it to set
3367        // a privileged xattr.
3368        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3369            return Err(io::Error::from_raw_os_error(libc::EPERM));
3370        }
3371
3372        let data = self.find_inode(inode)?;
3373        let name = self.rewrite_xattr_name(name);
3374
3375        #[cfg(feature = "arc_quota")]
3376        if self.skip_host_set_xattr(&data.path, &name.to_string_lossy()) {
3377            debug!(
3378                "ignore setxattr for path:{} xattr_name:{}",
3379                &data.path,
3380                &name.to_string_lossy()
3381            );
3382            return Ok(());
3383        }
3384
3385        let file = data.file.lock();
3386        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3387        if o_path_file {
3388            // For FDs opened with `O_PATH`, we cannot call `fsetxattr` normally. Instead we emulate
3389            // an _at syscall by changing the CWD to /proc, running the path based syscall, and then
3390            // setting the CWD back to the root directory.
3391            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3392                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3393
3394            syscall!(self.with_proc_chdir(|| {
3395                // SAFETY: this doesn't modify any memory and we check the return value.
3396                unsafe {
3397                    libc::setxattr(
3398                        path.as_ptr(),
3399                        name.as_ptr(),
3400                        value.as_ptr() as *const libc::c_void,
3401                        value.len() as libc::size_t,
3402                        flags as c_int,
3403                    )
3404                }
3405            }))?;
3406        } else {
3407            syscall!(
3408                // For regular files and directories, we can just use fsetxattr.
3409                // SAFETY: this doesn't modify any memory and we check the return value.
3410                unsafe {
3411                    libc::fsetxattr(
3412                        file.as_raw_descriptor(),
3413                        name.as_ptr(),
3414                        value.as_ptr() as *const libc::c_void,
3415                        value.len() as libc::size_t,
3416                        flags as c_int,
3417                    )
3418                }
3419            )?;
3420        }
3421
3422        Ok(())
3423    }
3424
3425    fn getxattr(
3426        &self,
3427        _ctx: Context,
3428        inode: Inode,
3429        name: &CStr,
3430        size: u32,
3431    ) -> io::Result<GetxattrReply> {
3432        let _trace = fs_trace!(self.tag, "getxattr", inode, name, size);
3433        // We don't allow the VM to set this xattr so we also pretend there is no value associated
3434        // with it.
3435        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3436            return Err(io::Error::from_raw_os_error(libc::ENODATA));
3437        }
3438
3439        let data = self.find_inode(inode)?;
3440        let name = self.rewrite_xattr_name(name);
3441        let mut buf = vec![0u8; size as usize];
3442
3443        #[cfg(feature = "arc_quota")]
3444        let res = self.do_getxattr_with_filter(data, name, &mut buf)?;
3445
3446        #[cfg(not(feature = "arc_quota"))]
3447        let res = self.do_getxattr(&data, &name, &mut buf[..])?;
3448
3449        if size == 0 {
3450            Ok(GetxattrReply::Count(res as u32))
3451        } else {
3452            buf.truncate(res);
3453            Ok(GetxattrReply::Value(buf))
3454        }
3455    }
3456
3457    fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
3458        let _trace = fs_trace!(self.tag, "listxattr", inode, size);
3459        let data = self.find_inode(inode)?;
3460
3461        let mut buf = vec![0u8; size as usize];
3462
3463        let file = data.file.lock();
3464        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3465        let res = if o_path_file {
3466            // For FDs opened with `O_PATH`, we cannot call `flistxattr` normally. Instead we
3467            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
3468            // and then setting the CWD back to the root directory.
3469            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3470                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3471
3472            // SAFETY: this will only modify `buf` and we check the return value.
3473            syscall!(self.with_proc_chdir(|| unsafe {
3474                libc::listxattr(
3475                    path.as_ptr(),
3476                    buf.as_mut_ptr() as *mut libc::c_char,
3477                    buf.len() as libc::size_t,
3478                )
3479            }))?
3480        } else {
3481            // For regular files and directories, we can just flistxattr.
3482            // SAFETY: this will only write to `buf` and we check the return value.
3483            syscall!(unsafe {
3484                libc::flistxattr(
3485                    file.as_raw_descriptor(),
3486                    buf.as_mut_ptr() as *mut libc::c_char,
3487                    buf.len() as libc::size_t,
3488                )
3489            })?
3490        };
3491
3492        if size == 0 {
3493            Ok(ListxattrReply::Count(res as u32))
3494        } else {
3495            buf.truncate(res as usize);
3496
3497            if self.cfg.rewrite_security_xattrs {
3498                strip_xattr_prefix(&mut buf);
3499            }
3500            Ok(ListxattrReply::Names(buf))
3501        }
3502    }
3503
3504    fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
3505        let _trace = fs_trace!(self.tag, "removexattr", inode, name);
3506        // We don't allow the VM to set this xattr so we also pretend there is no value associated
3507        // with it.
3508        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3509            return Err(io::Error::from_raw_os_error(libc::ENODATA));
3510        }
3511
3512        let data = self.find_inode(inode)?;
3513        let name = self.rewrite_xattr_name(name);
3514
3515        let file = data.file.lock();
3516        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3517        if o_path_file {
3518            // For files opened with `O_PATH`, we cannot call `fremovexattr` normally. Instead we
3519            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
3520            // and then setting the CWD back to the root directory.
3521            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3522                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3523
3524            syscall!(self.with_proc_chdir(||
3525                    // SAFETY: this doesn't modify any memory and we check the return value.
3526                    unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) }))?;
3527        } else {
3528            // For regular files and directories, we can just use fremovexattr.
3529            syscall!(
3530                // SAFETY: this doesn't modify any memory and we check the return value.
3531                unsafe { libc::fremovexattr(file.as_raw_descriptor(), name.as_ptr()) }
3532            )?;
3533        }
3534
3535        Ok(())
3536    }
3537
3538    fn fallocate(
3539        &self,
3540        _ctx: Context,
3541        inode: Inode,
3542        handle: Handle,
3543        mode: u32,
3544        offset: u64,
3545        length: u64,
3546    ) -> io::Result<()> {
3547        let _trace = fs_trace!(self.tag, "fallocate", inode, handle, mode, offset, length);
3548
3549        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
3550            let data = self.find_inode(inode)?;
3551
3552            {
3553                // fallocate needs a writable fd
3554                let mut file = data.file.lock();
3555                let mut flags = file.open_flags;
3556                match flags & libc::O_ACCMODE {
3557                    libc::O_RDONLY => {
3558                        flags &= !libc::O_RDONLY;
3559                        flags |= libc::O_RDWR;
3560
3561                        // We need to get a writable handle for this file.
3562                        let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
3563                        *file = OpenedFile::new(newfile, flags);
3564                    }
3565                    libc::O_WRONLY | libc::O_RDWR => {}
3566                    _ => panic!("Unexpected flags: {flags:#x}"),
3567                }
3568            }
3569
3570            data
3571        } else {
3572            self.find_handle(handle, inode)?
3573        };
3574
3575        let fd = data.as_raw_descriptor();
3576        // SAFETY: this doesn't modify any memory and we check the return value.
3577        syscall!(unsafe {
3578            libc::fallocate64(
3579                fd,
3580                mode as libc::c_int,
3581                offset as libc::off64_t,
3582                length as libc::off64_t,
3583            )
3584        })?;
3585
3586        Ok(())
3587    }
3588
3589    #[allow(clippy::unnecessary_cast)]
3590    fn ioctl<R: io::Read>(
3591        &self,
3592        ctx: Context,
3593        inode: Inode,
3594        handle: Handle,
3595        _flags: IoctlFlags,
3596        cmd: u32,
3597        _arg: u64,
3598        in_size: u32,
3599        out_size: u32,
3600        r: R,
3601    ) -> io::Result<IoctlReply> {
3602        let _trace = fs_trace!(self.tag, "ioctl", inode, handle, cmd, in_size, out_size);
3603
3604        match cmd as IoctlNr {
3605            FS_IOC_GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
3606            FS_IOC_FSGETXATTR => {
3607                if out_size < size_of::<fsxattr>() as u32 {
3608                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3609                } else {
3610                    self.get_fsxattr(inode, handle)
3611                }
3612            }
3613            FS_IOC_FSSETXATTR => {
3614                if in_size < size_of::<fsxattr>() as u32 {
3615                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3616                } else {
3617                    self.set_fsxattr(ctx, inode, handle, r)
3618                }
3619            }
3620            FS_IOC32_GETFLAGS | FS_IOC64_GETFLAGS => {
3621                if out_size < size_of::<c_int>() as u32 {
3622                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3623                } else {
3624                    self.get_flags(inode, handle)
3625                }
3626            }
3627            FS_IOC32_SETFLAGS | FS_IOC64_SETFLAGS => {
3628                if in_size < size_of::<c_int>() as u32 {
3629                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3630                } else {
3631                    self.set_flags(ctx, inode, handle, r)
3632                }
3633            }
3634            FS_IOC_ENABLE_VERITY => {
3635                if in_size < size_of::<fsverity_enable_arg>() as u32 {
3636                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3637                } else {
3638                    self.enable_verity(inode, handle, r)
3639                }
3640            }
3641            FS_IOC_MEASURE_VERITY => {
3642                if in_size < size_of::<fsverity_digest>() as u32
3643                    || out_size < size_of::<fsverity_digest>() as u32
3644                {
3645                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3646                } else {
3647                    self.measure_verity(inode, handle, r, out_size)
3648                }
3649            }
3650            // The following is ARCVM-specific ioctl
3651            // Refer go/remove-mount-passthrough-fuse for more design details
3652            #[cfg(feature = "arc_quota")]
3653            FS_IOC_SETPERMISSION => {
3654                if in_size != size_of::<FsPermissionDataBuffer>() as u32 {
3655                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3656                } else {
3657                    Ok(self.set_permission_by_path(r))
3658                }
3659            }
3660            #[cfg(feature = "arc_quota")]
3661            FS_IOC_SETPATHXATTR => {
3662                if in_size != size_of::<FsPathXattrDataBuffer>() as u32 {
3663                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3664                } else {
3665                    Ok(self.set_xattr_by_path(r))
3666                }
3667            }
3668            _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
3669        }
3670    }
3671
3672    fn copy_file_range(
3673        &self,
3674        ctx: Context,
3675        inode_src: Inode,
3676        handle_src: Handle,
3677        offset_src: u64,
3678        inode_dst: Inode,
3679        handle_dst: Handle,
3680        offset_dst: u64,
3681        length: u64,
3682        flags: u64,
3683    ) -> io::Result<usize> {
3684        let _trace = fs_trace!(
3685            self.tag,
3686            "copy_file_range",
3687            inode_src,
3688            handle_src,
3689            offset_src,
3690            inode_dst,
3691            handle_dst,
3692            offset_dst,
3693            length,
3694            flags
3695        );
3696        let dst_inode_data = self.find_inode(inode_dst)?;
3697
3698        #[allow(unused_variables)]
3699        #[cfg(feature = "arc_quota")]
3700        let (uid, gid) = self.change_creds_for_path(&ctx, &dst_inode_data.path);
3701        #[cfg(feature = "fs_runtime_ugid_map")]
3702        let (uid, gid) = self.change_ugid_creds_for_path(&ctx, &dst_inode_data.path);
3703        #[cfg(not(feature = "fs_permission_translation"))]
3704        let (uid, gid) = (ctx.uid, ctx.gid);
3705
3706        // We need to change credentials during a write so that the kernel will remove setuid or
3707        // setgid bits from the file if it was written to by someone other than the owner.
3708        let (_uid, _gid) = set_creds(uid, gid)?;
3709        let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
3710            if self.zero_message_open.load(Ordering::Relaxed) {
3711                (self.find_inode(inode_src)?, dst_inode_data)
3712            } else {
3713                (
3714                    self.find_handle(handle_src, inode_src)?,
3715                    self.find_handle(handle_dst, inode_dst)?,
3716                )
3717            };
3718
3719        let src = src_data.as_raw_descriptor();
3720        let dst = dst_data.as_raw_descriptor();
3721
3722        Ok(syscall!(
3723            // SAFETY: this call is safe because it doesn't modify any memory and we
3724            // check the return value.
3725            unsafe {
3726                libc::syscall(
3727                    libc::SYS_copy_file_range,
3728                    src,
3729                    &offset_src,
3730                    dst,
3731                    &offset_dst,
3732                    length,
3733                    flags,
3734                )
3735            }
3736        )? as usize)
3737    }
3738
3739    fn set_up_mapping<M: Mapper>(
3740        &self,
3741        _ctx: Context,
3742        inode: Self::Inode,
3743        _handle: Self::Handle,
3744        file_offset: u64,
3745        mem_offset: u64,
3746        size: usize,
3747        prot: u32,
3748        mapper: M,
3749    ) -> io::Result<()> {
3750        let _trace = fs_trace!(
3751            self.tag,
3752            "set_up_mapping",
3753            inode,
3754            file_offset,
3755            mem_offset,
3756            size,
3757            prot
3758        );
3759        if !self.cfg.use_dax {
3760            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3761        }
3762
3763        let read = prot & libc::PROT_READ as u32 != 0;
3764        let write = prot & libc::PROT_WRITE as u32 != 0;
3765        let (mmap_flags, prot) = match (read, write) {
3766            (true, true) => (libc::O_RDWR, Protection::read_write()),
3767            (true, false) => (libc::O_RDONLY, Protection::read()),
3768            // Write-only is mapped to O_RDWR since mmap always requires an fd opened for reading.
3769            (false, true) => (libc::O_RDWR, Protection::write()),
3770            (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
3771        };
3772
3773        let data = self.find_inode(inode)?;
3774
3775        if self.zero_message_open.load(Ordering::Relaxed) {
3776            let mut file = data.file.lock();
3777            let mut open_flags = file.open_flags;
3778            match (mmap_flags, open_flags & libc::O_ACCMODE) {
3779                (libc::O_RDONLY, libc::O_WRONLY)
3780                | (libc::O_RDWR, libc::O_RDONLY)
3781                | (libc::O_RDWR, libc::O_WRONLY) => {
3782                    // We have a read-only or write-only fd and we need to upgrade it.
3783                    open_flags &= !libc::O_ACCMODE;
3784                    open_flags |= libc::O_RDWR;
3785
3786                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
3787                    *file = OpenedFile::new(newfile, open_flags);
3788                }
3789                (libc::O_RDONLY, libc::O_RDONLY)
3790                | (libc::O_RDONLY, libc::O_RDWR)
3791                | (libc::O_RDWR, libc::O_RDWR) => {}
3792                (m, o) => panic!("Unexpected combination of access flags: ({m:#x}, {o:#x})"),
3793            }
3794            mapper.map(mem_offset, size, file.file(), file_offset, prot)
3795        } else {
3796            let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
3797            mapper.map(mem_offset, size, &file, file_offset, prot)
3798        }
3799    }
3800
3801    fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
3802        let _trace = fs_trace!(self.tag, "remove_mapping", msgs);
3803        if !self.cfg.use_dax {
3804            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3805        }
3806
3807        for RemoveMappingOne { moffset, len } in msgs {
3808            mapper.unmap(*moffset, *len)?;
3809        }
3810        Ok(())
3811    }
3812
3813    fn atomic_open(
3814        &self,
3815        ctx: Context,
3816        parent: Self::Inode,
3817        name: &CStr,
3818        mode: u32,
3819        flags: u32,
3820        umask: u32,
3821        security_ctx: Option<&CStr>,
3822    ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)> {
3823        validate_path_component(name)?;
3824        let _trace = fs_trace!(
3825            self.tag,
3826            "atomic_open",
3827            parent,
3828            name,
3829            mode,
3830            flags,
3831            umask,
3832            security_ctx
3833        );
3834        // Perform lookup but not create negative dentry
3835        let data = self.find_inode(parent)?;
3836
3837        #[allow(unused_variables)]
3838        #[cfg(feature = "arc_quota")]
3839        let (uid, gid) = self.change_creds(&ctx, &data, name);
3840        #[cfg(feature = "fs_runtime_ugid_map")]
3841        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3842        #[cfg(not(feature = "fs_permission_translation"))]
3843        let (uid, gid) = (ctx.uid, ctx.gid);
3844
3845        let (_uid, _gid) = set_creds(uid, gid)?;
3846
3847        // This lookup serves two purposes:
3848        // 1. If the O_CREATE flag is not set, it retrieves the d_entry for the file.
3849        // 2. If the O_CREATE flag is set, it checks whether the file exists.
3850        let res = self.do_lookup_with_casefold_fallback(&data, name);
3851
3852        if let Err(e) = res {
3853            if e.kind() == std::io::ErrorKind::NotFound && (flags as i32 & libc::O_CREAT) != 0 {
3854                // If the file did not exist & O_CREAT is set,
3855                // create file & set FILE_CREATED bits in open options
3856                let (entry, handler, mut opts) =
3857                    self.create(ctx, parent, name, mode, flags, umask, security_ctx)?;
3858                opts |= OpenOptions::FILE_CREATED;
3859                return Ok((entry, handler, opts));
3860            } else if e.kind() == std::io::ErrorKind::NotFound
3861                && !self.cfg.negative_timeout.is_zero()
3862            {
3863                return Ok((
3864                    Entry::new_negative(self.cfg.negative_timeout),
3865                    None,
3866                    OpenOptions::empty(),
3867                ));
3868            }
3869            return Err(e);
3870        }
3871
3872        // SAFETY: checked res is not error before
3873        let entry = res.unwrap();
3874
3875        if entry.attr.st_mode & libc::S_IFMT == libc::S_IFLNK {
3876            return Ok((entry, None, OpenOptions::empty()));
3877        }
3878
3879        if (flags as i32 & (libc::O_CREAT | libc::O_EXCL)) == (libc::O_CREAT | libc::O_EXCL) {
3880            return Err(eexist());
3881        }
3882
3883        let (handler, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
3884            (None, OpenOptions::KEEP_CACHE)
3885        } else {
3886            let (handler, opts) = self.do_open(entry.inode, flags)?;
3887            (handler, opts)
3888        };
3889        Ok((entry, handler, opts))
3890    }
3891}
3892
3893#[cfg(test)]
3894mod tests {
3895    use std::path::Path;
3896
3897    use named_lock::NamedLock;
3898    use tempfile::TempDir;
3899
3900    use super::*;
3901    #[cfg(feature = "arc_quota")]
3902    use crate::virtio::fs::arc_ioctl::FS_IOCTL_PATH_MAX_LEN;
3903    #[cfg(feature = "arc_quota")]
3904    use crate::virtio::fs::arc_ioctl::FS_IOCTL_XATTR_NAME_MAX_LEN;
3905    #[cfg(feature = "arc_quota")]
3906    use crate::virtio::fs::arc_ioctl::FS_IOCTL_XATTR_VALUE_MAX_LEN;
3907
3908    const UNITTEST_LOCK_NAME: &str = "passthroughfs_unittest_lock";
3909
3910    #[test]
3911    fn test_passthrough_fs_allowlist() {
3912        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3913        let _guard = lock.lock().expect("acquire named lock");
3914
3915        let temp_dir = TempDir::new().unwrap();
3916        create_test_data(
3917            &temp_dir,
3918            &["allowed", "blocked"],
3919            &["allowed/a.txt", "blocked/b.txt"],
3920        );
3921
3922        let cfg = Default::default();
3923        let mut fs = PassthroughFs::new("tag", cfg).unwrap();
3924
3925        let capable = FsOptions::empty();
3926        fs.init(capable).unwrap();
3927
3928        let allowlist = Arc::new(RwLock::new(PathAllowlist::new()));
3929        fs.set_allowlist(Some(allowlist.clone()));
3930
3931        let allowed_path = temp_dir
3932            .path()
3933            .join("allowed")
3934            .to_string_lossy()
3935            .into_owned();
3936        allowlist.write().unwrap().add_path(allowed_path);
3937
3938        // 1. Verify Lookups of allowed paths succeed
3939        assert!(lookup(&fs, &temp_dir.path().join("allowed")).is_ok());
3940        assert!(lookup(&fs, &temp_dir.path().join("allowed/a.txt")).is_ok());
3941
3942        // 2. Verify Lookups of blocked paths fail with NotFound (ENOENT)
3943        let blocked_err = lookup(&fs, &temp_dir.path().join("blocked"))
3944            .expect_err("blocked directory must not be accessible");
3945        assert_eq!(blocked_err.kind(), io::ErrorKind::NotFound);
3946
3947        let blocked_file_err = lookup(&fs, &temp_dir.path().join("blocked/b.txt"))
3948            .expect_err("blocked file must not be accessible");
3949        assert_eq!(blocked_file_err.kind(), io::ErrorKind::NotFound);
3950
3951        // 3. Verify Write/Creation of allowed paths succeed
3952        assert!(create(&fs, &temp_dir.path().join("allowed/new_file.txt")).is_ok());
3953
3954        // 4. Verify Write/Creation inside a blocked directory fails with NotFound (because the
3955        //    parent directory cannot be resolved/looked up)
3956        let blocked_dir_write_err = create(&fs, &temp_dir.path().join("blocked/new_file.txt"))
3957            .expect_err("parent directory must not be lookupable");
3958        assert_eq!(blocked_dir_write_err.kind(), io::ErrorKind::NotFound);
3959
3960        // 5. Verify Write/Creation directly in the ancestor directory (root/temp_dir) fails with
3961        //    PermissionDenied (EACCES)
3962        // (the parent directory lookup succeeds because ancestors are allowed, but the write itself
3963        // is blocked)
3964        let ancestor_write_err = create(&fs, &temp_dir.path().join("new_file_in_ancestor.txt"))
3965            .expect_err("ancestor directory must not be writable");
3966        assert_eq!(ancestor_write_err.kind(), io::ErrorKind::PermissionDenied);
3967    }
3968
3969    #[test]
3970    fn test_passthrough_fs_revocation() {
3971        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3972        let _guard = lock.lock().expect("acquire named lock");
3973
3974        let temp_dir = TempDir::new().unwrap();
3975        create_test_data(&temp_dir, &["allowed"], &["allowed/a.txt"]);
3976
3977        // Write some data to the file
3978        let file_path = temp_dir.path().join("allowed/a.txt");
3979        std::fs::write(&file_path, b"hello revocation").unwrap();
3980
3981        let cfg = Default::default();
3982        let mut fs = PassthroughFs::new("tag", cfg).unwrap();
3983
3984        let capable = FsOptions::empty();
3985        fs.init(capable).unwrap();
3986
3987        let allowlist = Arc::new(RwLock::new(PathAllowlist::new()));
3988        fs.set_allowlist(Some(allowlist.clone()));
3989
3990        let allowed_path = temp_dir
3991            .path()
3992            .join("allowed")
3993            .to_string_lossy()
3994            .into_owned();
3995        allowlist.write().unwrap().add_path(allowed_path.clone());
3996
3997        // 1. Lookup & Open allowed file to get Inode and Handle
3998        let inode = lookup(&fs, &file_path).expect("lookup failed");
3999        let ctx = get_context();
4000        let (handle, _) = fs
4001            .open(ctx, inode, libc::O_RDONLY as u32)
4002            .expect("open failed");
4003        let handle = handle.expect("no handle returned");
4004
4005        // 2. Remove path from allowlist
4006        allowlist.write().unwrap().remove_path(&allowed_path);
4007
4008        // 3. Verify that reading from the already open handle still succeeds
4009        struct DummyWriter {
4010            data: Vec<u8>,
4011        }
4012        impl io::Write for DummyWriter {
4013            fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
4014                self.data.extend_from_slice(buf);
4015                Ok(buf.len())
4016            }
4017            fn flush(&mut self) -> io::Result<()> {
4018                Ok(())
4019            }
4020        }
4021        impl ZeroCopyWriter for DummyWriter {
4022            fn write_from(&mut self, f: &mut File, count: usize, off: u64) -> io::Result<usize> {
4023                use std::os::unix::fs::FileExt;
4024                let mut buf = vec![0; count];
4025                let n = f.read_at(&mut buf, off)?;
4026                self.data.extend_from_slice(&buf[..n]);
4027                Ok(n)
4028            }
4029        }
4030
4031        let mut writer = DummyWriter { data: Vec::new() };
4032        let read_bytes = fs
4033            .read(
4034                ctx,
4035                inode,
4036                handle,
4037                &mut writer,
4038                16,
4039                0,
4040                None,
4041                libc::O_RDONLY as u32,
4042            )
4043            .expect("read failed");
4044
4045        assert_eq!(read_bytes, 16);
4046        assert_eq!(writer.data, b"hello revocation");
4047
4048        // 4. Verify that new lookup of the file now fails
4049        let lookup_res = lookup(&fs, &file_path);
4050        assert!(lookup_res.is_err());
4051        assert_eq!(lookup_res.unwrap_err().kind(), io::ErrorKind::NotFound);
4052    }
4053
4054    // Create an instance of `Context` with valid uid, gid, and pid.
4055    // The correct ids are necessary for test cases where new files are created.
4056    fn get_context() -> Context {
4057        // SAFETY: both calls take no parameters and only return an integer value. The kernel also
4058        // guarantees that they can never fail.
4059        let uid = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
4060        // SAFETY: both calls take no parameters and only return an integer value. The kernel also
4061        // guarantees that they can never fail.
4062        let gid = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
4063        let pid = std::process::id() as libc::pid_t;
4064        Context { uid, gid, pid }
4065    }
4066
4067    /// Creates the given directories and files under `temp_dir`.
4068    fn create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str]) {
4069        let path = temp_dir.path();
4070
4071        for d in dirs {
4072            std::fs::create_dir_all(path.join(d)).unwrap();
4073        }
4074
4075        for f in files {
4076            File::create(path.join(f)).unwrap();
4077        }
4078    }
4079
4080    /// Looks up the given `path` in `fs`.
4081    fn lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode> {
4082        let mut inode = 1;
4083        let ctx = get_context();
4084        for name in path.iter() {
4085            let name = CString::new(name.to_str().unwrap()).unwrap();
4086            let ent = match fs.lookup(ctx, inode, &name) {
4087                Ok(ent) => ent,
4088                Err(e) => {
4089                    return Err(e);
4090                }
4091            };
4092            inode = ent.inode;
4093        }
4094        Ok(inode)
4095    }
4096
4097    /// Looks up the given `path` in `fs`.
4098    #[cfg(feature = "arc_quota")]
4099    fn lookup_ent(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
4100        let mut inode = 1;
4101        let ctx = get_context();
4102        let mut entry = Entry::new_negative(Duration::from_secs(10));
4103        for name in path.iter() {
4104            let name = CString::new(name.to_str().unwrap()).unwrap();
4105            entry = match fs.lookup(ctx, inode, &name) {
4106                Ok(ent) => ent,
4107                Err(e) => {
4108                    return Err(e);
4109                }
4110            };
4111            inode = entry.inode;
4112        }
4113        Ok(entry)
4114    }
4115
4116    /// Creates a file at the given `path`.
4117    fn create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
4118        let parent = path.parent().unwrap();
4119        let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
4120        let parent_inode = lookup(fs, parent)?;
4121        let ctx = get_context();
4122        let security_ctx = None;
4123        fs.create(
4124            ctx,
4125            parent_inode,
4126            &filename,
4127            0o666,
4128            libc::O_RDWR as u32,
4129            0,
4130            security_ctx,
4131        )
4132        .map(|(entry, _, _)| entry)
4133    }
4134
4135    /// Removes a file at the given `path`.
4136    fn unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
4137        let parent = path.parent().unwrap();
4138        let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
4139        let parent_inode = lookup(fs, parent)?;
4140        let ctx = get_context();
4141        fs.unlink(ctx, parent_inode, &filename)
4142    }
4143
4144    /// Forgets cache.
4145    fn forget(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
4146        let ctx = get_context();
4147        let inode = lookup(fs, path)?;
4148        // Pass `u64::MAX` to ensure that the refcount goes to 0 and we forget inode.
4149        fs.forget(ctx, inode, u64::MAX);
4150        Ok(())
4151    }
4152
4153    /// Looks up and open the given `path` in `fs`.
4154    fn atomic_open(
4155        fs: &PassthroughFs,
4156        path: &Path,
4157        mode: u32,
4158        flags: u32,
4159        umask: u32,
4160        security_ctx: Option<&CStr>,
4161    ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
4162        let mut inode = 1;
4163        let ctx = get_context();
4164
4165        let path_vec: Vec<_> = path.iter().collect();
4166        let vec_len = path_vec.len();
4167
4168        // Do lookup before util (vec_len-1)-th pathname, this operation is to simulate
4169        // the behavior of VFS, since when VFS call atomic_open only at last look up.
4170        for name in &path_vec[0..vec_len - 1] {
4171            let name = CString::new(name.to_str().unwrap()).unwrap();
4172            let ent = fs.lookup(ctx, inode, &name)?;
4173            inode = ent.inode;
4174        }
4175
4176        let name = CString::new(path_vec[vec_len - 1].to_str().unwrap()).unwrap();
4177
4178        fs.atomic_open(ctx, inode, &name, mode, flags, umask, security_ctx)
4179    }
4180
4181    fn symlink(
4182        fs: &PassthroughFs,
4183        linkname: &Path,
4184        path: &Path,
4185        security_ctx: Option<&CStr>,
4186    ) -> io::Result<Entry> {
4187        let parent = path.parent().unwrap();
4188        let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
4189        let parent_inode = lookup(fs, parent)?;
4190        let ctx = get_context();
4191        let linkname = CString::new(linkname.to_str().unwrap()).unwrap();
4192        fs.symlink(ctx, &linkname, parent_inode, &filename, security_ctx)
4193    }
4194
4195    // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
4196    #[cfg(feature = "arc_quota")]
4197    fn fs_ioc_setpermission<R: io::Read>(
4198        fs: &PassthroughFs,
4199        in_size: u32,
4200        r: R,
4201    ) -> io::Result<IoctlReply> {
4202        let ctx = get_context();
4203        fs.ioctl(
4204            ctx,
4205            0,
4206            0,
4207            IoctlFlags::empty(),
4208            FS_IOC_SETPERMISSION as u32,
4209            0,
4210            in_size,
4211            0,
4212            r,
4213        )
4214    }
4215
4216    // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
4217    #[cfg(feature = "arc_quota")]
4218    fn fs_ioc_setpathxattr<R: io::Read>(
4219        fs: &PassthroughFs,
4220        in_size: u32,
4221        r: R,
4222    ) -> io::Result<IoctlReply> {
4223        let ctx = get_context();
4224        fs.ioctl(
4225            ctx,
4226            0,
4227            0,
4228            IoctlFlags::empty(),
4229            FS_IOC_SETPATHXATTR as u32,
4230            0,
4231            in_size,
4232            0,
4233            r,
4234        )
4235    }
4236
4237    #[test]
4238    fn rewrite_xattr_names() {
4239        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4240        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4241        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4242        let _guard = lock.lock().expect("acquire named lock");
4243
4244        let cfg = Config {
4245            rewrite_security_xattrs: true,
4246            ..Default::default()
4247        };
4248
4249        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4250
4251        // Selinux shouldn't get overwritten.
4252        let selinux = c"security.selinux";
4253        assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
4254
4255        // user, trusted, and system should not be changed either.
4256        let user = c"user.foobar";
4257        assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
4258        let trusted = c"trusted.foobar";
4259        assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
4260        let system = c"system.foobar";
4261        assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
4262
4263        // sehash should be re-written.
4264        let sehash = c"security.sehash";
4265        assert_eq!(
4266            p.rewrite_xattr_name(sehash).to_bytes(),
4267            b"user.virtiofs.security.sehash"
4268        );
4269    }
4270
4271    #[test]
4272    fn strip_xattr_names() {
4273        let only_nuls = b"\0\0\0\0\0";
4274        let mut actual = only_nuls.to_vec();
4275        strip_xattr_prefix(&mut actual);
4276        assert_eq!(&actual[..], &only_nuls[..]);
4277
4278        let no_nuls = b"security.sehashuser.virtiofs";
4279        let mut actual = no_nuls.to_vec();
4280        strip_xattr_prefix(&mut actual);
4281        assert_eq!(&actual[..], &no_nuls[..]);
4282
4283        let empty = b"";
4284        let mut actual = empty.to_vec();
4285        strip_xattr_prefix(&mut actual);
4286        assert_eq!(&actual[..], &empty[..]);
4287
4288        let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
4289        let mut actual = no_strippable_names.to_vec();
4290        strip_xattr_prefix(&mut actual);
4291        assert_eq!(&actual[..], &no_strippable_names[..]);
4292
4293        let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
4294        let mut actual = only_strippable_names.to_vec();
4295        strip_xattr_prefix(&mut actual);
4296        assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
4297
4298        let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
4299        let mut actual = mixed_names.to_vec();
4300        strip_xattr_prefix(&mut actual);
4301        let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
4302        assert_eq!(&actual[..], &expected[..]);
4303
4304        let no_nul_with_prefix = b"user.virtiofs.security.sehash";
4305        let mut actual = no_nul_with_prefix.to_vec();
4306        strip_xattr_prefix(&mut actual);
4307        assert_eq!(&actual[..], b"security.sehash");
4308    }
4309
4310    #[test]
4311    fn lookup_files() {
4312        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4313        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4314        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4315        let _guard = lock.lock().expect("acquire named lock");
4316
4317        let temp_dir = TempDir::new().unwrap();
4318        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
4319
4320        let cfg = Default::default();
4321        let fs = PassthroughFs::new("tag", cfg).unwrap();
4322
4323        let capable = FsOptions::empty();
4324        fs.init(capable).unwrap();
4325
4326        assert!(lookup(&fs, &temp_dir.path().join("a.txt")).is_ok());
4327        assert!(lookup(&fs, &temp_dir.path().join("dir")).is_ok());
4328        assert!(lookup(&fs, &temp_dir.path().join("dir/b.txt")).is_ok());
4329
4330        assert_eq!(
4331            lookup(&fs, &temp_dir.path().join("nonexistent-file"))
4332                .expect_err("file must not exist")
4333                .kind(),
4334            io::ErrorKind::NotFound
4335        );
4336        // "A.txt" is different from "a.txt".
4337        assert_eq!(
4338            lookup(&fs, &temp_dir.path().join("A.txt"))
4339                .expect_err("file must not exist")
4340                .kind(),
4341            io::ErrorKind::NotFound
4342        );
4343    }
4344
4345    #[test]
4346    fn lookup_files_ascii_casefold() {
4347        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4348        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4349        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4350        let _guard = lock.lock().expect("acquire named lock");
4351
4352        let temp_dir = TempDir::new().unwrap();
4353        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
4354
4355        let cfg = Config {
4356            ascii_casefold: true,
4357            ..Default::default()
4358        };
4359        let fs = PassthroughFs::new("tag", cfg).unwrap();
4360
4361        let capable = FsOptions::empty();
4362        fs.init(capable).unwrap();
4363
4364        // Ensure that "A.txt" is equated with "a.txt".
4365        let a_inode = lookup(&fs, &temp_dir.path().join("a.txt")).expect("a.txt must be found");
4366        assert_eq!(
4367            lookup(&fs, &temp_dir.path().join("A.txt")).expect("A.txt must exist"),
4368            a_inode
4369        );
4370
4371        let dir_inode = lookup(&fs, &temp_dir.path().join("dir")).expect("dir must be found");
4372        assert_eq!(
4373            lookup(&fs, &temp_dir.path().join("DiR")).expect("DiR must exist"),
4374            dir_inode
4375        );
4376
4377        let b_inode =
4378            lookup(&fs, &temp_dir.path().join("dir/b.txt")).expect("dir/b.txt must be found");
4379        assert_eq!(
4380            lookup(&fs, &temp_dir.path().join("dIr/B.TxT")).expect("dIr/B.TxT must exist"),
4381            b_inode
4382        );
4383
4384        assert_eq!(
4385            lookup(&fs, &temp_dir.path().join("nonexistent-file"))
4386                .expect_err("file must not exist")
4387                .kind(),
4388            io::ErrorKind::NotFound
4389        );
4390    }
4391
4392    fn test_create_and_remove(ascii_casefold: bool) {
4393        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4394        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4395        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4396        let _guard = lock.lock().expect("acquire named lock");
4397
4398        let temp_dir = TempDir::new().unwrap();
4399        let timeout = Duration::from_millis(10);
4400        let cfg = Config {
4401            timeout,
4402            cache_policy: CachePolicy::Auto,
4403            ascii_casefold,
4404            ..Default::default()
4405        };
4406        let fs = PassthroughFs::new("tag", cfg).unwrap();
4407
4408        let capable = FsOptions::empty();
4409        fs.init(capable).unwrap();
4410
4411        // Create a.txt and b.txt.
4412        let a_path = temp_dir.path().join("a.txt");
4413        let b_path = temp_dir.path().join("b.txt");
4414        let a_entry = create(&fs, &a_path).expect("create a.txt");
4415        let b_entry = create(&fs, &b_path).expect("create b.txt");
4416        assert_eq!(
4417            a_entry.inode,
4418            lookup(&fs, &a_path).expect("lookup a.txt"),
4419            "Created file 'a.txt' must be looked up"
4420        );
4421        assert_eq!(
4422            b_entry.inode,
4423            lookup(&fs, &b_path).expect("lookup b.txt"),
4424            "Created file 'b.txt' must be looked up"
4425        );
4426
4427        // Remove a.txt only
4428        unlink(&fs, &a_path).expect("Remove");
4429        assert_eq!(
4430            lookup(&fs, &a_path)
4431                .expect_err("file must not exist")
4432                .kind(),
4433            io::ErrorKind::NotFound,
4434            "a.txt must be removed"
4435        );
4436        // "A.TXT" must not be found regardless of whether casefold is enabled or not.
4437        let upper_a_path = temp_dir.path().join("A.TXT");
4438        assert_eq!(
4439            lookup(&fs, &upper_a_path)
4440                .expect_err("file must not exist")
4441                .kind(),
4442            io::ErrorKind::NotFound,
4443            "A.txt must be removed"
4444        );
4445
4446        // Check if the host file system doesn't have a.txt but does b.txt.
4447        assert!(!a_path.exists(), "a.txt must be removed");
4448        assert!(b_path.exists(), "b.txt must exist");
4449    }
4450
4451    #[test]
4452    fn create_and_remove() {
4453        test_create_and_remove(false /* casefold */);
4454    }
4455
4456    #[test]
4457    fn create_and_remove_casefold() {
4458        test_create_and_remove(true /* casefold */);
4459    }
4460
4461    fn test_create_and_forget(ascii_casefold: bool) {
4462        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4463        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4464        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4465        let _guard = lock.lock().expect("acquire named lock");
4466
4467        let temp_dir = TempDir::new().unwrap();
4468        let timeout = Duration::from_millis(10);
4469        let cfg = Config {
4470            timeout,
4471            cache_policy: CachePolicy::Auto,
4472            ascii_casefold,
4473            ..Default::default()
4474        };
4475        let fs = PassthroughFs::new("tag", cfg).unwrap();
4476
4477        let capable = FsOptions::empty();
4478        fs.init(capable).unwrap();
4479
4480        // Create a.txt.
4481        let a_path = temp_dir.path().join("a.txt");
4482        let a_entry = create(&fs, &a_path).expect("create a.txt");
4483        assert_eq!(
4484            a_entry.inode,
4485            lookup(&fs, &a_path).expect("lookup a.txt"),
4486            "Created file 'a.txt' must be looked up"
4487        );
4488
4489        // Forget a.txt's inode from PassthroughFs's internal cache.
4490        forget(&fs, &a_path).expect("forget a.txt");
4491
4492        if ascii_casefold {
4493            let upper_a_path = temp_dir.path().join("A.TXT");
4494            let new_a_inode = lookup(&fs, &upper_a_path).expect("lookup a.txt");
4495            assert_ne!(
4496                a_entry.inode, new_a_inode,
4497                "inode must be changed after forget()"
4498            );
4499            assert_eq!(
4500                new_a_inode,
4501                lookup(&fs, &a_path).expect("lookup a.txt"),
4502                "inode must be same for a.txt and A.TXT"
4503            );
4504        } else {
4505            assert_ne!(
4506                a_entry.inode,
4507                lookup(&fs, &a_path).expect("lookup a.txt"),
4508                "inode must be changed after forget()"
4509            );
4510        }
4511    }
4512
4513    #[test]
4514    fn create_and_forget() {
4515        test_create_and_forget(false /* ascii_casefold */);
4516    }
4517
4518    #[test]
4519    fn create_and_forget_casefold() {
4520        test_create_and_forget(true /* ascii_casefold */);
4521    }
4522
4523    #[test]
4524    fn casefold_lookup_cache() {
4525        let temp_dir = TempDir::new().unwrap();
4526        // Prepare `a.txt` before starting the test.
4527        create_test_data(&temp_dir, &[], &["a.txt"]);
4528
4529        let cfg = Config {
4530            ascii_casefold: true,
4531            ..Default::default()
4532        };
4533        let fs = PassthroughFs::new("tag", cfg).unwrap();
4534
4535        let capable = FsOptions::empty();
4536        fs.init(capable).unwrap();
4537
4538        let parent = lookup(&fs, temp_dir.path()).expect("lookup temp_dir");
4539
4540        // Since `a.txt` exists, "A.TXT" must exist.
4541        let large_a_path = temp_dir.path().join("A.TXT");
4542        // Looking up "A.TXT" must create a CasefoldCache entry.
4543        lookup(&fs, &large_a_path).expect("A.TXT must exist");
4544        assert!(fs.exists_in_casefold_cache(parent, &CString::new("A.TXT").unwrap()));
4545
4546        // Create b.txt.
4547        let b_path = temp_dir.path().join("b.txt");
4548        create(&fs, &b_path).expect("create b.txt");
4549        // Then, b.txt must exists in the cache.
4550        assert!(fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
4551        // When removing b.txt, it must be removed from the cache as well.
4552        unlink(&fs, &b_path).expect("remove b.txt");
4553        assert!(!fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
4554    }
4555
4556    #[test]
4557    fn lookup_negative_cache() {
4558        let temp_dir = TempDir::new().unwrap();
4559        // Prepare `a.txt` before starting the test.
4560        create_test_data(&temp_dir, &[], &[]);
4561
4562        let cfg = Config {
4563            negative_timeout: Duration::from_secs(5),
4564            ..Default::default()
4565        };
4566        let fs = PassthroughFs::new("tag", cfg).unwrap();
4567
4568        let capable = FsOptions::empty();
4569        fs.init(capable).unwrap();
4570
4571        let a_path = temp_dir.path().join("a.txt");
4572        // a.txt hasn't existed yet.
4573        // Since negative_timeout is enabled, success with inode=0 is expected.
4574        assert_eq!(
4575            0,
4576            lookup(&fs, &a_path).expect("lookup a.txt"),
4577            "Entry with inode=0 is expected for non-existing file 'a.txt'"
4578        );
4579        // Create a.txt
4580        let a_entry = create(&fs, &a_path).expect("create a.txt");
4581        assert_eq!(
4582            a_entry.inode,
4583            lookup(&fs, &a_path).expect("lookup a.txt"),
4584            "Created file 'a.txt' must be looked up"
4585        );
4586        // Remove a.txt
4587        unlink(&fs, &a_path).expect("Remove");
4588        assert_eq!(
4589            0,
4590            lookup(&fs, &a_path).expect("lookup a.txt"),
4591            "Entry with inode=0 is expected for the removed file 'a.txt'"
4592        );
4593    }
4594    #[test]
4595    fn test_atomic_open_existing_file() {
4596        atomic_open_existing_file(false);
4597    }
4598
4599    #[test]
4600    fn test_atomic_open_existing_file_zero_message() {
4601        atomic_open_existing_file(true);
4602    }
4603
4604    fn atomic_open_existing_file(zero_message_open: bool) {
4605        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4606        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4607        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4608        let _guard = lock.lock().expect("acquire named lock");
4609
4610        let temp_dir = TempDir::new().unwrap();
4611        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt", "dir/c.txt"]);
4612
4613        let cache_policy = match zero_message_open {
4614            true => CachePolicy::Always,
4615            false => CachePolicy::Auto,
4616        };
4617
4618        let cfg = Config {
4619            cache_policy,
4620            ..Default::default()
4621        };
4622        let fs = PassthroughFs::new("tag", cfg).unwrap();
4623
4624        let capable = FsOptions::ZERO_MESSAGE_OPEN;
4625        fs.init(capable).unwrap();
4626
4627        // atomic_open with flag O_RDWR, should return positive dentry and file handler
4628        let res = atomic_open(
4629            &fs,
4630            &temp_dir.path().join("a.txt"),
4631            0o666,
4632            libc::O_RDWR as u32,
4633            0,
4634            None,
4635        );
4636        assert!(res.is_ok());
4637        let (entry, handler, open_options) = res.unwrap();
4638        assert_ne!(entry.inode, 0);
4639
4640        if zero_message_open {
4641            assert!(handler.is_none());
4642            assert_eq!(open_options, OpenOptions::KEEP_CACHE);
4643        } else {
4644            assert!(handler.is_some());
4645            assert_ne!(
4646                open_options & OpenOptions::FILE_CREATED,
4647                OpenOptions::FILE_CREATED
4648            );
4649        }
4650
4651        // atomic_open with flag O_RDWR |  O_CREATE, should return positive dentry and file handler
4652        let res = atomic_open(
4653            &fs,
4654            &temp_dir.path().join("dir/b.txt"),
4655            0o666,
4656            (libc::O_RDWR | libc::O_CREAT) as u32,
4657            0,
4658            None,
4659        );
4660        assert!(res.is_ok());
4661        let (entry, handler, open_options) = res.unwrap();
4662        assert_ne!(entry.inode, 0);
4663
4664        if zero_message_open {
4665            assert!(handler.is_none());
4666            assert_eq!(open_options, OpenOptions::KEEP_CACHE);
4667        } else {
4668            assert!(handler.is_some());
4669            assert_ne!(
4670                open_options & OpenOptions::FILE_CREATED,
4671                OpenOptions::FILE_CREATED
4672            );
4673        }
4674
4675        // atomic_open with flag O_RDWR | O_CREATE | O_EXCL, should return positive dentry and file
4676        // handler
4677        let res = atomic_open(
4678            &fs,
4679            &temp_dir.path().join("dir/c.txt"),
4680            0o666,
4681            (libc::O_RDWR | libc::O_CREAT | libc::O_EXCL) as u32,
4682            0,
4683            None,
4684        );
4685        assert!(res.is_err());
4686        let err_kind = res.unwrap_err().kind();
4687        assert_eq!(err_kind, io::ErrorKind::AlreadyExists);
4688    }
4689
4690    #[test]
4691    fn test_atomic_open_non_existing_file() {
4692        atomic_open_non_existing_file(false);
4693    }
4694
4695    #[test]
4696    fn test_atomic_open_non_existing_file_zero_message() {
4697        atomic_open_non_existing_file(true);
4698    }
4699
4700    fn atomic_open_non_existing_file(zero_message_open: bool) {
4701        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4702        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4703        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4704        let _guard = lock.lock().expect("acquire named lock");
4705
4706        let temp_dir = TempDir::new().unwrap();
4707
4708        let cache_policy = match zero_message_open {
4709            true => CachePolicy::Always,
4710            false => CachePolicy::Auto,
4711        };
4712
4713        let cfg = Config {
4714            cache_policy,
4715            ..Default::default()
4716        };
4717        let fs = PassthroughFs::new("tag", cfg).unwrap();
4718
4719        let capable = FsOptions::ZERO_MESSAGE_OPEN;
4720        fs.init(capable).unwrap();
4721
4722        // atomic_open with flag O_RDWR, should return NO_EXIST error
4723        let res = atomic_open(
4724            &fs,
4725            &temp_dir.path().join("a.txt"),
4726            0o666,
4727            libc::O_RDWR as u32,
4728            0,
4729            None,
4730        );
4731        assert!(res.is_err());
4732        let err_kind = res.unwrap_err().kind();
4733        assert_eq!(err_kind, io::ErrorKind::NotFound);
4734
4735        // atomic_open with flag O_RDWR | O_CREATE, should return positive dentry and file handler
4736        let res = atomic_open(
4737            &fs,
4738            &temp_dir.path().join("b.txt"),
4739            0o666,
4740            (libc::O_RDWR | libc::O_CREAT) as u32,
4741            0,
4742            None,
4743        );
4744        assert!(res.is_ok());
4745        let (entry, handler, open_options) = res.unwrap();
4746        assert_ne!(entry.inode, 0);
4747
4748        if zero_message_open {
4749            assert!(handler.is_none());
4750            assert_eq!(
4751                open_options & OpenOptions::KEEP_CACHE,
4752                OpenOptions::KEEP_CACHE
4753            );
4754        } else {
4755            assert!(handler.is_some());
4756        }
4757        assert_eq!(
4758            open_options & OpenOptions::FILE_CREATED,
4759            OpenOptions::FILE_CREATED
4760        );
4761    }
4762
4763    #[test]
4764    fn atomic_open_symbol_link() {
4765        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4766        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4767        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4768        let _guard = lock.lock().expect("acquire named lock");
4769
4770        let temp_dir = TempDir::new().unwrap();
4771        create_test_data(&temp_dir, &["dir"], &["a.txt"]);
4772
4773        let cfg = Default::default();
4774        let fs = PassthroughFs::new("tag", cfg).unwrap();
4775
4776        let capable = FsOptions::empty();
4777        fs.init(capable).unwrap();
4778
4779        // atomic open the link destination file
4780        let res_dst = atomic_open(
4781            &fs,
4782            &temp_dir.path().join("a.txt"),
4783            0o666,
4784            libc::O_RDWR as u32,
4785            0,
4786            None,
4787        );
4788        assert!(res_dst.is_ok());
4789        let (entry_dst, handler_dst, _) = res_dst.unwrap();
4790        assert_ne!(entry_dst.inode, 0);
4791        assert!(handler_dst.is_some());
4792
4793        // create depth 1 symbol link
4794        let sym1_res = symlink(
4795            &fs,
4796            &temp_dir.path().join("a.txt"),
4797            &temp_dir.path().join("blink"),
4798            None,
4799        );
4800        assert!(sym1_res.is_ok());
4801        let sym1_entry = sym1_res.unwrap();
4802        assert_ne!(sym1_entry.inode, 0);
4803
4804        // atomic_open symbol link, should return dentry with no handler
4805        let res = atomic_open(
4806            &fs,
4807            &temp_dir.path().join("blink"),
4808            0o666,
4809            libc::O_RDWR as u32,
4810            0,
4811            None,
4812        );
4813        assert!(res.is_ok());
4814        let (entry, handler, open_options) = res.unwrap();
4815        assert_eq!(entry.inode, sym1_entry.inode);
4816        assert!(handler.is_none());
4817        assert_eq!(open_options, OpenOptions::empty());
4818
4819        // delete link destination
4820        unlink(&fs, &temp_dir.path().join("a.txt")).expect("Remove");
4821        assert_eq!(
4822            lookup(&fs, &temp_dir.path().join("a.txt"))
4823                .expect_err("file must not exist")
4824                .kind(),
4825            io::ErrorKind::NotFound,
4826            "a.txt must be removed"
4827        );
4828
4829        // after link destination removed, should still return valid dentry
4830        let res = atomic_open(
4831            &fs,
4832            &temp_dir.path().join("blink"),
4833            0o666,
4834            libc::O_RDWR as u32,
4835            0,
4836            None,
4837        );
4838        assert!(res.is_ok());
4839        let (entry, handler, open_options) = res.unwrap();
4840        assert_eq!(entry.inode, sym1_entry.inode);
4841        assert!(handler.is_none());
4842        assert_eq!(open_options, OpenOptions::empty());
4843    }
4844
4845    #[test]
4846    #[cfg(feature = "arc_quota")]
4847    fn set_permission_ioctl_valid_data() {
4848        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4849        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4850        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4851        let _guard = lock.lock().expect("acquire named lock");
4852
4853        let cfg = Config {
4854            max_dynamic_perm: 1,
4855            ..Default::default()
4856        };
4857        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4858
4859        let perm_path_string = String::from("/test");
4860        let fs_permission_data_buffer = FsPermissionDataBuffer {
4861            guest_uid: 1,
4862            guest_gid: 2,
4863            host_uid: 3,
4864            host_gid: 4,
4865            umask: 5,
4866            pad: 0,
4867            perm_path: {
4868                let mut perm_path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4869                perm_path[..perm_path_string.len()].copy_from_slice(perm_path_string.as_bytes());
4870                perm_path
4871            },
4872        };
4873        let r = std::io::Cursor::new(fs_permission_data_buffer.as_bytes());
4874
4875        let res = fs_ioc_setpermission(
4876            &p,
4877            mem::size_of_val(&fs_permission_data_buffer) as u32,
4878            r.clone(),
4879        )
4880        .expect("valid input should get IoctlReply");
4881        assert!(matches!(res, IoctlReply::Done(Ok(data)) if data.is_empty()));
4882
4883        let read_guard = p
4884            .permission_paths
4885            .read()
4886            .expect("read permission_paths failed");
4887        let permission_data = read_guard
4888            .first()
4889            .expect("permission path should not be empty");
4890
4891        // Check expected data item is added to permission_paths.
4892        let expected_data = PermissionData {
4893            guest_uid: 1,
4894            guest_gid: 2,
4895            host_uid: 3,
4896            host_gid: 4,
4897            umask: 5,
4898            perm_path: perm_path_string,
4899        };
4900        assert_eq!(*permission_data, expected_data);
4901
4902        // Second ioctl should not succeed since max_dynamic_perm is set to 1
4903        let res = fs_ioc_setpermission(
4904            &p,
4905            mem::size_of_val(&fs_permission_data_buffer) as u32,
4906            r.clone(),
4907        )
4908        .expect("valid input should get IoctlReply");
4909        assert!(
4910            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4911                errno == libc::EPERM
4912            }))
4913        );
4914    }
4915
4916    #[test]
4917    #[cfg(feature = "arc_quota")]
4918    fn set_permission_ioctl_invalid_data() {
4919        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4920        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4921        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4922        let _guard = lock.lock().expect("acquire named lock");
4923
4924        let cfg = Config {
4925            max_dynamic_perm: 1,
4926            ..Default::default()
4927        };
4928        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4929
4930        // The perm_path is not valid since it does not start with /.
4931        let perm_path_string = String::from("test");
4932        let fs_permission_data_buffer = FsPermissionDataBuffer {
4933            guest_uid: 1,
4934            guest_gid: 2,
4935            host_uid: 3,
4936            host_gid: 4,
4937            umask: 5,
4938            pad: 0,
4939            perm_path: {
4940                let mut perm_path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4941                perm_path[..perm_path_string.len()].copy_from_slice(perm_path_string.as_bytes());
4942                perm_path
4943            },
4944        };
4945
4946        let r = std::io::Cursor::new(fs_permission_data_buffer.as_bytes());
4947        // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
4948        // This call is supposed to get EINVAL ioctlReply, since the perm_path is invalid.
4949        let res = fs_ioc_setpermission(&p, mem::size_of_val(&fs_permission_data_buffer) as u32, r)
4950            .expect("invalid perm_path should get IoctlReply");
4951        assert!(
4952            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4953                errno == libc::EINVAL
4954            }))
4955        );
4956
4957        let fake_data_buffer: [u8; 128] = [0; 128];
4958        let r = std::io::Cursor::new(fake_data_buffer.as_bytes());
4959
4960        // This call is supposed to get EINVAL ioctlReply, since the in_size is not the size of
4961        // struct FsPermissionDataBuffer.
4962        let res = fs_ioc_setpermission(&p, mem::size_of_val(&fake_data_buffer) as u32, r)
4963            .expect_err("invalid in_size should get Error");
4964        assert!(res
4965            .raw_os_error()
4966            .is_some_and(|errno| { errno == libc::EINVAL }));
4967    }
4968
4969    #[test]
4970    #[cfg(feature = "arc_quota")]
4971    fn permission_data_path_matching() {
4972        let ctx = get_context();
4973        let temp_dir = TempDir::new().unwrap();
4974        // Prepare `a.txt` before starting the test.
4975        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/a.txt"]);
4976
4977        let cfg = Config {
4978            max_dynamic_perm: 1,
4979            ..Default::default()
4980        };
4981        let fs = PassthroughFs::new("tag", cfg).unwrap();
4982
4983        let capable = FsOptions::empty();
4984        fs.init(capable).unwrap();
4985
4986        const BY_PATH_UID: u32 = 655360;
4987        const BY_PATH_GID: u32 = 655361;
4988        const BY_PATH_UMASK: u32 = 0o007;
4989
4990        let dir_path = temp_dir.path().join("dir");
4991        let permission_data = PermissionData {
4992            guest_uid: BY_PATH_UID,
4993            guest_gid: BY_PATH_GID,
4994            host_uid: ctx.uid,
4995            host_gid: ctx.gid,
4996            umask: BY_PATH_UMASK,
4997            perm_path: dir_path.to_string_lossy().into_owned(),
4998        };
4999        fs.permission_paths
5000            .write()
5001            .expect("permission_path lock must be acquired")
5002            .push(permission_data);
5003
5004        // a_path is the path with out set permission by path
5005        let a_path = temp_dir.path().join("a.txt");
5006        let in_dir_a_path = dir_path.join("a.txt");
5007
5008        // a.txt should not be set with guest_uid/guest_uid/umask by path
5009        let a_entry = lookup_ent(&fs, &a_path).expect("a.txt must exist");
5010        assert_ne!(a_entry.attr.st_uid, BY_PATH_UID);
5011        assert_ne!(a_entry.attr.st_gid, BY_PATH_GID);
5012
5013        // a.txt in dir should be set guest_uid/guest_uid/umask by path
5014        let in_dir_a_entry = lookup_ent(&fs, &in_dir_a_path).expect("dir/a.txt must exist");
5015        assert_eq!(in_dir_a_entry.attr.st_uid, BY_PATH_UID);
5016        assert_eq!(in_dir_a_entry.attr.st_gid, BY_PATH_GID);
5017        assert_eq!(in_dir_a_entry.attr.st_mode & 0o777, !BY_PATH_UMASK & 0o777);
5018
5019        // Create dir/b.txt.
5020        let in_dir_b_path = dir_path.join("b.txt");
5021        create(&fs, &in_dir_b_path).expect("create b.txt");
5022
5023        // newly created b.txt in dir should be set guest_uid/guest_uid/umask by path
5024        let in_dir_b_entry = lookup_ent(&fs, &in_dir_a_path).expect("dir/b.txt must exist");
5025        assert_eq!(in_dir_b_entry.attr.st_uid, BY_PATH_UID);
5026        assert_eq!(in_dir_b_entry.attr.st_gid, BY_PATH_GID);
5027        assert_eq!(in_dir_b_entry.attr.st_mode & 0o777, !BY_PATH_UMASK & 0o777);
5028    }
5029
5030    #[test]
5031    #[cfg(feature = "fs_permission_translation")]
5032    fn test_copy_file_range_path_mapping() {
5033        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5034        let _guard = lock.lock().expect("acquire named lock");
5035
5036        let real_ctx = get_context();
5037        let temp_dir = TempDir::new().unwrap();
5038        let dir_path = temp_dir.path().join("dir");
5039        create_test_data(&temp_dir, &["dir"], &["src.txt", "dir/dst.txt"]);
5040
5041        let cfg = Config {
5042            ..Default::default()
5043        };
5044        let fs = PassthroughFs::new("tag", cfg).unwrap();
5045        fs.init(FsOptions::empty()).unwrap();
5046
5047        // Use a fake UID in the context that would normally fail set_creds()
5048        let mut fake_ctx = real_ctx;
5049        fake_ctx.uid = 9999;
5050        fake_ctx.gid = 9999;
5051
5052        // Create mapping: mapping the fake guest UID to the REAL host UID.
5053        // If the mapping works, copy_file_range will use real_ctx.uid and succeed.
5054        // If the mapping is ignored, it will use fake_ctx.uid (9999) and set_creds will fail with
5055        // EPERM.
5056        let permission_data = PermissionData {
5057            guest_uid: fake_ctx.uid,
5058            guest_gid: fake_ctx.gid,
5059            host_uid: real_ctx.uid,
5060            host_gid: real_ctx.gid,
5061            umask: 0,
5062            perm_path: dir_path.to_string_lossy().into_owned(),
5063        };
5064        fs.permission_paths.write().unwrap().push(permission_data);
5065
5066        let src_path = temp_dir.path().join("src.txt");
5067        let dst_path = dir_path.join("dst.txt");
5068
5069        std::fs::write(&src_path, b"hello world").unwrap();
5070
5071        let src_inode = lookup(&fs, &src_path).unwrap();
5072        let dst_inode = lookup(&fs, &dst_path).unwrap();
5073
5074        // Open files to get handles.
5075        // Note: we use real_ctx here to ensure file handles are opened successfully.
5076        // The copy_file_range call itself will use fake_ctx.
5077        let (src_handle, _) = fs
5078            .open(real_ctx, src_inode, libc::O_RDONLY as u32)
5079            .expect("open src");
5080        let (dst_handle, _) = fs
5081            .open(real_ctx, dst_inode, libc::O_WRONLY as u32)
5082            .expect("open dst");
5083
5084        let src_handle = src_handle.unwrap();
5085        let dst_handle = dst_handle.unwrap();
5086
5087        // Execute copy_file_range with fake_ctx.
5088        // This will only succeed if change_creds_for_path correctly translates 9999 -> real_uid.
5089        let result = fs.copy_file_range(
5090            fake_ctx, src_inode, src_handle, 0, dst_inode, dst_handle, 0, 5, 0,
5091        );
5092
5093        assert!(
5094            result.is_ok(),
5095            "copy_file_range failed: {:?}. Mapping might not be applied.",
5096            result.err()
5097        );
5098        assert_eq!(result.unwrap(), 5);
5099
5100        let content = std::fs::read(&dst_path).unwrap();
5101        assert_eq!(&content[0..5], b"hello");
5102    }
5103
5104    #[test]
5105    #[cfg(feature = "arc_quota")]
5106    fn set_path_xattr_ioctl_valid_data() {
5107        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
5108        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
5109        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5110        let _guard = lock.lock().expect("acquire named lock");
5111
5112        let cfg: Config = Config {
5113            max_dynamic_xattr: 1,
5114            ..Default::default()
5115        };
5116        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
5117
5118        let path_string = String::from("/test");
5119        let xattr_name_string = String::from("test_name");
5120        let xattr_value_string = String::from("test_value");
5121        let fs_path_xattr_data_buffer = FsPathXattrDataBuffer {
5122            path: {
5123                let mut path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
5124                path[..path_string.len()].copy_from_slice(path_string.as_bytes());
5125                path
5126            },
5127            xattr_name: {
5128                let mut xattr_name: [u8; FS_IOCTL_XATTR_NAME_MAX_LEN] =
5129                    [0; FS_IOCTL_XATTR_NAME_MAX_LEN];
5130                xattr_name[..xattr_name_string.len()].copy_from_slice(xattr_name_string.as_bytes());
5131                xattr_name
5132            },
5133            xattr_value: {
5134                let mut xattr_value: [u8; FS_IOCTL_XATTR_VALUE_MAX_LEN] =
5135                    [0; FS_IOCTL_XATTR_VALUE_MAX_LEN];
5136                xattr_value[..xattr_value_string.len()]
5137                    .copy_from_slice(xattr_value_string.as_bytes());
5138                xattr_value
5139            },
5140        };
5141        let r = std::io::Cursor::new(fs_path_xattr_data_buffer.as_bytes());
5142
5143        let res = fs_ioc_setpathxattr(
5144            &p,
5145            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
5146            r.clone(),
5147        )
5148        .expect("valid input should get IoctlReply");
5149        assert!(matches!(res, IoctlReply::Done(Ok(data)) if data.is_empty()));
5150
5151        let read_guard = p.xattr_paths.read().expect("read xattr_paths failed");
5152        let xattr_data = read_guard.first().expect("xattr_paths should not be empty");
5153
5154        // Check expected data item is added to permission_paths.
5155        let expected_data = XattrData {
5156            xattr_path: path_string,
5157            xattr_name: xattr_name_string,
5158            xattr_value: xattr_value_string,
5159        };
5160        assert_eq!(*xattr_data, expected_data);
5161
5162        // Second ioctl should not succeed since max_dynamic_perm is set to 1
5163        let res = fs_ioc_setpathxattr(
5164            &p,
5165            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
5166            r.clone(),
5167        )
5168        .expect("valid input should get IoctlReply");
5169        assert!(
5170            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
5171                errno == libc::EPERM
5172            }))
5173        );
5174    }
5175    #[test]
5176    #[cfg(feature = "arc_quota")]
5177    fn set_path_xattr_ioctl_invalid_data() {
5178        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
5179        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
5180        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5181        let _guard = lock.lock().expect("acquire named lock");
5182
5183        let cfg: Config = Config {
5184            max_dynamic_xattr: 1,
5185            ..Default::default()
5186        };
5187        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
5188
5189        let path_string = String::from("test");
5190        let xattr_name_string = String::from("test_name");
5191        let xattr_value_string = String::from("test_value");
5192        let fs_path_xattr_data_buffer = FsPathXattrDataBuffer {
5193            path: {
5194                let mut path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
5195                path[..path_string.len()].copy_from_slice(path_string.as_bytes());
5196                path
5197            },
5198            xattr_name: {
5199                let mut xattr_name: [u8; FS_IOCTL_XATTR_NAME_MAX_LEN] =
5200                    [0; FS_IOCTL_XATTR_NAME_MAX_LEN];
5201                xattr_name[..xattr_name_string.len()].copy_from_slice(xattr_name_string.as_bytes());
5202                xattr_name
5203            },
5204            xattr_value: {
5205                let mut xattr_value: [u8; FS_IOCTL_XATTR_VALUE_MAX_LEN] =
5206                    [0; FS_IOCTL_XATTR_VALUE_MAX_LEN];
5207                xattr_value[..xattr_value_string.len()]
5208                    .copy_from_slice(xattr_value_string.as_bytes());
5209                xattr_value
5210            },
5211        };
5212        let r = std::io::Cursor::new(fs_path_xattr_data_buffer.as_bytes());
5213
5214        // This call is supposed to get EINVAL ioctlReply, since the perm_path is invalid.
5215        let res = fs_ioc_setpathxattr(
5216            &p,
5217            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
5218            r.clone(),
5219        )
5220        .expect("valid input should get IoctlReply");
5221        assert!(
5222            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
5223                errno == libc::EINVAL
5224            }))
5225        );
5226
5227        let fake_data_buffer: [u8; 128] = [0; 128];
5228        let r = std::io::Cursor::new(fake_data_buffer.as_bytes());
5229        // This call is supposed to get EINVAL ioctlReply, since the in_size is not the size of
5230        // struct FsPathXattrDataBuffer.
5231        let res = fs_ioc_setpathxattr(&p, mem::size_of_val(&fake_data_buffer) as u32, r.clone())
5232            .expect_err("valid input should get IoctlReply");
5233        assert!(res
5234            .raw_os_error()
5235            .is_some_and(|errno| { errno == libc::EINVAL }));
5236    }
5237
5238    #[test]
5239    #[cfg(feature = "arc_quota")]
5240    fn xattr_data_path_matching() {
5241        let ctx = get_context();
5242        let temp_dir = TempDir::new().unwrap();
5243        // Prepare `a.txt` before starting the test.
5244        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/a.txt"]);
5245
5246        let cfg = Config {
5247            max_dynamic_xattr: 1,
5248            ..Default::default()
5249        };
5250        let fs = PassthroughFs::new("tag", cfg).unwrap();
5251
5252        let capable = FsOptions::empty();
5253        fs.init(capable).unwrap();
5254
5255        let dir_path = temp_dir.path().join("dir");
5256        let xattr_name_string = String::from("test_name");
5257        let xattr_name_cstring = CString::new(xattr_name_string.clone()).expect("create c string");
5258        let xattr_value_string = String::from("test_value");
5259        let xattr_value_bytes = xattr_value_string.clone().into_bytes();
5260
5261        let xattr_data = XattrData {
5262            xattr_name: xattr_name_string,
5263            xattr_value: xattr_value_string,
5264            xattr_path: dir_path.to_string_lossy().into_owned(),
5265        };
5266        fs.xattr_paths
5267            .write()
5268            .expect("xattr_paths lock must be acquired")
5269            .push(xattr_data);
5270
5271        // a_path is the path with out set xattr by path
5272        let a_path: std::path::PathBuf = temp_dir.path().join("a.txt");
5273        let in_dir_a_path = dir_path.join("a.txt");
5274
5275        let a_node = lookup(&fs, a_path.as_path()).expect("lookup a node");
5276        // a.txt should not be set with xattr by path
5277        assert!(fs
5278            .getxattr(
5279                ctx,
5280                a_node,
5281                &xattr_name_cstring,
5282                xattr_value_bytes.len() as u32
5283            )
5284            .is_err());
5285
5286        let in_dir_a_node = lookup(&fs, in_dir_a_path.as_path()).expect("lookup in dir a node");
5287        // a.txt in dir should be set xattr by path
5288        let in_dir_a_reply = fs
5289            .getxattr(
5290                ctx,
5291                in_dir_a_node,
5292                &xattr_name_cstring,
5293                xattr_value_bytes.len() as u32,
5294            )
5295            .expect("Getxattr should success");
5296        assert!(matches!(in_dir_a_reply, GetxattrReply::Value(v) if v == xattr_value_bytes));
5297        // Create dir/b.txt.
5298        let in_dir_b_path = dir_path.join("b.txt");
5299        create(&fs, &in_dir_b_path).expect("create b.txt");
5300
5301        // newly created b.txt in dir should be set xattr by path
5302        let in_dir_b_node = lookup(&fs, in_dir_a_path.as_path()).expect("lookup in dir b node");
5303        let in_dir_b_reply = fs
5304            .getxattr(
5305                ctx,
5306                in_dir_b_node,
5307                &xattr_name_cstring,
5308                xattr_value_bytes.len() as u32,
5309            )
5310            .expect("Getxattr should success");
5311        assert!(matches!(in_dir_b_reply, GetxattrReply::Value(v) if v == xattr_value_bytes));
5312    }
5313
5314    /// Creates and open a new file by atomic_open with O_APPEND flag.
5315    /// We check O_APPEND is properly handled, depending on writeback cache is enabled or not.
5316    fn atomic_open_create_o_append(writeback: bool) {
5317        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
5318        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
5319        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5320        let _guard = lock.lock().expect("acquire named lock");
5321
5322        let temp_dir = TempDir::new().unwrap();
5323
5324        let cfg = Config {
5325            cache_policy: CachePolicy::Always,
5326            writeback,
5327            ..Default::default()
5328        };
5329        let fs = PassthroughFs::new("tag", cfg).unwrap();
5330
5331        let capable = FsOptions::ZERO_MESSAGE_OPEN | FsOptions::WRITEBACK_CACHE;
5332        fs.init(capable).unwrap();
5333
5334        let (entry, _, _) = atomic_open(
5335            &fs,
5336            &temp_dir.path().join("a.txt"),
5337            0o666,
5338            (libc::O_RDWR | libc::O_CREAT | libc::O_APPEND) as u32,
5339            0,
5340            None,
5341        )
5342        .expect("atomic_open");
5343        assert_ne!(entry.inode, 0);
5344
5345        let inodes = fs.inodes.lock();
5346        let data = inodes.get(&entry.inode).unwrap();
5347        let flags = data.file.lock().open_flags;
5348        if writeback {
5349            // When writeback is enabled, O_APPEND must be handled by the guest kernel.
5350            // So, it must be cleared.
5351            assert_eq!(flags & libc::O_APPEND, 0);
5352        } else {
5353            // Without writeback cache, O_APPEND must not be cleared.
5354            assert_eq!(flags & libc::O_APPEND, libc::O_APPEND);
5355        }
5356    }
5357
5358    #[test]
5359    fn test_atomic_open_create_o_append_no_writeback() {
5360        atomic_open_create_o_append(false);
5361    }
5362
5363    #[test]
5364    fn test_atomic_open_create_o_append_writeback() {
5365        atomic_open_create_o_append(true);
5366    }
5367
5368    #[test]
5369    fn test_lookup_dotdot_escape() {
5370        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5371        let _guard = lock.lock().expect("acquire named lock");
5372        let temp_dir = TempDir::new().unwrap();
5373        let root_path = temp_dir.path().join("root");
5374        std::fs::create_dir(&root_path).unwrap();
5375
5376        // Create a secret file in the parent of root
5377        let secret_file = temp_dir.path().join("secret.txt");
5378        std::fs::write(&secret_file, "top secret").unwrap();
5379
5380        let cfg = Config {
5381            ..Default::default()
5382        };
5383        let mut fs = PassthroughFs::new("tag", cfg).unwrap();
5384        fs.set_root_dir(root_path.to_str().unwrap().to_string())
5385            .unwrap();
5386        fs.init(FsOptions::empty()).unwrap();
5387        let ctx = get_context();
5388
5389        // 1. Lookup ".." from root (inode 1)
5390        let dotdot = c"..";
5391        let res = fs.lookup(ctx, 1, dotdot);
5392        assert!(res.is_err(), "Lookup .. should be blocked!");
5393    }
5394
5395    #[test]
5396    fn test_passthrough_fs_create_validation() {
5397        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5398        let _guard = lock.lock().expect("acquire named lock");
5399
5400        let temp_dir = TempDir::new().unwrap();
5401        create_test_data(&temp_dir, &["allowed"], &[]);
5402
5403        let cfg = Default::default();
5404        let mut fs = PassthroughFs::new("tag", cfg).unwrap();
5405        fs.init(FsOptions::empty()).unwrap();
5406
5407        let allowlist = Arc::new(RwLock::new(PathAllowlist::new()));
5408        fs.set_allowlist(Some(allowlist.clone()));
5409
5410        allowlist.write().unwrap().add_path(
5411            temp_dir
5412                .path()
5413                .join("allowed")
5414                .to_string_lossy()
5415                .into_owned(),
5416        );
5417
5418        let allowed_inode = lookup(&fs, &temp_dir.path().join("allowed")).unwrap();
5419        let ctx = get_context();
5420
5421        // Creation with a name containing '..' should fail with EINVAL
5422        let invalid_name = CString::new("..").unwrap();
5423        let res = fs.create(
5424            ctx,
5425            allowed_inode,
5426            &invalid_name,
5427            0o666,
5428            libc::O_RDWR as u32,
5429            0,
5430            None,
5431        );
5432        assert!(res.is_err());
5433        assert_eq!(res.unwrap_err().raw_os_error().unwrap(), libc::EINVAL);
5434    }
5435
5436    #[test]
5437    fn test_passthrough_fs_rename_validation() {
5438        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5439        let _guard = lock.lock().expect("acquire named lock");
5440
5441        let temp_dir = TempDir::new().unwrap();
5442        create_test_data(&temp_dir, &["allowed"], &["allowed/a.txt"]);
5443
5444        let cfg = Default::default();
5445        let fs = PassthroughFs::new("tag", cfg).unwrap();
5446        fs.init(FsOptions::empty()).unwrap();
5447
5448        let allowed_inode = lookup(&fs, &temp_dir.path().join("allowed")).unwrap();
5449        let ctx = get_context();
5450
5451        // Rename with '..' in newname should fail with EINVAL
5452        let oldname = CString::new("a.txt").unwrap();
5453        let invalid_newname = CString::new("../blocked.txt").unwrap();
5454
5455        let res = fs.rename(
5456            ctx,
5457            allowed_inode,
5458            &oldname,
5459            allowed_inode,
5460            &invalid_newname,
5461            0,
5462        );
5463        assert!(res.is_err());
5464        assert_eq!(res.unwrap_err().raw_os_error().unwrap(), libc::EINVAL);
5465    }
5466
5467    #[test]
5468    fn test_passthrough_fs_symlink_authorization() {
5469        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5470        let _guard = lock.lock().expect("acquire named lock");
5471
5472        let temp_dir = TempDir::new().unwrap();
5473        create_test_data(&temp_dir, &["allowed"], &[]);
5474
5475        let cfg = Default::default();
5476        let mut fs = PassthroughFs::new("tag", cfg).unwrap();
5477        fs.init(FsOptions::empty()).unwrap();
5478
5479        let allowlist = Arc::new(RwLock::new(PathAllowlist::new()));
5480        fs.set_allowlist(Some(allowlist.clone()));
5481
5482        allowlist.write().unwrap().add_path(
5483            temp_dir
5484                .path()
5485                .join("allowed")
5486                .to_string_lossy()
5487                .into_owned(),
5488        );
5489
5490        // Get Inode of the shared root (which is an ancestor, hence not writable)
5491        let root_inode = lookup(&fs, temp_dir.path()).unwrap();
5492        let ctx = get_context();
5493
5494        // Symlink creation in the shared root (not writable) should fail with EACCES
5495        let linkname = CString::new("allowed/a.txt").unwrap();
5496        let symlink_name = CString::new("malicious_link").unwrap();
5497
5498        let res = fs.symlink(ctx, &linkname, root_inode, &symlink_name, None);
5499        assert!(res.is_err());
5500        assert_eq!(res.unwrap_err().raw_os_error().unwrap(), libc::EACCES);
5501    }
5502
5503    #[test]
5504    fn test_passthrough_fs_link_authorization() {
5505        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5506        let _guard = lock.lock().expect("acquire named lock");
5507
5508        let temp_dir = TempDir::new().unwrap();
5509        create_test_data(&temp_dir, &["allowed"], &["allowed/a.txt"]);
5510
5511        let cfg = Default::default();
5512        let mut fs = PassthroughFs::new("tag", cfg).unwrap();
5513        fs.init(FsOptions::empty()).unwrap();
5514
5515        let allowlist = Arc::new(RwLock::new(PathAllowlist::new()));
5516        fs.set_allowlist(Some(allowlist.clone()));
5517
5518        allowlist.write().unwrap().add_path(
5519            temp_dir
5520                .path()
5521                .join("allowed")
5522                .to_string_lossy()
5523                .into_owned(),
5524        );
5525
5526        let file_inode = lookup(&fs, &temp_dir.path().join("allowed/a.txt")).unwrap();
5527        let root_inode = lookup(&fs, temp_dir.path()).unwrap(); // Directory not writable
5528        let ctx = get_context();
5529
5530        // Hardlink creation in a non-writable directory should fail with EACCES
5531        let link_name = CString::new("malicious_hardlink").unwrap();
5532        let res = fs.link(ctx, file_inode, root_inode, &link_name);
5533        assert!(res.is_err());
5534        assert_eq!(res.unwrap_err().raw_os_error().unwrap(), libc::EACCES);
5535    }
5536
5537    #[test]
5538    fn test_passthrough_fs_mknod_authorization() {
5539        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5540        let _guard = lock.lock().expect("acquire named lock");
5541
5542        let temp_dir = TempDir::new().unwrap();
5543        create_test_data(&temp_dir, &["allowed"], &[]);
5544
5545        let cfg = Default::default();
5546        let mut fs = PassthroughFs::new("tag", cfg).unwrap();
5547        fs.init(FsOptions::empty()).unwrap();
5548
5549        let allowlist = Arc::new(RwLock::new(PathAllowlist::new()));
5550        fs.set_allowlist(Some(allowlist.clone()));
5551
5552        allowlist.write().unwrap().add_path(
5553            temp_dir
5554                .path()
5555                .join("allowed")
5556                .to_string_lossy()
5557                .into_owned(),
5558        );
5559
5560        let root_inode = lookup(&fs, temp_dir.path()).unwrap(); // Directory not writable
5561        let ctx = get_context();
5562
5563        // mknod creation in a non-writable directory should fail with EACCES
5564        let name = CString::new("malicious_fifo").unwrap();
5565        let res = fs.mknod(ctx, root_inode, &name, 0o666 | libc::S_IFIFO, 0, 0, None);
5566        assert!(res.is_err());
5567        assert_eq!(res.unwrap_err().raw_os_error().unwrap(), libc::EACCES);
5568    }
5569
5570    #[test]
5571    fn test_passthrough_fs_non_utf8_bypass() {
5572        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5573        let _guard = lock.lock().expect("acquire named lock");
5574
5575        let temp_dir = TempDir::new().unwrap();
5576        let cfg = Default::default();
5577        let mut fs = PassthroughFs::new("tag", cfg).unwrap();
5578        fs.init(FsOptions::empty()).unwrap();
5579
5580        let allowlist = Arc::new(RwLock::new(PathAllowlist::new()));
5581        fs.set_allowlist(Some(allowlist.clone()));
5582
5583        // Register the fallback string path in allowlist
5584        let bypass_path = temp_dir
5585            .path()
5586            .join("<non UTF-8 path>")
5587            .to_string_lossy()
5588            .into_owned();
5589        allowlist.write().unwrap().add_path(bypass_path);
5590
5591        let root_inode = lookup(&fs, temp_dir.path()).unwrap();
5592        let ctx = get_context();
5593
5594        // SAFETY: The vector [0xff] does not contain any interior null bytes, which satisfies
5595        // CString's safety invariant.
5596        let non_utf8_name = unsafe { CString::from_vec_unchecked(vec![0xff]) };
5597        let res = fs.create(
5598            ctx,
5599            root_inode,
5600            &non_utf8_name,
5601            0o666,
5602            libc::O_RDWR as u32,
5603            0,
5604            None,
5605        );
5606
5607        // We expect it to be blocked by allowlist (EACCES) because the root directory itself is not
5608        // writable. However, due to the unwrap_or("<non UTF-8 path>") bug, it will match
5609        // the registered bypass path and bypass the allowlist check.
5610        assert!(res.is_err(), "Should fail because root is not writable");
5611        assert_eq!(
5612            res.unwrap_err().raw_os_error().unwrap(),
5613            libc::EILSEQ,
5614            "Should fail with EILSEQ for non-UTF8 name"
5615        );
5616    }
5617}