devices/virtio/fs/
passthrough.rs

1// Copyright 2019 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::borrow::Cow;
6use std::cell::RefCell;
7use std::cmp;
8use std::collections::btree_map;
9use std::collections::BTreeMap;
10use std::ffi::CStr;
11use std::ffi::CString;
12#[cfg(feature = "fs_runtime_ugid_map")]
13use std::ffi::OsStr;
14use std::fs::File;
15use std::io;
16use std::mem;
17use std::mem::size_of;
18use std::mem::MaybeUninit;
19use std::os::raw::c_int;
20use std::os::raw::c_long;
21#[cfg(feature = "fs_runtime_ugid_map")]
22use std::os::unix::ffi::OsStrExt;
23#[cfg(feature = "fs_runtime_ugid_map")]
24use std::path::Path;
25use std::ptr;
26use std::ptr::addr_of;
27use std::ptr::addr_of_mut;
28use std::sync::atomic::AtomicBool;
29use std::sync::atomic::AtomicU64;
30use std::sync::atomic::Ordering;
31use std::sync::Arc;
32use std::sync::MutexGuard;
33#[cfg(feature = "fs_permission_translation")]
34use std::sync::RwLock;
35use std::time::Duration;
36
37#[cfg(feature = "arc_quota")]
38use base::debug;
39use base::error;
40use base::ioctl_ior_nr;
41use base::ioctl_iow_nr;
42use base::ioctl_iowr_nr;
43use base::ioctl_with_mut_ptr;
44use base::ioctl_with_ptr;
45use base::open_how;
46use base::openat2;
47use base::syscall;
48use base::unix::FileFlags;
49use base::warn;
50use base::AsRawDescriptor;
51use base::FromRawDescriptor;
52use base::IntoRawDescriptor;
53use base::IoctlNr;
54use base::Protection;
55use base::RawDescriptor;
56use fuse::filesystem::Context;
57use fuse::filesystem::DirectoryIterator;
58use fuse::filesystem::Entry;
59use fuse::filesystem::FileSystem;
60use fuse::filesystem::FsOptions;
61use fuse::filesystem::GetxattrReply;
62use fuse::filesystem::IoctlFlags;
63use fuse::filesystem::IoctlReply;
64use fuse::filesystem::ListxattrReply;
65use fuse::filesystem::OpenOptions;
66use fuse::filesystem::RemoveMappingOne;
67use fuse::filesystem::SetattrValid;
68use fuse::filesystem::ZeroCopyReader;
69use fuse::filesystem::ZeroCopyWriter;
70use fuse::filesystem::ROOT_ID;
71use fuse::sys::WRITE_KILL_PRIV;
72use fuse::Mapper;
73#[cfg(feature = "arc_quota")]
74use protobuf::Message;
75use sync::Mutex;
76#[cfg(feature = "arc_quota")]
77use system_api::client::OrgChromiumSpaced;
78#[cfg(feature = "arc_quota")]
79use system_api::spaced::SetProjectIdReply;
80#[cfg(feature = "arc_quota")]
81use system_api::spaced::SetProjectInheritanceFlagReply;
82use zerocopy::FromBytes;
83use zerocopy::FromZeros;
84use zerocopy::Immutable;
85use zerocopy::IntoBytes;
86use zerocopy::KnownLayout;
87
88#[cfg(feature = "arc_quota")]
89use crate::virtio::fs::arc_ioctl::FsPathXattrDataBuffer;
90#[cfg(feature = "arc_quota")]
91use crate::virtio::fs::arc_ioctl::FsPermissionDataBuffer;
92#[cfg(feature = "arc_quota")]
93use crate::virtio::fs::arc_ioctl::XattrData;
94use crate::virtio::fs::caps::Capability;
95use crate::virtio::fs::caps::Caps;
96use crate::virtio::fs::caps::Set as CapSet;
97use crate::virtio::fs::caps::Value as CapValue;
98use crate::virtio::fs::config::CachePolicy;
99use crate::virtio::fs::config::Config;
100#[cfg(feature = "fs_permission_translation")]
101use crate::virtio::fs::config::PermissionData;
102use crate::virtio::fs::expiring_map::ExpiringMap;
103use crate::virtio::fs::multikey::MultikeyBTreeMap;
104use crate::virtio::fs::read_dir::ReadDir;
105
106// RESOLVE_* constants are missing in libc crate for some targets (e.g. Android).
107// Define them here as they are stable Linux kernel API constants.
108const RESOLVE_NO_MAGICLINKS: u64 = 0x02;
109const RESOLVE_NO_SYMLINKS: u64 = 0x04;
110const RESOLVE_IN_ROOT: u64 = 0x10;
111
112const EMPTY_CSTR: &CStr = c"";
113const PROC_CSTR: &CStr = c"/proc";
114const UNLABELED_CSTR: &CStr = c"unlabeled";
115
116const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
117const SECURITY_XATTR: &[u8] = b"security.";
118const SELINUX_XATTR: &[u8] = b"security.selinux";
119
120const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
121const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
122
123#[cfg(feature = "arc_quota")]
124const FS_PROJINHERIT_FL: c_int = 0x20000000;
125
126// 25 seconds is the default timeout for dbus-send.
127#[cfg(feature = "arc_quota")]
128const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
129
130/// Internal utility wrapper for `cros_tracing::trace_event!()` macro with VirtioFS calls.
131macro_rules! fs_trace {
132    ($tag:expr, $name:expr, $($arg:expr),+) => {
133        cros_tracing::trace_event!(VirtioFs, $name, $tag, $($arg),*)
134    };
135}
136
137#[repr(C)]
138#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
139struct fscrypt_policy_v1 {
140    _version: u8,
141    _contents_encryption_mode: u8,
142    _filenames_encryption_mode: u8,
143    _flags: u8,
144    _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
145}
146
147#[repr(C)]
148#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
149struct fscrypt_policy_v2 {
150    _version: u8,
151    _contents_encryption_mode: u8,
152    _filenames_encryption_mode: u8,
153    _flags: u8,
154    __reserved: [u8; 4],
155    master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
156}
157
158#[repr(C)]
159#[derive(Copy, Clone, FromBytes, Immutable, KnownLayout)]
160union fscrypt_policy {
161    _version: u8,
162    _v1: fscrypt_policy_v1,
163    _v2: fscrypt_policy_v2,
164}
165
166#[repr(C)]
167#[derive(Copy, Clone, FromBytes, Immutable, KnownLayout)]
168struct fscrypt_get_policy_ex_arg {
169    policy_size: u64,       /* input/output */
170    policy: fscrypt_policy, /* output */
171}
172
173impl From<&fscrypt_get_policy_ex_arg> for &[u8] {
174    fn from(value: &fscrypt_get_policy_ex_arg) -> Self {
175        assert!(value.policy_size <= size_of::<fscrypt_policy>() as u64);
176        let data_raw: *const fscrypt_get_policy_ex_arg = value;
177        // SAFETY: the length of the output slice is asserted to be within the struct it points to
178        unsafe {
179            std::slice::from_raw_parts(
180                data_raw.cast(),
181                value.policy_size as usize + size_of::<u64>(),
182            )
183        }
184    }
185}
186
187ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
188
189#[repr(C)]
190#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
191struct fsxattr {
192    fsx_xflags: u32,     /* xflags field value (get/set) */
193    fsx_extsize: u32,    /* extsize field value (get/set) */
194    fsx_nextents: u32,   /* nextents field value (get) */
195    fsx_projid: u32,     /* project identifier (get/set) */
196    fsx_cowextsize: u32, /* CoW extsize field value (get/set) */
197    fsx_pad: [u8; 8],
198}
199
200ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
201ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
202
203ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
204ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
205
206ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
207ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
208
209ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
210ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
211
212#[cfg(feature = "arc_quota")]
213ioctl_iow_nr!(FS_IOC_SETPERMISSION, 'f' as u32, 1, FsPermissionDataBuffer);
214#[cfg(feature = "arc_quota")]
215ioctl_iow_nr!(FS_IOC_SETPATHXATTR, 'f' as u32, 1, FsPathXattrDataBuffer);
216
217#[repr(C)]
218#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
219struct fsverity_enable_arg {
220    _version: u32,
221    _hash_algorithm: u32,
222    _block_size: u32,
223    salt_size: u32,
224    salt_ptr: u64,
225    sig_size: u32,
226    __reserved1: u32,
227    sig_ptr: u64,
228    __reserved2: [u64; 11],
229}
230
231#[repr(C)]
232#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
233struct fsverity_digest {
234    _digest_algorithm: u16,
235    digest_size: u16,
236    // __u8 digest[];
237}
238
239ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
240ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
241
242pub type Inode = u64;
243type Handle = u64;
244
245#[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq)]
246struct InodeAltKey {
247    ino: libc::ino64_t,
248    dev: libc::dev_t,
249}
250
251#[derive(PartialEq, Eq, Debug)]
252enum FileType {
253    Regular,
254    Directory,
255    Other,
256}
257
258impl From<libc::mode_t> for FileType {
259    fn from(mode: libc::mode_t) -> Self {
260        match mode & libc::S_IFMT {
261            libc::S_IFREG => FileType::Regular,
262            libc::S_IFDIR => FileType::Directory,
263            _ => FileType::Other,
264        }
265    }
266}
267
268#[derive(Debug)]
269struct OpenedFile {
270    file: Option<File>,
271    open_flags: libc::c_int,
272}
273
274impl AsRawDescriptor for OpenedFile {
275    fn as_raw_descriptor(&self) -> RawDescriptor {
276        self.file().as_raw_descriptor()
277    }
278}
279
280impl OpenedFile {
281    fn new(file: File, open_flags: libc::c_int) -> Self {
282        OpenedFile {
283            file: Some(file),
284            open_flags,
285        }
286    }
287
288    fn file(&self) -> &File {
289        self.file.as_ref().expect("must have a file")
290    }
291
292    fn file_mut(&mut self) -> &mut File {
293        self.file.as_mut().expect("must have a file")
294    }
295
296    /// Leaks the file descriptor and makes the struct unusable.
297    ///
298    /// This is an optimization to speed up dropping `OpenedFile` instances, which is useful
299    /// during an abrupt shutdown. Instead of properly closing the file descriptor, which
300    /// involves a syscall, this function effectively forgets the file descriptor, relying on the
301    /// OS to clean it up when the process terminates.
302    fn leak_fd(&mut self) {
303        let f = self.file.take().expect("must have a file");
304        let _ = f.into_raw_descriptor();
305    }
306}
307
308#[derive(Debug)]
309struct InodeData {
310    inode: Inode,
311    // (File, open_flags)
312    file: Mutex<OpenedFile>,
313    refcount: AtomicU64,
314    filetype: FileType,
315    path: String,
316    // This needs to be atomic because we need to set it through a shared reference.
317    unsafe_leak_fd: AtomicBool,
318}
319
320impl AsRawDescriptor for InodeData {
321    fn as_raw_descriptor(&self) -> RawDescriptor {
322        self.file.lock().as_raw_descriptor()
323    }
324}
325
326impl Drop for InodeData {
327    /// If `unsafe_leak_fd` is set, this `drop` implementation will "leak" the file descriptor.
328    /// This is an optimization to speed up the cleanup process, based on the
329    /// assumption that the OS will handle the cleanup of file descriptors after the process
330    /// terminates. This is only okay if the process is guaranteed to terminate immediately
331    /// after the `PassthroughFs` instance is dropped.
332    fn drop(&mut self) {
333        if self.unsafe_leak_fd.load(Ordering::Relaxed) {
334            self.file.get_mut().leak_fd();
335        }
336    }
337}
338
339impl InodeData {
340    fn set_unsafe_leak_fd(&self) {
341        self.unsafe_leak_fd.store(true, Ordering::Relaxed);
342    }
343}
344
345#[derive(Debug)]
346struct HandleData {
347    inode: Inode,
348    file: Mutex<OpenedFile>,
349
350    unsafe_leak_fd: AtomicBool,
351}
352
353impl AsRawDescriptor for HandleData {
354    fn as_raw_descriptor(&self) -> RawDescriptor {
355        self.file.lock().as_raw_descriptor()
356    }
357}
358
359impl Drop for HandleData {
360    /// If `unsafe_leak_fd` is set, this `drop` implementation will "leak" the file descriptor by
361    /// forgetting it. This is an optimization to speed up the cleanup process, based on the
362    /// assumption that the OS will handle the cleanup of file descriptors after the process
363    // terminates. This is only safe if the process is guaranteed to terminate immediately
364    /// after the `PassthroughFs` instance is dropped.
365    fn drop(&mut self) {
366        if self.unsafe_leak_fd.load(Ordering::Relaxed) {
367            self.file.get_mut().leak_fd();
368        }
369    }
370}
371
372impl HandleData {
373    fn set_unsafe_leak_fd(&self) {
374        self.unsafe_leak_fd.store(true, Ordering::Relaxed);
375    }
376}
377
378macro_rules! scoped_cred {
379    ($name:ident, $ty:ty, $syscall_nr:expr) => {
380        #[derive(Debug)]
381        struct $name {
382            old: $ty,
383        }
384
385        impl $name {
386            // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
387            // credentials back to `old` when the returned struct is dropped.
388            fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
389                if val == old {
390                    // Nothing to do since we already have the correct value.
391                    return Ok(None);
392                }
393
394                // We want credential changes to be per-thread because otherwise
395                // we might interfere with operations being carried out on other
396                // threads with different uids/gids.  However, posix requires that
397                // all threads in a process share the same credentials.  To do this
398                // libc uses signals to ensure that when one thread changes its
399                // credentials the other threads do the same thing.
400                //
401                // So instead we invoke the syscall directly in order to get around
402                // this limitation.  Another option is to use the setfsuid and
403                // setfsgid systems calls.   However since those calls have no way to
404                // return an error, it's preferable to do this instead.
405
406                // SAFETY: this call is safe because it doesn't modify any memory and we
407                // check the return value.
408                let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
409                if res == 0 {
410                    Ok(Some($name { old }))
411                } else {
412                    Err(io::Error::last_os_error())
413                }
414            }
415        }
416
417        impl Drop for $name {
418            fn drop(&mut self) {
419                // SAFETY: trivially safe
420                let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
421                if res < 0 {
422                    error!(
423                        "failed to change credentials back to {}: {}",
424                        self.old,
425                        io::Error::last_os_error(),
426                    );
427                }
428            }
429        }
430    };
431}
432scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
433scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
434
435const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
436const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
437
438thread_local! {
439    // SAFETY: both calls take no parameters and only return an integer value. The kernel also
440    // guarantees that they can never fail.
441    static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
442    // SAFETY: both calls take no parameters and only return an integer value. The kernel also
443    // guarantees that they can never fail.
444    static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
445}
446
447fn set_creds(
448    uid: libc::uid_t,
449    gid: libc::gid_t,
450) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
451    let olduid = THREAD_EUID.with(|uid| *uid);
452    let oldgid = THREAD_EGID.with(|gid| *gid);
453
454    // We have to change the gid before we change the uid because if we change the uid first then we
455    // lose the capability to change the gid.  However changing back can happen in any order.
456    ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
457}
458
459thread_local!(static THREAD_FSCREATE: RefCell<Option<File>> = const { RefCell::new(None) });
460
461// Opens and returns a write-only handle to /proc/thread-self/attr/fscreate. Panics if it fails to
462// open the file.
463fn open_fscreate(proc: &File) -> File {
464    let fscreate = c"thread-self/attr/fscreate";
465
466    // SAFETY: this doesn't modify any memory and we check the return value.
467    let raw_descriptor = unsafe {
468        libc::openat(
469            proc.as_raw_descriptor(),
470            fscreate.as_ptr(),
471            libc::O_CLOEXEC | libc::O_WRONLY,
472        )
473    };
474
475    // We don't expect this to fail and we're not in a position to return an error here so just
476    // panic.
477    if raw_descriptor < 0 {
478        panic!(
479            "Failed to open /proc/thread-self/attr/fscreate: {}",
480            io::Error::last_os_error()
481        );
482    }
483
484    // SAFETY: safe because we just opened this descriptor.
485    unsafe { File::from_raw_descriptor(raw_descriptor) }
486}
487
488struct ScopedSecurityContext;
489
490impl ScopedSecurityContext {
491    fn new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext> {
492        THREAD_FSCREATE.with(|thread_fscreate| {
493            let mut fscreate = thread_fscreate.borrow_mut();
494            let file = fscreate.get_or_insert_with(|| open_fscreate(proc));
495            // SAFETY: this doesn't modify any memory and we check the return value.
496            let ret = unsafe {
497                libc::write(
498                    file.as_raw_descriptor(),
499                    ctx.as_ptr() as *const libc::c_void,
500                    ctx.to_bytes_with_nul().len(),
501                )
502            };
503            if ret < 0 {
504                Err(io::Error::last_os_error())
505            } else {
506                Ok(ScopedSecurityContext)
507            }
508        })
509    }
510}
511
512impl Drop for ScopedSecurityContext {
513    fn drop(&mut self) {
514        THREAD_FSCREATE.with(|thread_fscreate| {
515            // expect is safe here because the thread local would have been initialized by the call
516            // to `new` above.
517            let fscreate = thread_fscreate.borrow();
518            let file = fscreate
519                .as_ref()
520                .expect("Uninitialized thread-local when dropping ScopedSecurityContext");
521
522            // SAFETY: this doesn't modify any memory and we check the return value.
523            let ret = unsafe { libc::write(file.as_raw_descriptor(), ptr::null(), 0) };
524
525            if ret < 0 {
526                warn!(
527                    "Failed to restore security context: {}",
528                    io::Error::last_os_error()
529                );
530            }
531        })
532    }
533}
534
535struct ScopedUmask {
536    old: libc::mode_t,
537    mask: libc::mode_t,
538}
539
540impl ScopedUmask {
541    fn new(mask: libc::mode_t) -> ScopedUmask {
542        ScopedUmask {
543            // SAFETY: this doesn't modify any memory and always succeeds.
544            old: unsafe { libc::umask(mask) },
545            mask,
546        }
547    }
548}
549
550impl Drop for ScopedUmask {
551    fn drop(&mut self) {
552        // SAFETY: this doesn't modify any memory and always succeeds.
553        let previous = unsafe { libc::umask(self.old) };
554        debug_assert_eq!(
555            previous, self.mask,
556            "umask changed while holding ScopedUmask"
557        );
558    }
559}
560
561struct ScopedFsetid(Caps);
562impl Drop for ScopedFsetid {
563    fn drop(&mut self) {
564        if let Err(e) = raise_cap_fsetid(&mut self.0) {
565            error!(
566                "Failed to restore CAP_FSETID: {}.  Some operations may be broken.",
567                e
568            )
569        }
570    }
571}
572
573fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
574    c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
575    c.apply()
576}
577
578// Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
579// adds the capability back when it is dropped.
580fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
581    let mut caps = Caps::for_current_thread()?;
582    caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
583    caps.apply()?;
584    Ok(ScopedFsetid(caps))
585}
586
587fn ebadf() -> io::Error {
588    io::Error::from_raw_os_error(libc::EBADF)
589}
590
591fn eexist() -> io::Error {
592    io::Error::from_raw_os_error(libc::EEXIST)
593}
594
595fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
596    let mut st: MaybeUninit<libc::stat64> = MaybeUninit::<libc::stat64>::zeroed();
597
598    // SAFETY: the kernel will only write data in `st` and we check the return value.
599    syscall!(unsafe {
600        libc::fstatat64(
601            f.as_raw_descriptor(),
602            EMPTY_CSTR.as_ptr(),
603            st.as_mut_ptr(),
604            libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
605        )
606    })?;
607
608    // SAFETY: the kernel guarantees that the struct is now fully initialized.
609    Ok(unsafe { st.assume_init() })
610}
611
612fn validate_path_component(name: &CStr) -> io::Result<()> {
613    let bytes = name.to_bytes();
614    if bytes == b".." || (bytes.contains(&b'/') && bytes != b"/") {
615        return Err(io::Error::from_raw_os_error(libc::EINVAL));
616    }
617    Ok(())
618}
619
620/// A safe wrapper around `openat2` with a fallback to `openat64` for backward compatibility.
621///
622/// It attempts to use `openat2` to leverage secure path resolution flags (like `RESOLVE_IN_ROOT`).
623/// If `openat2` is not supported by the kernel (returns `ENOSYS`, e.g. on kernels older than 5.6),
624/// it falls back to standard path resolution using `openat64` to allow operation on older
625/// platforms.
626fn safe_openat2<D: AsRawDescriptor>(
627    dir: &D,
628    name: &CStr,
629    flags: libc::c_int,
630    mode: Option<libc::mode_t>,
631    resolve: u64,
632) -> io::Result<File> {
633    let mut how = open_how {
634        flags: flags as u64,
635        resolve,
636        ..Default::default()
637    };
638    if let Some(m) = mode {
639        how.mode = (m & 0o7777) as u64;
640    }
641
642    let res = openat2(dir, name, &how);
643    match res {
644        Ok(file) => Ok(file),
645        Err(e) if e.errno() == libc::ENOSYS => {
646            // Fallback to openat64 if openat2 is not supported.
647            let fd = if let Some(m) = mode {
648                // SAFETY: openat64 doesn't modify any memory and we check the return value.
649                syscall!(unsafe {
650                    libc::openat64(dir.as_raw_descriptor(), name.as_ptr(), flags, m)
651                })
652            } else {
653                // SAFETY: openat64 doesn't modify any memory and we check the return value.
654                syscall!(unsafe { libc::openat64(dir.as_raw_descriptor(), name.as_ptr(), flags) })
655            }?;
656            // SAFETY: safe because we own the fd.
657            Ok(unsafe { File::from_raw_descriptor(fd) })
658        }
659        Err(e) => Err(e.into()),
660    }
661}
662
663#[cfg(feature = "arc_quota")]
664fn is_android_project_id(project_id: u32) -> bool {
665    // The following constants defines the valid range of project ID used by
666    // Android and are taken from android_filesystem_config.h in Android
667    // codebase.
668    //
669    // Project IDs reserved for Android files on external storage. Total 100 IDs
670    // from PROJECT_ID_EXT_DEFAULT (1000) are reserved.
671    const PROJECT_ID_FOR_ANDROID_FILES: std::ops::RangeInclusive<u32> = 1000..=1099;
672    // Project IDs reserved for Android apps.
673    // The lower-limit of the range is PROJECT_ID_EXT_DATA_START.
674    // The upper-limit of the range differs before and after T. Here we use that
675    // of T (PROJECT_ID_APP_CACHE_END) as it is larger.
676    const PROJECT_ID_FOR_ANDROID_APPS: std::ops::RangeInclusive<u32> = 20000..=69999;
677
678    PROJECT_ID_FOR_ANDROID_FILES.contains(&project_id)
679        || PROJECT_ID_FOR_ANDROID_APPS.contains(&project_id)
680}
681
682/// Per-directory cache for `PassthroughFs::ascii_casefold_lookup()`.
683///
684/// The key of the underlying `BTreeMap` is a lower-cased file name in the direcoty.
685/// The value is the case-sensitive file name stored in the host file system.
686/// We assume that if PassthroughFs has exclusive access to the filesystem, this cache exhaustively
687///  covers all file names that exist within the directory.
688/// So every `PassthroughFs`'s handler that adds or removes files in the directory is expected to
689/// update this cache.
690struct CasefoldCache(BTreeMap<Vec<u8>, CString>);
691
692impl CasefoldCache {
693    fn new(dir: &InodeData) -> io::Result<Self> {
694        let mut mp = BTreeMap::new();
695
696        let mut buf = [0u8; 1024];
697        let mut offset = 0;
698        loop {
699            let mut read_dir = ReadDir::new(dir, offset, &mut buf[..])?;
700            if read_dir.remaining() == 0 {
701                break;
702            }
703
704            while let Some(entry) = read_dir.next() {
705                offset = entry.offset as libc::off64_t;
706                let entry_name = entry.name;
707                mp.insert(
708                    entry_name.to_bytes().to_ascii_lowercase(),
709                    entry_name.to_owned(),
710                );
711            }
712        }
713        Ok(Self(mp))
714    }
715
716    fn insert(&mut self, name: &CStr) {
717        let lower_case = name.to_bytes().to_ascii_lowercase();
718        self.0.insert(lower_case, name.into());
719    }
720
721    fn lookup(&self, name: &[u8]) -> Option<CString> {
722        let lower = name.to_ascii_lowercase();
723        self.0.get(&lower).cloned()
724    }
725
726    fn remove(&mut self, name: &CStr) {
727        let lower_case = name.to_bytes().to_ascii_lowercase();
728        self.0.remove(&lower_case);
729    }
730}
731
732/// Time expiring mapping from an inode of a directory to `CasefoldCache` for the directory.
733/// Each entry will be expired after `timeout`.
734/// When ascii_casefold is disabled, this struct does nothing.
735struct ExpiringCasefoldLookupCaches {
736    inner: ExpiringMap<Inode, CasefoldCache>,
737}
738
739impl ExpiringCasefoldLookupCaches {
740    fn new(timeout: Duration) -> Self {
741        Self {
742            inner: ExpiringMap::new(timeout),
743        }
744    }
745
746    fn insert(&mut self, parent: Inode, name: &CStr) {
747        if let Some(dir_cache) = self.inner.get_mut(&parent) {
748            dir_cache.insert(name);
749        }
750    }
751
752    fn remove(&mut self, parent: Inode, name: &CStr) {
753        if let Some(dir_cache) = self.inner.get_mut(&parent) {
754            dir_cache.remove(name);
755        }
756    }
757
758    fn forget(&mut self, parent: Inode) {
759        self.inner.remove(&parent);
760    }
761
762    /// Get `CasefoldCache` for the given directory.
763    /// If the cache doesn't exist, generate it by fetching directory information with
764    /// `getdents64()`.
765    fn get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache> {
766        self.inner
767            .get_or_insert_with(&parent.inode, || CasefoldCache::new(parent))
768    }
769
770    #[cfg(test)]
771    fn exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool {
772        if let Some(dir_cache) = self.inner.get(&parent) {
773            dir_cache.lookup(name.to_bytes()).is_some()
774        } else {
775            false
776        }
777    }
778}
779
780#[cfg(feature = "fs_permission_translation")]
781impl PermissionData {
782    pub(crate) fn need_set_permission(&self, path: &str) -> bool {
783        path.starts_with(&self.perm_path)
784    }
785}
786
787/// A file system that simply "passes through" all requests it receives to the underlying file
788/// system. To keep the implementation simple it servers the contents of its root directory. Users
789/// that wish to serve only a specific directory should set up the environment so that that
790/// directory ends up as the root of the file system process. One way to accomplish this is via a
791/// combination of mount namespaces and the pivot_root system call.
792///
793/// # Safety
794///
795/// The `Drop` implementation for this struct intentionally leaks all open file
796/// descriptors. It is **critical** that an instance of `PassthroughFs` is
797/// only dropped immediately prior to process termination. Failure to uphold
798/// this invariant **will** result in resource leaks. This is a deliberate
799/// performance optimization for abrupt shutdowns, where we let the OS
800/// handle resource cleanup.
801pub struct PassthroughFs {
802    // Mutex that must be acquired before executing a process-wide operation such as fchdir.
803    process_lock: Mutex<()>,
804    // virtio-fs tag that the guest uses when mounting. This is only used for debugging
805    // when tracing is enabled.
806    tag: String,
807
808    // File descriptors for various points in the file system tree.
809    inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
810    next_inode: AtomicU64,
811
812    // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
813    // used for reading and writing data.
814    handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
815    next_handle: AtomicU64,
816
817    // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
818    // `inodes` into one that can go into `handles`. This is accomplished by reading the
819    // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
820    // to be serving doesn't have access to `/proc`.
821    proc: File,
822
823    // Whether writeback caching is enabled for this directory. This will only be true when
824    // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
825    writeback: AtomicBool,
826
827    // Whether zero message opens are supported by the kernel driver.
828    zero_message_open: AtomicBool,
829
830    // Whether zero message opendir is supported by the kernel driver.
831    zero_message_opendir: AtomicBool,
832
833    // Used to communicate with other processes using D-Bus.
834    #[cfg(feature = "arc_quota")]
835    dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
836    #[cfg(feature = "arc_quota")]
837    dbus_fd: Option<std::os::unix::io::RawFd>,
838
839    // Time-expiring cache for `ascii_casefold_lookup()`.
840    // The key is an inode of a directory, and the value is a cache for the directory.
841    // Each value will be expired `cfg.timeout` after it's created.
842    //
843    // TODO(b/267748212): Instead of per-device Mutex, we might want to have per-directory Mutex
844    // if we use PassthroughFs in multi-threaded environments.
845    expiring_casefold_lookup_caches: Option<Mutex<ExpiringCasefoldLookupCaches>>,
846
847    // paths and coresponding permission setting set by `crosvm_client_fs_permission_set` API
848    #[cfg(feature = "fs_permission_translation")]
849    permission_paths: RwLock<Vec<PermissionData>>,
850
851    // paths and coresponding xattr setting set by `crosvm_client_fs_xattr_set` API
852    #[cfg(feature = "arc_quota")]
853    xattr_paths: RwLock<Vec<XattrData>>,
854
855    cfg: Config,
856
857    // Set the root directory when pivot root isn't enabled for jailed process.
858    //
859    // virtio-fs typically uses mount namespaces and pivot_root for file system isolation,
860    // making the jailed process's root directory "/".
861    //
862    // However, Android's security model prevents crosvm from having the necessary SYS_ADMIN
863    // capability for mount namespaces and pivot_root. This lack of isolation means that
864    // root_dir defaults to the path provided via "--shared-dir".
865    root_dir: String,
866}
867
868impl std::fmt::Debug for PassthroughFs {
869    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
870        f.debug_struct("PassthroughFs")
871            .field("tag", &self.tag)
872            .field("next_inode", &self.next_inode)
873            .field("next_handle", &self.next_handle)
874            .field("proc", &self.proc)
875            .field("writeback", &self.writeback)
876            .field("zero_message_open", &self.zero_message_open)
877            .field("zero_message_opendir", &self.zero_message_opendir)
878            .field("cfg", &self.cfg)
879            .finish()
880    }
881}
882
883impl PassthroughFs {
884    pub fn new(tag: &str, cfg: Config) -> io::Result<PassthroughFs> {
885        // SAFETY: this doesn't modify any memory and we check the return value.
886        let raw_descriptor = syscall!(unsafe {
887            libc::openat64(
888                libc::AT_FDCWD,
889                PROC_CSTR.as_ptr(),
890                libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
891            )
892        })?;
893
894        // Privileged UIDs can use D-Bus to perform some operations.
895        #[cfg(feature = "arc_quota")]
896        let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
897            (None, None)
898        } else {
899            let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
900                .map_err(io::Error::other)?;
901            channel.set_watch_enabled(true);
902            let dbus_fd = channel.watch().fd;
903            channel.set_watch_enabled(false);
904            (
905                Some(Mutex::new(dbus::blocking::Connection::from(channel))),
906                Some(dbus_fd),
907            )
908        };
909
910        // SAFETY: safe because we just opened this descriptor.
911        let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
912
913        let expiring_casefold_lookup_caches = if cfg.ascii_casefold {
914            Some(Mutex::new(ExpiringCasefoldLookupCaches::new(cfg.timeout)))
915        } else {
916            None
917        };
918
919        #[allow(unused_mut)]
920        let mut passthroughfs = PassthroughFs {
921            process_lock: Mutex::new(()),
922            tag: tag.to_string(),
923            inodes: Mutex::new(MultikeyBTreeMap::new()),
924            next_inode: AtomicU64::new(ROOT_ID + 1),
925
926            handles: Mutex::new(BTreeMap::new()),
927            next_handle: AtomicU64::new(1),
928
929            proc,
930
931            writeback: AtomicBool::new(false),
932            zero_message_open: AtomicBool::new(false),
933            zero_message_opendir: AtomicBool::new(false),
934
935            #[cfg(feature = "arc_quota")]
936            dbus_connection,
937            #[cfg(feature = "arc_quota")]
938            dbus_fd,
939            expiring_casefold_lookup_caches,
940            #[cfg(feature = "fs_permission_translation")]
941            permission_paths: RwLock::new(Vec::new()),
942            #[cfg(feature = "arc_quota")]
943            xattr_paths: RwLock::new(Vec::new()),
944            cfg,
945            root_dir: "/".to_string(),
946        };
947
948        #[cfg(feature = "fs_runtime_ugid_map")]
949        passthroughfs.set_permission_path();
950
951        cros_tracing::trace_simple_print!(
952            VirtioFs,
953            "New PassthroughFS initialized: {:?}",
954            passthroughfs
955        );
956        Ok(passthroughfs)
957    }
958
959    #[cfg(feature = "fs_runtime_ugid_map")]
960    fn set_permission_path(&mut self) {
961        if !self.cfg.ugid_map.is_empty() {
962            let mut write_lock = self
963                .permission_paths
964                .write()
965                .expect("Failed to acquire write lock on permission_paths");
966            *write_lock = self.cfg.ugid_map.clone();
967        }
968    }
969
970    pub fn set_root_dir(&mut self, shared_dir: String) -> io::Result<()> {
971        let canonicalized_root = match std::fs::canonicalize(shared_dir) {
972            Ok(path) => path,
973            Err(e) => {
974                return Err(io::Error::new(
975                    io::ErrorKind::InvalidInput,
976                    format!("Failed to canonicalize root_dir: {e}"),
977                ));
978            }
979        };
980        self.root_dir = canonicalized_root.to_string_lossy().to_string();
981        Ok(())
982    }
983
984    pub fn cfg(&self) -> &Config {
985        &self.cfg
986    }
987
988    pub fn keep_rds(&self) -> Vec<RawDescriptor> {
989        #[cfg_attr(not(feature = "arc_quota"), allow(unused_mut))]
990        let mut keep_rds = vec![self.proc.as_raw_descriptor()];
991        #[cfg(feature = "arc_quota")]
992        if let Some(fd) = self.dbus_fd {
993            keep_rds.push(fd);
994        }
995        keep_rds
996    }
997
998    fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
999        if !self.cfg.rewrite_security_xattrs {
1000            return Cow::Borrowed(name);
1001        }
1002
1003        // Does not include nul-terminator.
1004        let buf = name.to_bytes();
1005        if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
1006            return Cow::Borrowed(name);
1007        }
1008
1009        let mut newname = USER_VIRTIOFS_XATTR.to_vec();
1010        newname.extend_from_slice(buf);
1011
1012        // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
1013        // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
1014        Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
1015    }
1016
1017    fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
1018        self.inodes.lock().get(&inode).cloned().ok_or_else(ebadf)
1019    }
1020
1021    fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
1022        self.handles
1023            .lock()
1024            .get(&handle)
1025            .filter(|hd| hd.inode == inode)
1026            .cloned()
1027            .ok_or_else(ebadf)
1028    }
1029
1030    fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
1031        let pathname = CString::new(format!("self/fd/{fd}"))
1032            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1033
1034        // SAFETY: this doesn't modify any memory and we check the return value. We don't really
1035        // check `flags` because if the kernel can't handle poorly specified flags then we have
1036        // much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
1037        // to follow the `/proc/self/fd` symlink to get the file.
1038        let raw_descriptor = syscall!(unsafe {
1039            libc::openat64(
1040                self.proc.as_raw_descriptor(),
1041                pathname.as_ptr(),
1042                (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1043            )
1044        })?;
1045
1046        // SAFETY: safe because we just opened this descriptor.
1047        Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
1048    }
1049
1050    /// Modifies the provided open flags based on the writeback caching configuration.
1051    /// Return the updated open flags.
1052    fn update_open_flags(&self, mut flags: i32) -> i32 {
1053        // When writeback caching is enabled, the kernel may send read requests even if the
1054        // userspace program opened the file write-only. So we need to ensure that we have opened
1055        // the file for reading as well as writing.
1056        let writeback = self.writeback.load(Ordering::Relaxed);
1057        if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
1058            flags &= !libc::O_ACCMODE;
1059            flags |= libc::O_RDWR;
1060        }
1061
1062        // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
1063        // However, this breaks atomicity as the file may have changed on disk, invalidating the
1064        // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
1065        // the file. Just allow this for now as it is the user's responsibility to enable writeback
1066        // caching only for directories that are not shared. It also means that we need to clear the
1067        // `O_APPEND` flag.
1068        if writeback && flags & libc::O_APPEND != 0 {
1069            flags &= !libc::O_APPEND;
1070        }
1071
1072        flags
1073    }
1074
1075    fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
1076        // handle writeback caching cases
1077        flags = self.update_open_flags(flags);
1078
1079        self.open_fd(inode.as_raw_descriptor(), flags)
1080    }
1081
1082    // Increases the inode refcount and returns the inode.
1083    fn increase_inode_refcount(&self, inode_data: &InodeData) -> Inode {
1084        // Matches with the release store in `forget`.
1085        inode_data.refcount.fetch_add(1, Ordering::Acquire);
1086        inode_data.inode
1087    }
1088
1089    // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
1090    // The inodes mutex lock must not be already taken by the same thread otherwise this
1091    // will deadlock.
1092    fn add_entry(
1093        &self,
1094        f: File,
1095        #[cfg_attr(not(feature = "fs_permission_translation"), allow(unused_mut))]
1096        mut st: libc::stat64,
1097        open_flags: libc::c_int,
1098        path: String,
1099    ) -> Entry {
1100        #[cfg(feature = "arc_quota")]
1101        self.set_permission(&mut st, &path);
1102        #[cfg(feature = "fs_runtime_ugid_map")]
1103        self.set_ugid_permission(&mut st, &path);
1104        let mut inodes = self.inodes.lock();
1105
1106        let altkey = InodeAltKey {
1107            ino: st.st_ino,
1108            dev: st.st_dev,
1109        };
1110
1111        let inode = if let Some(data) = inodes.get_alt(&altkey) {
1112            self.increase_inode_refcount(data)
1113        } else {
1114            let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
1115            inodes.insert(
1116                inode,
1117                altkey,
1118                Arc::new(InodeData {
1119                    inode,
1120                    file: Mutex::new(OpenedFile::new(f, open_flags)),
1121                    refcount: AtomicU64::new(1),
1122                    filetype: st.st_mode.into(),
1123                    path,
1124                    unsafe_leak_fd: AtomicBool::new(false),
1125                }),
1126            );
1127
1128            inode
1129        };
1130
1131        Entry {
1132            inode,
1133            generation: 0,
1134            attr: st,
1135            // We use the same timeout for the attribute and the entry.
1136            attr_timeout: self.cfg.timeout,
1137            entry_timeout: self.cfg.timeout,
1138        }
1139    }
1140
1141    /// Acquires lock of `expiring_casefold_lookup_caches` if `ascii_casefold` is enabled.
1142    fn lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>> {
1143        self.expiring_casefold_lookup_caches
1144            .as_ref()
1145            .map(|c| c.lock())
1146    }
1147
1148    // Returns an actual case-sensitive file name that matches with the given `name`.
1149    // Returns `Ok(None)` if no file matches with the give `name`.
1150    // This function will panic if casefold is not enabled.
1151    fn get_case_unfolded_name(
1152        &self,
1153        parent: &InodeData,
1154        name: &[u8],
1155    ) -> io::Result<Option<CString>> {
1156        let mut caches = self
1157            .lock_casefold_lookup_caches()
1158            .expect("casefold must be enabled");
1159        let dir_cache = caches.get(parent)?;
1160        Ok(dir_cache.lookup(name))
1161    }
1162
1163    // Performs an ascii case insensitive lookup.
1164    fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
1165        match self.get_case_unfolded_name(parent, name)? {
1166            None => Err(io::Error::from_raw_os_error(libc::ENOENT)),
1167            Some(actual_name) => self.do_lookup(parent, &actual_name),
1168        }
1169    }
1170
1171    #[cfg(test)]
1172    fn exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool {
1173        let mut cache = self
1174            .lock_casefold_lookup_caches()
1175            .expect("casefold must be enabled");
1176        cache.exists_in_cache(parent, name)
1177    }
1178
1179    fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
1180        let path_file = safe_openat2(
1181            parent,
1182            name,
1183            libc::O_PATH | libc::O_CLOEXEC | libc::O_NOFOLLOW,
1184            None,
1185            RESOLVE_IN_ROOT | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS,
1186        )?;
1187
1188        #[allow(unused_mut)]
1189        let mut st = stat(&path_file)?;
1190
1191        let altkey = InodeAltKey {
1192            ino: st.st_ino,
1193            dev: st.st_dev,
1194        };
1195
1196        let path = format!(
1197            "{}/{}",
1198            parent.path.clone(),
1199            name.to_str().unwrap_or("<non UTF-8 str>")
1200        );
1201
1202        // Check if we already have an entry before opening a new file.
1203        if let Some(data) = self.inodes.lock().get_alt(&altkey) {
1204            // Return the same inode with the reference counter increased.
1205            #[cfg(feature = "arc_quota")]
1206            self.set_permission(&mut st, &path);
1207            #[cfg(feature = "fs_runtime_ugid_map")]
1208            self.set_ugid_permission(&mut st, &path);
1209            return Ok(Entry {
1210                inode: self.increase_inode_refcount(data),
1211                generation: 0,
1212                attr: st,
1213                // We use the same timeout for the attribute and the entry.
1214                attr_timeout: self.cfg.timeout,
1215                entry_timeout: self.cfg.timeout,
1216            });
1217        }
1218
1219        // Now we need to get a file descriptor that can be used for operations
1220        // that don't support O_PATH. We try to open it with O_RDONLY or O_DIRECTORY
1221        // first.
1222        let mut flags = libc::O_RDONLY | libc::O_CLOEXEC;
1223        match FileType::from(st.st_mode) {
1224            FileType::Regular => {}
1225            FileType::Directory => flags |= libc::O_DIRECTORY,
1226            FileType::Other => flags |= libc::O_PATH,
1227        };
1228
1229        // We use /proc/self/fd/{path_fd} to open the file again with full permissions.
1230        // This is safe because we resolved the path securely above.
1231        let pathname = CString::new(format!("self/fd/{}", path_file.as_raw_descriptor()))
1232            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1233
1234        // SAFETY: this doesn't modify any memory and we check the return value.
1235        let fd = match syscall!(unsafe {
1236            libc::openat64(self.proc.as_raw_descriptor(), pathname.as_ptr(), flags)
1237        }) {
1238            Ok(fd) => fd,
1239            Err(e) if e.errno() == libc::EACCES => {
1240                // Fall back to O_PATH if we can't read it.
1241                flags |= libc::O_PATH;
1242                // SAFETY: this doesn't modify any memory and we check the return value.
1243                syscall!(unsafe {
1244                    libc::openat64(self.proc.as_raw_descriptor(), pathname.as_ptr(), flags)
1245                })?
1246            }
1247            Err(e) => return Err(e.into()),
1248        };
1249
1250        // SAFETY: safe because we own the fd.
1251        let f = unsafe { File::from_raw_descriptor(fd) };
1252        flags |= libc::O_NOFOLLOW;
1253        Ok(self.add_entry(f, st, flags, path))
1254    }
1255
1256    fn get_cache_open_options(&self, flags: u32) -> OpenOptions {
1257        let mut opts = OpenOptions::empty();
1258        match self.cfg.cache_policy {
1259            // We only set the direct I/O option on files.
1260            CachePolicy::Never => opts.set(
1261                OpenOptions::DIRECT_IO,
1262                flags & (libc::O_DIRECTORY as u32) == 0,
1263            ),
1264            CachePolicy::Always => {
1265                opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
1266                    OpenOptions::KEEP_CACHE
1267                } else {
1268                    OpenOptions::CACHE_DIR
1269                }
1270            }
1271            _ => {}
1272        };
1273        opts
1274    }
1275
1276    // Performs lookup using original name first, if it fails and ascii_casefold is enabled,
1277    // it tries to unfold the name and do lookup again.
1278    fn do_lookup_with_casefold_fallback(
1279        &self,
1280        parent: &InodeData,
1281        name: &CStr,
1282    ) -> io::Result<Entry> {
1283        let mut res = self.do_lookup(parent, name);
1284        // If `ascii_casefold` is enabled, fallback to `ascii_casefold_lookup()`.
1285        if res.is_err() && self.cfg.ascii_casefold {
1286            res = self.ascii_casefold_lookup(parent, name.to_bytes());
1287        }
1288        res
1289    }
1290
1291    fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
1292        let inode_data = self.find_inode(inode)?;
1293
1294        let file = self.open_inode(&inode_data, flags as i32)?;
1295
1296        let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1297        let data = HandleData {
1298            inode,
1299            file: Mutex::new(OpenedFile::new(file, flags as i32)),
1300            unsafe_leak_fd: AtomicBool::new(false),
1301        };
1302
1303        self.handles.lock().insert(handle, Arc::new(data));
1304
1305        let opts = self.get_cache_open_options(flags);
1306
1307        Ok((Some(handle), opts))
1308    }
1309
1310    fn do_open_at(
1311        &self,
1312        parent_data: Arc<InodeData>,
1313        name: &CStr,
1314        inode: Inode,
1315        flags: u32,
1316    ) -> io::Result<(Option<Handle>, OpenOptions)> {
1317        let open_flags = self.update_open_flags(flags as i32);
1318
1319        let fd_open = syscall!(
1320            // SAFETY: return value is checked.
1321            unsafe {
1322                libc::openat64(
1323                    parent_data.as_raw_descriptor(),
1324                    name.as_ptr(),
1325                    (open_flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1326                )
1327            }
1328        )?;
1329
1330        // SAFETY: fd_open is valid
1331        let file_open = unsafe { File::from_raw_descriptor(fd_open) };
1332        let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1333        let data = HandleData {
1334            inode,
1335            file: Mutex::new(OpenedFile::new(file_open, open_flags)),
1336            unsafe_leak_fd: AtomicBool::new(false),
1337        };
1338
1339        self.handles.lock().insert(handle, Arc::new(data));
1340
1341        let opts = self.get_cache_open_options(open_flags as u32);
1342        Ok((Some(handle), opts))
1343    }
1344
1345    fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
1346        let mut handles = self.handles.lock();
1347
1348        if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
1349            if e.get().inode == inode {
1350                // We don't need to close the file here because that will happen automatically when
1351                // the last `Arc` is dropped.
1352                e.remove();
1353                return Ok(());
1354            }
1355        }
1356
1357        Err(ebadf())
1358    }
1359
1360    fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
1361        #[allow(unused_mut)]
1362        let mut st = stat(inode)?;
1363
1364        #[cfg(feature = "arc_quota")]
1365        self.set_permission(&mut st, &inode.path);
1366        #[cfg(feature = "fs_runtime_ugid_map")]
1367        self.set_ugid_permission(&mut st, &inode.path);
1368        Ok((st, self.cfg.timeout))
1369    }
1370
1371    fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
1372        if name.to_bytes().contains(&b'/') {
1373            return Err(io::Error::from_raw_os_error(libc::EINVAL));
1374        }
1375        // SAFETY: this doesn't modify any memory and we check the return value.
1376        syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
1377        Ok(())
1378    }
1379
1380    fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
1381        // SAFETY: this doesn't modify any memory and we check the return value.
1382        syscall!(unsafe {
1383            if datasync {
1384                libc::fdatasync(file.as_raw_descriptor())
1385            } else {
1386                libc::fsync(file.as_raw_descriptor())
1387            }
1388        })?;
1389
1390        Ok(())
1391    }
1392
1393    // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
1394    // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
1395    // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
1396    // root inode.
1397    //
1398    // NOTE: this method acquires an `self`-wide lock. If any locks are acquired in `f`, care must
1399    // be taken to avoid the risk of deadlocks.
1400    fn with_proc_chdir<F, T>(&self, f: F) -> T
1401    where
1402        F: FnOnce() -> T,
1403    {
1404        let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
1405
1406        // Acquire a lock for `fchdir`.
1407        let _proc_lock = self.process_lock.lock();
1408        // SAFETY: this doesn't modify any memory and we check the return value. Since the
1409        // fchdir should never fail we just use debug_asserts.
1410        let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
1411        debug_assert_eq!(
1412            proc_cwd,
1413            0,
1414            "failed to fchdir to /proc: {}",
1415            io::Error::last_os_error()
1416        );
1417
1418        let res = f();
1419
1420        // SAFETY: this doesn't modify any memory and we check the return value. Since the
1421        // fchdir should never fail we just use debug_asserts.
1422        let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
1423        debug_assert_eq!(
1424            root_cwd,
1425            0,
1426            "failed to fchdir back to root directory: {}",
1427            io::Error::last_os_error()
1428        );
1429
1430        res
1431    }
1432
1433    fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
1434        let file = inode.file.lock();
1435        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
1436        let res = if o_path_file {
1437            // For FDs opened with `O_PATH`, we cannot call `fgetxattr` normally. Instead we
1438            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1439            //  and then setting the CWD back to the root directory.
1440            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
1441                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1442
1443            // SAFETY: this will only modify `value` and we check the return value.
1444            self.with_proc_chdir(|| unsafe {
1445                libc::getxattr(
1446                    path.as_ptr(),
1447                    name.as_ptr(),
1448                    value.as_mut_ptr() as *mut libc::c_void,
1449                    value.len() as libc::size_t,
1450                )
1451            })
1452        } else {
1453            // For regular files and directories, we can just use fgetxattr.
1454            // SAFETY: this will only write to `value` and we check the return value.
1455            unsafe {
1456                libc::fgetxattr(
1457                    file.as_raw_descriptor(),
1458                    name.as_ptr(),
1459                    value.as_mut_ptr() as *mut libc::c_void,
1460                    value.len() as libc::size_t,
1461                )
1462            }
1463        };
1464
1465        if res < 0 {
1466            Err(io::Error::last_os_error())
1467        } else {
1468            Ok(res as usize)
1469        }
1470    }
1471
1472    fn get_encryption_policy_ex<R: io::Read>(
1473        &self,
1474        inode: Inode,
1475        handle: Handle,
1476        mut r: R,
1477    ) -> io::Result<IoctlReply> {
1478        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1479            self.find_inode(inode)?
1480        } else {
1481            self.find_handle(handle, inode)?
1482        };
1483
1484        // SAFETY: this struct only has integer fields and any value is valid.
1485        let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
1486        r.read_exact(arg.policy_size.as_mut_bytes())?;
1487
1488        let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
1489        arg.policy_size = policy_size;
1490
1491        let res =
1492            // SAFETY: the kernel will only write to `arg` and we check the return value.
1493            unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX, &mut arg) };
1494        if res < 0 {
1495            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1496        } else {
1497            let len = size_of::<u64>() + arg.policy_size as usize;
1498            Ok(IoctlReply::Done(Ok(<&[u8]>::from(&arg)[..len].to_vec())))
1499        }
1500    }
1501
1502    fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1503        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1504            self.find_inode(inode)?
1505        } else {
1506            self.find_handle(handle, inode)?
1507        };
1508
1509        let mut buf = MaybeUninit::<fsxattr>::zeroed();
1510
1511        // SAFETY: the kernel will only write to `buf` and we check the return value.
1512        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR, buf.as_mut_ptr()) };
1513        if res < 0 {
1514            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1515        } else {
1516            // SAFETY: the kernel guarantees that the policy is now initialized.
1517            let xattr = unsafe { buf.assume_init() };
1518            Ok(IoctlReply::Done(Ok(xattr.as_bytes().to_vec())))
1519        }
1520    }
1521
1522    fn set_fsxattr<R: io::Read>(
1523        &self,
1524        #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1525        inode: Inode,
1526        handle: Handle,
1527        mut r: R,
1528    ) -> io::Result<IoctlReply> {
1529        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1530            self.find_inode(inode)?
1531        } else {
1532            self.find_handle(handle, inode)?
1533        };
1534
1535        let mut in_attr = fsxattr::new_zeroed();
1536        r.read_exact(in_attr.as_mut_bytes())?;
1537
1538        #[cfg(feature = "arc_quota")]
1539        let st = stat(&*data)?;
1540
1541        #[cfg(feature = "arc_quota")]
1542        let ctx_uid = self.lookup_host_uid(&ctx, inode);
1543
1544        // Changing quota project ID requires CAP_FOWNER or being file owner.
1545        // Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
1546        #[cfg(feature = "arc_quota")]
1547        if ctx_uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx_uid) {
1548            // Get the current fsxattr.
1549            let mut buf = MaybeUninit::<fsxattr>::zeroed();
1550            // SAFETY: the kernel will only write to `buf` and we check the return value.
1551            let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR, buf.as_mut_ptr()) };
1552            if res < 0 {
1553                return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1554            }
1555            // SAFETY: the kernel guarantees that the policy is now initialized.
1556            let current_attr = unsafe { buf.assume_init() };
1557
1558            // Project ID cannot be changed inside a user namespace.
1559            // Use Spaced to avoid this restriction.
1560            if current_attr.fsx_projid != in_attr.fsx_projid {
1561                let connection = self.dbus_connection.as_ref().unwrap().lock();
1562                let proxy = connection.with_proxy(
1563                    "org.chromium.Spaced",
1564                    "/org/chromium/Spaced",
1565                    DEFAULT_DBUS_TIMEOUT,
1566                );
1567                let project_id = in_attr.fsx_projid;
1568                if !is_android_project_id(project_id) {
1569                    return Err(io::Error::from_raw_os_error(libc::EINVAL));
1570                }
1571                let file_clone = base::SafeDescriptor::try_from(&*data)?;
1572                match proxy.set_project_id(file_clone.into(), project_id) {
1573                    Ok(r) => {
1574                        let r = SetProjectIdReply::parse_from_bytes(&r)
1575                            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1576                        if !r.success {
1577                            return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1578                                r.error,
1579                            ))));
1580                        }
1581                    }
1582                    Err(e) => {
1583                        return Err(io::Error::other(e));
1584                    }
1585                };
1586            }
1587        }
1588
1589        //  SAFETY: this doesn't modify any memory and we check the return value.
1590        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR, &in_attr) };
1591        if res < 0 {
1592            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1593        } else {
1594            Ok(IoctlReply::Done(Ok(Vec::new())))
1595        }
1596    }
1597
1598    fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1599        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1600            self.find_inode(inode)?
1601        } else {
1602            self.find_handle(handle, inode)?
1603        };
1604
1605        // The ioctl encoding is a long but the parameter is actually an int.
1606        let mut flags: c_int = 0;
1607
1608        // SAFETY: the kernel will only write to `flags` and we check the return value.
1609        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS, &mut flags) };
1610        if res < 0 {
1611            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1612        } else {
1613            Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
1614        }
1615    }
1616
1617    fn set_flags<R: io::Read>(
1618        &self,
1619        #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1620        inode: Inode,
1621        handle: Handle,
1622        mut r: R,
1623    ) -> io::Result<IoctlReply> {
1624        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1625            self.find_inode(inode)?
1626        } else {
1627            self.find_handle(handle, inode)?
1628        };
1629
1630        // The ioctl encoding is a long but the parameter is actually an int.
1631        let mut in_flags: c_int = 0;
1632        r.read_exact(in_flags.as_mut_bytes())?;
1633
1634        #[cfg(feature = "arc_quota")]
1635        let st = stat(&*data)?;
1636
1637        #[cfg(feature = "arc_quota")]
1638        let ctx_uid = self.lookup_host_uid(&ctx, inode);
1639
1640        // Only privleged uid can perform FS_IOC_SETFLAGS through cryptohome.
1641        #[cfg(feature = "arc_quota")]
1642        if ctx_uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx_uid) {
1643            // Get the current flag.
1644            let mut buf = MaybeUninit::<c_int>::zeroed();
1645            // SAFETY: the kernel will only write to `buf` and we check the return value.
1646            let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS, buf.as_mut_ptr()) };
1647            if res < 0 {
1648                return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1649            }
1650            // SAFETY: the kernel guarantees that the policy is now initialized.
1651            let current_flags = unsafe { buf.assume_init() };
1652
1653            // Project inheritance flag cannot be changed inside a user namespace.
1654            // Use Spaced to avoid this restriction.
1655            if (in_flags & FS_PROJINHERIT_FL) != (current_flags & FS_PROJINHERIT_FL) {
1656                let connection = self.dbus_connection.as_ref().unwrap().lock();
1657                let proxy = connection.with_proxy(
1658                    "org.chromium.Spaced",
1659                    "/org/chromium/Spaced",
1660                    DEFAULT_DBUS_TIMEOUT,
1661                );
1662                // If the input flags contain FS_PROJINHERIT_FL, then it is a set. Otherwise it is a
1663                // reset.
1664                let enable = (in_flags & FS_PROJINHERIT_FL) == FS_PROJINHERIT_FL;
1665                let file_clone = base::SafeDescriptor::try_from(&*data)?;
1666                match proxy.set_project_inheritance_flag(file_clone.into(), enable) {
1667                    Ok(r) => {
1668                        let r = SetProjectInheritanceFlagReply::parse_from_bytes(&r)
1669                            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1670                        if !r.success {
1671                            return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1672                                r.error,
1673                            ))));
1674                        }
1675                    }
1676                    Err(e) => {
1677                        return Err(io::Error::other(e));
1678                    }
1679                };
1680            }
1681        }
1682
1683        // SAFETY: this doesn't modify any memory and we check the return value.
1684        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS, &in_flags) };
1685        if res < 0 {
1686            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1687        } else {
1688            Ok(IoctlReply::Done(Ok(Vec::new())))
1689        }
1690    }
1691
1692    fn enable_verity<R: io::Read>(
1693        &self,
1694        inode: Inode,
1695        handle: Handle,
1696        mut r: R,
1697    ) -> io::Result<IoctlReply> {
1698        let inode_data = self.find_inode(inode)?;
1699
1700        // These match the return codes from `fsverity_ioctl_enable` in the kernel.
1701        match inode_data.filetype {
1702            FileType::Regular => {}
1703            FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
1704            FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
1705        }
1706
1707        {
1708            // We cannot enable verity while holding a writable fd so get a new one, if necessary.
1709            let mut file = inode_data.file.lock();
1710            let mut flags = file.open_flags;
1711            match flags & libc::O_ACCMODE {
1712                libc::O_WRONLY | libc::O_RDWR => {
1713                    flags &= !libc::O_ACCMODE;
1714                    flags |= libc::O_RDONLY;
1715
1716                    // We need to get a read-only handle for this file.
1717                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
1718                    *file = OpenedFile::new(newfile, flags);
1719                }
1720                libc::O_RDONLY => {}
1721                _ => panic!("Unexpected flags: {flags:#x}"),
1722            }
1723        }
1724
1725        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1726            inode_data
1727        } else {
1728            let data = self.find_handle(handle, inode)?;
1729
1730            {
1731                // We can't enable verity while holding a writable fd. We don't know whether the
1732                // file was opened for writing so check it here. We don't expect
1733                // this to be a frequent operation so the extra latency should be
1734                // fine.
1735                let mut file = data.file.lock();
1736                let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
1737                match flags {
1738                    FileFlags::ReadWrite | FileFlags::Write => {
1739                        // We need to get a read-only handle for this file.
1740                        *file = OpenedFile::new(
1741                            self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?,
1742                            libc::O_RDONLY,
1743                        );
1744                    }
1745                    FileFlags::Read => {}
1746                }
1747            }
1748
1749            data
1750        };
1751
1752        let mut arg = fsverity_enable_arg::new_zeroed();
1753        r.read_exact(arg.as_mut_bytes())?;
1754
1755        let mut salt;
1756        if arg.salt_size > 0 {
1757            if arg.salt_size > self.max_buffer_size() {
1758                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1759                    libc::ENOMEM,
1760                ))));
1761            }
1762            salt = vec![0; arg.salt_size as usize];
1763            r.read_exact(&mut salt)?;
1764            arg.salt_ptr = salt.as_ptr() as usize as u64;
1765        } else {
1766            arg.salt_ptr = 0;
1767        }
1768
1769        let mut sig;
1770        if arg.sig_size > 0 {
1771            if arg.sig_size > self.max_buffer_size() {
1772                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1773                    libc::ENOMEM,
1774                ))));
1775            }
1776            sig = vec![0; arg.sig_size as usize];
1777            r.read_exact(&mut sig)?;
1778            arg.sig_ptr = sig.as_ptr() as usize as u64;
1779        } else {
1780            arg.sig_ptr = 0;
1781        }
1782
1783        // SAFETY: this doesn't modify any memory and we check the return value.
1784        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY, &arg) };
1785        if res < 0 {
1786            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1787        } else {
1788            Ok(IoctlReply::Done(Ok(Vec::new())))
1789        }
1790    }
1791
1792    fn measure_verity<R: io::Read>(
1793        &self,
1794        inode: Inode,
1795        handle: Handle,
1796        mut r: R,
1797        out_size: u32,
1798    ) -> io::Result<IoctlReply> {
1799        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1800            self.find_inode(inode)?
1801        } else {
1802            self.find_handle(handle, inode)?
1803        };
1804
1805        let mut digest = fsverity_digest::new_zeroed();
1806        r.read_exact(digest.as_mut_bytes())?;
1807
1808        // Taken from fs/verity/fsverity_private.h.
1809        const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
1810
1811        // This digest size is what the fsverity command line utility uses.
1812        const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
1813        const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
1814        const ROUNDED_LEN: usize = BUFLEN.div_ceil(size_of::<fsverity_digest>());
1815
1816        // Make sure we get a properly aligned allocation.
1817        let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
1818
1819        // SAFETY: we are only writing data and not reading uninitialized memory.
1820        unsafe {
1821            // TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
1822            addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
1823                .write(DIGEST_SIZE)
1824        };
1825
1826        // SAFETY: this will only modify `buf` and we check the return value.
1827        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY, buf.as_mut_ptr()) };
1828        if res < 0 {
1829            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1830        } else {
1831            let digest_size =
1832                // SAFETY: this value was initialized by us already and then overwritten by the kernel.
1833                // TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
1834                unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
1835            let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
1836
1837            // The kernel guarantees this but it doesn't hurt to be paranoid.
1838            debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
1839            if digest.digest_size < digest_size || out_size < outlen {
1840                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1841                    libc::EOVERFLOW,
1842                ))));
1843            }
1844
1845            let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
1846                // SAFETY: any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
1847                // doesn't contain any references.
1848                unsafe { mem::transmute(buf) };
1849
1850            let buf =
1851                // SAFETY: Casting to `*const [u8]` is safe because the kernel guarantees that the
1852                // first `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed
1853                // to have the same layout as `u8`.
1854                // TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
1855                unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
1856            Ok(IoctlReply::Done(Ok(buf.to_vec())))
1857        }
1858    }
1859}
1860
1861#[cfg(feature = "fs_runtime_ugid_map")]
1862impl PassthroughFs {
1863    fn find_and_set_ugid_permission(
1864        &self,
1865        st: &mut libc::stat64,
1866        path: &str,
1867        is_root_path: bool,
1868    ) -> bool {
1869        for perm_data in self
1870            .permission_paths
1871            .read()
1872            .expect("acquire permission_paths read lock")
1873            .iter()
1874        {
1875            if (is_root_path && perm_data.perm_path == "/")
1876                || (!is_root_path
1877                    && perm_data.perm_path != "/"
1878                    && perm_data.need_set_permission(path))
1879            {
1880                self.set_permission_from_data(st, perm_data);
1881                return true;
1882            }
1883        }
1884        false
1885    }
1886
1887    fn set_permission_from_data(&self, st: &mut libc::stat64, perm_data: &PermissionData) {
1888        st.st_uid = perm_data.guest_uid;
1889        st.st_gid = perm_data.guest_gid;
1890        st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
1891    }
1892
1893    /// Set permission according to path
1894    fn set_ugid_permission(&self, st: &mut libc::stat64, path: &str) {
1895        let is_root_path = path.is_empty();
1896
1897        if self.find_and_set_ugid_permission(st, path, is_root_path) {
1898            return;
1899        }
1900
1901        if let Some(perm_data) = self
1902            .permission_paths
1903            .read()
1904            .expect("acquire permission_paths read lock")
1905            .iter()
1906            .find(|pd| pd.perm_path == "/")
1907        {
1908            self.set_permission_from_data(st, perm_data);
1909        }
1910    }
1911
1912    /// Set host uid/gid to configured value according to path
1913    fn change_ugid_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32) {
1914        let path = format!(
1915            "{}/{}",
1916            parent_data.path.clone(),
1917            name.to_str().unwrap_or("<non UTF-8 str>")
1918        );
1919
1920        self.change_ugid_creds_for_path(ctx, &path)
1921    }
1922
1923    /// Set host uid/gid to configured value according to path
1924    fn change_ugid_creds_for_path(&self, ctx: &Context, path: &str) -> (u32, u32) {
1925        let is_root_path = path.is_empty();
1926
1927        if let Some(creds) = self.find_ugid_creds_for_path(path, is_root_path) {
1928            return creds;
1929        }
1930
1931        if let Some(perm_data) = self
1932            .permission_paths
1933            .read()
1934            .expect("acquire permission_paths read lock")
1935            .iter()
1936            .find(|pd| pd.perm_path == "/")
1937        {
1938            return (perm_data.host_uid, perm_data.host_gid);
1939        }
1940
1941        (ctx.uid, ctx.gid)
1942    }
1943
1944    fn find_ugid_creds_for_path(&self, path: &str, is_root_path: bool) -> Option<(u32, u32)> {
1945        for perm_data in self
1946            .permission_paths
1947            .read()
1948            .expect("acquire permission_paths read lock")
1949            .iter()
1950        {
1951            if (is_root_path && perm_data.perm_path == "/")
1952                || (!is_root_path
1953                    && perm_data.perm_path != "/"
1954                    && perm_data.need_set_permission(path))
1955            {
1956                return Some((perm_data.host_uid, perm_data.host_gid));
1957            }
1958        }
1959        None
1960    }
1961}
1962
1963#[cfg(feature = "arc_quota")]
1964impl PassthroughFs {
1965    /// Convert u8 slice to string
1966    fn string_from_u8_slice(&self, buf: &[u8]) -> io::Result<String> {
1967        match CStr::from_bytes_until_nul(buf).map(|s| s.to_string_lossy().to_string()) {
1968            Ok(s) => Ok(s),
1969            Err(e) => {
1970                error!("fail to convert u8 slice to string: {}", e);
1971                Err(io::Error::from_raw_os_error(libc::EINVAL))
1972            }
1973        }
1974    }
1975
1976    /// Set permission according to path
1977    fn set_permission(&self, st: &mut libc::stat64, path: &str) {
1978        for perm_data in self
1979            .permission_paths
1980            .read()
1981            .expect("acquire permission_paths read lock")
1982            .iter()
1983        {
1984            if perm_data.need_set_permission(path) {
1985                st.st_uid = perm_data.guest_uid;
1986                st.st_gid = perm_data.guest_gid;
1987                st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
1988            }
1989        }
1990    }
1991
1992    /// Set host uid/gid to configured value according to path
1993    fn change_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32) {
1994        let path = format!(
1995            "{}/{}",
1996            parent_data.path.clone(),
1997            name.to_str().unwrap_or("<non UTF-8 str>")
1998        );
1999
2000        self.change_creds_for_path(ctx, &path)
2001    }
2002
2003    /// Set host uid/gid to configured value according to path
2004    fn change_creds_for_path(&self, ctx: &Context, path: &str) -> (u32, u32) {
2005        for perm_data in self
2006            .permission_paths
2007            .read()
2008            .expect("acquire permission_paths read lock")
2009            .iter()
2010        {
2011            if perm_data.need_set_permission(path) {
2012                return (perm_data.host_uid, perm_data.host_gid);
2013            }
2014        }
2015
2016        (ctx.uid, ctx.gid)
2017    }
2018
2019    fn read_permission_data<R: io::Read>(&self, mut r: R) -> io::Result<PermissionData> {
2020        let mut fs_permission_data = FsPermissionDataBuffer::new_zeroed();
2021        r.read_exact(fs_permission_data.as_mut_bytes())?;
2022
2023        let perm_path = self.string_from_u8_slice(&fs_permission_data.perm_path)?;
2024        if !perm_path.starts_with('/') {
2025            error!("FS_IOC_SETPERMISSION: perm path must start with '/'");
2026            return Err(io::Error::from_raw_os_error(libc::EINVAL));
2027        }
2028        Ok(PermissionData {
2029            guest_uid: fs_permission_data.guest_uid,
2030            guest_gid: fs_permission_data.guest_gid,
2031            host_uid: fs_permission_data.host_uid,
2032            host_gid: fs_permission_data.host_gid,
2033            umask: fs_permission_data.umask,
2034            perm_path,
2035        })
2036    }
2037
2038    /// Sets uid/gid/umask for all files and directories under a specific path.
2039    ///
2040    /// This ioctl does not correspond to any upstream FUSE feature. It is used for arcvm
2041    /// It associates the specified path with the provide uid, gid, and umask values within the
2042    /// filesystem metadata.
2043    ///
2044    /// During subsequent lookup operations, the stored uid/gid/umask values are retrieved and
2045    /// applied to all files and directories found under the registered path. Before sending
2046    /// file stat information to the client, the uid and gid are substituted by `guest_uid` and
2047    /// `guest_gid` if the file falls under the registered path. The file mode is masked by the
2048    ///  umask.
2049    ///
2050    /// When the guest creates a file within the specified path, the file gid/uid stat in host
2051    /// will be overwritten to `host_uid` and `host_gid` values.
2052    ///
2053    /// This functionality enables dynamic configuration of ownership and permissions for a
2054    /// specific directory hierarchy within the filesystem.
2055    ///
2056    /// # Notes
2057    /// - This method affects all existing and future files under the registered path.
2058    /// - The original file ownership and permissions are overridden by the provided values.
2059    /// - The registered path should not be renamed
2060    /// - Refer go/remove-mount-passthrough-fuse for more design details
2061    fn set_permission_by_path<R: io::Read>(&self, r: R) -> IoctlReply {
2062        if self
2063            .permission_paths
2064            .read()
2065            .expect("acquire permission_paths read lock")
2066            .len()
2067            >= self.cfg.max_dynamic_perm
2068        {
2069            error!(
2070                "FS_IOC_SETPERMISSION exceeds limits of max_dynamic_perm: {}",
2071                self.cfg.max_dynamic_perm
2072            );
2073            return IoctlReply::Done(Err(io::Error::from_raw_os_error(libc::EPERM)));
2074        }
2075
2076        let perm_data = match self.read_permission_data(r) {
2077            Ok(data) => data,
2078            Err(e) => {
2079                error!("fail to read permission data: {}", e);
2080                return IoctlReply::Done(Err(e));
2081            }
2082        };
2083
2084        self.permission_paths
2085            .write()
2086            .expect("acquire permission_paths write lock")
2087            .push(perm_data);
2088
2089        IoctlReply::Done(Ok(Vec::new()))
2090    }
2091
2092    // Get xattr value according to path and name
2093    fn get_xattr_by_path(&self, path: &str, name: &str) -> Option<String> {
2094        self.xattr_paths
2095            .read()
2096            .expect("acquire permission_paths read lock")
2097            .iter()
2098            .find(|data| data.need_set_guest_xattr(path, name))
2099            .map(|data| data.xattr_value.clone())
2100    }
2101
2102    fn skip_host_set_xattr(&self, path: &str, name: &str) -> bool {
2103        self.get_xattr_by_path(path, name).is_some()
2104    }
2105
2106    fn read_xattr_data<R: io::Read>(&self, mut r: R) -> io::Result<XattrData> {
2107        let mut fs_path_xattr_data = FsPathXattrDataBuffer::new_zeroed();
2108        r.read_exact(fs_path_xattr_data.as_mut_bytes())?;
2109
2110        let xattr_path = self.string_from_u8_slice(&fs_path_xattr_data.path)?;
2111        if !xattr_path.starts_with('/') {
2112            error!("FS_IOC_SETPATHXATTR: perm path must start with '/'");
2113            return Err(io::Error::from_raw_os_error(libc::EINVAL));
2114        }
2115        let xattr_name = self.string_from_u8_slice(&fs_path_xattr_data.xattr_name)?;
2116        let xattr_value = self.string_from_u8_slice(&fs_path_xattr_data.xattr_value)?;
2117
2118        Ok(XattrData {
2119            xattr_path,
2120            xattr_name,
2121            xattr_value,
2122        })
2123    }
2124
2125    /// Sets xattr value for all files and directories under a specific path.
2126    ///
2127    /// This ioctl does not correspond to any upstream FUSE feature. It is used for arcvm.
2128    /// It associates the specified path and xattr name with a value.
2129    ///
2130    /// When the getxattr is called for the specified path and name, the predefined
2131    /// value is returned.
2132    ///
2133    /// # Notes
2134    /// - This method affects all existing and future files under the registered path.
2135    /// - The SECURITY_CONTEXT feature will be disabled if this ioctl is enabled.
2136    /// - The registered path should not be renamed
2137    /// - Refer go/remove-mount-passthrough-fuse for more design details
2138    fn set_xattr_by_path<R: io::Read>(&self, r: R) -> IoctlReply {
2139        if self
2140            .xattr_paths
2141            .read()
2142            .expect("acquire xattr_paths read lock")
2143            .len()
2144            >= self.cfg.max_dynamic_xattr
2145        {
2146            error!(
2147                "FS_IOC_SETPATHXATTR exceeds limits of max_dynamic_xattr: {}",
2148                self.cfg.max_dynamic_xattr
2149            );
2150            return IoctlReply::Done(Err(io::Error::from_raw_os_error(libc::EPERM)));
2151        }
2152
2153        let xattr_data = match self.read_xattr_data(r) {
2154            Ok(data) => data,
2155            Err(e) => {
2156                error!("fail to read xattr data: {}", e);
2157                return IoctlReply::Done(Err(e));
2158            }
2159        };
2160
2161        self.xattr_paths
2162            .write()
2163            .expect("acquire xattr_paths write lock")
2164            .push(xattr_data);
2165
2166        IoctlReply::Done(Ok(Vec::new()))
2167    }
2168
2169    fn do_getxattr_with_filter(
2170        &self,
2171        data: Arc<InodeData>,
2172        name: Cow<CStr>,
2173        buf: &mut [u8],
2174    ) -> io::Result<usize> {
2175        let res: usize = match self.get_xattr_by_path(&data.path, &name.to_string_lossy()) {
2176            Some(predifined_xattr) => {
2177                let x = predifined_xattr.into_bytes();
2178                if x.len() > buf.len() {
2179                    return Err(io::Error::from_raw_os_error(libc::ERANGE));
2180                }
2181                buf[..x.len()].copy_from_slice(&x);
2182                x.len()
2183            }
2184            None => self.do_getxattr(&data, &name, &mut buf[..])?,
2185        };
2186        Ok(res)
2187    }
2188
2189    /// Looks up the host uid according to the path of file that inode is referring to.
2190    fn lookup_host_uid(&self, ctx: &Context, inode: Inode) -> u32 {
2191        if let Ok(inode_data) = self.find_inode(inode) {
2192            let path = &inode_data.path;
2193            for perm_data in self
2194                .permission_paths
2195                .read()
2196                .expect("acquire permission_paths read lock")
2197                .iter()
2198            {
2199                if perm_data.need_set_permission(path) {
2200                    return perm_data.host_uid;
2201                }
2202            }
2203        }
2204        ctx.uid
2205    }
2206}
2207
2208/// Decrements the refcount of the inode.
2209/// Returns `true` if the refcount became 0.
2210fn forget_one(
2211    inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
2212    inode: Inode,
2213    count: u64,
2214) -> bool {
2215    if let Some(data) = inodes.get(&inode) {
2216        // Acquiring the write lock on the inode map prevents new lookups from incrementing the
2217        // refcount but there is the possibility that a previous lookup already acquired a
2218        // reference to the inode data and is in the process of updating the refcount so we need
2219        // to loop here until we can decrement successfully.
2220        loop {
2221            let refcount = data.refcount.load(Ordering::Relaxed);
2222
2223            // Saturating sub because it doesn't make sense for a refcount to go below zero and
2224            // we don't want misbehaving clients to cause integer overflow.
2225            let new_count = refcount.saturating_sub(count);
2226
2227            // Synchronizes with the acquire load in `do_lookup`.
2228            if data
2229                .refcount
2230                .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
2231                .is_ok()
2232            {
2233                if new_count == 0 {
2234                    // We just removed the last refcount for this inode. There's no need for an
2235                    // acquire fence here because we hold a write lock on the inode map and any
2236                    // thread that is waiting to do a forget on the same inode will have to wait
2237                    // until we release the lock. So there's is no other release store for us to
2238                    // synchronize with before deleting the entry.
2239                    inodes.remove(&inode);
2240                    return true;
2241                }
2242                break;
2243            }
2244        }
2245    }
2246    false
2247}
2248
2249// Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
2250// nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
2251fn strip_xattr_prefix(buf: &mut Vec<u8>) {
2252    fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
2253        if start >= b.len() {
2254            return None;
2255        }
2256
2257        let end = b[start..]
2258            .iter()
2259            .position(|&c| c == b'\0')
2260            .map(|p| start + p + 1)
2261            .unwrap_or(b.len());
2262
2263        Some(&b[start..end])
2264    }
2265
2266    let mut pos = 0;
2267    while let Some(name) = next_cstr(buf, pos) {
2268        if !name.starts_with(USER_VIRTIOFS_XATTR) {
2269            pos += name.len();
2270            continue;
2271        }
2272
2273        let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
2274        buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
2275        pos += newlen;
2276    }
2277}
2278
2279impl Drop for PassthroughFs {
2280    /// The `Drop` implementation for this struct intentionally leaks all open file descriptors.
2281    /// It sets the `unsafe_leak_fd` flag on all `InodeData` and `HandleData` instances, which
2282    /// causes their `drop` implementations to forget the underlying `File` objects.
2283    ///
2284    /// This is a deliberate performance optimization for abrupt shutdowns. It relies on the
2285    /// operating system to clean up the file descriptors when the process terminates. It is
2286    /// **critical** that an instance of `PassthroughFs` is only dropped immediately prior to
2287    /// process termination.
2288    fn drop(&mut self) {
2289        let inodes = self.inodes.lock();
2290        inodes.apply(|v| {
2291            v.set_unsafe_leak_fd();
2292        });
2293        let handles = self.handles.lock();
2294        handles.values().for_each(|v| v.set_unsafe_leak_fd());
2295    }
2296}
2297
2298impl FileSystem for PassthroughFs {
2299    type Inode = Inode;
2300    type Handle = Handle;
2301    type DirIter = ReadDir<Box<[u8]>>;
2302
2303    fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
2304        let root = CString::new(self.root_dir.clone())
2305            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
2306
2307        let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
2308        // SAFETY: this doesn't modify any memory and we check the return value.
2309        let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
2310        if raw_descriptor < 0 {
2311            return Err(io::Error::last_os_error());
2312        }
2313
2314        // SAFETY: safe because we just opened this descriptor above.
2315        let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
2316
2317        let st = stat(&f)?;
2318
2319        // SAFETY: this doesn't modify any memory and there is no need to check the return
2320        // value because this system call always succeeds. We need to clear the umask here because
2321        // we want the client to be able to set all the bits in the mode.
2322        unsafe { libc::umask(0o000) };
2323
2324        let mut inodes = self.inodes.lock();
2325
2326        // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
2327        inodes.insert(
2328            ROOT_ID,
2329            InodeAltKey {
2330                ino: st.st_ino,
2331                dev: st.st_dev,
2332            },
2333            Arc::new(InodeData {
2334                inode: ROOT_ID,
2335                file: Mutex::new(OpenedFile::new(f, flags)),
2336                refcount: AtomicU64::new(2),
2337                filetype: st.st_mode.into(),
2338                path: "".to_string(),
2339                unsafe_leak_fd: AtomicBool::new(false),
2340            }),
2341        );
2342
2343        let mut opts = FsOptions::DO_READDIRPLUS
2344            | FsOptions::READDIRPLUS_AUTO
2345            | FsOptions::EXPORT_SUPPORT
2346            | FsOptions::DONT_MASK
2347            | FsOptions::CACHE_SYMLINKS;
2348
2349        // Device using dynamic xattr feature will have different security context in
2350        // host and guests. The SECURITY_CONTEXT feature should not be enabled in the
2351        // device.
2352        if self.cfg.max_dynamic_xattr == 0 && self.cfg.security_ctx {
2353            opts |= FsOptions::SECURITY_CONTEXT;
2354        }
2355
2356        if self.cfg.posix_acl {
2357            opts |= FsOptions::POSIX_ACL;
2358        }
2359        if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
2360            opts |= FsOptions::WRITEBACK_CACHE;
2361            self.writeback.store(true, Ordering::Relaxed);
2362        }
2363        if self.cfg.cache_policy == CachePolicy::Always {
2364            if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
2365                opts |= FsOptions::ZERO_MESSAGE_OPEN;
2366                self.zero_message_open.store(true, Ordering::Relaxed);
2367            }
2368            if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
2369                opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
2370                self.zero_message_opendir.store(true, Ordering::Relaxed);
2371            }
2372        }
2373        Ok(opts)
2374    }
2375
2376    fn destroy(&self) {
2377        cros_tracing::trace_simple_print!(VirtioFs, "{:?}: destroy", self);
2378        self.handles.lock().clear();
2379        self.inodes.lock().clear();
2380    }
2381
2382    fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
2383        let _trace = fs_trace!(self.tag, "statfs", inode);
2384        let data = self.find_inode(inode)?;
2385
2386        let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
2387
2388        // SAFETY: this will only modify `out` and we check the return value.
2389        syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
2390
2391        // SAFETY: the kernel guarantees that `out` has been initialized.
2392        Ok(unsafe { out.assume_init() })
2393    }
2394
2395    fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
2396        validate_path_component(name)?;
2397        let data = self.find_inode(parent)?;
2398        #[allow(unused_variables)]
2399        let path = format!(
2400            "{}/{}",
2401            data.path,
2402            name.to_str().unwrap_or("<non UTF-8 path>")
2403        );
2404        let _trace = fs_trace!(self.tag, "lookup", parent, path);
2405
2406        let mut res = self.do_lookup_with_casefold_fallback(&data, name);
2407
2408        // FUSE takes a inode=0 as a request to do negative dentry cache.
2409        // So, if `negative_timeout` is set, return success with the timeout value and inode=0 as a
2410        // response.
2411        if let Err(e) = &res {
2412            if e.kind() == std::io::ErrorKind::NotFound && !self.cfg.negative_timeout.is_zero() {
2413                res = Ok(Entry::new_negative(self.cfg.negative_timeout));
2414            }
2415        }
2416
2417        res
2418    }
2419
2420    fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
2421        let _trace = fs_trace!(self.tag, "forget", inode, count);
2422        let mut inodes = self.inodes.lock();
2423        let caches = self.lock_casefold_lookup_caches();
2424        if forget_one(&mut inodes, inode, count) {
2425            if let Some(mut c) = caches {
2426                c.forget(inode);
2427            }
2428        }
2429    }
2430
2431    fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
2432        let mut inodes = self.inodes.lock();
2433        let mut caches = self.lock_casefold_lookup_caches();
2434        for (inode, count) in requests {
2435            if forget_one(&mut inodes, inode, count) {
2436                if let Some(c) = caches.as_mut() {
2437                    c.forget(inode);
2438                }
2439            }
2440        }
2441    }
2442
2443    fn opendir(
2444        &self,
2445        _ctx: Context,
2446        inode: Inode,
2447        flags: u32,
2448    ) -> io::Result<(Option<Handle>, OpenOptions)> {
2449        let _trace = fs_trace!(self.tag, "opendir", inode, flags);
2450        if self.zero_message_opendir.load(Ordering::Relaxed) {
2451            Err(io::Error::from_raw_os_error(libc::ENOSYS))
2452        } else {
2453            self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
2454        }
2455    }
2456
2457    fn releasedir(
2458        &self,
2459        _ctx: Context,
2460        inode: Inode,
2461        _flags: u32,
2462        handle: Handle,
2463    ) -> io::Result<()> {
2464        let _trace = fs_trace!(self.tag, "releasedir", inode, handle);
2465        if self.zero_message_opendir.load(Ordering::Relaxed) {
2466            Ok(())
2467        } else {
2468            self.do_release(inode, handle)
2469        }
2470    }
2471
2472    fn mkdir(
2473        &self,
2474        ctx: Context,
2475        parent: Inode,
2476        name: &CStr,
2477        mode: u32,
2478        umask: u32,
2479        security_ctx: Option<&CStr>,
2480    ) -> io::Result<Entry> {
2481        validate_path_component(name)?;
2482        let _trace = fs_trace!(self.tag, "mkdir", parent, name, mode, umask, security_ctx);
2483        let data = self.find_inode(parent)?;
2484
2485        let _ctx = security_ctx
2486            .filter(|ctx| *ctx != UNLABELED_CSTR)
2487            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2488            .transpose()?;
2489
2490        #[allow(unused_variables)]
2491        #[cfg(feature = "arc_quota")]
2492        let (uid, gid) = self.change_creds(&ctx, &data, name);
2493        #[cfg(feature = "fs_runtime_ugid_map")]
2494        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2495        #[cfg(not(feature = "fs_permission_translation"))]
2496        let (uid, gid) = (ctx.uid, ctx.gid);
2497
2498        let (_uid, _gid) = set_creds(uid, gid)?;
2499        {
2500            let casefold_cache = self.lock_casefold_lookup_caches();
2501            let _scoped_umask = ScopedUmask::new(umask);
2502
2503            if name.to_bytes().contains(&b'/') {
2504                return Err(io::Error::from_raw_os_error(libc::EINVAL));
2505            }
2506            // SAFETY: this doesn't modify any memory and we check the return value.
2507            syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
2508            if let Some(mut c) = casefold_cache {
2509                c.insert(data.inode, name);
2510            }
2511        }
2512        self.do_lookup(&data, name)
2513    }
2514
2515    fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2516        validate_path_component(name)?;
2517        let _trace = fs_trace!(self.tag, "rmdir", parent, name);
2518        let data = self.find_inode(parent)?;
2519        let casefold_cache = self.lock_casefold_lookup_caches();
2520        // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2521        // `get_case_unfolded_name()` to get the actual name to be unlinked.
2522        self.do_unlink(&data, name, libc::AT_REMOVEDIR)?;
2523        if let Some(mut c) = casefold_cache {
2524            c.remove(data.inode, name);
2525        }
2526        Ok(())
2527    }
2528
2529    fn readdir(
2530        &self,
2531        _ctx: Context,
2532        inode: Inode,
2533        handle: Handle,
2534        size: u32,
2535        offset: u64,
2536    ) -> io::Result<Self::DirIter> {
2537        let _trace = fs_trace!(self.tag, "readdir", inode, handle, size, offset);
2538        let buf = vec![0; size as usize].into_boxed_slice();
2539
2540        if self.zero_message_opendir.load(Ordering::Relaxed) {
2541            let data = self.find_inode(inode)?;
2542            ReadDir::new(&*data, offset as libc::off64_t, buf)
2543        } else {
2544            let data = self.find_handle(handle, inode)?;
2545
2546            let dir = data.file.lock();
2547
2548            ReadDir::new(&*dir, offset as libc::off64_t, buf)
2549        }
2550    }
2551
2552    fn open(
2553        &self,
2554        _ctx: Context,
2555        inode: Inode,
2556        flags: u32,
2557    ) -> io::Result<(Option<Handle>, OpenOptions)> {
2558        if self.zero_message_open.load(Ordering::Relaxed) {
2559            let _trace = fs_trace!(self.tag, "open (zero-message)", inode, flags);
2560            Err(io::Error::from_raw_os_error(libc::ENOSYS))
2561        } else {
2562            let _trace = fs_trace!(self.tag, "open", inode, flags);
2563            self.do_open(inode, flags)
2564        }
2565    }
2566
2567    fn release(
2568        &self,
2569        _ctx: Context,
2570        inode: Inode,
2571        _flags: u32,
2572        handle: Handle,
2573        _flush: bool,
2574        _flock_release: bool,
2575        _lock_owner: Option<u64>,
2576    ) -> io::Result<()> {
2577        if self.zero_message_open.load(Ordering::Relaxed) {
2578            let _trace = fs_trace!(self.tag, "release (zero-message)", inode, handle);
2579            Ok(())
2580        } else {
2581            let _trace = fs_trace!(self.tag, "release", inode, handle);
2582            self.do_release(inode, handle)
2583        }
2584    }
2585
2586    fn chromeos_tmpfile(
2587        &self,
2588        ctx: Context,
2589        parent: Self::Inode,
2590        mode: u32,
2591        umask: u32,
2592        security_ctx: Option<&CStr>,
2593    ) -> io::Result<Entry> {
2594        let _trace = fs_trace!(
2595            self.tag,
2596            "chromeos_tempfile",
2597            parent,
2598            mode,
2599            umask,
2600            security_ctx
2601        );
2602        let data = self.find_inode(parent)?;
2603
2604        let _ctx = security_ctx
2605            .filter(|ctx| *ctx != UNLABELED_CSTR)
2606            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2607            .transpose()?;
2608
2609        let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
2610
2611        let current_dir = c".";
2612
2613        #[allow(unused_variables)]
2614        #[cfg(feature = "arc_quota")]
2615        let (uid, gid) = self.change_creds(&ctx, &data, current_dir);
2616        #[cfg(feature = "fs_runtime_ugid_map")]
2617        let (uid, gid) = self.change_ugid_creds(&ctx, &data, current_dir);
2618        #[cfg(not(feature = "fs_permission_translation"))]
2619        let (uid, gid) = (ctx.uid, ctx.gid);
2620
2621        let (_uid, _gid) = set_creds(uid, gid)?;
2622
2623        let fd = {
2624            let _scoped_umask = ScopedUmask::new(umask);
2625
2626            // SAFETY: this doesn't modify any memory and we check the return value.
2627            syscall!(unsafe {
2628                libc::openat64(
2629                    data.as_raw_descriptor(),
2630                    current_dir.as_ptr(),
2631                    tmpflags,
2632                    mode,
2633                )
2634            })?
2635        };
2636        // No need to add casefold_cache becuase we created an anonymous file.
2637
2638        // SAFETY: safe because we just opened this fd.
2639        let tmpfile = unsafe { File::from_raw_descriptor(fd) };
2640        let st = stat(&tmpfile)?;
2641        let path = format!(
2642            "{}/{}",
2643            data.path.clone(),
2644            current_dir.to_str().unwrap_or("<non UTF-8 str>")
2645        );
2646        Ok(self.add_entry(tmpfile, st, tmpflags, path))
2647    }
2648
2649    fn create(
2650        &self,
2651        ctx: Context,
2652        parent: Inode,
2653        name: &CStr,
2654        mode: u32,
2655        flags: u32,
2656        umask: u32,
2657        security_ctx: Option<&CStr>,
2658    ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
2659        validate_path_component(name)?;
2660        let _trace = fs_trace!(
2661            self.tag,
2662            "create",
2663            parent,
2664            name,
2665            mode,
2666            flags,
2667            umask,
2668            security_ctx
2669        );
2670        let data = self.find_inode(parent)?;
2671
2672        let _ctx = security_ctx
2673            .filter(|ctx| *ctx != UNLABELED_CSTR)
2674            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2675            .transpose()?;
2676
2677        #[allow(unused_variables)]
2678        #[cfg(feature = "arc_quota")]
2679        let (uid, gid) = self.change_creds(&ctx, &data, name);
2680        #[cfg(feature = "fs_runtime_ugid_map")]
2681        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2682        #[cfg(not(feature = "fs_permission_translation"))]
2683        let (uid, gid) = (ctx.uid, ctx.gid);
2684
2685        let (_uid, _gid) = set_creds(uid, gid)?;
2686
2687        let flags = self.update_open_flags(flags as i32);
2688        // Mask out O_DIRECT. Also mask out O_PATH because we need to return a readable/writable
2689        // file descriptor for create.
2690        let create_flags = (flags | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW)
2691            & !(libc::O_DIRECT | libc::O_PATH);
2692        if name.to_bytes().contains(&b'/') {
2693            return Err(io::Error::from_raw_os_error(libc::EINVAL));
2694        }
2695
2696        let file = {
2697            let _scoped_umask = ScopedUmask::new(umask);
2698            let casefold_cache = self.lock_casefold_lookup_caches();
2699
2700            let file = safe_openat2(
2701                &data,
2702                name,
2703                create_flags,
2704                Some(mode),
2705                RESOLVE_IN_ROOT | RESOLVE_NO_MAGICLINKS,
2706            )?;
2707            if let Some(mut c) = casefold_cache {
2708                c.insert(parent, name);
2709            }
2710            file
2711        };
2712
2713        let st = stat(&file)?;
2714        let path = format!(
2715            "{}/{}",
2716            data.path.clone(),
2717            name.to_str().unwrap_or("<non UTF-8 str>")
2718        );
2719        let entry = self.add_entry(file, st, create_flags, path);
2720
2721        let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
2722            (None, OpenOptions::KEEP_CACHE)
2723        } else {
2724            self.do_open_at(
2725                data,
2726                name,
2727                entry.inode,
2728                flags as u32 & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
2729            )
2730            .inspect_err(|_e| {
2731                // Don't leak the entry.
2732                self.forget(ctx, entry.inode, 1);
2733            })?
2734        };
2735        Ok((entry, handle, opts))
2736    }
2737
2738    fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2739        validate_path_component(name)?;
2740        let _trace = fs_trace!(self.tag, "unlink", parent, name);
2741        let data = self.find_inode(parent)?;
2742        let casefold_cache = self.lock_casefold_lookup_caches();
2743        // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2744        // `get_case_unfolded_name()` to get the actual name to be unlinked.
2745        self.do_unlink(&data, name, 0)?;
2746        if let Some(mut c) = casefold_cache {
2747            c.remove(data.inode, name);
2748        }
2749        Ok(())
2750    }
2751
2752    fn read<W: io::Write + ZeroCopyWriter>(
2753        &self,
2754        _ctx: Context,
2755        inode: Inode,
2756        handle: Handle,
2757        mut w: W,
2758        size: u32,
2759        offset: u64,
2760        _lock_owner: Option<u64>,
2761        _flags: u32,
2762    ) -> io::Result<usize> {
2763        if self.zero_message_open.load(Ordering::Relaxed) {
2764            let _trace = fs_trace!(self.tag, "read (zero-message)", inode, handle, size, offset);
2765            let data = self.find_inode(inode)?;
2766
2767            let mut file = data.file.lock();
2768            let mut flags = file.open_flags;
2769            match flags & libc::O_ACCMODE {
2770                libc::O_WRONLY => {
2771                    flags &= !libc::O_WRONLY;
2772                    flags |= libc::O_RDWR;
2773
2774                    // We need to get a readable handle for this file.
2775                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
2776                    *file = OpenedFile::new(newfile, flags);
2777                }
2778                libc::O_RDONLY | libc::O_RDWR => {}
2779                _ => panic!("Unexpected flags: {flags:#x}"),
2780            }
2781
2782            w.write_from(file.file_mut(), size as usize, offset)
2783        } else {
2784            let _trace = fs_trace!(self.tag, "read", inode, handle, size, offset);
2785            let data = self.find_handle(handle, inode)?;
2786
2787            let mut f = data.file.lock();
2788            w.write_from(f.file_mut(), size as usize, offset)
2789        }
2790    }
2791
2792    fn write<R: io::Read + ZeroCopyReader>(
2793        &self,
2794        _ctx: Context,
2795        inode: Inode,
2796        handle: Handle,
2797        mut r: R,
2798        size: u32,
2799        offset: u64,
2800        _lock_owner: Option<u64>,
2801        _delayed_write: bool,
2802        flags: u32,
2803    ) -> io::Result<usize> {
2804        // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
2805        // automatically clear the setuid and setgid bits for us.
2806        let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
2807            Some(drop_cap_fsetid()?)
2808        } else {
2809            None
2810        };
2811
2812        if self.zero_message_open.load(Ordering::Relaxed) {
2813            let _trace = fs_trace!(
2814                self.tag,
2815                "write (zero-message)",
2816                inode,
2817                handle,
2818                size,
2819                offset
2820            );
2821
2822            let data = self.find_inode(inode)?;
2823
2824            let mut file = data.file.lock();
2825            let mut flags = file.open_flags;
2826            match flags & libc::O_ACCMODE {
2827                libc::O_RDONLY => {
2828                    flags &= !libc::O_RDONLY;
2829                    flags |= libc::O_RDWR;
2830
2831                    // We need to get a writable handle for this file.
2832                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
2833                    *file = OpenedFile::new(newfile, flags);
2834                }
2835                libc::O_WRONLY | libc::O_RDWR => {}
2836                _ => panic!("Unexpected flags: {flags:#x}"),
2837            }
2838
2839            r.read_to(file.file_mut(), size as usize, offset)
2840        } else {
2841            let _trace = fs_trace!(self.tag, "write", inode, handle, size, offset);
2842
2843            let data = self.find_handle(handle, inode)?;
2844
2845            let mut f = data.file.lock();
2846            r.read_to(f.file_mut(), size as usize, offset)
2847        }
2848    }
2849
2850    fn getattr(
2851        &self,
2852        _ctx: Context,
2853        inode: Inode,
2854        _handle: Option<Handle>,
2855    ) -> io::Result<(libc::stat64, Duration)> {
2856        let _trace = fs_trace!(self.tag, "getattr", inode, _handle);
2857
2858        let data = self.find_inode(inode)?;
2859        self.do_getattr(&data)
2860    }
2861
2862    fn setattr(
2863        &self,
2864        _ctx: Context,
2865        inode: Inode,
2866        attr: libc::stat64,
2867        handle: Option<Handle>,
2868        valid: SetattrValid,
2869    ) -> io::Result<(libc::stat64, Duration)> {
2870        let _trace = fs_trace!(self.tag, "setattr", inode, handle);
2871        let inode_data = self.find_inode(inode)?;
2872
2873        enum Data<'a> {
2874            Handle(MutexGuard<'a, OpenedFile>),
2875            ProcPath(CString),
2876        }
2877
2878        // If we have a handle then use it otherwise get a new fd from the inode.
2879        let hd;
2880        let data = if let Some(handle) = handle.filter(|&h| h != 0) {
2881            hd = self.find_handle(handle, inode)?;
2882            Data::Handle(hd.file.lock())
2883        } else {
2884            let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
2885                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2886            Data::ProcPath(pathname)
2887        };
2888
2889        if valid.contains(SetattrValid::MODE) {
2890            // SAFETY: this doesn't modify any memory and we check the return value.
2891            syscall!(unsafe {
2892                match data {
2893                    Data::Handle(ref fd) => libc::fchmod(fd.as_raw_descriptor(), attr.st_mode),
2894                    Data::ProcPath(ref p) => {
2895                        libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
2896                    }
2897                }
2898            })?;
2899        }
2900
2901        if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
2902            let uid = if valid.contains(SetattrValid::UID) {
2903                attr.st_uid
2904            } else {
2905                // Cannot use -1 here because these are unsigned values.
2906                u32::MAX
2907            };
2908            let gid = if valid.contains(SetattrValid::GID) {
2909                attr.st_gid
2910            } else {
2911                // Cannot use -1 here because these are unsigned values.
2912                u32::MAX
2913            };
2914
2915            // SAFETY: this doesn't modify any memory and we check the return value.
2916            syscall!(unsafe {
2917                libc::fchownat(
2918                    inode_data.as_raw_descriptor(),
2919                    EMPTY_CSTR.as_ptr(),
2920                    uid,
2921                    gid,
2922                    libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
2923                )
2924            })?;
2925        }
2926
2927        if valid.contains(SetattrValid::SIZE) {
2928            syscall!(match data {
2929                Data::Handle(ref fd) => {
2930                    // SAFETY: this doesn't modify any memory and we check the return value.
2931                    unsafe { libc::ftruncate64(fd.as_raw_descriptor(), attr.st_size) }
2932                }
2933                _ => {
2934                    // There is no `ftruncateat` so we need to get a new fd and truncate it.
2935                    let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
2936                    // SAFETY: this doesn't modify any memory and we check the return value.
2937                    unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
2938                }
2939            })?;
2940        }
2941
2942        if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
2943            let mut tvs = [
2944                libc::timespec {
2945                    tv_sec: 0,
2946                    tv_nsec: libc::UTIME_OMIT,
2947                },
2948                libc::timespec {
2949                    tv_sec: 0,
2950                    tv_nsec: libc::UTIME_OMIT,
2951                },
2952            ];
2953
2954            if valid.contains(SetattrValid::ATIME_NOW) {
2955                tvs[0].tv_nsec = libc::UTIME_NOW;
2956            } else if valid.contains(SetattrValid::ATIME) {
2957                tvs[0].tv_sec = attr.st_atime;
2958                tvs[0].tv_nsec = attr.st_atime_nsec;
2959            }
2960
2961            if valid.contains(SetattrValid::MTIME_NOW) {
2962                tvs[1].tv_nsec = libc::UTIME_NOW;
2963            } else if valid.contains(SetattrValid::MTIME) {
2964                tvs[1].tv_sec = attr.st_mtime;
2965                tvs[1].tv_nsec = attr.st_mtime_nsec;
2966            }
2967
2968            // SAFETY: this doesn't modify any memory and we check the return value.
2969            syscall!(unsafe {
2970                match data {
2971                    Data::Handle(ref fd) => libc::futimens(fd.as_raw_descriptor(), tvs.as_ptr()),
2972                    Data::ProcPath(ref p) => {
2973                        libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
2974                    }
2975                }
2976            })?;
2977        }
2978
2979        self.do_getattr(&inode_data)
2980    }
2981
2982    fn rename(
2983        &self,
2984        _ctx: Context,
2985        olddir: Inode,
2986        oldname: &CStr,
2987        newdir: Inode,
2988        newname: &CStr,
2989        flags: u32,
2990    ) -> io::Result<()> {
2991        validate_path_component(oldname)?;
2992        validate_path_component(newname)?;
2993        let _trace = fs_trace!(self.tag, "rename", olddir, oldname, newdir, newname, flags);
2994
2995        let old_inode = self.find_inode(olddir)?;
2996        let new_inode = self.find_inode(newdir)?;
2997        {
2998            let casefold_cache = self.lock_casefold_lookup_caches();
2999
3000            // SAFETY: this doesn't modify any memory and we check the return value.
3001            // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
3002            // and we have glibc 2.28.
3003            syscall!(unsafe {
3004                libc::syscall(
3005                    libc::SYS_renameat2,
3006                    old_inode.as_raw_descriptor(),
3007                    oldname.as_ptr(),
3008                    new_inode.as_raw_descriptor(),
3009                    newname.as_ptr(),
3010                    flags,
3011                )
3012            })?;
3013            if let Some(mut c) = casefold_cache {
3014                c.remove(olddir, oldname);
3015                c.insert(newdir, newname);
3016            }
3017        }
3018
3019        Ok(())
3020    }
3021
3022    fn mknod(
3023        &self,
3024        ctx: Context,
3025        parent: Inode,
3026        name: &CStr,
3027        mode: u32,
3028        rdev: u32,
3029        umask: u32,
3030        security_ctx: Option<&CStr>,
3031    ) -> io::Result<Entry> {
3032        validate_path_component(name)?;
3033        let _trace = fs_trace!(
3034            self.tag,
3035            "mknod",
3036            parent,
3037            name,
3038            mode,
3039            rdev,
3040            umask,
3041            security_ctx
3042        );
3043        let data = self.find_inode(parent)?;
3044
3045        let _ctx = security_ctx
3046            .filter(|ctx| *ctx != UNLABELED_CSTR)
3047            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
3048            .transpose()?;
3049
3050        #[allow(unused_variables)]
3051        #[cfg(feature = "arc_quota")]
3052        let (uid, gid) = self.change_creds(&ctx, &data, name);
3053        #[cfg(feature = "fs_runtime_ugid_map")]
3054        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3055        #[cfg(not(feature = "fs_permission_translation"))]
3056        let (uid, gid) = (ctx.uid, ctx.gid);
3057
3058        let (_uid, _gid) = set_creds(uid, gid)?;
3059        {
3060            let _scoped_umask = ScopedUmask::new(umask);
3061            let casefold_cache = self.lock_casefold_lookup_caches();
3062
3063            // SAFETY: this doesn't modify any memory and we check the return value.
3064            syscall!(unsafe {
3065                libc::mknodat(
3066                    data.as_raw_descriptor(),
3067                    name.as_ptr(),
3068                    mode as libc::mode_t,
3069                    rdev as libc::dev_t,
3070                )
3071            })?;
3072            if let Some(mut c) = casefold_cache {
3073                c.insert(parent, name);
3074            }
3075        }
3076
3077        self.do_lookup(&data, name)
3078    }
3079
3080    fn link(
3081        &self,
3082        _ctx: Context,
3083        inode: Inode,
3084        newparent: Inode,
3085        newname: &CStr,
3086    ) -> io::Result<Entry> {
3087        validate_path_component(newname)?;
3088        let _trace = fs_trace!(self.tag, "link", inode, newparent, newname);
3089        let data = self.find_inode(inode)?;
3090        let new_inode = self.find_inode(newparent)?;
3091
3092        let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
3093            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3094
3095        {
3096            let casefold_cache = self.lock_casefold_lookup_caches();
3097            // SAFETY: this doesn't modify any memory and we check the return value.
3098            syscall!(unsafe {
3099                libc::linkat(
3100                    self.proc.as_raw_descriptor(),
3101                    path.as_ptr(),
3102                    new_inode.as_raw_descriptor(),
3103                    newname.as_ptr(),
3104                    libc::AT_SYMLINK_FOLLOW,
3105                )
3106            })?;
3107            if let Some(mut c) = casefold_cache {
3108                c.insert(newparent, newname);
3109            }
3110        }
3111
3112        self.do_lookup(&new_inode, newname)
3113    }
3114
3115    fn symlink(
3116        &self,
3117        ctx: Context,
3118        linkname: &CStr,
3119        parent: Inode,
3120        name: &CStr,
3121        security_ctx: Option<&CStr>,
3122    ) -> io::Result<Entry> {
3123        validate_path_component(name)?;
3124        let _trace = fs_trace!(self.tag, "symlink", parent, linkname, name, security_ctx);
3125        let data = self.find_inode(parent)?;
3126
3127        let _ctx = security_ctx
3128            .filter(|ctx| *ctx != UNLABELED_CSTR)
3129            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
3130            .transpose()?;
3131
3132        #[allow(unused_variables)]
3133        #[cfg(feature = "arc_quota")]
3134        let (uid, gid) = self.change_creds(&ctx, &data, name);
3135        #[cfg(feature = "fs_runtime_ugid_map")]
3136        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3137        #[cfg(not(feature = "fs_permission_translation"))]
3138        let (uid, gid) = (ctx.uid, ctx.gid);
3139
3140        let (_uid, _gid) = set_creds(uid, gid)?;
3141        {
3142            let casefold_cache = self.lock_casefold_lookup_caches();
3143            // SAFETY: this doesn't modify any memory and we check the return value.
3144            syscall!(unsafe {
3145                libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
3146            })?;
3147            if let Some(mut c) = casefold_cache {
3148                c.insert(parent, name);
3149            }
3150        }
3151
3152        self.do_lookup(&data, name)
3153    }
3154
3155    fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
3156        let _trace = fs_trace!(self.tag, "readlink", inode);
3157        let data = self.find_inode(inode)?;
3158
3159        let mut buf = vec![0; libc::PATH_MAX as usize];
3160
3161        // SAFETY: this will only modify the contents of `buf` and we check the return value.
3162        let res = syscall!(unsafe {
3163            libc::readlinkat(
3164                data.as_raw_descriptor(),
3165                EMPTY_CSTR.as_ptr(),
3166                buf.as_mut_ptr() as *mut libc::c_char,
3167                buf.len(),
3168            )
3169        })?;
3170
3171        buf.resize(res as usize, 0);
3172
3173        #[cfg(feature = "fs_runtime_ugid_map")]
3174        {
3175            let link_target = Path::new(OsStr::from_bytes(&buf[..res as usize]));
3176            if !link_target.starts_with(&self.root_dir) {
3177                return Err(io::Error::new(
3178                    io::ErrorKind::InvalidInput,
3179                    "Symbolic link points outside of root_dir",
3180                ));
3181            }
3182        }
3183        Ok(buf)
3184    }
3185
3186    fn flush(
3187        &self,
3188        _ctx: Context,
3189        inode: Inode,
3190        handle: Handle,
3191        _lock_owner: u64,
3192    ) -> io::Result<()> {
3193        let _trace = fs_trace!(self.tag, "flush", inode, handle);
3194        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
3195            self.find_inode(inode)?
3196        } else {
3197            self.find_handle(handle, inode)?
3198        };
3199
3200        // SAFETY:
3201        // Since this method is called whenever an fd is closed in the client, we can emulate that
3202        // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
3203        // because this doesn't modify any memory and we check the return values.
3204        unsafe {
3205            let newfd = syscall!(libc::fcntl(
3206                data.as_raw_descriptor(),
3207                libc::F_DUPFD_CLOEXEC,
3208                0
3209            ))?;
3210
3211            syscall!(libc::close(newfd))?;
3212        }
3213        Ok(())
3214    }
3215
3216    fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
3217        if self.zero_message_open.load(Ordering::Relaxed) {
3218            let _trace = fs_trace!(self.tag, "fsync (zero-message)", inode, datasync, handle);
3219            let data = self.find_inode(inode)?;
3220            self.do_fsync(&*data, datasync)
3221        } else {
3222            let _trace = fs_trace!(self.tag, "fsync", inode, datasync, handle);
3223            let data = self.find_handle(handle, inode)?;
3224
3225            let file = data.file.lock();
3226            self.do_fsync(&*file, datasync)
3227        }
3228    }
3229
3230    fn fsyncdir(
3231        &self,
3232        _ctx: Context,
3233        inode: Inode,
3234        datasync: bool,
3235        handle: Handle,
3236    ) -> io::Result<()> {
3237        if self.zero_message_opendir.load(Ordering::Relaxed) {
3238            let _trace = fs_trace!(self.tag, "fsyncdir (zero-message)", inode, datasync, handle);
3239            let data = self.find_inode(inode)?;
3240            self.do_fsync(&*data, datasync)
3241        } else {
3242            let _trace = fs_trace!(self.tag, "fsyncdir", inode, datasync, handle);
3243            let data = self.find_handle(handle, inode)?;
3244
3245            let file = data.file.lock();
3246            self.do_fsync(&*file, datasync)
3247        }
3248    }
3249
3250    fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
3251        let _trace = fs_trace!(self.tag, "access", inode, mask);
3252        let data = self.find_inode(inode)?;
3253
3254        let st = stat(&*data)?;
3255        let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
3256
3257        if mode == libc::F_OK {
3258            // The file exists since we were able to call `stat(2)` on it.
3259            return Ok(());
3260        }
3261
3262        if (mode & libc::R_OK) != 0 {
3263            if ctx.uid != 0
3264                && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
3265                && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
3266                && st.st_mode & 0o004 == 0
3267            {
3268                return Err(io::Error::from_raw_os_error(libc::EACCES));
3269            }
3270        }
3271
3272        if (mode & libc::W_OK) != 0 {
3273            if ctx.uid != 0
3274                && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
3275                && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
3276                && st.st_mode & 0o002 == 0
3277            {
3278                return Err(io::Error::from_raw_os_error(libc::EACCES));
3279            }
3280        }
3281
3282        // root can only execute something if it is executable by one of the owner, the group, or
3283        // everyone.
3284        if (mode & libc::X_OK) != 0 {
3285            if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
3286                && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
3287                && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
3288                && st.st_mode & 0o001 == 0
3289            {
3290                return Err(io::Error::from_raw_os_error(libc::EACCES));
3291            }
3292        }
3293
3294        Ok(())
3295    }
3296
3297    fn setxattr(
3298        &self,
3299        _ctx: Context,
3300        inode: Inode,
3301        name: &CStr,
3302        value: &[u8],
3303        flags: u32,
3304    ) -> io::Result<()> {
3305        let _trace = fs_trace!(self.tag, "setxattr", inode, name, flags);
3306        // We can't allow the VM to set this xattr because an unprivileged process may use it to set
3307        // a privileged xattr.
3308        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3309            return Err(io::Error::from_raw_os_error(libc::EPERM));
3310        }
3311
3312        let data = self.find_inode(inode)?;
3313        let name = self.rewrite_xattr_name(name);
3314
3315        #[cfg(feature = "arc_quota")]
3316        if self.skip_host_set_xattr(&data.path, &name.to_string_lossy()) {
3317            debug!(
3318                "ignore setxattr for path:{} xattr_name:{}",
3319                &data.path,
3320                &name.to_string_lossy()
3321            );
3322            return Ok(());
3323        }
3324
3325        let file = data.file.lock();
3326        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3327        if o_path_file {
3328            // For FDs opened with `O_PATH`, we cannot call `fsetxattr` normally. Instead we emulate
3329            // an _at syscall by changing the CWD to /proc, running the path based syscall, and then
3330            // setting the CWD back to the root directory.
3331            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3332                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3333
3334            syscall!(self.with_proc_chdir(|| {
3335                // SAFETY: this doesn't modify any memory and we check the return value.
3336                unsafe {
3337                    libc::setxattr(
3338                        path.as_ptr(),
3339                        name.as_ptr(),
3340                        value.as_ptr() as *const libc::c_void,
3341                        value.len() as libc::size_t,
3342                        flags as c_int,
3343                    )
3344                }
3345            }))?;
3346        } else {
3347            syscall!(
3348                // For regular files and directories, we can just use fsetxattr.
3349                // SAFETY: this doesn't modify any memory and we check the return value.
3350                unsafe {
3351                    libc::fsetxattr(
3352                        file.as_raw_descriptor(),
3353                        name.as_ptr(),
3354                        value.as_ptr() as *const libc::c_void,
3355                        value.len() as libc::size_t,
3356                        flags as c_int,
3357                    )
3358                }
3359            )?;
3360        }
3361
3362        Ok(())
3363    }
3364
3365    fn getxattr(
3366        &self,
3367        _ctx: Context,
3368        inode: Inode,
3369        name: &CStr,
3370        size: u32,
3371    ) -> io::Result<GetxattrReply> {
3372        let _trace = fs_trace!(self.tag, "getxattr", inode, name, size);
3373        // We don't allow the VM to set this xattr so we also pretend there is no value associated
3374        // with it.
3375        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3376            return Err(io::Error::from_raw_os_error(libc::ENODATA));
3377        }
3378
3379        let data = self.find_inode(inode)?;
3380        let name = self.rewrite_xattr_name(name);
3381        let mut buf = vec![0u8; size as usize];
3382
3383        #[cfg(feature = "arc_quota")]
3384        let res = self.do_getxattr_with_filter(data, name, &mut buf)?;
3385
3386        #[cfg(not(feature = "arc_quota"))]
3387        let res = self.do_getxattr(&data, &name, &mut buf[..])?;
3388
3389        if size == 0 {
3390            Ok(GetxattrReply::Count(res as u32))
3391        } else {
3392            buf.truncate(res);
3393            Ok(GetxattrReply::Value(buf))
3394        }
3395    }
3396
3397    fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
3398        let _trace = fs_trace!(self.tag, "listxattr", inode, size);
3399        let data = self.find_inode(inode)?;
3400
3401        let mut buf = vec![0u8; size as usize];
3402
3403        let file = data.file.lock();
3404        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3405        let res = if o_path_file {
3406            // For FDs opened with `O_PATH`, we cannot call `flistxattr` normally. Instead we
3407            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
3408            // and then setting the CWD back to the root directory.
3409            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3410                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3411
3412            // SAFETY: this will only modify `buf` and we check the return value.
3413            syscall!(self.with_proc_chdir(|| unsafe {
3414                libc::listxattr(
3415                    path.as_ptr(),
3416                    buf.as_mut_ptr() as *mut libc::c_char,
3417                    buf.len() as libc::size_t,
3418                )
3419            }))?
3420        } else {
3421            // For regular files and directories, we can just flistxattr.
3422            // SAFETY: this will only write to `buf` and we check the return value.
3423            syscall!(unsafe {
3424                libc::flistxattr(
3425                    file.as_raw_descriptor(),
3426                    buf.as_mut_ptr() as *mut libc::c_char,
3427                    buf.len() as libc::size_t,
3428                )
3429            })?
3430        };
3431
3432        if size == 0 {
3433            Ok(ListxattrReply::Count(res as u32))
3434        } else {
3435            buf.truncate(res as usize);
3436
3437            if self.cfg.rewrite_security_xattrs {
3438                strip_xattr_prefix(&mut buf);
3439            }
3440            Ok(ListxattrReply::Names(buf))
3441        }
3442    }
3443
3444    fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
3445        let _trace = fs_trace!(self.tag, "removexattr", inode, name);
3446        // We don't allow the VM to set this xattr so we also pretend there is no value associated
3447        // with it.
3448        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3449            return Err(io::Error::from_raw_os_error(libc::ENODATA));
3450        }
3451
3452        let data = self.find_inode(inode)?;
3453        let name = self.rewrite_xattr_name(name);
3454
3455        let file = data.file.lock();
3456        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3457        if o_path_file {
3458            // For files opened with `O_PATH`, we cannot call `fremovexattr` normally. Instead we
3459            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
3460            // and then setting the CWD back to the root directory.
3461            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3462                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3463
3464            syscall!(self.with_proc_chdir(||
3465                    // SAFETY: this doesn't modify any memory and we check the return value.
3466                    unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) }))?;
3467        } else {
3468            // For regular files and directories, we can just use fremovexattr.
3469            syscall!(
3470                // SAFETY: this doesn't modify any memory and we check the return value.
3471                unsafe { libc::fremovexattr(file.as_raw_descriptor(), name.as_ptr()) }
3472            )?;
3473        }
3474
3475        Ok(())
3476    }
3477
3478    fn fallocate(
3479        &self,
3480        _ctx: Context,
3481        inode: Inode,
3482        handle: Handle,
3483        mode: u32,
3484        offset: u64,
3485        length: u64,
3486    ) -> io::Result<()> {
3487        let _trace = fs_trace!(self.tag, "fallocate", inode, handle, mode, offset, length);
3488
3489        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
3490            let data = self.find_inode(inode)?;
3491
3492            {
3493                // fallocate needs a writable fd
3494                let mut file = data.file.lock();
3495                let mut flags = file.open_flags;
3496                match flags & libc::O_ACCMODE {
3497                    libc::O_RDONLY => {
3498                        flags &= !libc::O_RDONLY;
3499                        flags |= libc::O_RDWR;
3500
3501                        // We need to get a writable handle for this file.
3502                        let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
3503                        *file = OpenedFile::new(newfile, flags);
3504                    }
3505                    libc::O_WRONLY | libc::O_RDWR => {}
3506                    _ => panic!("Unexpected flags: {flags:#x}"),
3507                }
3508            }
3509
3510            data
3511        } else {
3512            self.find_handle(handle, inode)?
3513        };
3514
3515        let fd = data.as_raw_descriptor();
3516        // SAFETY: this doesn't modify any memory and we check the return value.
3517        syscall!(unsafe {
3518            libc::fallocate64(
3519                fd,
3520                mode as libc::c_int,
3521                offset as libc::off64_t,
3522                length as libc::off64_t,
3523            )
3524        })?;
3525
3526        Ok(())
3527    }
3528
3529    #[allow(clippy::unnecessary_cast)]
3530    fn ioctl<R: io::Read>(
3531        &self,
3532        ctx: Context,
3533        inode: Inode,
3534        handle: Handle,
3535        _flags: IoctlFlags,
3536        cmd: u32,
3537        _arg: u64,
3538        in_size: u32,
3539        out_size: u32,
3540        r: R,
3541    ) -> io::Result<IoctlReply> {
3542        let _trace = fs_trace!(self.tag, "ioctl", inode, handle, cmd, in_size, out_size);
3543
3544        match cmd as IoctlNr {
3545            FS_IOC_GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
3546            FS_IOC_FSGETXATTR => {
3547                if out_size < size_of::<fsxattr>() as u32 {
3548                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3549                } else {
3550                    self.get_fsxattr(inode, handle)
3551                }
3552            }
3553            FS_IOC_FSSETXATTR => {
3554                if in_size < size_of::<fsxattr>() as u32 {
3555                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3556                } else {
3557                    self.set_fsxattr(ctx, inode, handle, r)
3558                }
3559            }
3560            FS_IOC32_GETFLAGS | FS_IOC64_GETFLAGS => {
3561                if out_size < size_of::<c_int>() as u32 {
3562                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3563                } else {
3564                    self.get_flags(inode, handle)
3565                }
3566            }
3567            FS_IOC32_SETFLAGS | FS_IOC64_SETFLAGS => {
3568                if in_size < size_of::<c_int>() as u32 {
3569                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3570                } else {
3571                    self.set_flags(ctx, inode, handle, r)
3572                }
3573            }
3574            FS_IOC_ENABLE_VERITY => {
3575                if in_size < size_of::<fsverity_enable_arg>() as u32 {
3576                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3577                } else {
3578                    self.enable_verity(inode, handle, r)
3579                }
3580            }
3581            FS_IOC_MEASURE_VERITY => {
3582                if in_size < size_of::<fsverity_digest>() as u32
3583                    || out_size < size_of::<fsverity_digest>() as u32
3584                {
3585                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3586                } else {
3587                    self.measure_verity(inode, handle, r, out_size)
3588                }
3589            }
3590            // The following is ARCVM-specific ioctl
3591            // Refer go/remove-mount-passthrough-fuse for more design details
3592            #[cfg(feature = "arc_quota")]
3593            FS_IOC_SETPERMISSION => {
3594                if in_size != size_of::<FsPermissionDataBuffer>() as u32 {
3595                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3596                } else {
3597                    Ok(self.set_permission_by_path(r))
3598                }
3599            }
3600            #[cfg(feature = "arc_quota")]
3601            FS_IOC_SETPATHXATTR => {
3602                if in_size != size_of::<FsPathXattrDataBuffer>() as u32 {
3603                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3604                } else {
3605                    Ok(self.set_xattr_by_path(r))
3606                }
3607            }
3608            _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
3609        }
3610    }
3611
3612    fn copy_file_range(
3613        &self,
3614        ctx: Context,
3615        inode_src: Inode,
3616        handle_src: Handle,
3617        offset_src: u64,
3618        inode_dst: Inode,
3619        handle_dst: Handle,
3620        offset_dst: u64,
3621        length: u64,
3622        flags: u64,
3623    ) -> io::Result<usize> {
3624        let _trace = fs_trace!(
3625            self.tag,
3626            "copy_file_range",
3627            inode_src,
3628            handle_src,
3629            offset_src,
3630            inode_dst,
3631            handle_dst,
3632            offset_dst,
3633            length,
3634            flags
3635        );
3636        let dst_inode_data = self.find_inode(inode_dst)?;
3637
3638        #[allow(unused_variables)]
3639        #[cfg(feature = "arc_quota")]
3640        let (uid, gid) = self.change_creds_for_path(&ctx, &dst_inode_data.path);
3641        #[cfg(feature = "fs_runtime_ugid_map")]
3642        let (uid, gid) = self.change_ugid_creds_for_path(&ctx, &dst_inode_data.path);
3643        #[cfg(not(feature = "fs_permission_translation"))]
3644        let (uid, gid) = (ctx.uid, ctx.gid);
3645
3646        // We need to change credentials during a write so that the kernel will remove setuid or
3647        // setgid bits from the file if it was written to by someone other than the owner.
3648        let (_uid, _gid) = set_creds(uid, gid)?;
3649        let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
3650            if self.zero_message_open.load(Ordering::Relaxed) {
3651                (self.find_inode(inode_src)?, dst_inode_data)
3652            } else {
3653                (
3654                    self.find_handle(handle_src, inode_src)?,
3655                    self.find_handle(handle_dst, inode_dst)?,
3656                )
3657            };
3658
3659        let src = src_data.as_raw_descriptor();
3660        let dst = dst_data.as_raw_descriptor();
3661
3662        Ok(syscall!(
3663            // SAFETY: this call is safe because it doesn't modify any memory and we
3664            // check the return value.
3665            unsafe {
3666                libc::syscall(
3667                    libc::SYS_copy_file_range,
3668                    src,
3669                    &offset_src,
3670                    dst,
3671                    &offset_dst,
3672                    length,
3673                    flags,
3674                )
3675            }
3676        )? as usize)
3677    }
3678
3679    fn set_up_mapping<M: Mapper>(
3680        &self,
3681        _ctx: Context,
3682        inode: Self::Inode,
3683        _handle: Self::Handle,
3684        file_offset: u64,
3685        mem_offset: u64,
3686        size: usize,
3687        prot: u32,
3688        mapper: M,
3689    ) -> io::Result<()> {
3690        let _trace = fs_trace!(
3691            self.tag,
3692            "set_up_mapping",
3693            inode,
3694            file_offset,
3695            mem_offset,
3696            size,
3697            prot
3698        );
3699        if !self.cfg.use_dax {
3700            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3701        }
3702
3703        let read = prot & libc::PROT_READ as u32 != 0;
3704        let write = prot & libc::PROT_WRITE as u32 != 0;
3705        let (mmap_flags, prot) = match (read, write) {
3706            (true, true) => (libc::O_RDWR, Protection::read_write()),
3707            (true, false) => (libc::O_RDONLY, Protection::read()),
3708            // Write-only is mapped to O_RDWR since mmap always requires an fd opened for reading.
3709            (false, true) => (libc::O_RDWR, Protection::write()),
3710            (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
3711        };
3712
3713        let data = self.find_inode(inode)?;
3714
3715        if self.zero_message_open.load(Ordering::Relaxed) {
3716            let mut file = data.file.lock();
3717            let mut open_flags = file.open_flags;
3718            match (mmap_flags, open_flags & libc::O_ACCMODE) {
3719                (libc::O_RDONLY, libc::O_WRONLY)
3720                | (libc::O_RDWR, libc::O_RDONLY)
3721                | (libc::O_RDWR, libc::O_WRONLY) => {
3722                    // We have a read-only or write-only fd and we need to upgrade it.
3723                    open_flags &= !libc::O_ACCMODE;
3724                    open_flags |= libc::O_RDWR;
3725
3726                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
3727                    *file = OpenedFile::new(newfile, open_flags);
3728                }
3729                (libc::O_RDONLY, libc::O_RDONLY)
3730                | (libc::O_RDONLY, libc::O_RDWR)
3731                | (libc::O_RDWR, libc::O_RDWR) => {}
3732                (m, o) => panic!("Unexpected combination of access flags: ({m:#x}, {o:#x})"),
3733            }
3734            mapper.map(mem_offset, size, file.file(), file_offset, prot)
3735        } else {
3736            let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
3737            mapper.map(mem_offset, size, &file, file_offset, prot)
3738        }
3739    }
3740
3741    fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
3742        let _trace = fs_trace!(self.tag, "remove_mapping", msgs);
3743        if !self.cfg.use_dax {
3744            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3745        }
3746
3747        for RemoveMappingOne { moffset, len } in msgs {
3748            mapper.unmap(*moffset, *len)?;
3749        }
3750        Ok(())
3751    }
3752
3753    fn atomic_open(
3754        &self,
3755        ctx: Context,
3756        parent: Self::Inode,
3757        name: &CStr,
3758        mode: u32,
3759        flags: u32,
3760        umask: u32,
3761        security_ctx: Option<&CStr>,
3762    ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)> {
3763        validate_path_component(name)?;
3764        let _trace = fs_trace!(
3765            self.tag,
3766            "atomic_open",
3767            parent,
3768            name,
3769            mode,
3770            flags,
3771            umask,
3772            security_ctx
3773        );
3774        // Perform lookup but not create negative dentry
3775        let data = self.find_inode(parent)?;
3776
3777        #[allow(unused_variables)]
3778        #[cfg(feature = "arc_quota")]
3779        let (uid, gid) = self.change_creds(&ctx, &data, name);
3780        #[cfg(feature = "fs_runtime_ugid_map")]
3781        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3782        #[cfg(not(feature = "fs_permission_translation"))]
3783        let (uid, gid) = (ctx.uid, ctx.gid);
3784
3785        let (_uid, _gid) = set_creds(uid, gid)?;
3786
3787        // This lookup serves two purposes:
3788        // 1. If the O_CREATE flag is not set, it retrieves the d_entry for the file.
3789        // 2. If the O_CREATE flag is set, it checks whether the file exists.
3790        let res = self.do_lookup_with_casefold_fallback(&data, name);
3791
3792        if let Err(e) = res {
3793            if e.kind() == std::io::ErrorKind::NotFound && (flags as i32 & libc::O_CREAT) != 0 {
3794                // If the file did not exist & O_CREAT is set,
3795                // create file & set FILE_CREATED bits in open options
3796                let (entry, handler, mut opts) =
3797                    self.create(ctx, parent, name, mode, flags, umask, security_ctx)?;
3798                opts |= OpenOptions::FILE_CREATED;
3799                return Ok((entry, handler, opts));
3800            } else if e.kind() == std::io::ErrorKind::NotFound
3801                && !self.cfg.negative_timeout.is_zero()
3802            {
3803                return Ok((
3804                    Entry::new_negative(self.cfg.negative_timeout),
3805                    None,
3806                    OpenOptions::empty(),
3807                ));
3808            }
3809            return Err(e);
3810        }
3811
3812        // SAFETY: checked res is not error before
3813        let entry = res.unwrap();
3814
3815        if entry.attr.st_mode & libc::S_IFMT == libc::S_IFLNK {
3816            return Ok((entry, None, OpenOptions::empty()));
3817        }
3818
3819        if (flags as i32 & (libc::O_CREAT | libc::O_EXCL)) == (libc::O_CREAT | libc::O_EXCL) {
3820            return Err(eexist());
3821        }
3822
3823        let (handler, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
3824            (None, OpenOptions::KEEP_CACHE)
3825        } else {
3826            let (handler, opts) = self.do_open(entry.inode, flags)?;
3827            (handler, opts)
3828        };
3829        Ok((entry, handler, opts))
3830    }
3831}
3832
3833#[cfg(test)]
3834mod tests {
3835    use std::path::Path;
3836
3837    use named_lock::NamedLock;
3838    use tempfile::TempDir;
3839
3840    use super::*;
3841    #[cfg(feature = "arc_quota")]
3842    use crate::virtio::fs::arc_ioctl::FS_IOCTL_PATH_MAX_LEN;
3843    #[cfg(feature = "arc_quota")]
3844    use crate::virtio::fs::arc_ioctl::FS_IOCTL_XATTR_NAME_MAX_LEN;
3845    #[cfg(feature = "arc_quota")]
3846    use crate::virtio::fs::arc_ioctl::FS_IOCTL_XATTR_VALUE_MAX_LEN;
3847
3848    const UNITTEST_LOCK_NAME: &str = "passthroughfs_unittest_lock";
3849
3850    // Create an instance of `Context` with valid uid, gid, and pid.
3851    // The correct ids are necessary for test cases where new files are created.
3852    fn get_context() -> Context {
3853        // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3854        // guarantees that they can never fail.
3855        let uid = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
3856        // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3857        // guarantees that they can never fail.
3858        let gid = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
3859        let pid = std::process::id() as libc::pid_t;
3860        Context { uid, gid, pid }
3861    }
3862
3863    /// Creates the given directories and files under `temp_dir`.
3864    fn create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str]) {
3865        let path = temp_dir.path();
3866
3867        for d in dirs {
3868            std::fs::create_dir_all(path.join(d)).unwrap();
3869        }
3870
3871        for f in files {
3872            File::create(path.join(f)).unwrap();
3873        }
3874    }
3875
3876    /// Looks up the given `path` in `fs`.
3877    fn lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode> {
3878        let mut inode = 1;
3879        let ctx = get_context();
3880        for name in path.iter() {
3881            let name = CString::new(name.to_str().unwrap()).unwrap();
3882            let ent = match fs.lookup(ctx, inode, &name) {
3883                Ok(ent) => ent,
3884                Err(e) => {
3885                    return Err(e);
3886                }
3887            };
3888            inode = ent.inode;
3889        }
3890        Ok(inode)
3891    }
3892
3893    /// Looks up the given `path` in `fs`.
3894    #[cfg(feature = "arc_quota")]
3895    fn lookup_ent(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
3896        let mut inode = 1;
3897        let ctx = get_context();
3898        let mut entry = Entry::new_negative(Duration::from_secs(10));
3899        for name in path.iter() {
3900            let name = CString::new(name.to_str().unwrap()).unwrap();
3901            entry = match fs.lookup(ctx, inode, &name) {
3902                Ok(ent) => ent,
3903                Err(e) => {
3904                    return Err(e);
3905                }
3906            };
3907            inode = entry.inode;
3908        }
3909        Ok(entry)
3910    }
3911
3912    /// Creates a file at the given `path`.
3913    fn create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
3914        let parent = path.parent().unwrap();
3915        let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3916        let parent_inode = lookup(fs, parent)?;
3917        let ctx = get_context();
3918        let security_ctx = None;
3919        fs.create(
3920            ctx,
3921            parent_inode,
3922            &filename,
3923            0o666,
3924            libc::O_RDWR as u32,
3925            0,
3926            security_ctx,
3927        )
3928        .map(|(entry, _, _)| entry)
3929    }
3930
3931    /// Removes a file at the given `path`.
3932    fn unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3933        let parent = path.parent().unwrap();
3934        let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3935        let parent_inode = lookup(fs, parent)?;
3936        let ctx = get_context();
3937        fs.unlink(ctx, parent_inode, &filename)
3938    }
3939
3940    /// Forgets cache.
3941    fn forget(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3942        let ctx = get_context();
3943        let inode = lookup(fs, path)?;
3944        // Pass `u64::MAX` to ensure that the refcount goes to 0 and we forget inode.
3945        fs.forget(ctx, inode, u64::MAX);
3946        Ok(())
3947    }
3948
3949    /// Looks up and open the given `path` in `fs`.
3950    fn atomic_open(
3951        fs: &PassthroughFs,
3952        path: &Path,
3953        mode: u32,
3954        flags: u32,
3955        umask: u32,
3956        security_ctx: Option<&CStr>,
3957    ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
3958        let mut inode = 1;
3959        let ctx = get_context();
3960
3961        let path_vec: Vec<_> = path.iter().collect();
3962        let vec_len = path_vec.len();
3963
3964        // Do lookup before util (vec_len-1)-th pathname, this operation is to simulate
3965        // the behavior of VFS, since when VFS call atomic_open only at last look up.
3966        for name in &path_vec[0..vec_len - 1] {
3967            let name = CString::new(name.to_str().unwrap()).unwrap();
3968            let ent = fs.lookup(ctx, inode, &name)?;
3969            inode = ent.inode;
3970        }
3971
3972        let name = CString::new(path_vec[vec_len - 1].to_str().unwrap()).unwrap();
3973
3974        fs.atomic_open(ctx, inode, &name, mode, flags, umask, security_ctx)
3975    }
3976
3977    fn symlink(
3978        fs: &PassthroughFs,
3979        linkname: &Path,
3980        path: &Path,
3981        security_ctx: Option<&CStr>,
3982    ) -> io::Result<Entry> {
3983        let parent = path.parent().unwrap();
3984        let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3985        let parent_inode = lookup(fs, parent)?;
3986        let ctx = get_context();
3987        let linkname = CString::new(linkname.to_str().unwrap()).unwrap();
3988        fs.symlink(ctx, &linkname, parent_inode, &filename, security_ctx)
3989    }
3990
3991    // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
3992    #[cfg(feature = "arc_quota")]
3993    fn fs_ioc_setpermission<R: io::Read>(
3994        fs: &PassthroughFs,
3995        in_size: u32,
3996        r: R,
3997    ) -> io::Result<IoctlReply> {
3998        let ctx = get_context();
3999        fs.ioctl(
4000            ctx,
4001            0,
4002            0,
4003            IoctlFlags::empty(),
4004            FS_IOC_SETPERMISSION as u32,
4005            0,
4006            in_size,
4007            0,
4008            r,
4009        )
4010    }
4011
4012    // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
4013    #[cfg(feature = "arc_quota")]
4014    fn fs_ioc_setpathxattr<R: io::Read>(
4015        fs: &PassthroughFs,
4016        in_size: u32,
4017        r: R,
4018    ) -> io::Result<IoctlReply> {
4019        let ctx = get_context();
4020        fs.ioctl(
4021            ctx,
4022            0,
4023            0,
4024            IoctlFlags::empty(),
4025            FS_IOC_SETPATHXATTR as u32,
4026            0,
4027            in_size,
4028            0,
4029            r,
4030        )
4031    }
4032
4033    #[test]
4034    fn rewrite_xattr_names() {
4035        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4036        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4037        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4038        let _guard = lock.lock().expect("acquire named lock");
4039
4040        let cfg = Config {
4041            rewrite_security_xattrs: true,
4042            ..Default::default()
4043        };
4044
4045        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4046
4047        // Selinux shouldn't get overwritten.
4048        let selinux = c"security.selinux";
4049        assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
4050
4051        // user, trusted, and system should not be changed either.
4052        let user = c"user.foobar";
4053        assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
4054        let trusted = c"trusted.foobar";
4055        assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
4056        let system = c"system.foobar";
4057        assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
4058
4059        // sehash should be re-written.
4060        let sehash = c"security.sehash";
4061        assert_eq!(
4062            p.rewrite_xattr_name(sehash).to_bytes(),
4063            b"user.virtiofs.security.sehash"
4064        );
4065    }
4066
4067    #[test]
4068    fn strip_xattr_names() {
4069        let only_nuls = b"\0\0\0\0\0";
4070        let mut actual = only_nuls.to_vec();
4071        strip_xattr_prefix(&mut actual);
4072        assert_eq!(&actual[..], &only_nuls[..]);
4073
4074        let no_nuls = b"security.sehashuser.virtiofs";
4075        let mut actual = no_nuls.to_vec();
4076        strip_xattr_prefix(&mut actual);
4077        assert_eq!(&actual[..], &no_nuls[..]);
4078
4079        let empty = b"";
4080        let mut actual = empty.to_vec();
4081        strip_xattr_prefix(&mut actual);
4082        assert_eq!(&actual[..], &empty[..]);
4083
4084        let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
4085        let mut actual = no_strippable_names.to_vec();
4086        strip_xattr_prefix(&mut actual);
4087        assert_eq!(&actual[..], &no_strippable_names[..]);
4088
4089        let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
4090        let mut actual = only_strippable_names.to_vec();
4091        strip_xattr_prefix(&mut actual);
4092        assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
4093
4094        let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
4095        let mut actual = mixed_names.to_vec();
4096        strip_xattr_prefix(&mut actual);
4097        let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
4098        assert_eq!(&actual[..], &expected[..]);
4099
4100        let no_nul_with_prefix = b"user.virtiofs.security.sehash";
4101        let mut actual = no_nul_with_prefix.to_vec();
4102        strip_xattr_prefix(&mut actual);
4103        assert_eq!(&actual[..], b"security.sehash");
4104    }
4105
4106    #[test]
4107    fn lookup_files() {
4108        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4109        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4110        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4111        let _guard = lock.lock().expect("acquire named lock");
4112
4113        let temp_dir = TempDir::new().unwrap();
4114        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
4115
4116        let cfg = Default::default();
4117        let fs = PassthroughFs::new("tag", cfg).unwrap();
4118
4119        let capable = FsOptions::empty();
4120        fs.init(capable).unwrap();
4121
4122        assert!(lookup(&fs, &temp_dir.path().join("a.txt")).is_ok());
4123        assert!(lookup(&fs, &temp_dir.path().join("dir")).is_ok());
4124        assert!(lookup(&fs, &temp_dir.path().join("dir/b.txt")).is_ok());
4125
4126        assert_eq!(
4127            lookup(&fs, &temp_dir.path().join("nonexistent-file"))
4128                .expect_err("file must not exist")
4129                .kind(),
4130            io::ErrorKind::NotFound
4131        );
4132        // "A.txt" is different from "a.txt".
4133        assert_eq!(
4134            lookup(&fs, &temp_dir.path().join("A.txt"))
4135                .expect_err("file must not exist")
4136                .kind(),
4137            io::ErrorKind::NotFound
4138        );
4139    }
4140
4141    #[test]
4142    fn lookup_files_ascii_casefold() {
4143        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4144        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4145        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4146        let _guard = lock.lock().expect("acquire named lock");
4147
4148        let temp_dir = TempDir::new().unwrap();
4149        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
4150
4151        let cfg = Config {
4152            ascii_casefold: true,
4153            ..Default::default()
4154        };
4155        let fs = PassthroughFs::new("tag", cfg).unwrap();
4156
4157        let capable = FsOptions::empty();
4158        fs.init(capable).unwrap();
4159
4160        // Ensure that "A.txt" is equated with "a.txt".
4161        let a_inode = lookup(&fs, &temp_dir.path().join("a.txt")).expect("a.txt must be found");
4162        assert_eq!(
4163            lookup(&fs, &temp_dir.path().join("A.txt")).expect("A.txt must exist"),
4164            a_inode
4165        );
4166
4167        let dir_inode = lookup(&fs, &temp_dir.path().join("dir")).expect("dir must be found");
4168        assert_eq!(
4169            lookup(&fs, &temp_dir.path().join("DiR")).expect("DiR must exist"),
4170            dir_inode
4171        );
4172
4173        let b_inode =
4174            lookup(&fs, &temp_dir.path().join("dir/b.txt")).expect("dir/b.txt must be found");
4175        assert_eq!(
4176            lookup(&fs, &temp_dir.path().join("dIr/B.TxT")).expect("dIr/B.TxT must exist"),
4177            b_inode
4178        );
4179
4180        assert_eq!(
4181            lookup(&fs, &temp_dir.path().join("nonexistent-file"))
4182                .expect_err("file must not exist")
4183                .kind(),
4184            io::ErrorKind::NotFound
4185        );
4186    }
4187
4188    fn test_create_and_remove(ascii_casefold: bool) {
4189        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4190        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4191        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4192        let _guard = lock.lock().expect("acquire named lock");
4193
4194        let temp_dir = TempDir::new().unwrap();
4195        let timeout = Duration::from_millis(10);
4196        let cfg = Config {
4197            timeout,
4198            cache_policy: CachePolicy::Auto,
4199            ascii_casefold,
4200            ..Default::default()
4201        };
4202        let fs = PassthroughFs::new("tag", cfg).unwrap();
4203
4204        let capable = FsOptions::empty();
4205        fs.init(capable).unwrap();
4206
4207        // Create a.txt and b.txt.
4208        let a_path = temp_dir.path().join("a.txt");
4209        let b_path = temp_dir.path().join("b.txt");
4210        let a_entry = create(&fs, &a_path).expect("create a.txt");
4211        let b_entry = create(&fs, &b_path).expect("create b.txt");
4212        assert_eq!(
4213            a_entry.inode,
4214            lookup(&fs, &a_path).expect("lookup a.txt"),
4215            "Created file 'a.txt' must be looked up"
4216        );
4217        assert_eq!(
4218            b_entry.inode,
4219            lookup(&fs, &b_path).expect("lookup b.txt"),
4220            "Created file 'b.txt' must be looked up"
4221        );
4222
4223        // Remove a.txt only
4224        unlink(&fs, &a_path).expect("Remove");
4225        assert_eq!(
4226            lookup(&fs, &a_path)
4227                .expect_err("file must not exist")
4228                .kind(),
4229            io::ErrorKind::NotFound,
4230            "a.txt must be removed"
4231        );
4232        // "A.TXT" must not be found regardless of whether casefold is enabled or not.
4233        let upper_a_path = temp_dir.path().join("A.TXT");
4234        assert_eq!(
4235            lookup(&fs, &upper_a_path)
4236                .expect_err("file must not exist")
4237                .kind(),
4238            io::ErrorKind::NotFound,
4239            "A.txt must be removed"
4240        );
4241
4242        // Check if the host file system doesn't have a.txt but does b.txt.
4243        assert!(!a_path.exists(), "a.txt must be removed");
4244        assert!(b_path.exists(), "b.txt must exist");
4245    }
4246
4247    #[test]
4248    fn create_and_remove() {
4249        test_create_and_remove(false /* casefold */);
4250    }
4251
4252    #[test]
4253    fn create_and_remove_casefold() {
4254        test_create_and_remove(true /* casefold */);
4255    }
4256
4257    fn test_create_and_forget(ascii_casefold: bool) {
4258        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4259        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4260        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4261        let _guard = lock.lock().expect("acquire named lock");
4262
4263        let temp_dir = TempDir::new().unwrap();
4264        let timeout = Duration::from_millis(10);
4265        let cfg = Config {
4266            timeout,
4267            cache_policy: CachePolicy::Auto,
4268            ascii_casefold,
4269            ..Default::default()
4270        };
4271        let fs = PassthroughFs::new("tag", cfg).unwrap();
4272
4273        let capable = FsOptions::empty();
4274        fs.init(capable).unwrap();
4275
4276        // Create a.txt.
4277        let a_path = temp_dir.path().join("a.txt");
4278        let a_entry = create(&fs, &a_path).expect("create a.txt");
4279        assert_eq!(
4280            a_entry.inode,
4281            lookup(&fs, &a_path).expect("lookup a.txt"),
4282            "Created file 'a.txt' must be looked up"
4283        );
4284
4285        // Forget a.txt's inode from PassthroughFs's internal cache.
4286        forget(&fs, &a_path).expect("forget a.txt");
4287
4288        if ascii_casefold {
4289            let upper_a_path = temp_dir.path().join("A.TXT");
4290            let new_a_inode = lookup(&fs, &upper_a_path).expect("lookup a.txt");
4291            assert_ne!(
4292                a_entry.inode, new_a_inode,
4293                "inode must be changed after forget()"
4294            );
4295            assert_eq!(
4296                new_a_inode,
4297                lookup(&fs, &a_path).expect("lookup a.txt"),
4298                "inode must be same for a.txt and A.TXT"
4299            );
4300        } else {
4301            assert_ne!(
4302                a_entry.inode,
4303                lookup(&fs, &a_path).expect("lookup a.txt"),
4304                "inode must be changed after forget()"
4305            );
4306        }
4307    }
4308
4309    #[test]
4310    fn create_and_forget() {
4311        test_create_and_forget(false /* ascii_casefold */);
4312    }
4313
4314    #[test]
4315    fn create_and_forget_casefold() {
4316        test_create_and_forget(true /* ascii_casefold */);
4317    }
4318
4319    #[test]
4320    fn casefold_lookup_cache() {
4321        let temp_dir = TempDir::new().unwrap();
4322        // Prepare `a.txt` before starting the test.
4323        create_test_data(&temp_dir, &[], &["a.txt"]);
4324
4325        let cfg = Config {
4326            ascii_casefold: true,
4327            ..Default::default()
4328        };
4329        let fs = PassthroughFs::new("tag", cfg).unwrap();
4330
4331        let capable = FsOptions::empty();
4332        fs.init(capable).unwrap();
4333
4334        let parent = lookup(&fs, temp_dir.path()).expect("lookup temp_dir");
4335
4336        // Since `a.txt` exists, "A.TXT" must exist.
4337        let large_a_path = temp_dir.path().join("A.TXT");
4338        // Looking up "A.TXT" must create a CasefoldCache entry.
4339        lookup(&fs, &large_a_path).expect("A.TXT must exist");
4340        assert!(fs.exists_in_casefold_cache(parent, &CString::new("A.TXT").unwrap()));
4341
4342        // Create b.txt.
4343        let b_path = temp_dir.path().join("b.txt");
4344        create(&fs, &b_path).expect("create b.txt");
4345        // Then, b.txt must exists in the cache.
4346        assert!(fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
4347        // When removing b.txt, it must be removed from the cache as well.
4348        unlink(&fs, &b_path).expect("remove b.txt");
4349        assert!(!fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
4350    }
4351
4352    #[test]
4353    fn lookup_negative_cache() {
4354        let temp_dir = TempDir::new().unwrap();
4355        // Prepare `a.txt` before starting the test.
4356        create_test_data(&temp_dir, &[], &[]);
4357
4358        let cfg = Config {
4359            negative_timeout: Duration::from_secs(5),
4360            ..Default::default()
4361        };
4362        let fs = PassthroughFs::new("tag", cfg).unwrap();
4363
4364        let capable = FsOptions::empty();
4365        fs.init(capable).unwrap();
4366
4367        let a_path = temp_dir.path().join("a.txt");
4368        // a.txt hasn't existed yet.
4369        // Since negative_timeout is enabled, success with inode=0 is expected.
4370        assert_eq!(
4371            0,
4372            lookup(&fs, &a_path).expect("lookup a.txt"),
4373            "Entry with inode=0 is expected for non-existing file 'a.txt'"
4374        );
4375        // Create a.txt
4376        let a_entry = create(&fs, &a_path).expect("create a.txt");
4377        assert_eq!(
4378            a_entry.inode,
4379            lookup(&fs, &a_path).expect("lookup a.txt"),
4380            "Created file 'a.txt' must be looked up"
4381        );
4382        // Remove a.txt
4383        unlink(&fs, &a_path).expect("Remove");
4384        assert_eq!(
4385            0,
4386            lookup(&fs, &a_path).expect("lookup a.txt"),
4387            "Entry with inode=0 is expected for the removed file 'a.txt'"
4388        );
4389    }
4390    #[test]
4391    fn test_atomic_open_existing_file() {
4392        atomic_open_existing_file(false);
4393    }
4394
4395    #[test]
4396    fn test_atomic_open_existing_file_zero_message() {
4397        atomic_open_existing_file(true);
4398    }
4399
4400    fn atomic_open_existing_file(zero_message_open: bool) {
4401        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4402        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4403        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4404        let _guard = lock.lock().expect("acquire named lock");
4405
4406        let temp_dir = TempDir::new().unwrap();
4407        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt", "dir/c.txt"]);
4408
4409        let cache_policy = match zero_message_open {
4410            true => CachePolicy::Always,
4411            false => CachePolicy::Auto,
4412        };
4413
4414        let cfg = Config {
4415            cache_policy,
4416            ..Default::default()
4417        };
4418        let fs = PassthroughFs::new("tag", cfg).unwrap();
4419
4420        let capable = FsOptions::ZERO_MESSAGE_OPEN;
4421        fs.init(capable).unwrap();
4422
4423        // atomic_open with flag O_RDWR, should return positive dentry and file handler
4424        let res = atomic_open(
4425            &fs,
4426            &temp_dir.path().join("a.txt"),
4427            0o666,
4428            libc::O_RDWR as u32,
4429            0,
4430            None,
4431        );
4432        assert!(res.is_ok());
4433        let (entry, handler, open_options) = res.unwrap();
4434        assert_ne!(entry.inode, 0);
4435
4436        if zero_message_open {
4437            assert!(handler.is_none());
4438            assert_eq!(open_options, OpenOptions::KEEP_CACHE);
4439        } else {
4440            assert!(handler.is_some());
4441            assert_ne!(
4442                open_options & OpenOptions::FILE_CREATED,
4443                OpenOptions::FILE_CREATED
4444            );
4445        }
4446
4447        // atomic_open with flag O_RDWR |  O_CREATE, should return positive dentry and file handler
4448        let res = atomic_open(
4449            &fs,
4450            &temp_dir.path().join("dir/b.txt"),
4451            0o666,
4452            (libc::O_RDWR | libc::O_CREAT) as u32,
4453            0,
4454            None,
4455        );
4456        assert!(res.is_ok());
4457        let (entry, handler, open_options) = res.unwrap();
4458        assert_ne!(entry.inode, 0);
4459
4460        if zero_message_open {
4461            assert!(handler.is_none());
4462            assert_eq!(open_options, OpenOptions::KEEP_CACHE);
4463        } else {
4464            assert!(handler.is_some());
4465            assert_ne!(
4466                open_options & OpenOptions::FILE_CREATED,
4467                OpenOptions::FILE_CREATED
4468            );
4469        }
4470
4471        // atomic_open with flag O_RDWR | O_CREATE | O_EXCL, should return positive dentry and file
4472        // handler
4473        let res = atomic_open(
4474            &fs,
4475            &temp_dir.path().join("dir/c.txt"),
4476            0o666,
4477            (libc::O_RDWR | libc::O_CREAT | libc::O_EXCL) as u32,
4478            0,
4479            None,
4480        );
4481        assert!(res.is_err());
4482        let err_kind = res.unwrap_err().kind();
4483        assert_eq!(err_kind, io::ErrorKind::AlreadyExists);
4484    }
4485
4486    #[test]
4487    fn test_atomic_open_non_existing_file() {
4488        atomic_open_non_existing_file(false);
4489    }
4490
4491    #[test]
4492    fn test_atomic_open_non_existing_file_zero_message() {
4493        atomic_open_non_existing_file(true);
4494    }
4495
4496    fn atomic_open_non_existing_file(zero_message_open: bool) {
4497        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4498        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4499        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4500        let _guard = lock.lock().expect("acquire named lock");
4501
4502        let temp_dir = TempDir::new().unwrap();
4503
4504        let cache_policy = match zero_message_open {
4505            true => CachePolicy::Always,
4506            false => CachePolicy::Auto,
4507        };
4508
4509        let cfg = Config {
4510            cache_policy,
4511            ..Default::default()
4512        };
4513        let fs = PassthroughFs::new("tag", cfg).unwrap();
4514
4515        let capable = FsOptions::ZERO_MESSAGE_OPEN;
4516        fs.init(capable).unwrap();
4517
4518        // atomic_open with flag O_RDWR, should return NO_EXIST error
4519        let res = atomic_open(
4520            &fs,
4521            &temp_dir.path().join("a.txt"),
4522            0o666,
4523            libc::O_RDWR as u32,
4524            0,
4525            None,
4526        );
4527        assert!(res.is_err());
4528        let err_kind = res.unwrap_err().kind();
4529        assert_eq!(err_kind, io::ErrorKind::NotFound);
4530
4531        // atomic_open with flag O_RDWR | O_CREATE, should return positive dentry and file handler
4532        let res = atomic_open(
4533            &fs,
4534            &temp_dir.path().join("b.txt"),
4535            0o666,
4536            (libc::O_RDWR | libc::O_CREAT) as u32,
4537            0,
4538            None,
4539        );
4540        assert!(res.is_ok());
4541        let (entry, handler, open_options) = res.unwrap();
4542        assert_ne!(entry.inode, 0);
4543
4544        if zero_message_open {
4545            assert!(handler.is_none());
4546            assert_eq!(
4547                open_options & OpenOptions::KEEP_CACHE,
4548                OpenOptions::KEEP_CACHE
4549            );
4550        } else {
4551            assert!(handler.is_some());
4552        }
4553        assert_eq!(
4554            open_options & OpenOptions::FILE_CREATED,
4555            OpenOptions::FILE_CREATED
4556        );
4557    }
4558
4559    #[test]
4560    fn atomic_open_symbol_link() {
4561        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4562        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4563        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4564        let _guard = lock.lock().expect("acquire named lock");
4565
4566        let temp_dir = TempDir::new().unwrap();
4567        create_test_data(&temp_dir, &["dir"], &["a.txt"]);
4568
4569        let cfg = Default::default();
4570        let fs = PassthroughFs::new("tag", cfg).unwrap();
4571
4572        let capable = FsOptions::empty();
4573        fs.init(capable).unwrap();
4574
4575        // atomic open the link destination file
4576        let res_dst = atomic_open(
4577            &fs,
4578            &temp_dir.path().join("a.txt"),
4579            0o666,
4580            libc::O_RDWR as u32,
4581            0,
4582            None,
4583        );
4584        assert!(res_dst.is_ok());
4585        let (entry_dst, handler_dst, _) = res_dst.unwrap();
4586        assert_ne!(entry_dst.inode, 0);
4587        assert!(handler_dst.is_some());
4588
4589        // create depth 1 symbol link
4590        let sym1_res = symlink(
4591            &fs,
4592            &temp_dir.path().join("a.txt"),
4593            &temp_dir.path().join("blink"),
4594            None,
4595        );
4596        assert!(sym1_res.is_ok());
4597        let sym1_entry = sym1_res.unwrap();
4598        assert_ne!(sym1_entry.inode, 0);
4599
4600        // atomic_open symbol link, should return dentry with no handler
4601        let res = atomic_open(
4602            &fs,
4603            &temp_dir.path().join("blink"),
4604            0o666,
4605            libc::O_RDWR as u32,
4606            0,
4607            None,
4608        );
4609        assert!(res.is_ok());
4610        let (entry, handler, open_options) = res.unwrap();
4611        assert_eq!(entry.inode, sym1_entry.inode);
4612        assert!(handler.is_none());
4613        assert_eq!(open_options, OpenOptions::empty());
4614
4615        // delete link destination
4616        unlink(&fs, &temp_dir.path().join("a.txt")).expect("Remove");
4617        assert_eq!(
4618            lookup(&fs, &temp_dir.path().join("a.txt"))
4619                .expect_err("file must not exist")
4620                .kind(),
4621            io::ErrorKind::NotFound,
4622            "a.txt must be removed"
4623        );
4624
4625        // after link destination removed, should still return valid dentry
4626        let res = atomic_open(
4627            &fs,
4628            &temp_dir.path().join("blink"),
4629            0o666,
4630            libc::O_RDWR as u32,
4631            0,
4632            None,
4633        );
4634        assert!(res.is_ok());
4635        let (entry, handler, open_options) = res.unwrap();
4636        assert_eq!(entry.inode, sym1_entry.inode);
4637        assert!(handler.is_none());
4638        assert_eq!(open_options, OpenOptions::empty());
4639    }
4640
4641    #[test]
4642    #[cfg(feature = "arc_quota")]
4643    fn set_permission_ioctl_valid_data() {
4644        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4645        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4646        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4647        let _guard = lock.lock().expect("acquire named lock");
4648
4649        let cfg = Config {
4650            max_dynamic_perm: 1,
4651            ..Default::default()
4652        };
4653        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4654
4655        let perm_path_string = String::from("/test");
4656        let fs_permission_data_buffer = FsPermissionDataBuffer {
4657            guest_uid: 1,
4658            guest_gid: 2,
4659            host_uid: 3,
4660            host_gid: 4,
4661            umask: 5,
4662            pad: 0,
4663            perm_path: {
4664                let mut perm_path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4665                perm_path[..perm_path_string.len()].copy_from_slice(perm_path_string.as_bytes());
4666                perm_path
4667            },
4668        };
4669        let r = std::io::Cursor::new(fs_permission_data_buffer.as_bytes());
4670
4671        let res = fs_ioc_setpermission(
4672            &p,
4673            mem::size_of_val(&fs_permission_data_buffer) as u32,
4674            r.clone(),
4675        )
4676        .expect("valid input should get IoctlReply");
4677        assert!(matches!(res, IoctlReply::Done(Ok(data)) if data.is_empty()));
4678
4679        let read_guard = p
4680            .permission_paths
4681            .read()
4682            .expect("read permission_paths failed");
4683        let permission_data = read_guard
4684            .first()
4685            .expect("permission path should not be empty");
4686
4687        // Check expected data item is added to permission_paths.
4688        let expected_data = PermissionData {
4689            guest_uid: 1,
4690            guest_gid: 2,
4691            host_uid: 3,
4692            host_gid: 4,
4693            umask: 5,
4694            perm_path: perm_path_string,
4695        };
4696        assert_eq!(*permission_data, expected_data);
4697
4698        // Second ioctl should not succeed since max_dynamic_perm is set to 1
4699        let res = fs_ioc_setpermission(
4700            &p,
4701            mem::size_of_val(&fs_permission_data_buffer) as u32,
4702            r.clone(),
4703        )
4704        .expect("valid input should get IoctlReply");
4705        assert!(
4706            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4707                errno == libc::EPERM
4708            }))
4709        );
4710    }
4711
4712    #[test]
4713    #[cfg(feature = "arc_quota")]
4714    fn set_permission_ioctl_invalid_data() {
4715        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4716        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4717        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4718        let _guard = lock.lock().expect("acquire named lock");
4719
4720        let cfg = Config {
4721            max_dynamic_perm: 1,
4722            ..Default::default()
4723        };
4724        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4725
4726        // The perm_path is not valid since it does not start with /.
4727        let perm_path_string = String::from("test");
4728        let fs_permission_data_buffer = FsPermissionDataBuffer {
4729            guest_uid: 1,
4730            guest_gid: 2,
4731            host_uid: 3,
4732            host_gid: 4,
4733            umask: 5,
4734            pad: 0,
4735            perm_path: {
4736                let mut perm_path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4737                perm_path[..perm_path_string.len()].copy_from_slice(perm_path_string.as_bytes());
4738                perm_path
4739            },
4740        };
4741
4742        let r = std::io::Cursor::new(fs_permission_data_buffer.as_bytes());
4743        // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
4744        // This call is supposed to get EINVAL ioctlReply, since the perm_path is invalid.
4745        let res = fs_ioc_setpermission(&p, mem::size_of_val(&fs_permission_data_buffer) as u32, r)
4746            .expect("invalid perm_path should get IoctlReply");
4747        assert!(
4748            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4749                errno == libc::EINVAL
4750            }))
4751        );
4752
4753        let fake_data_buffer: [u8; 128] = [0; 128];
4754        let r = std::io::Cursor::new(fake_data_buffer.as_bytes());
4755
4756        // This call is supposed to get EINVAL ioctlReply, since the in_size is not the size of
4757        // struct FsPermissionDataBuffer.
4758        let res = fs_ioc_setpermission(&p, mem::size_of_val(&fake_data_buffer) as u32, r)
4759            .expect_err("invalid in_size should get Error");
4760        assert!(res
4761            .raw_os_error()
4762            .is_some_and(|errno| { errno == libc::EINVAL }));
4763    }
4764
4765    #[test]
4766    #[cfg(feature = "arc_quota")]
4767    fn permission_data_path_matching() {
4768        let ctx = get_context();
4769        let temp_dir = TempDir::new().unwrap();
4770        // Prepare `a.txt` before starting the test.
4771        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/a.txt"]);
4772
4773        let cfg = Config {
4774            max_dynamic_perm: 1,
4775            ..Default::default()
4776        };
4777        let fs = PassthroughFs::new("tag", cfg).unwrap();
4778
4779        let capable = FsOptions::empty();
4780        fs.init(capable).unwrap();
4781
4782        const BY_PATH_UID: u32 = 655360;
4783        const BY_PATH_GID: u32 = 655361;
4784        const BY_PATH_UMASK: u32 = 0o007;
4785
4786        let dir_path = temp_dir.path().join("dir");
4787        let permission_data = PermissionData {
4788            guest_uid: BY_PATH_UID,
4789            guest_gid: BY_PATH_GID,
4790            host_uid: ctx.uid,
4791            host_gid: ctx.gid,
4792            umask: BY_PATH_UMASK,
4793            perm_path: dir_path.to_string_lossy().into_owned(),
4794        };
4795        fs.permission_paths
4796            .write()
4797            .expect("permission_path lock must be acquired")
4798            .push(permission_data);
4799
4800        // a_path is the path with out set permission by path
4801        let a_path = temp_dir.path().join("a.txt");
4802        let in_dir_a_path = dir_path.join("a.txt");
4803
4804        // a.txt should not be set with guest_uid/guest_uid/umask by path
4805        let a_entry = lookup_ent(&fs, &a_path).expect("a.txt must exist");
4806        assert_ne!(a_entry.attr.st_uid, BY_PATH_UID);
4807        assert_ne!(a_entry.attr.st_gid, BY_PATH_GID);
4808
4809        // a.txt in dir should be set guest_uid/guest_uid/umask by path
4810        let in_dir_a_entry = lookup_ent(&fs, &in_dir_a_path).expect("dir/a.txt must exist");
4811        assert_eq!(in_dir_a_entry.attr.st_uid, BY_PATH_UID);
4812        assert_eq!(in_dir_a_entry.attr.st_gid, BY_PATH_GID);
4813        assert_eq!(in_dir_a_entry.attr.st_mode & 0o777, !BY_PATH_UMASK & 0o777);
4814
4815        // Create dir/b.txt.
4816        let in_dir_b_path = dir_path.join("b.txt");
4817        create(&fs, &in_dir_b_path).expect("create b.txt");
4818
4819        // newly created b.txt in dir should be set guest_uid/guest_uid/umask by path
4820        let in_dir_b_entry = lookup_ent(&fs, &in_dir_a_path).expect("dir/b.txt must exist");
4821        assert_eq!(in_dir_b_entry.attr.st_uid, BY_PATH_UID);
4822        assert_eq!(in_dir_b_entry.attr.st_gid, BY_PATH_GID);
4823        assert_eq!(in_dir_b_entry.attr.st_mode & 0o777, !BY_PATH_UMASK & 0o777);
4824    }
4825
4826    #[test]
4827    #[cfg(feature = "fs_permission_translation")]
4828    fn test_copy_file_range_path_mapping() {
4829        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4830        let _guard = lock.lock().expect("acquire named lock");
4831
4832        let real_ctx = get_context();
4833        let temp_dir = TempDir::new().unwrap();
4834        let dir_path = temp_dir.path().join("dir");
4835        create_test_data(&temp_dir, &["dir"], &["src.txt", "dir/dst.txt"]);
4836
4837        let cfg = Config {
4838            ..Default::default()
4839        };
4840        let fs = PassthroughFs::new("tag", cfg).unwrap();
4841        fs.init(FsOptions::empty()).unwrap();
4842
4843        // Use a fake UID in the context that would normally fail set_creds()
4844        let mut fake_ctx = real_ctx;
4845        fake_ctx.uid = 9999;
4846        fake_ctx.gid = 9999;
4847
4848        // Create mapping: mapping the fake guest UID to the REAL host UID.
4849        // If the mapping works, copy_file_range will use real_ctx.uid and succeed.
4850        // If the mapping is ignored, it will use fake_ctx.uid (9999) and set_creds will fail with
4851        // EPERM.
4852        let permission_data = PermissionData {
4853            guest_uid: fake_ctx.uid,
4854            guest_gid: fake_ctx.gid,
4855            host_uid: real_ctx.uid,
4856            host_gid: real_ctx.gid,
4857            umask: 0,
4858            perm_path: dir_path.to_string_lossy().into_owned(),
4859        };
4860        fs.permission_paths.write().unwrap().push(permission_data);
4861
4862        let src_path = temp_dir.path().join("src.txt");
4863        let dst_path = dir_path.join("dst.txt");
4864
4865        std::fs::write(&src_path, b"hello world").unwrap();
4866
4867        let src_inode = lookup(&fs, &src_path).unwrap();
4868        let dst_inode = lookup(&fs, &dst_path).unwrap();
4869
4870        // Open files to get handles.
4871        // Note: we use real_ctx here to ensure file handles are opened successfully.
4872        // The copy_file_range call itself will use fake_ctx.
4873        let (src_handle, _) = fs
4874            .open(real_ctx, src_inode, libc::O_RDONLY as u32)
4875            .expect("open src");
4876        let (dst_handle, _) = fs
4877            .open(real_ctx, dst_inode, libc::O_WRONLY as u32)
4878            .expect("open dst");
4879
4880        let src_handle = src_handle.unwrap();
4881        let dst_handle = dst_handle.unwrap();
4882
4883        // Execute copy_file_range with fake_ctx.
4884        // This will only succeed if change_creds_for_path correctly translates 9999 -> real_uid.
4885        let result = fs.copy_file_range(
4886            fake_ctx, src_inode, src_handle, 0, dst_inode, dst_handle, 0, 5, 0,
4887        );
4888
4889        assert!(
4890            result.is_ok(),
4891            "copy_file_range failed: {:?}. Mapping might not be applied.",
4892            result.err()
4893        );
4894        assert_eq!(result.unwrap(), 5);
4895
4896        let content = std::fs::read(&dst_path).unwrap();
4897        assert_eq!(&content[0..5], b"hello");
4898    }
4899
4900    #[test]
4901    #[cfg(feature = "arc_quota")]
4902    fn set_path_xattr_ioctl_valid_data() {
4903        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4904        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4905        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4906        let _guard = lock.lock().expect("acquire named lock");
4907
4908        let cfg: Config = Config {
4909            max_dynamic_xattr: 1,
4910            ..Default::default()
4911        };
4912        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4913
4914        let path_string = String::from("/test");
4915        let xattr_name_string = String::from("test_name");
4916        let xattr_value_string = String::from("test_value");
4917        let fs_path_xattr_data_buffer = FsPathXattrDataBuffer {
4918            path: {
4919                let mut path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4920                path[..path_string.len()].copy_from_slice(path_string.as_bytes());
4921                path
4922            },
4923            xattr_name: {
4924                let mut xattr_name: [u8; FS_IOCTL_XATTR_NAME_MAX_LEN] =
4925                    [0; FS_IOCTL_XATTR_NAME_MAX_LEN];
4926                xattr_name[..xattr_name_string.len()].copy_from_slice(xattr_name_string.as_bytes());
4927                xattr_name
4928            },
4929            xattr_value: {
4930                let mut xattr_value: [u8; FS_IOCTL_XATTR_VALUE_MAX_LEN] =
4931                    [0; FS_IOCTL_XATTR_VALUE_MAX_LEN];
4932                xattr_value[..xattr_value_string.len()]
4933                    .copy_from_slice(xattr_value_string.as_bytes());
4934                xattr_value
4935            },
4936        };
4937        let r = std::io::Cursor::new(fs_path_xattr_data_buffer.as_bytes());
4938
4939        let res = fs_ioc_setpathxattr(
4940            &p,
4941            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4942            r.clone(),
4943        )
4944        .expect("valid input should get IoctlReply");
4945        assert!(matches!(res, IoctlReply::Done(Ok(data)) if data.is_empty()));
4946
4947        let read_guard = p.xattr_paths.read().expect("read xattr_paths failed");
4948        let xattr_data = read_guard.first().expect("xattr_paths should not be empty");
4949
4950        // Check expected data item is added to permission_paths.
4951        let expected_data = XattrData {
4952            xattr_path: path_string,
4953            xattr_name: xattr_name_string,
4954            xattr_value: xattr_value_string,
4955        };
4956        assert_eq!(*xattr_data, expected_data);
4957
4958        // Second ioctl should not succeed since max_dynamic_perm is set to 1
4959        let res = fs_ioc_setpathxattr(
4960            &p,
4961            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4962            r.clone(),
4963        )
4964        .expect("valid input should get IoctlReply");
4965        assert!(
4966            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4967                errno == libc::EPERM
4968            }))
4969        );
4970    }
4971    #[test]
4972    #[cfg(feature = "arc_quota")]
4973    fn set_path_xattr_ioctl_invalid_data() {
4974        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4975        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4976        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4977        let _guard = lock.lock().expect("acquire named lock");
4978
4979        let cfg: Config = Config {
4980            max_dynamic_xattr: 1,
4981            ..Default::default()
4982        };
4983        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4984
4985        let path_string = String::from("test");
4986        let xattr_name_string = String::from("test_name");
4987        let xattr_value_string = String::from("test_value");
4988        let fs_path_xattr_data_buffer = FsPathXattrDataBuffer {
4989            path: {
4990                let mut path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4991                path[..path_string.len()].copy_from_slice(path_string.as_bytes());
4992                path
4993            },
4994            xattr_name: {
4995                let mut xattr_name: [u8; FS_IOCTL_XATTR_NAME_MAX_LEN] =
4996                    [0; FS_IOCTL_XATTR_NAME_MAX_LEN];
4997                xattr_name[..xattr_name_string.len()].copy_from_slice(xattr_name_string.as_bytes());
4998                xattr_name
4999            },
5000            xattr_value: {
5001                let mut xattr_value: [u8; FS_IOCTL_XATTR_VALUE_MAX_LEN] =
5002                    [0; FS_IOCTL_XATTR_VALUE_MAX_LEN];
5003                xattr_value[..xattr_value_string.len()]
5004                    .copy_from_slice(xattr_value_string.as_bytes());
5005                xattr_value
5006            },
5007        };
5008        let r = std::io::Cursor::new(fs_path_xattr_data_buffer.as_bytes());
5009
5010        // This call is supposed to get EINVAL ioctlReply, since the perm_path is invalid.
5011        let res = fs_ioc_setpathxattr(
5012            &p,
5013            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
5014            r.clone(),
5015        )
5016        .expect("valid input should get IoctlReply");
5017        assert!(
5018            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
5019                errno == libc::EINVAL
5020            }))
5021        );
5022
5023        let fake_data_buffer: [u8; 128] = [0; 128];
5024        let r = std::io::Cursor::new(fake_data_buffer.as_bytes());
5025        // This call is supposed to get EINVAL ioctlReply, since the in_size is not the size of
5026        // struct FsPathXattrDataBuffer.
5027        let res = fs_ioc_setpathxattr(&p, mem::size_of_val(&fake_data_buffer) as u32, r.clone())
5028            .expect_err("valid input should get IoctlReply");
5029        assert!(res
5030            .raw_os_error()
5031            .is_some_and(|errno| { errno == libc::EINVAL }));
5032    }
5033
5034    #[test]
5035    #[cfg(feature = "arc_quota")]
5036    fn xattr_data_path_matching() {
5037        let ctx = get_context();
5038        let temp_dir = TempDir::new().unwrap();
5039        // Prepare `a.txt` before starting the test.
5040        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/a.txt"]);
5041
5042        let cfg = Config {
5043            max_dynamic_xattr: 1,
5044            ..Default::default()
5045        };
5046        let fs = PassthroughFs::new("tag", cfg).unwrap();
5047
5048        let capable = FsOptions::empty();
5049        fs.init(capable).unwrap();
5050
5051        let dir_path = temp_dir.path().join("dir");
5052        let xattr_name_string = String::from("test_name");
5053        let xattr_name_cstring = CString::new(xattr_name_string.clone()).expect("create c string");
5054        let xattr_value_string = String::from("test_value");
5055        let xattr_value_bytes = xattr_value_string.clone().into_bytes();
5056
5057        let xattr_data = XattrData {
5058            xattr_name: xattr_name_string,
5059            xattr_value: xattr_value_string,
5060            xattr_path: dir_path.to_string_lossy().into_owned(),
5061        };
5062        fs.xattr_paths
5063            .write()
5064            .expect("xattr_paths lock must be acquired")
5065            .push(xattr_data);
5066
5067        // a_path is the path with out set xattr by path
5068        let a_path: std::path::PathBuf = temp_dir.path().join("a.txt");
5069        let in_dir_a_path = dir_path.join("a.txt");
5070
5071        let a_node = lookup(&fs, a_path.as_path()).expect("lookup a node");
5072        // a.txt should not be set with xattr by path
5073        assert!(fs
5074            .getxattr(
5075                ctx,
5076                a_node,
5077                &xattr_name_cstring,
5078                xattr_value_bytes.len() as u32
5079            )
5080            .is_err());
5081
5082        let in_dir_a_node = lookup(&fs, in_dir_a_path.as_path()).expect("lookup in dir a node");
5083        // a.txt in dir should be set xattr by path
5084        let in_dir_a_reply = fs
5085            .getxattr(
5086                ctx,
5087                in_dir_a_node,
5088                &xattr_name_cstring,
5089                xattr_value_bytes.len() as u32,
5090            )
5091            .expect("Getxattr should success");
5092        assert!(matches!(in_dir_a_reply, GetxattrReply::Value(v) if v == xattr_value_bytes));
5093        // Create dir/b.txt.
5094        let in_dir_b_path = dir_path.join("b.txt");
5095        create(&fs, &in_dir_b_path).expect("create b.txt");
5096
5097        // newly created b.txt in dir should be set xattr by path
5098        let in_dir_b_node = lookup(&fs, in_dir_a_path.as_path()).expect("lookup in dir b node");
5099        let in_dir_b_reply = fs
5100            .getxattr(
5101                ctx,
5102                in_dir_b_node,
5103                &xattr_name_cstring,
5104                xattr_value_bytes.len() as u32,
5105            )
5106            .expect("Getxattr should success");
5107        assert!(matches!(in_dir_b_reply, GetxattrReply::Value(v) if v == xattr_value_bytes));
5108    }
5109
5110    /// Creates and open a new file by atomic_open with O_APPEND flag.
5111    /// We check O_APPEND is properly handled, depending on writeback cache is enabled or not.
5112    fn atomic_open_create_o_append(writeback: bool) {
5113        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
5114        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
5115        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5116        let _guard = lock.lock().expect("acquire named lock");
5117
5118        let temp_dir = TempDir::new().unwrap();
5119
5120        let cfg = Config {
5121            cache_policy: CachePolicy::Always,
5122            writeback,
5123            ..Default::default()
5124        };
5125        let fs = PassthroughFs::new("tag", cfg).unwrap();
5126
5127        let capable = FsOptions::ZERO_MESSAGE_OPEN | FsOptions::WRITEBACK_CACHE;
5128        fs.init(capable).unwrap();
5129
5130        let (entry, _, _) = atomic_open(
5131            &fs,
5132            &temp_dir.path().join("a.txt"),
5133            0o666,
5134            (libc::O_RDWR | libc::O_CREAT | libc::O_APPEND) as u32,
5135            0,
5136            None,
5137        )
5138        .expect("atomic_open");
5139        assert_ne!(entry.inode, 0);
5140
5141        let inodes = fs.inodes.lock();
5142        let data = inodes.get(&entry.inode).unwrap();
5143        let flags = data.file.lock().open_flags;
5144        if writeback {
5145            // When writeback is enabled, O_APPEND must be handled by the guest kernel.
5146            // So, it must be cleared.
5147            assert_eq!(flags & libc::O_APPEND, 0);
5148        } else {
5149            // Without writeback cache, O_APPEND must not be cleared.
5150            assert_eq!(flags & libc::O_APPEND, libc::O_APPEND);
5151        }
5152    }
5153
5154    #[test]
5155    fn test_atomic_open_create_o_append_no_writeback() {
5156        atomic_open_create_o_append(false);
5157    }
5158
5159    #[test]
5160    fn test_atomic_open_create_o_append_writeback() {
5161        atomic_open_create_o_append(true);
5162    }
5163
5164    #[test]
5165    fn test_lookup_dotdot_escape() {
5166        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5167        let _guard = lock.lock().expect("acquire named lock");
5168        let temp_dir = TempDir::new().unwrap();
5169        let root_path = temp_dir.path().join("root");
5170        std::fs::create_dir(&root_path).unwrap();
5171
5172        // Create a secret file in the parent of root
5173        let secret_file = temp_dir.path().join("secret.txt");
5174        std::fs::write(&secret_file, "top secret").unwrap();
5175
5176        let cfg = Config {
5177            ..Default::default()
5178        };
5179        let mut fs = PassthroughFs::new("tag", cfg).unwrap();
5180        fs.set_root_dir(root_path.to_str().unwrap().to_string())
5181            .unwrap();
5182        fs.init(FsOptions::empty()).unwrap();
5183        let ctx = get_context();
5184
5185        // 1. Lookup ".." from root (inode 1)
5186        let dotdot = c"..";
5187        let res = fs.lookup(ctx, 1, dotdot);
5188        assert!(res.is_err(), "Lookup .. should be blocked!");
5189    }
5190}