devices/virtio/fs/
passthrough.rs

1// Copyright 2019 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::borrow::Cow;
6use std::cell::RefCell;
7use std::cmp;
8use std::collections::btree_map;
9use std::collections::BTreeMap;
10use std::ffi::CStr;
11use std::ffi::CString;
12#[cfg(feature = "fs_runtime_ugid_map")]
13use std::ffi::OsStr;
14use std::fs::File;
15use std::io;
16use std::mem;
17use std::mem::size_of;
18use std::mem::MaybeUninit;
19use std::os::raw::c_int;
20use std::os::raw::c_long;
21#[cfg(feature = "fs_runtime_ugid_map")]
22use std::os::unix::ffi::OsStrExt;
23#[cfg(feature = "fs_runtime_ugid_map")]
24use std::path::Path;
25use std::ptr;
26use std::ptr::addr_of;
27use std::ptr::addr_of_mut;
28use std::sync::atomic::AtomicBool;
29use std::sync::atomic::AtomicU64;
30use std::sync::atomic::Ordering;
31use std::sync::Arc;
32use std::sync::MutexGuard;
33#[cfg(feature = "fs_permission_translation")]
34use std::sync::RwLock;
35use std::time::Duration;
36
37#[cfg(feature = "arc_quota")]
38use base::debug;
39use base::error;
40use base::ioctl_ior_nr;
41use base::ioctl_iow_nr;
42use base::ioctl_iowr_nr;
43use base::ioctl_with_mut_ptr;
44use base::ioctl_with_ptr;
45use base::syscall;
46use base::unix::FileFlags;
47use base::warn;
48use base::AsRawDescriptor;
49use base::FromRawDescriptor;
50use base::IntoRawDescriptor;
51use base::IoctlNr;
52use base::Protection;
53use base::RawDescriptor;
54use fuse::filesystem::Context;
55use fuse::filesystem::DirectoryIterator;
56use fuse::filesystem::Entry;
57use fuse::filesystem::FileSystem;
58use fuse::filesystem::FsOptions;
59use fuse::filesystem::GetxattrReply;
60use fuse::filesystem::IoctlFlags;
61use fuse::filesystem::IoctlReply;
62use fuse::filesystem::ListxattrReply;
63use fuse::filesystem::OpenOptions;
64use fuse::filesystem::RemoveMappingOne;
65use fuse::filesystem::SetattrValid;
66use fuse::filesystem::ZeroCopyReader;
67use fuse::filesystem::ZeroCopyWriter;
68use fuse::filesystem::ROOT_ID;
69use fuse::sys::WRITE_KILL_PRIV;
70use fuse::Mapper;
71#[cfg(feature = "arc_quota")]
72use protobuf::Message;
73use sync::Mutex;
74#[cfg(feature = "arc_quota")]
75use system_api::client::OrgChromiumSpaced;
76#[cfg(feature = "arc_quota")]
77use system_api::spaced::SetProjectIdReply;
78#[cfg(feature = "arc_quota")]
79use system_api::spaced::SetProjectInheritanceFlagReply;
80use zerocopy::FromBytes;
81use zerocopy::FromZeros;
82use zerocopy::Immutable;
83use zerocopy::IntoBytes;
84use zerocopy::KnownLayout;
85
86#[cfg(feature = "arc_quota")]
87use crate::virtio::fs::arc_ioctl::FsPathXattrDataBuffer;
88#[cfg(feature = "arc_quota")]
89use crate::virtio::fs::arc_ioctl::FsPermissionDataBuffer;
90#[cfg(feature = "arc_quota")]
91use crate::virtio::fs::arc_ioctl::XattrData;
92use crate::virtio::fs::caps::Capability;
93use crate::virtio::fs::caps::Caps;
94use crate::virtio::fs::caps::Set as CapSet;
95use crate::virtio::fs::caps::Value as CapValue;
96use crate::virtio::fs::config::CachePolicy;
97use crate::virtio::fs::config::Config;
98#[cfg(feature = "fs_permission_translation")]
99use crate::virtio::fs::config::PermissionData;
100use crate::virtio::fs::expiring_map::ExpiringMap;
101use crate::virtio::fs::multikey::MultikeyBTreeMap;
102use crate::virtio::fs::read_dir::ReadDir;
103
104const EMPTY_CSTR: &CStr = c"";
105const PROC_CSTR: &CStr = c"/proc";
106const UNLABELED_CSTR: &CStr = c"unlabeled";
107
108const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
109const SECURITY_XATTR: &[u8] = b"security.";
110const SELINUX_XATTR: &[u8] = b"security.selinux";
111
112const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
113const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
114
115#[cfg(feature = "arc_quota")]
116const FS_PROJINHERIT_FL: c_int = 0x20000000;
117
118// 25 seconds is the default timeout for dbus-send.
119#[cfg(feature = "arc_quota")]
120const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
121
122/// Internal utility wrapper for `cros_tracing::trace_event!()` macro with VirtioFS calls.
123macro_rules! fs_trace {
124    ($tag:expr, $name:expr, $($arg:expr),+) => {
125        cros_tracing::trace_event!(VirtioFs, $name, $tag, $($arg),*)
126    };
127}
128
129#[repr(C)]
130#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
131struct fscrypt_policy_v1 {
132    _version: u8,
133    _contents_encryption_mode: u8,
134    _filenames_encryption_mode: u8,
135    _flags: u8,
136    _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
137}
138
139#[repr(C)]
140#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
141struct fscrypt_policy_v2 {
142    _version: u8,
143    _contents_encryption_mode: u8,
144    _filenames_encryption_mode: u8,
145    _flags: u8,
146    __reserved: [u8; 4],
147    master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
148}
149
150#[repr(C)]
151#[derive(Copy, Clone, FromBytes, Immutable, KnownLayout)]
152union fscrypt_policy {
153    _version: u8,
154    _v1: fscrypt_policy_v1,
155    _v2: fscrypt_policy_v2,
156}
157
158#[repr(C)]
159#[derive(Copy, Clone, FromBytes, Immutable, KnownLayout)]
160struct fscrypt_get_policy_ex_arg {
161    policy_size: u64,       /* input/output */
162    policy: fscrypt_policy, /* output */
163}
164
165impl From<&fscrypt_get_policy_ex_arg> for &[u8] {
166    fn from(value: &fscrypt_get_policy_ex_arg) -> Self {
167        assert!(value.policy_size <= size_of::<fscrypt_policy>() as u64);
168        let data_raw: *const fscrypt_get_policy_ex_arg = value;
169        // SAFETY: the length of the output slice is asserted to be within the struct it points to
170        unsafe {
171            std::slice::from_raw_parts(
172                data_raw.cast(),
173                value.policy_size as usize + size_of::<u64>(),
174            )
175        }
176    }
177}
178
179ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
180
181#[repr(C)]
182#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
183struct fsxattr {
184    fsx_xflags: u32,     /* xflags field value (get/set) */
185    fsx_extsize: u32,    /* extsize field value (get/set) */
186    fsx_nextents: u32,   /* nextents field value (get) */
187    fsx_projid: u32,     /* project identifier (get/set) */
188    fsx_cowextsize: u32, /* CoW extsize field value (get/set) */
189    fsx_pad: [u8; 8],
190}
191
192ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
193ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
194
195ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
196ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
197
198ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
199ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
200
201ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
202ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
203
204#[cfg(feature = "arc_quota")]
205ioctl_iow_nr!(FS_IOC_SETPERMISSION, 'f' as u32, 1, FsPermissionDataBuffer);
206#[cfg(feature = "arc_quota")]
207ioctl_iow_nr!(FS_IOC_SETPATHXATTR, 'f' as u32, 1, FsPathXattrDataBuffer);
208
209#[repr(C)]
210#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
211struct fsverity_enable_arg {
212    _version: u32,
213    _hash_algorithm: u32,
214    _block_size: u32,
215    salt_size: u32,
216    salt_ptr: u64,
217    sig_size: u32,
218    __reserved1: u32,
219    sig_ptr: u64,
220    __reserved2: [u64; 11],
221}
222
223#[repr(C)]
224#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
225struct fsverity_digest {
226    _digest_algorithm: u16,
227    digest_size: u16,
228    // __u8 digest[];
229}
230
231ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
232ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
233
234pub type Inode = u64;
235type Handle = u64;
236
237#[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq)]
238struct InodeAltKey {
239    ino: libc::ino64_t,
240    dev: libc::dev_t,
241}
242
243#[derive(PartialEq, Eq, Debug)]
244enum FileType {
245    Regular,
246    Directory,
247    Other,
248}
249
250impl From<libc::mode_t> for FileType {
251    fn from(mode: libc::mode_t) -> Self {
252        match mode & libc::S_IFMT {
253            libc::S_IFREG => FileType::Regular,
254            libc::S_IFDIR => FileType::Directory,
255            _ => FileType::Other,
256        }
257    }
258}
259
260#[derive(Debug)]
261struct OpenedFile {
262    file: Option<File>,
263    open_flags: libc::c_int,
264}
265
266impl AsRawDescriptor for OpenedFile {
267    fn as_raw_descriptor(&self) -> RawDescriptor {
268        self.file().as_raw_descriptor()
269    }
270}
271
272impl OpenedFile {
273    fn new(file: File, open_flags: libc::c_int) -> Self {
274        OpenedFile {
275            file: Some(file),
276            open_flags,
277        }
278    }
279
280    fn file(&self) -> &File {
281        self.file.as_ref().expect("must have a file")
282    }
283
284    fn file_mut(&mut self) -> &mut File {
285        self.file.as_mut().expect("must have a file")
286    }
287
288    /// Leaks the file descriptor and makes the struct unusable.
289    ///
290    /// This is an optimization to speed up dropping `OpenedFile` instances, which is useful
291    /// during an abrupt shutdown. Instead of properly closing the file descriptor, which
292    /// involves a syscall, this function effectively forgets the file descriptor, relying on the
293    /// OS to clean it up when the process terminates.
294    fn leak_fd(&mut self) {
295        let f = self.file.take().expect("must have a file");
296        let _ = f.into_raw_descriptor();
297    }
298}
299
300#[derive(Debug)]
301struct InodeData {
302    inode: Inode,
303    // (File, open_flags)
304    file: Mutex<OpenedFile>,
305    refcount: AtomicU64,
306    filetype: FileType,
307    path: String,
308    // This needs to be atomic because we need to set it through a shared reference.
309    unsafe_leak_fd: AtomicBool,
310}
311
312impl AsRawDescriptor for InodeData {
313    fn as_raw_descriptor(&self) -> RawDescriptor {
314        self.file.lock().as_raw_descriptor()
315    }
316}
317
318impl Drop for InodeData {
319    /// If `unsafe_leak_fd` is set, this `drop` implementation will "leak" the file descriptor.
320    /// This is an optimization to speed up the cleanup process, based on the
321    /// assumption that the OS will handle the cleanup of file descriptors after the process
322    /// terminates. This is only okay if the process is guaranteed to terminate immediately
323    /// after the `PassthroughFs` instance is dropped.
324    fn drop(&mut self) {
325        if self.unsafe_leak_fd.load(Ordering::Relaxed) {
326            self.file.get_mut().leak_fd();
327        }
328    }
329}
330
331impl InodeData {
332    fn set_unsafe_leak_fd(&self) {
333        self.unsafe_leak_fd.store(true, Ordering::Relaxed);
334    }
335}
336
337#[derive(Debug)]
338struct HandleData {
339    inode: Inode,
340    file: Mutex<OpenedFile>,
341
342    unsafe_leak_fd: AtomicBool,
343}
344
345impl AsRawDescriptor for HandleData {
346    fn as_raw_descriptor(&self) -> RawDescriptor {
347        self.file.lock().as_raw_descriptor()
348    }
349}
350
351impl Drop for HandleData {
352    /// If `unsafe_leak_fd` is set, this `drop` implementation will "leak" the file descriptor by
353    /// forgetting it. This is an optimization to speed up the cleanup process, based on the
354    /// assumption that the OS will handle the cleanup of file descriptors after the process
355    // terminates. This is only safe if the process is guaranteed to terminate immediately
356    /// after the `PassthroughFs` instance is dropped.
357    fn drop(&mut self) {
358        if self.unsafe_leak_fd.load(Ordering::Relaxed) {
359            self.file.get_mut().leak_fd();
360        }
361    }
362}
363
364impl HandleData {
365    fn set_unsafe_leak_fd(&self) {
366        self.unsafe_leak_fd.store(true, Ordering::Relaxed);
367    }
368}
369
370macro_rules! scoped_cred {
371    ($name:ident, $ty:ty, $syscall_nr:expr) => {
372        #[derive(Debug)]
373        struct $name {
374            old: $ty,
375        }
376
377        impl $name {
378            // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
379            // credentials back to `old` when the returned struct is dropped.
380            fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
381                if val == old {
382                    // Nothing to do since we already have the correct value.
383                    return Ok(None);
384                }
385
386                // We want credential changes to be per-thread because otherwise
387                // we might interfere with operations being carried out on other
388                // threads with different uids/gids.  However, posix requires that
389                // all threads in a process share the same credentials.  To do this
390                // libc uses signals to ensure that when one thread changes its
391                // credentials the other threads do the same thing.
392                //
393                // So instead we invoke the syscall directly in order to get around
394                // this limitation.  Another option is to use the setfsuid and
395                // setfsgid systems calls.   However since those calls have no way to
396                // return an error, it's preferable to do this instead.
397
398                // SAFETY: this call is safe because it doesn't modify any memory and we
399                // check the return value.
400                let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
401                if res == 0 {
402                    Ok(Some($name { old }))
403                } else {
404                    Err(io::Error::last_os_error())
405                }
406            }
407        }
408
409        impl Drop for $name {
410            fn drop(&mut self) {
411                // SAFETY: trivially safe
412                let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
413                if res < 0 {
414                    error!(
415                        "failed to change credentials back to {}: {}",
416                        self.old,
417                        io::Error::last_os_error(),
418                    );
419                }
420            }
421        }
422    };
423}
424scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
425scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
426
427const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
428const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
429
430thread_local! {
431    // SAFETY: both calls take no parameters and only return an integer value. The kernel also
432    // guarantees that they can never fail.
433    static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
434    // SAFETY: both calls take no parameters and only return an integer value. The kernel also
435    // guarantees that they can never fail.
436    static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
437}
438
439fn set_creds(
440    uid: libc::uid_t,
441    gid: libc::gid_t,
442) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
443    let olduid = THREAD_EUID.with(|uid| *uid);
444    let oldgid = THREAD_EGID.with(|gid| *gid);
445
446    // We have to change the gid before we change the uid because if we change the uid first then we
447    // lose the capability to change the gid.  However changing back can happen in any order.
448    ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
449}
450
451thread_local!(static THREAD_FSCREATE: RefCell<Option<File>> = const { RefCell::new(None) });
452
453// Opens and returns a write-only handle to /proc/thread-self/attr/fscreate. Panics if it fails to
454// open the file.
455fn open_fscreate(proc: &File) -> File {
456    let fscreate = c"thread-self/attr/fscreate";
457
458    // SAFETY: this doesn't modify any memory and we check the return value.
459    let raw_descriptor = unsafe {
460        libc::openat(
461            proc.as_raw_descriptor(),
462            fscreate.as_ptr(),
463            libc::O_CLOEXEC | libc::O_WRONLY,
464        )
465    };
466
467    // We don't expect this to fail and we're not in a position to return an error here so just
468    // panic.
469    if raw_descriptor < 0 {
470        panic!(
471            "Failed to open /proc/thread-self/attr/fscreate: {}",
472            io::Error::last_os_error()
473        );
474    }
475
476    // SAFETY: safe because we just opened this descriptor.
477    unsafe { File::from_raw_descriptor(raw_descriptor) }
478}
479
480struct ScopedSecurityContext;
481
482impl ScopedSecurityContext {
483    fn new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext> {
484        THREAD_FSCREATE.with(|thread_fscreate| {
485            let mut fscreate = thread_fscreate.borrow_mut();
486            let file = fscreate.get_or_insert_with(|| open_fscreate(proc));
487            // SAFETY: this doesn't modify any memory and we check the return value.
488            let ret = unsafe {
489                libc::write(
490                    file.as_raw_descriptor(),
491                    ctx.as_ptr() as *const libc::c_void,
492                    ctx.to_bytes_with_nul().len(),
493                )
494            };
495            if ret < 0 {
496                Err(io::Error::last_os_error())
497            } else {
498                Ok(ScopedSecurityContext)
499            }
500        })
501    }
502}
503
504impl Drop for ScopedSecurityContext {
505    fn drop(&mut self) {
506        THREAD_FSCREATE.with(|thread_fscreate| {
507            // expect is safe here because the thread local would have been initialized by the call
508            // to `new` above.
509            let fscreate = thread_fscreate.borrow();
510            let file = fscreate
511                .as_ref()
512                .expect("Uninitialized thread-local when dropping ScopedSecurityContext");
513
514            // SAFETY: this doesn't modify any memory and we check the return value.
515            let ret = unsafe { libc::write(file.as_raw_descriptor(), ptr::null(), 0) };
516
517            if ret < 0 {
518                warn!(
519                    "Failed to restore security context: {}",
520                    io::Error::last_os_error()
521                );
522            }
523        })
524    }
525}
526
527struct ScopedUmask {
528    old: libc::mode_t,
529    mask: libc::mode_t,
530}
531
532impl ScopedUmask {
533    fn new(mask: libc::mode_t) -> ScopedUmask {
534        ScopedUmask {
535            // SAFETY: this doesn't modify any memory and always succeeds.
536            old: unsafe { libc::umask(mask) },
537            mask,
538        }
539    }
540}
541
542impl Drop for ScopedUmask {
543    fn drop(&mut self) {
544        // SAFETY: this doesn't modify any memory and always succeeds.
545        let previous = unsafe { libc::umask(self.old) };
546        debug_assert_eq!(
547            previous, self.mask,
548            "umask changed while holding ScopedUmask"
549        );
550    }
551}
552
553struct ScopedFsetid(Caps);
554impl Drop for ScopedFsetid {
555    fn drop(&mut self) {
556        if let Err(e) = raise_cap_fsetid(&mut self.0) {
557            error!(
558                "Failed to restore CAP_FSETID: {}.  Some operations may be broken.",
559                e
560            )
561        }
562    }
563}
564
565fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
566    c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
567    c.apply()
568}
569
570// Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
571// adds the capability back when it is dropped.
572fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
573    let mut caps = Caps::for_current_thread()?;
574    caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
575    caps.apply()?;
576    Ok(ScopedFsetid(caps))
577}
578
579fn ebadf() -> io::Error {
580    io::Error::from_raw_os_error(libc::EBADF)
581}
582
583fn eexist() -> io::Error {
584    io::Error::from_raw_os_error(libc::EEXIST)
585}
586
587fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
588    let mut st: MaybeUninit<libc::stat64> = MaybeUninit::<libc::stat64>::zeroed();
589
590    // SAFETY: the kernel will only write data in `st` and we check the return value.
591    syscall!(unsafe {
592        libc::fstatat64(
593            f.as_raw_descriptor(),
594            EMPTY_CSTR.as_ptr(),
595            st.as_mut_ptr(),
596            libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
597        )
598    })?;
599
600    // SAFETY: the kernel guarantees that the struct is now fully initialized.
601    Ok(unsafe { st.assume_init() })
602}
603
604fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
605    let mut st = MaybeUninit::<libc::stat64>::zeroed();
606
607    // SAFETY: the kernel will only write data in `st` and we check the return value.
608    syscall!(unsafe {
609        libc::fstatat64(
610            dir.as_raw_descriptor(),
611            name.as_ptr(),
612            st.as_mut_ptr(),
613            libc::AT_SYMLINK_NOFOLLOW,
614        )
615    })?;
616
617    // SAFETY: the kernel guarantees that the struct is now fully initialized.
618    Ok(unsafe { st.assume_init() })
619}
620
621#[cfg(feature = "arc_quota")]
622fn is_android_project_id(project_id: u32) -> bool {
623    // The following constants defines the valid range of project ID used by
624    // Android and are taken from android_filesystem_config.h in Android
625    // codebase.
626    //
627    // Project IDs reserved for Android files on external storage. Total 100 IDs
628    // from PROJECT_ID_EXT_DEFAULT (1000) are reserved.
629    const PROJECT_ID_FOR_ANDROID_FILES: std::ops::RangeInclusive<u32> = 1000..=1099;
630    // Project IDs reserved for Android apps.
631    // The lower-limit of the range is PROJECT_ID_EXT_DATA_START.
632    // The upper-limit of the range differs before and after T. Here we use that
633    // of T (PROJECT_ID_APP_CACHE_END) as it is larger.
634    const PROJECT_ID_FOR_ANDROID_APPS: std::ops::RangeInclusive<u32> = 20000..=69999;
635
636    PROJECT_ID_FOR_ANDROID_FILES.contains(&project_id)
637        || PROJECT_ID_FOR_ANDROID_APPS.contains(&project_id)
638}
639
640/// Per-directory cache for `PassthroughFs::ascii_casefold_lookup()`.
641///
642/// The key of the underlying `BTreeMap` is a lower-cased file name in the direcoty.
643/// The value is the case-sensitive file name stored in the host file system.
644/// We assume that if PassthroughFs has exclusive access to the filesystem, this cache exhaustively
645///  covers all file names that exist within the directory.
646/// So every `PassthroughFs`'s handler that adds or removes files in the directory is expected to
647/// update this cache.
648struct CasefoldCache(BTreeMap<Vec<u8>, CString>);
649
650impl CasefoldCache {
651    fn new(dir: &InodeData) -> io::Result<Self> {
652        let mut mp = BTreeMap::new();
653
654        let mut buf = [0u8; 1024];
655        let mut offset = 0;
656        loop {
657            let mut read_dir = ReadDir::new(dir, offset, &mut buf[..])?;
658            if read_dir.remaining() == 0 {
659                break;
660            }
661
662            while let Some(entry) = read_dir.next() {
663                offset = entry.offset as libc::off64_t;
664                let entry_name = entry.name;
665                mp.insert(
666                    entry_name.to_bytes().to_ascii_lowercase(),
667                    entry_name.to_owned(),
668                );
669            }
670        }
671        Ok(Self(mp))
672    }
673
674    fn insert(&mut self, name: &CStr) {
675        let lower_case = name.to_bytes().to_ascii_lowercase();
676        self.0.insert(lower_case, name.into());
677    }
678
679    fn lookup(&self, name: &[u8]) -> Option<CString> {
680        let lower = name.to_ascii_lowercase();
681        self.0.get(&lower).cloned()
682    }
683
684    fn remove(&mut self, name: &CStr) {
685        let lower_case = name.to_bytes().to_ascii_lowercase();
686        self.0.remove(&lower_case);
687    }
688}
689
690/// Time expiring mapping from an inode of a directory to `CasefoldCache` for the directory.
691/// Each entry will be expired after `timeout`.
692/// When ascii_casefold is disabled, this struct does nothing.
693struct ExpiringCasefoldLookupCaches {
694    inner: ExpiringMap<Inode, CasefoldCache>,
695}
696
697impl ExpiringCasefoldLookupCaches {
698    fn new(timeout: Duration) -> Self {
699        Self {
700            inner: ExpiringMap::new(timeout),
701        }
702    }
703
704    fn insert(&mut self, parent: Inode, name: &CStr) {
705        if let Some(dir_cache) = self.inner.get_mut(&parent) {
706            dir_cache.insert(name);
707        }
708    }
709
710    fn remove(&mut self, parent: Inode, name: &CStr) {
711        if let Some(dir_cache) = self.inner.get_mut(&parent) {
712            dir_cache.remove(name);
713        }
714    }
715
716    fn forget(&mut self, parent: Inode) {
717        self.inner.remove(&parent);
718    }
719
720    /// Get `CasefoldCache` for the given directory.
721    /// If the cache doesn't exist, generate it by fetching directory information with
722    /// `getdents64()`.
723    fn get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache> {
724        self.inner
725            .get_or_insert_with(&parent.inode, || CasefoldCache::new(parent))
726    }
727
728    #[cfg(test)]
729    fn exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool {
730        if let Some(dir_cache) = self.inner.get(&parent) {
731            dir_cache.lookup(name.to_bytes()).is_some()
732        } else {
733            false
734        }
735    }
736}
737
738#[cfg(feature = "fs_permission_translation")]
739impl PermissionData {
740    pub(crate) fn need_set_permission(&self, path: &str) -> bool {
741        path.starts_with(&self.perm_path)
742    }
743}
744
745/// A file system that simply "passes through" all requests it receives to the underlying file
746/// system. To keep the implementation simple it servers the contents of its root directory. Users
747/// that wish to serve only a specific directory should set up the environment so that that
748/// directory ends up as the root of the file system process. One way to accomplish this is via a
749/// combination of mount namespaces and the pivot_root system call.
750///
751/// # Safety
752///
753/// The `Drop` implementation for this struct intentionally leaks all open file
754/// descriptors. It is **critical** that an instance of `PassthroughFs` is
755/// only dropped immediately prior to process termination. Failure to uphold
756/// this invariant **will** result in resource leaks. This is a deliberate
757/// performance optimization for abrupt shutdowns, where we let the OS
758/// handle resource cleanup.
759pub struct PassthroughFs {
760    // Mutex that must be acquired before executing a process-wide operation such as fchdir.
761    process_lock: Mutex<()>,
762    // virtio-fs tag that the guest uses when mounting. This is only used for debugging
763    // when tracing is enabled.
764    tag: String,
765
766    // File descriptors for various points in the file system tree.
767    inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
768    next_inode: AtomicU64,
769
770    // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
771    // used for reading and writing data.
772    handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
773    next_handle: AtomicU64,
774
775    // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
776    // `inodes` into one that can go into `handles`. This is accomplished by reading the
777    // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
778    // to be serving doesn't have access to `/proc`.
779    proc: File,
780
781    // Whether writeback caching is enabled for this directory. This will only be true when
782    // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
783    writeback: AtomicBool,
784
785    // Whether zero message opens are supported by the kernel driver.
786    zero_message_open: AtomicBool,
787
788    // Whether zero message opendir is supported by the kernel driver.
789    zero_message_opendir: AtomicBool,
790
791    // Used to communicate with other processes using D-Bus.
792    #[cfg(feature = "arc_quota")]
793    dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
794    #[cfg(feature = "arc_quota")]
795    dbus_fd: Option<std::os::unix::io::RawFd>,
796
797    // Time-expiring cache for `ascii_casefold_lookup()`.
798    // The key is an inode of a directory, and the value is a cache for the directory.
799    // Each value will be expired `cfg.timeout` after it's created.
800    //
801    // TODO(b/267748212): Instead of per-device Mutex, we might want to have per-directory Mutex
802    // if we use PassthroughFs in multi-threaded environments.
803    expiring_casefold_lookup_caches: Option<Mutex<ExpiringCasefoldLookupCaches>>,
804
805    // paths and coresponding permission setting set by `crosvm_client_fs_permission_set` API
806    #[cfg(feature = "fs_permission_translation")]
807    permission_paths: RwLock<Vec<PermissionData>>,
808
809    // paths and coresponding xattr setting set by `crosvm_client_fs_xattr_set` API
810    #[cfg(feature = "arc_quota")]
811    xattr_paths: RwLock<Vec<XattrData>>,
812
813    cfg: Config,
814
815    // Set the root directory when pivot root isn't enabled for jailed process.
816    //
817    // virtio-fs typically uses mount namespaces and pivot_root for file system isolation,
818    // making the jailed process's root directory "/".
819    //
820    // However, Android's security model prevents crosvm from having the necessary SYS_ADMIN
821    // capability for mount namespaces and pivot_root. This lack of isolation means that
822    // root_dir defaults to the path provided via "--shared-dir".
823    root_dir: String,
824}
825
826impl std::fmt::Debug for PassthroughFs {
827    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
828        f.debug_struct("PassthroughFs")
829            .field("tag", &self.tag)
830            .field("next_inode", &self.next_inode)
831            .field("next_handle", &self.next_handle)
832            .field("proc", &self.proc)
833            .field("writeback", &self.writeback)
834            .field("zero_message_open", &self.zero_message_open)
835            .field("zero_message_opendir", &self.zero_message_opendir)
836            .field("cfg", &self.cfg)
837            .finish()
838    }
839}
840
841impl PassthroughFs {
842    pub fn new(tag: &str, cfg: Config) -> io::Result<PassthroughFs> {
843        // SAFETY: this doesn't modify any memory and we check the return value.
844        let raw_descriptor = syscall!(unsafe {
845            libc::openat64(
846                libc::AT_FDCWD,
847                PROC_CSTR.as_ptr(),
848                libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
849            )
850        })?;
851
852        // Privileged UIDs can use D-Bus to perform some operations.
853        #[cfg(feature = "arc_quota")]
854        let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
855            (None, None)
856        } else {
857            let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
858                .map_err(io::Error::other)?;
859            channel.set_watch_enabled(true);
860            let dbus_fd = channel.watch().fd;
861            channel.set_watch_enabled(false);
862            (
863                Some(Mutex::new(dbus::blocking::Connection::from(channel))),
864                Some(dbus_fd),
865            )
866        };
867
868        // SAFETY: safe because we just opened this descriptor.
869        let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
870
871        let expiring_casefold_lookup_caches = if cfg.ascii_casefold {
872            Some(Mutex::new(ExpiringCasefoldLookupCaches::new(cfg.timeout)))
873        } else {
874            None
875        };
876
877        #[allow(unused_mut)]
878        let mut passthroughfs = PassthroughFs {
879            process_lock: Mutex::new(()),
880            tag: tag.to_string(),
881            inodes: Mutex::new(MultikeyBTreeMap::new()),
882            next_inode: AtomicU64::new(ROOT_ID + 1),
883
884            handles: Mutex::new(BTreeMap::new()),
885            next_handle: AtomicU64::new(1),
886
887            proc,
888
889            writeback: AtomicBool::new(false),
890            zero_message_open: AtomicBool::new(false),
891            zero_message_opendir: AtomicBool::new(false),
892
893            #[cfg(feature = "arc_quota")]
894            dbus_connection,
895            #[cfg(feature = "arc_quota")]
896            dbus_fd,
897            expiring_casefold_lookup_caches,
898            #[cfg(feature = "fs_permission_translation")]
899            permission_paths: RwLock::new(Vec::new()),
900            #[cfg(feature = "arc_quota")]
901            xattr_paths: RwLock::new(Vec::new()),
902            cfg,
903            root_dir: "/".to_string(),
904        };
905
906        #[cfg(feature = "fs_runtime_ugid_map")]
907        passthroughfs.set_permission_path();
908
909        cros_tracing::trace_simple_print!(
910            VirtioFs,
911            "New PassthroughFS initialized: {:?}",
912            passthroughfs
913        );
914        Ok(passthroughfs)
915    }
916
917    #[cfg(feature = "fs_runtime_ugid_map")]
918    fn set_permission_path(&mut self) {
919        if !self.cfg.ugid_map.is_empty() {
920            let mut write_lock = self
921                .permission_paths
922                .write()
923                .expect("Failed to acquire write lock on permission_paths");
924            *write_lock = self.cfg.ugid_map.clone();
925        }
926    }
927
928    #[cfg(feature = "fs_runtime_ugid_map")]
929    pub fn set_root_dir(&mut self, shared_dir: String) -> io::Result<()> {
930        let canonicalized_root = match std::fs::canonicalize(shared_dir) {
931            Ok(path) => path,
932            Err(e) => {
933                return Err(io::Error::new(
934                    io::ErrorKind::InvalidInput,
935                    format!("Failed to canonicalize root_dir: {e}"),
936                ));
937            }
938        };
939        self.root_dir = canonicalized_root.to_string_lossy().to_string();
940        Ok(())
941    }
942
943    pub fn cfg(&self) -> &Config {
944        &self.cfg
945    }
946
947    pub fn keep_rds(&self) -> Vec<RawDescriptor> {
948        #[cfg_attr(not(feature = "arc_quota"), allow(unused_mut))]
949        let mut keep_rds = vec![self.proc.as_raw_descriptor()];
950        #[cfg(feature = "arc_quota")]
951        if let Some(fd) = self.dbus_fd {
952            keep_rds.push(fd);
953        }
954        keep_rds
955    }
956
957    fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
958        if !self.cfg.rewrite_security_xattrs {
959            return Cow::Borrowed(name);
960        }
961
962        // Does not include nul-terminator.
963        let buf = name.to_bytes();
964        if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
965            return Cow::Borrowed(name);
966        }
967
968        let mut newname = USER_VIRTIOFS_XATTR.to_vec();
969        newname.extend_from_slice(buf);
970
971        // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
972        // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
973        Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
974    }
975
976    fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
977        self.inodes.lock().get(&inode).cloned().ok_or_else(ebadf)
978    }
979
980    fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
981        self.handles
982            .lock()
983            .get(&handle)
984            .filter(|hd| hd.inode == inode)
985            .cloned()
986            .ok_or_else(ebadf)
987    }
988
989    fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
990        let pathname = CString::new(format!("self/fd/{fd}"))
991            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
992
993        // SAFETY: this doesn't modify any memory and we check the return value. We don't really
994        // check `flags` because if the kernel can't handle poorly specified flags then we have
995        // much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
996        // to follow the `/proc/self/fd` symlink to get the file.
997        let raw_descriptor = syscall!(unsafe {
998            libc::openat64(
999                self.proc.as_raw_descriptor(),
1000                pathname.as_ptr(),
1001                (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1002            )
1003        })?;
1004
1005        // SAFETY: safe because we just opened this descriptor.
1006        Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
1007    }
1008
1009    /// Modifies the provided open flags based on the writeback caching configuration.
1010    /// Return the updated open flags.
1011    fn update_open_flags(&self, mut flags: i32) -> i32 {
1012        // When writeback caching is enabled, the kernel may send read requests even if the
1013        // userspace program opened the file write-only. So we need to ensure that we have opened
1014        // the file for reading as well as writing.
1015        let writeback = self.writeback.load(Ordering::Relaxed);
1016        if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
1017            flags &= !libc::O_ACCMODE;
1018            flags |= libc::O_RDWR;
1019        }
1020
1021        // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
1022        // However, this breaks atomicity as the file may have changed on disk, invalidating the
1023        // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
1024        // the file. Just allow this for now as it is the user's responsibility to enable writeback
1025        // caching only for directories that are not shared. It also means that we need to clear the
1026        // `O_APPEND` flag.
1027        if writeback && flags & libc::O_APPEND != 0 {
1028            flags &= !libc::O_APPEND;
1029        }
1030
1031        flags
1032    }
1033
1034    fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
1035        // handle writeback caching cases
1036        flags = self.update_open_flags(flags);
1037
1038        self.open_fd(inode.as_raw_descriptor(), flags)
1039    }
1040
1041    // Increases the inode refcount and returns the inode.
1042    fn increase_inode_refcount(&self, inode_data: &InodeData) -> Inode {
1043        // Matches with the release store in `forget`.
1044        inode_data.refcount.fetch_add(1, Ordering::Acquire);
1045        inode_data.inode
1046    }
1047
1048    // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
1049    // The inodes mutex lock must not be already taken by the same thread otherwise this
1050    // will deadlock.
1051    fn add_entry(
1052        &self,
1053        f: File,
1054        #[cfg_attr(not(feature = "fs_permission_translation"), allow(unused_mut))]
1055        mut st: libc::stat64,
1056        open_flags: libc::c_int,
1057        path: String,
1058    ) -> Entry {
1059        #[cfg(feature = "arc_quota")]
1060        self.set_permission(&mut st, &path);
1061        #[cfg(feature = "fs_runtime_ugid_map")]
1062        self.set_ugid_permission(&mut st, &path);
1063        let mut inodes = self.inodes.lock();
1064
1065        let altkey = InodeAltKey {
1066            ino: st.st_ino,
1067            dev: st.st_dev,
1068        };
1069
1070        let inode = if let Some(data) = inodes.get_alt(&altkey) {
1071            self.increase_inode_refcount(data)
1072        } else {
1073            let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
1074            inodes.insert(
1075                inode,
1076                altkey,
1077                Arc::new(InodeData {
1078                    inode,
1079                    file: Mutex::new(OpenedFile::new(f, open_flags)),
1080                    refcount: AtomicU64::new(1),
1081                    filetype: st.st_mode.into(),
1082                    path,
1083                    unsafe_leak_fd: AtomicBool::new(false),
1084                }),
1085            );
1086
1087            inode
1088        };
1089
1090        Entry {
1091            inode,
1092            generation: 0,
1093            attr: st,
1094            // We use the same timeout for the attribute and the entry.
1095            attr_timeout: self.cfg.timeout,
1096            entry_timeout: self.cfg.timeout,
1097        }
1098    }
1099
1100    /// Acquires lock of `expiring_casefold_lookup_caches` if `ascii_casefold` is enabled.
1101    fn lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>> {
1102        self.expiring_casefold_lookup_caches
1103            .as_ref()
1104            .map(|c| c.lock())
1105    }
1106
1107    // Returns an actual case-sensitive file name that matches with the given `name`.
1108    // Returns `Ok(None)` if no file matches with the give `name`.
1109    // This function will panic if casefold is not enabled.
1110    fn get_case_unfolded_name(
1111        &self,
1112        parent: &InodeData,
1113        name: &[u8],
1114    ) -> io::Result<Option<CString>> {
1115        let mut caches = self
1116            .lock_casefold_lookup_caches()
1117            .expect("casefold must be enabled");
1118        let dir_cache = caches.get(parent)?;
1119        Ok(dir_cache.lookup(name))
1120    }
1121
1122    // Performs an ascii case insensitive lookup.
1123    fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
1124        match self.get_case_unfolded_name(parent, name)? {
1125            None => Err(io::Error::from_raw_os_error(libc::ENOENT)),
1126            Some(actual_name) => self.do_lookup(parent, &actual_name),
1127        }
1128    }
1129
1130    #[cfg(test)]
1131    fn exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool {
1132        let mut cache = self
1133            .lock_casefold_lookup_caches()
1134            .expect("casefold must be enabled");
1135        cache.exists_in_cache(parent, name)
1136    }
1137
1138    fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
1139        #[cfg_attr(not(feature = "fs_permission_translation"), allow(unused_mut))]
1140        let mut st = statat(parent, name)?;
1141
1142        let altkey = InodeAltKey {
1143            ino: st.st_ino,
1144            dev: st.st_dev,
1145        };
1146
1147        let path = format!(
1148            "{}/{}",
1149            parent.path.clone(),
1150            name.to_str().unwrap_or("<non UTF-8 str>")
1151        );
1152
1153        // Check if we already have an entry before opening a new file.
1154        if let Some(data) = self.inodes.lock().get_alt(&altkey) {
1155            // Return the same inode with the reference counter increased.
1156            #[cfg(feature = "arc_quota")]
1157            self.set_permission(&mut st, &path);
1158            #[cfg(feature = "fs_runtime_ugid_map")]
1159            self.set_ugid_permission(&mut st, &path);
1160            return Ok(Entry {
1161                inode: self.increase_inode_refcount(data),
1162                generation: 0,
1163                attr: st,
1164                // We use the same timeout for the attribute and the entry.
1165                attr_timeout: self.cfg.timeout,
1166                entry_timeout: self.cfg.timeout,
1167            });
1168        }
1169
1170        // Open a regular file with O_RDONLY to store in `InodeData` so explicit open requests can
1171        // be skipped later if the ZERO_MESSAGE_{OPEN,OPENDIR} features are enabled.
1172        // If the crosvm process doesn't have a read permission, fall back to O_PATH below.
1173        let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1174        match FileType::from(st.st_mode) {
1175            FileType::Regular => {}
1176            FileType::Directory => flags |= libc::O_DIRECTORY,
1177            FileType::Other => flags |= libc::O_PATH,
1178        };
1179
1180        // SAFETY: this doesn't modify any memory and we check the return value.
1181        let fd = match unsafe {
1182            syscall!(libc::openat64(
1183                parent.as_raw_descriptor(),
1184                name.as_ptr(),
1185                flags
1186            ))
1187        } {
1188            Ok(fd) => fd,
1189            Err(e) if e.errno() == libc::EACCES => {
1190                // If O_RDONLY is unavailable, fall back to O_PATH to get an FD to store in
1191                // `InodeData`.
1192                // Note that some operations which should be allowed without read permissions
1193                // require syscalls that don't support O_PATH fds. For those syscalls, we will
1194                // need to fall back to their path-based equivalents with /self/fd/${FD}.
1195                // e.g. `fgetxattr()` for an O_PATH FD fails while `getxaattr()` for /self/fd/${FD}
1196                // works.
1197                flags |= libc::O_PATH;
1198                // SAFETY: this doesn't modify any memory and we check the return value.
1199                unsafe {
1200                    syscall!(libc::openat64(
1201                        parent.as_raw_descriptor(),
1202                        name.as_ptr(),
1203                        flags
1204                    ))
1205                }?
1206            }
1207            Err(e) => {
1208                return Err(e.into());
1209            }
1210        };
1211
1212        // SAFETY: safe because we own the fd.
1213        let f = unsafe { File::from_raw_descriptor(fd) };
1214        // We made sure the lock acquired for `self.inodes` is released automatically when
1215        // the if block above is exited, so a call to `self.add_entry()` should not cause a deadlock
1216        // here. This would not be the case if this were executed in an else block instead.
1217        Ok(self.add_entry(f, st, flags, path))
1218    }
1219
1220    fn get_cache_open_options(&self, flags: u32) -> OpenOptions {
1221        let mut opts = OpenOptions::empty();
1222        match self.cfg.cache_policy {
1223            // We only set the direct I/O option on files.
1224            CachePolicy::Never => opts.set(
1225                OpenOptions::DIRECT_IO,
1226                flags & (libc::O_DIRECTORY as u32) == 0,
1227            ),
1228            CachePolicy::Always => {
1229                opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
1230                    OpenOptions::KEEP_CACHE
1231                } else {
1232                    OpenOptions::CACHE_DIR
1233                }
1234            }
1235            _ => {}
1236        };
1237        opts
1238    }
1239
1240    // Performs lookup using original name first, if it fails and ascii_casefold is enabled,
1241    // it tries to unfold the name and do lookup again.
1242    fn do_lookup_with_casefold_fallback(
1243        &self,
1244        parent: &InodeData,
1245        name: &CStr,
1246    ) -> io::Result<Entry> {
1247        let mut res = self.do_lookup(parent, name);
1248        // If `ascii_casefold` is enabled, fallback to `ascii_casefold_lookup()`.
1249        if res.is_err() && self.cfg.ascii_casefold {
1250            res = self.ascii_casefold_lookup(parent, name.to_bytes());
1251        }
1252        res
1253    }
1254
1255    fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
1256        let inode_data = self.find_inode(inode)?;
1257
1258        let file = self.open_inode(&inode_data, flags as i32)?;
1259
1260        let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1261        let data = HandleData {
1262            inode,
1263            file: Mutex::new(OpenedFile::new(file, flags as i32)),
1264            unsafe_leak_fd: AtomicBool::new(false),
1265        };
1266
1267        self.handles.lock().insert(handle, Arc::new(data));
1268
1269        let opts = self.get_cache_open_options(flags);
1270
1271        Ok((Some(handle), opts))
1272    }
1273
1274    fn do_open_at(
1275        &self,
1276        parent_data: Arc<InodeData>,
1277        name: &CStr,
1278        inode: Inode,
1279        flags: u32,
1280    ) -> io::Result<(Option<Handle>, OpenOptions)> {
1281        let open_flags = self.update_open_flags(flags as i32);
1282
1283        let fd_open = syscall!(
1284            // SAFETY: return value is checked.
1285            unsafe {
1286                libc::openat64(
1287                    parent_data.as_raw_descriptor(),
1288                    name.as_ptr(),
1289                    (open_flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1290                )
1291            }
1292        )?;
1293
1294        // SAFETY: fd_open is valid
1295        let file_open = unsafe { File::from_raw_descriptor(fd_open) };
1296        let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1297        let data = HandleData {
1298            inode,
1299            file: Mutex::new(OpenedFile::new(file_open, open_flags)),
1300            unsafe_leak_fd: AtomicBool::new(false),
1301        };
1302
1303        self.handles.lock().insert(handle, Arc::new(data));
1304
1305        let opts = self.get_cache_open_options(open_flags as u32);
1306        Ok((Some(handle), opts))
1307    }
1308
1309    fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
1310        let mut handles = self.handles.lock();
1311
1312        if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
1313            if e.get().inode == inode {
1314                // We don't need to close the file here because that will happen automatically when
1315                // the last `Arc` is dropped.
1316                e.remove();
1317                return Ok(());
1318            }
1319        }
1320
1321        Err(ebadf())
1322    }
1323
1324    fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
1325        #[allow(unused_mut)]
1326        let mut st = stat(inode)?;
1327
1328        #[cfg(feature = "arc_quota")]
1329        self.set_permission(&mut st, &inode.path);
1330        #[cfg(feature = "fs_runtime_ugid_map")]
1331        self.set_ugid_permission(&mut st, &inode.path);
1332        Ok((st, self.cfg.timeout))
1333    }
1334
1335    fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
1336        // SAFETY: this doesn't modify any memory and we check the return value.
1337        syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
1338        Ok(())
1339    }
1340
1341    fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
1342        // SAFETY: this doesn't modify any memory and we check the return value.
1343        syscall!(unsafe {
1344            if datasync {
1345                libc::fdatasync(file.as_raw_descriptor())
1346            } else {
1347                libc::fsync(file.as_raw_descriptor())
1348            }
1349        })?;
1350
1351        Ok(())
1352    }
1353
1354    // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
1355    // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
1356    // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
1357    // root inode.
1358    //
1359    // NOTE: this method acquires an `self`-wide lock. If any locks are acquired in `f`, care must
1360    // be taken to avoid the risk of deadlocks.
1361    fn with_proc_chdir<F, T>(&self, f: F) -> T
1362    where
1363        F: FnOnce() -> T,
1364    {
1365        let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
1366
1367        // Acquire a lock for `fchdir`.
1368        let _proc_lock = self.process_lock.lock();
1369        // SAFETY: this doesn't modify any memory and we check the return value. Since the
1370        // fchdir should never fail we just use debug_asserts.
1371        let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
1372        debug_assert_eq!(
1373            proc_cwd,
1374            0,
1375            "failed to fchdir to /proc: {}",
1376            io::Error::last_os_error()
1377        );
1378
1379        let res = f();
1380
1381        // SAFETY: this doesn't modify any memory and we check the return value. Since the
1382        // fchdir should never fail we just use debug_asserts.
1383        let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
1384        debug_assert_eq!(
1385            root_cwd,
1386            0,
1387            "failed to fchdir back to root directory: {}",
1388            io::Error::last_os_error()
1389        );
1390
1391        res
1392    }
1393
1394    fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
1395        let file = inode.file.lock();
1396        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
1397        let res = if o_path_file {
1398            // For FDs opened with `O_PATH`, we cannot call `fgetxattr` normally. Instead we
1399            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1400            //  and then setting the CWD back to the root directory.
1401            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
1402                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1403
1404            // SAFETY: this will only modify `value` and we check the return value.
1405            self.with_proc_chdir(|| unsafe {
1406                libc::getxattr(
1407                    path.as_ptr(),
1408                    name.as_ptr(),
1409                    value.as_mut_ptr() as *mut libc::c_void,
1410                    value.len() as libc::size_t,
1411                )
1412            })
1413        } else {
1414            // For regular files and directories, we can just use fgetxattr.
1415            // SAFETY: this will only write to `value` and we check the return value.
1416            unsafe {
1417                libc::fgetxattr(
1418                    file.as_raw_descriptor(),
1419                    name.as_ptr(),
1420                    value.as_mut_ptr() as *mut libc::c_void,
1421                    value.len() as libc::size_t,
1422                )
1423            }
1424        };
1425
1426        if res < 0 {
1427            Err(io::Error::last_os_error())
1428        } else {
1429            Ok(res as usize)
1430        }
1431    }
1432
1433    fn get_encryption_policy_ex<R: io::Read>(
1434        &self,
1435        inode: Inode,
1436        handle: Handle,
1437        mut r: R,
1438    ) -> io::Result<IoctlReply> {
1439        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1440            self.find_inode(inode)?
1441        } else {
1442            self.find_handle(handle, inode)?
1443        };
1444
1445        // SAFETY: this struct only has integer fields and any value is valid.
1446        let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
1447        r.read_exact(arg.policy_size.as_mut_bytes())?;
1448
1449        let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
1450        arg.policy_size = policy_size;
1451
1452        let res =
1453            // SAFETY: the kernel will only write to `arg` and we check the return value.
1454            unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX, &mut arg) };
1455        if res < 0 {
1456            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1457        } else {
1458            let len = size_of::<u64>() + arg.policy_size as usize;
1459            Ok(IoctlReply::Done(Ok(<&[u8]>::from(&arg)[..len].to_vec())))
1460        }
1461    }
1462
1463    fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1464        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1465            self.find_inode(inode)?
1466        } else {
1467            self.find_handle(handle, inode)?
1468        };
1469
1470        let mut buf = MaybeUninit::<fsxattr>::zeroed();
1471
1472        // SAFETY: the kernel will only write to `buf` and we check the return value.
1473        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR, buf.as_mut_ptr()) };
1474        if res < 0 {
1475            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1476        } else {
1477            // SAFETY: the kernel guarantees that the policy is now initialized.
1478            let xattr = unsafe { buf.assume_init() };
1479            Ok(IoctlReply::Done(Ok(xattr.as_bytes().to_vec())))
1480        }
1481    }
1482
1483    fn set_fsxattr<R: io::Read>(
1484        &self,
1485        #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1486        inode: Inode,
1487        handle: Handle,
1488        mut r: R,
1489    ) -> io::Result<IoctlReply> {
1490        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1491            self.find_inode(inode)?
1492        } else {
1493            self.find_handle(handle, inode)?
1494        };
1495
1496        let mut in_attr = fsxattr::new_zeroed();
1497        r.read_exact(in_attr.as_mut_bytes())?;
1498
1499        #[cfg(feature = "arc_quota")]
1500        let st = stat(&*data)?;
1501
1502        #[cfg(feature = "arc_quota")]
1503        let ctx_uid = self.lookup_host_uid(&ctx, inode);
1504
1505        // Changing quota project ID requires CAP_FOWNER or being file owner.
1506        // Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
1507        #[cfg(feature = "arc_quota")]
1508        if ctx_uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx_uid) {
1509            // Get the current fsxattr.
1510            let mut buf = MaybeUninit::<fsxattr>::zeroed();
1511            // SAFETY: the kernel will only write to `buf` and we check the return value.
1512            let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR, buf.as_mut_ptr()) };
1513            if res < 0 {
1514                return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1515            }
1516            // SAFETY: the kernel guarantees that the policy is now initialized.
1517            let current_attr = unsafe { buf.assume_init() };
1518
1519            // Project ID cannot be changed inside a user namespace.
1520            // Use Spaced to avoid this restriction.
1521            if current_attr.fsx_projid != in_attr.fsx_projid {
1522                let connection = self.dbus_connection.as_ref().unwrap().lock();
1523                let proxy = connection.with_proxy(
1524                    "org.chromium.Spaced",
1525                    "/org/chromium/Spaced",
1526                    DEFAULT_DBUS_TIMEOUT,
1527                );
1528                let project_id = in_attr.fsx_projid;
1529                if !is_android_project_id(project_id) {
1530                    return Err(io::Error::from_raw_os_error(libc::EINVAL));
1531                }
1532                let file_clone = base::SafeDescriptor::try_from(&*data)?;
1533                match proxy.set_project_id(file_clone.into(), project_id) {
1534                    Ok(r) => {
1535                        let r = SetProjectIdReply::parse_from_bytes(&r)
1536                            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1537                        if !r.success {
1538                            return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1539                                r.error,
1540                            ))));
1541                        }
1542                    }
1543                    Err(e) => {
1544                        return Err(io::Error::other(e));
1545                    }
1546                };
1547            }
1548        }
1549
1550        //  SAFETY: this doesn't modify any memory and we check the return value.
1551        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR, &in_attr) };
1552        if res < 0 {
1553            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1554        } else {
1555            Ok(IoctlReply::Done(Ok(Vec::new())))
1556        }
1557    }
1558
1559    fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1560        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1561            self.find_inode(inode)?
1562        } else {
1563            self.find_handle(handle, inode)?
1564        };
1565
1566        // The ioctl encoding is a long but the parameter is actually an int.
1567        let mut flags: c_int = 0;
1568
1569        // SAFETY: the kernel will only write to `flags` and we check the return value.
1570        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS, &mut flags) };
1571        if res < 0 {
1572            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1573        } else {
1574            Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
1575        }
1576    }
1577
1578    fn set_flags<R: io::Read>(
1579        &self,
1580        #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1581        inode: Inode,
1582        handle: Handle,
1583        mut r: R,
1584    ) -> io::Result<IoctlReply> {
1585        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1586            self.find_inode(inode)?
1587        } else {
1588            self.find_handle(handle, inode)?
1589        };
1590
1591        // The ioctl encoding is a long but the parameter is actually an int.
1592        let mut in_flags: c_int = 0;
1593        r.read_exact(in_flags.as_mut_bytes())?;
1594
1595        #[cfg(feature = "arc_quota")]
1596        let st = stat(&*data)?;
1597
1598        #[cfg(feature = "arc_quota")]
1599        let ctx_uid = self.lookup_host_uid(&ctx, inode);
1600
1601        // Only privleged uid can perform FS_IOC_SETFLAGS through cryptohome.
1602        #[cfg(feature = "arc_quota")]
1603        if ctx_uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx_uid) {
1604            // Get the current flag.
1605            let mut buf = MaybeUninit::<c_int>::zeroed();
1606            // SAFETY: the kernel will only write to `buf` and we check the return value.
1607            let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS, buf.as_mut_ptr()) };
1608            if res < 0 {
1609                return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1610            }
1611            // SAFETY: the kernel guarantees that the policy is now initialized.
1612            let current_flags = unsafe { buf.assume_init() };
1613
1614            // Project inheritance flag cannot be changed inside a user namespace.
1615            // Use Spaced to avoid this restriction.
1616            if (in_flags & FS_PROJINHERIT_FL) != (current_flags & FS_PROJINHERIT_FL) {
1617                let connection = self.dbus_connection.as_ref().unwrap().lock();
1618                let proxy = connection.with_proxy(
1619                    "org.chromium.Spaced",
1620                    "/org/chromium/Spaced",
1621                    DEFAULT_DBUS_TIMEOUT,
1622                );
1623                // If the input flags contain FS_PROJINHERIT_FL, then it is a set. Otherwise it is a
1624                // reset.
1625                let enable = (in_flags & FS_PROJINHERIT_FL) == FS_PROJINHERIT_FL;
1626                let file_clone = base::SafeDescriptor::try_from(&*data)?;
1627                match proxy.set_project_inheritance_flag(file_clone.into(), enable) {
1628                    Ok(r) => {
1629                        let r = SetProjectInheritanceFlagReply::parse_from_bytes(&r)
1630                            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1631                        if !r.success {
1632                            return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1633                                r.error,
1634                            ))));
1635                        }
1636                    }
1637                    Err(e) => {
1638                        return Err(io::Error::other(e));
1639                    }
1640                };
1641            }
1642        }
1643
1644        // SAFETY: this doesn't modify any memory and we check the return value.
1645        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS, &in_flags) };
1646        if res < 0 {
1647            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1648        } else {
1649            Ok(IoctlReply::Done(Ok(Vec::new())))
1650        }
1651    }
1652
1653    fn enable_verity<R: io::Read>(
1654        &self,
1655        inode: Inode,
1656        handle: Handle,
1657        mut r: R,
1658    ) -> io::Result<IoctlReply> {
1659        let inode_data = self.find_inode(inode)?;
1660
1661        // These match the return codes from `fsverity_ioctl_enable` in the kernel.
1662        match inode_data.filetype {
1663            FileType::Regular => {}
1664            FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
1665            FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
1666        }
1667
1668        {
1669            // We cannot enable verity while holding a writable fd so get a new one, if necessary.
1670            let mut file = inode_data.file.lock();
1671            let mut flags = file.open_flags;
1672            match flags & libc::O_ACCMODE {
1673                libc::O_WRONLY | libc::O_RDWR => {
1674                    flags &= !libc::O_ACCMODE;
1675                    flags |= libc::O_RDONLY;
1676
1677                    // We need to get a read-only handle for this file.
1678                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
1679                    *file = OpenedFile::new(newfile, flags);
1680                }
1681                libc::O_RDONLY => {}
1682                _ => panic!("Unexpected flags: {flags:#x}"),
1683            }
1684        }
1685
1686        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1687            inode_data
1688        } else {
1689            let data = self.find_handle(handle, inode)?;
1690
1691            {
1692                // We can't enable verity while holding a writable fd. We don't know whether the
1693                // file was opened for writing so check it here. We don't expect
1694                // this to be a frequent operation so the extra latency should be
1695                // fine.
1696                let mut file = data.file.lock();
1697                let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
1698                match flags {
1699                    FileFlags::ReadWrite | FileFlags::Write => {
1700                        // We need to get a read-only handle for this file.
1701                        *file = OpenedFile::new(
1702                            self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?,
1703                            libc::O_RDONLY,
1704                        );
1705                    }
1706                    FileFlags::Read => {}
1707                }
1708            }
1709
1710            data
1711        };
1712
1713        let mut arg = fsverity_enable_arg::new_zeroed();
1714        r.read_exact(arg.as_mut_bytes())?;
1715
1716        let mut salt;
1717        if arg.salt_size > 0 {
1718            if arg.salt_size > self.max_buffer_size() {
1719                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1720                    libc::ENOMEM,
1721                ))));
1722            }
1723            salt = vec![0; arg.salt_size as usize];
1724            r.read_exact(&mut salt)?;
1725            arg.salt_ptr = salt.as_ptr() as usize as u64;
1726        } else {
1727            arg.salt_ptr = 0;
1728        }
1729
1730        let mut sig;
1731        if arg.sig_size > 0 {
1732            if arg.sig_size > self.max_buffer_size() {
1733                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1734                    libc::ENOMEM,
1735                ))));
1736            }
1737            sig = vec![0; arg.sig_size as usize];
1738            r.read_exact(&mut sig)?;
1739            arg.sig_ptr = sig.as_ptr() as usize as u64;
1740        } else {
1741            arg.sig_ptr = 0;
1742        }
1743
1744        // SAFETY: this doesn't modify any memory and we check the return value.
1745        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY, &arg) };
1746        if res < 0 {
1747            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1748        } else {
1749            Ok(IoctlReply::Done(Ok(Vec::new())))
1750        }
1751    }
1752
1753    fn measure_verity<R: io::Read>(
1754        &self,
1755        inode: Inode,
1756        handle: Handle,
1757        mut r: R,
1758        out_size: u32,
1759    ) -> io::Result<IoctlReply> {
1760        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1761            self.find_inode(inode)?
1762        } else {
1763            self.find_handle(handle, inode)?
1764        };
1765
1766        let mut digest = fsverity_digest::new_zeroed();
1767        r.read_exact(digest.as_mut_bytes())?;
1768
1769        // Taken from fs/verity/fsverity_private.h.
1770        const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
1771
1772        // This digest size is what the fsverity command line utility uses.
1773        const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
1774        const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
1775        const ROUNDED_LEN: usize = BUFLEN.div_ceil(size_of::<fsverity_digest>());
1776
1777        // Make sure we get a properly aligned allocation.
1778        let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
1779
1780        // SAFETY: we are only writing data and not reading uninitialized memory.
1781        unsafe {
1782            // TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
1783            addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
1784                .write(DIGEST_SIZE)
1785        };
1786
1787        // SAFETY: this will only modify `buf` and we check the return value.
1788        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY, buf.as_mut_ptr()) };
1789        if res < 0 {
1790            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1791        } else {
1792            let digest_size =
1793                // SAFETY: this value was initialized by us already and then overwritten by the kernel.
1794                // TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
1795                unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
1796            let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
1797
1798            // The kernel guarantees this but it doesn't hurt to be paranoid.
1799            debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
1800            if digest.digest_size < digest_size || out_size < outlen {
1801                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1802                    libc::EOVERFLOW,
1803                ))));
1804            }
1805
1806            let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
1807                // SAFETY: any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
1808                // doesn't contain any references.
1809                unsafe { mem::transmute(buf) };
1810
1811            let buf =
1812                // SAFETY: Casting to `*const [u8]` is safe because the kernel guarantees that the
1813                // first `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed
1814                // to have the same layout as `u8`.
1815                // TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
1816                unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
1817            Ok(IoctlReply::Done(Ok(buf.to_vec())))
1818        }
1819    }
1820}
1821
1822#[cfg(feature = "fs_runtime_ugid_map")]
1823impl PassthroughFs {
1824    fn find_and_set_ugid_permission(
1825        &self,
1826        st: &mut libc::stat64,
1827        path: &str,
1828        is_root_path: bool,
1829    ) -> bool {
1830        for perm_data in self
1831            .permission_paths
1832            .read()
1833            .expect("acquire permission_paths read lock")
1834            .iter()
1835        {
1836            if (is_root_path && perm_data.perm_path == "/")
1837                || (!is_root_path
1838                    && perm_data.perm_path != "/"
1839                    && perm_data.need_set_permission(path))
1840            {
1841                self.set_permission_from_data(st, perm_data);
1842                return true;
1843            }
1844        }
1845        false
1846    }
1847
1848    fn set_permission_from_data(&self, st: &mut libc::stat64, perm_data: &PermissionData) {
1849        st.st_uid = perm_data.guest_uid;
1850        st.st_gid = perm_data.guest_gid;
1851        st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
1852    }
1853
1854    /// Set permission according to path
1855    fn set_ugid_permission(&self, st: &mut libc::stat64, path: &str) {
1856        let is_root_path = path.is_empty();
1857
1858        if self.find_and_set_ugid_permission(st, path, is_root_path) {
1859            return;
1860        }
1861
1862        if let Some(perm_data) = self
1863            .permission_paths
1864            .read()
1865            .expect("acquire permission_paths read lock")
1866            .iter()
1867            .find(|pd| pd.perm_path == "/")
1868        {
1869            self.set_permission_from_data(st, perm_data);
1870        }
1871    }
1872
1873    /// Set host uid/gid to configured value according to path
1874    fn change_ugid_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32) {
1875        let path = format!(
1876            "{}/{}",
1877            parent_data.path.clone(),
1878            name.to_str().unwrap_or("<non UTF-8 str>")
1879        );
1880
1881        let is_root_path = path.is_empty();
1882
1883        if self.find_ugid_creds_for_path(&path, is_root_path).is_some() {
1884            return self.find_ugid_creds_for_path(&path, is_root_path).unwrap();
1885        }
1886
1887        if let Some(perm_data) = self
1888            .permission_paths
1889            .read()
1890            .expect("acquire permission_paths read lock")
1891            .iter()
1892            .find(|pd| pd.perm_path == "/")
1893        {
1894            return (perm_data.host_uid, perm_data.host_gid);
1895        }
1896
1897        (ctx.uid, ctx.gid)
1898    }
1899
1900    fn find_ugid_creds_for_path(&self, path: &str, is_root_path: bool) -> Option<(u32, u32)> {
1901        for perm_data in self
1902            .permission_paths
1903            .read()
1904            .expect("acquire permission_paths read lock")
1905            .iter()
1906        {
1907            if (is_root_path && perm_data.perm_path == "/")
1908                || (!is_root_path
1909                    && perm_data.perm_path != "/"
1910                    && perm_data.need_set_permission(path))
1911            {
1912                return Some((perm_data.host_uid, perm_data.host_gid));
1913            }
1914        }
1915        None
1916    }
1917}
1918
1919#[cfg(feature = "arc_quota")]
1920impl PassthroughFs {
1921    /// Convert u8 slice to string
1922    fn string_from_u8_slice(&self, buf: &[u8]) -> io::Result<String> {
1923        match CStr::from_bytes_until_nul(buf).map(|s| s.to_string_lossy().to_string()) {
1924            Ok(s) => Ok(s),
1925            Err(e) => {
1926                error!("fail to convert u8 slice to string: {}", e);
1927                Err(io::Error::from_raw_os_error(libc::EINVAL))
1928            }
1929        }
1930    }
1931
1932    /// Set permission according to path
1933    fn set_permission(&self, st: &mut libc::stat64, path: &str) {
1934        for perm_data in self
1935            .permission_paths
1936            .read()
1937            .expect("acquire permission_paths read lock")
1938            .iter()
1939        {
1940            if perm_data.need_set_permission(path) {
1941                st.st_uid = perm_data.guest_uid;
1942                st.st_gid = perm_data.guest_gid;
1943                st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
1944            }
1945        }
1946    }
1947
1948    /// Set host uid/gid to configured value according to path
1949    fn change_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32) {
1950        let path = format!(
1951            "{}/{}",
1952            parent_data.path.clone(),
1953            name.to_str().unwrap_or("<non UTF-8 str>")
1954        );
1955
1956        for perm_data in self
1957            .permission_paths
1958            .read()
1959            .expect("acquire permission_paths read lock")
1960            .iter()
1961        {
1962            if perm_data.need_set_permission(&path) {
1963                return (perm_data.host_uid, perm_data.host_gid);
1964            }
1965        }
1966
1967        (ctx.uid, ctx.gid)
1968    }
1969
1970    fn read_permission_data<R: io::Read>(&self, mut r: R) -> io::Result<PermissionData> {
1971        let mut fs_permission_data = FsPermissionDataBuffer::new_zeroed();
1972        r.read_exact(fs_permission_data.as_mut_bytes())?;
1973
1974        let perm_path = self.string_from_u8_slice(&fs_permission_data.perm_path)?;
1975        if !perm_path.starts_with('/') {
1976            error!("FS_IOC_SETPERMISSION: perm path must start with '/'");
1977            return Err(io::Error::from_raw_os_error(libc::EINVAL));
1978        }
1979        Ok(PermissionData {
1980            guest_uid: fs_permission_data.guest_uid,
1981            guest_gid: fs_permission_data.guest_gid,
1982            host_uid: fs_permission_data.host_uid,
1983            host_gid: fs_permission_data.host_gid,
1984            umask: fs_permission_data.umask,
1985            perm_path,
1986        })
1987    }
1988
1989    /// Sets uid/gid/umask for all files and directories under a specific path.
1990    ///
1991    /// This ioctl does not correspond to any upstream FUSE feature. It is used for arcvm
1992    /// It associates the specified path with the provide uid, gid, and umask values within the
1993    /// filesystem metadata.
1994    ///
1995    /// During subsequent lookup operations, the stored uid/gid/umask values are retrieved and
1996    /// applied to all files and directories found under the registered path. Before sending
1997    /// file stat information to the client, the uid and gid are substituted by `guest_uid` and
1998    /// `guest_gid` if the file falls under the registered path. The file mode is masked by the
1999    ///  umask.
2000    ///
2001    /// When the guest creates a file within the specified path, the file gid/uid stat in host
2002    /// will be overwritten to `host_uid` and `host_gid` values.
2003    ///
2004    /// This functionality enables dynamic configuration of ownership and permissions for a
2005    /// specific directory hierarchy within the filesystem.
2006    ///
2007    /// # Notes
2008    /// - This method affects all existing and future files under the registered path.
2009    /// - The original file ownership and permissions are overridden by the provided values.
2010    /// - The registered path should not be renamed
2011    /// - Refer go/remove-mount-passthrough-fuse for more design details
2012    fn set_permission_by_path<R: io::Read>(&self, r: R) -> IoctlReply {
2013        if self
2014            .permission_paths
2015            .read()
2016            .expect("acquire permission_paths read lock")
2017            .len()
2018            >= self.cfg.max_dynamic_perm
2019        {
2020            error!(
2021                "FS_IOC_SETPERMISSION exceeds limits of max_dynamic_perm: {}",
2022                self.cfg.max_dynamic_perm
2023            );
2024            return IoctlReply::Done(Err(io::Error::from_raw_os_error(libc::EPERM)));
2025        }
2026
2027        let perm_data = match self.read_permission_data(r) {
2028            Ok(data) => data,
2029            Err(e) => {
2030                error!("fail to read permission data: {}", e);
2031                return IoctlReply::Done(Err(e));
2032            }
2033        };
2034
2035        self.permission_paths
2036            .write()
2037            .expect("acquire permission_paths write lock")
2038            .push(perm_data);
2039
2040        IoctlReply::Done(Ok(Vec::new()))
2041    }
2042
2043    // Get xattr value according to path and name
2044    fn get_xattr_by_path(&self, path: &str, name: &str) -> Option<String> {
2045        self.xattr_paths
2046            .read()
2047            .expect("acquire permission_paths read lock")
2048            .iter()
2049            .find(|data| data.need_set_guest_xattr(path, name))
2050            .map(|data| data.xattr_value.clone())
2051    }
2052
2053    fn skip_host_set_xattr(&self, path: &str, name: &str) -> bool {
2054        self.get_xattr_by_path(path, name).is_some()
2055    }
2056
2057    fn read_xattr_data<R: io::Read>(&self, mut r: R) -> io::Result<XattrData> {
2058        let mut fs_path_xattr_data = FsPathXattrDataBuffer::new_zeroed();
2059        r.read_exact(fs_path_xattr_data.as_mut_bytes())?;
2060
2061        let xattr_path = self.string_from_u8_slice(&fs_path_xattr_data.path)?;
2062        if !xattr_path.starts_with('/') {
2063            error!("FS_IOC_SETPATHXATTR: perm path must start with '/'");
2064            return Err(io::Error::from_raw_os_error(libc::EINVAL));
2065        }
2066        let xattr_name = self.string_from_u8_slice(&fs_path_xattr_data.xattr_name)?;
2067        let xattr_value = self.string_from_u8_slice(&fs_path_xattr_data.xattr_value)?;
2068
2069        Ok(XattrData {
2070            xattr_path,
2071            xattr_name,
2072            xattr_value,
2073        })
2074    }
2075
2076    /// Sets xattr value for all files and directories under a specific path.
2077    ///
2078    /// This ioctl does not correspond to any upstream FUSE feature. It is used for arcvm.
2079    /// It associates the specified path and xattr name with a value.
2080    ///
2081    /// When the getxattr is called for the specified path and name, the predefined
2082    /// value is returned.
2083    ///
2084    /// # Notes
2085    /// - This method affects all existing and future files under the registered path.
2086    /// - The SECURITY_CONTEXT feature will be disabled if this ioctl is enabled.
2087    /// - The registered path should not be renamed
2088    /// - Refer go/remove-mount-passthrough-fuse for more design details
2089    fn set_xattr_by_path<R: io::Read>(&self, r: R) -> IoctlReply {
2090        if self
2091            .xattr_paths
2092            .read()
2093            .expect("acquire xattr_paths read lock")
2094            .len()
2095            >= self.cfg.max_dynamic_xattr
2096        {
2097            error!(
2098                "FS_IOC_SETPATHXATTR exceeds limits of max_dynamic_xattr: {}",
2099                self.cfg.max_dynamic_xattr
2100            );
2101            return IoctlReply::Done(Err(io::Error::from_raw_os_error(libc::EPERM)));
2102        }
2103
2104        let xattr_data = match self.read_xattr_data(r) {
2105            Ok(data) => data,
2106            Err(e) => {
2107                error!("fail to read xattr data: {}", e);
2108                return IoctlReply::Done(Err(e));
2109            }
2110        };
2111
2112        self.xattr_paths
2113            .write()
2114            .expect("acquire xattr_paths write lock")
2115            .push(xattr_data);
2116
2117        IoctlReply::Done(Ok(Vec::new()))
2118    }
2119
2120    fn do_getxattr_with_filter(
2121        &self,
2122        data: Arc<InodeData>,
2123        name: Cow<CStr>,
2124        buf: &mut [u8],
2125    ) -> io::Result<usize> {
2126        let res: usize = match self.get_xattr_by_path(&data.path, &name.to_string_lossy()) {
2127            Some(predifined_xattr) => {
2128                let x = predifined_xattr.into_bytes();
2129                if x.len() > buf.len() {
2130                    return Err(io::Error::from_raw_os_error(libc::ERANGE));
2131                }
2132                buf[..x.len()].copy_from_slice(&x);
2133                x.len()
2134            }
2135            None => self.do_getxattr(&data, &name, &mut buf[..])?,
2136        };
2137        Ok(res)
2138    }
2139
2140    /// Looks up the host uid according to the path of file that inode is referring to.
2141    fn lookup_host_uid(&self, ctx: &Context, inode: Inode) -> u32 {
2142        if let Ok(inode_data) = self.find_inode(inode) {
2143            let path = &inode_data.path;
2144            for perm_data in self
2145                .permission_paths
2146                .read()
2147                .expect("acquire permission_paths read lock")
2148                .iter()
2149            {
2150                if perm_data.need_set_permission(path) {
2151                    return perm_data.host_uid;
2152                }
2153            }
2154        }
2155        ctx.uid
2156    }
2157}
2158
2159/// Decrements the refcount of the inode.
2160/// Returns `true` if the refcount became 0.
2161fn forget_one(
2162    inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
2163    inode: Inode,
2164    count: u64,
2165) -> bool {
2166    if let Some(data) = inodes.get(&inode) {
2167        // Acquiring the write lock on the inode map prevents new lookups from incrementing the
2168        // refcount but there is the possibility that a previous lookup already acquired a
2169        // reference to the inode data and is in the process of updating the refcount so we need
2170        // to loop here until we can decrement successfully.
2171        loop {
2172            let refcount = data.refcount.load(Ordering::Relaxed);
2173
2174            // Saturating sub because it doesn't make sense for a refcount to go below zero and
2175            // we don't want misbehaving clients to cause integer overflow.
2176            let new_count = refcount.saturating_sub(count);
2177
2178            // Synchronizes with the acquire load in `do_lookup`.
2179            if data
2180                .refcount
2181                .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
2182                .is_ok()
2183            {
2184                if new_count == 0 {
2185                    // We just removed the last refcount for this inode. There's no need for an
2186                    // acquire fence here because we hold a write lock on the inode map and any
2187                    // thread that is waiting to do a forget on the same inode will have to wait
2188                    // until we release the lock. So there's is no other release store for us to
2189                    // synchronize with before deleting the entry.
2190                    inodes.remove(&inode);
2191                    return true;
2192                }
2193                break;
2194            }
2195        }
2196    }
2197    false
2198}
2199
2200// Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
2201// nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
2202fn strip_xattr_prefix(buf: &mut Vec<u8>) {
2203    fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
2204        if start >= b.len() {
2205            return None;
2206        }
2207
2208        let end = b[start..]
2209            .iter()
2210            .position(|&c| c == b'\0')
2211            .map(|p| start + p + 1)
2212            .unwrap_or(b.len());
2213
2214        Some(&b[start..end])
2215    }
2216
2217    let mut pos = 0;
2218    while let Some(name) = next_cstr(buf, pos) {
2219        if !name.starts_with(USER_VIRTIOFS_XATTR) {
2220            pos += name.len();
2221            continue;
2222        }
2223
2224        let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
2225        buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
2226        pos += newlen;
2227    }
2228}
2229
2230impl Drop for PassthroughFs {
2231    /// The `Drop` implementation for this struct intentionally leaks all open file descriptors.
2232    /// It sets the `unsafe_leak_fd` flag on all `InodeData` and `HandleData` instances, which
2233    /// causes their `drop` implementations to forget the underlying `File` objects.
2234    ///
2235    /// This is a deliberate performance optimization for abrupt shutdowns. It relies on the
2236    /// operating system to clean up the file descriptors when the process terminates. It is
2237    /// **critical** that an instance of `PassthroughFs` is only dropped immediately prior to
2238    /// process termination.
2239    fn drop(&mut self) {
2240        let inodes = self.inodes.lock();
2241        inodes.apply(|v| {
2242            v.set_unsafe_leak_fd();
2243        });
2244        let handles = self.handles.lock();
2245        handles.values().for_each(|v| v.set_unsafe_leak_fd());
2246    }
2247}
2248
2249impl FileSystem for PassthroughFs {
2250    type Inode = Inode;
2251    type Handle = Handle;
2252    type DirIter = ReadDir<Box<[u8]>>;
2253
2254    fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
2255        let root = CString::new(self.root_dir.clone())
2256            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
2257
2258        let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
2259        // SAFETY: this doesn't modify any memory and we check the return value.
2260        let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
2261        if raw_descriptor < 0 {
2262            return Err(io::Error::last_os_error());
2263        }
2264
2265        // SAFETY: safe because we just opened this descriptor above.
2266        let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
2267
2268        let st = stat(&f)?;
2269
2270        // SAFETY: this doesn't modify any memory and there is no need to check the return
2271        // value because this system call always succeeds. We need to clear the umask here because
2272        // we want the client to be able to set all the bits in the mode.
2273        unsafe { libc::umask(0o000) };
2274
2275        let mut inodes = self.inodes.lock();
2276
2277        // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
2278        inodes.insert(
2279            ROOT_ID,
2280            InodeAltKey {
2281                ino: st.st_ino,
2282                dev: st.st_dev,
2283            },
2284            Arc::new(InodeData {
2285                inode: ROOT_ID,
2286                file: Mutex::new(OpenedFile::new(f, flags)),
2287                refcount: AtomicU64::new(2),
2288                filetype: st.st_mode.into(),
2289                path: "".to_string(),
2290                unsafe_leak_fd: AtomicBool::new(false),
2291            }),
2292        );
2293
2294        let mut opts = FsOptions::DO_READDIRPLUS
2295            | FsOptions::READDIRPLUS_AUTO
2296            | FsOptions::EXPORT_SUPPORT
2297            | FsOptions::DONT_MASK
2298            | FsOptions::CACHE_SYMLINKS;
2299
2300        // Device using dynamic xattr feature will have different security context in
2301        // host and guests. The SECURITY_CONTEXT feature should not be enabled in the
2302        // device.
2303        if self.cfg.max_dynamic_xattr == 0 && self.cfg.security_ctx {
2304            opts |= FsOptions::SECURITY_CONTEXT;
2305        }
2306
2307        if self.cfg.posix_acl {
2308            opts |= FsOptions::POSIX_ACL;
2309        }
2310        if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
2311            opts |= FsOptions::WRITEBACK_CACHE;
2312            self.writeback.store(true, Ordering::Relaxed);
2313        }
2314        if self.cfg.cache_policy == CachePolicy::Always {
2315            if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
2316                opts |= FsOptions::ZERO_MESSAGE_OPEN;
2317                self.zero_message_open.store(true, Ordering::Relaxed);
2318            }
2319            if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
2320                opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
2321                self.zero_message_opendir.store(true, Ordering::Relaxed);
2322            }
2323        }
2324        Ok(opts)
2325    }
2326
2327    fn destroy(&self) {
2328        cros_tracing::trace_simple_print!(VirtioFs, "{:?}: destroy", self);
2329        self.handles.lock().clear();
2330        self.inodes.lock().clear();
2331    }
2332
2333    fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
2334        let _trace = fs_trace!(self.tag, "statfs", inode);
2335        let data = self.find_inode(inode)?;
2336
2337        let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
2338
2339        // SAFETY: this will only modify `out` and we check the return value.
2340        syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
2341
2342        // SAFETY: the kernel guarantees that `out` has been initialized.
2343        Ok(unsafe { out.assume_init() })
2344    }
2345
2346    fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
2347        let data = self.find_inode(parent)?;
2348        #[allow(unused_variables)]
2349        let path = format!(
2350            "{}/{}",
2351            data.path,
2352            name.to_str().unwrap_or("<non UTF-8 path>")
2353        );
2354        let _trace = fs_trace!(self.tag, "lookup", parent, path);
2355
2356        let mut res = self.do_lookup_with_casefold_fallback(&data, name);
2357
2358        // FUSE takes a inode=0 as a request to do negative dentry cache.
2359        // So, if `negative_timeout` is set, return success with the timeout value and inode=0 as a
2360        // response.
2361        if let Err(e) = &res {
2362            if e.kind() == std::io::ErrorKind::NotFound && !self.cfg.negative_timeout.is_zero() {
2363                res = Ok(Entry::new_negative(self.cfg.negative_timeout));
2364            }
2365        }
2366
2367        res
2368    }
2369
2370    fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
2371        let _trace = fs_trace!(self.tag, "forget", inode, count);
2372        let mut inodes = self.inodes.lock();
2373        let caches = self.lock_casefold_lookup_caches();
2374        if forget_one(&mut inodes, inode, count) {
2375            if let Some(mut c) = caches {
2376                c.forget(inode);
2377            }
2378        }
2379    }
2380
2381    fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
2382        let mut inodes = self.inodes.lock();
2383        let mut caches = self.lock_casefold_lookup_caches();
2384        for (inode, count) in requests {
2385            if forget_one(&mut inodes, inode, count) {
2386                if let Some(c) = caches.as_mut() {
2387                    c.forget(inode);
2388                }
2389            }
2390        }
2391    }
2392
2393    fn opendir(
2394        &self,
2395        _ctx: Context,
2396        inode: Inode,
2397        flags: u32,
2398    ) -> io::Result<(Option<Handle>, OpenOptions)> {
2399        let _trace = fs_trace!(self.tag, "opendir", inode, flags);
2400        if self.zero_message_opendir.load(Ordering::Relaxed) {
2401            Err(io::Error::from_raw_os_error(libc::ENOSYS))
2402        } else {
2403            self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
2404        }
2405    }
2406
2407    fn releasedir(
2408        &self,
2409        _ctx: Context,
2410        inode: Inode,
2411        _flags: u32,
2412        handle: Handle,
2413    ) -> io::Result<()> {
2414        let _trace = fs_trace!(self.tag, "releasedir", inode, handle);
2415        if self.zero_message_opendir.load(Ordering::Relaxed) {
2416            Ok(())
2417        } else {
2418            self.do_release(inode, handle)
2419        }
2420    }
2421
2422    fn mkdir(
2423        &self,
2424        ctx: Context,
2425        parent: Inode,
2426        name: &CStr,
2427        mode: u32,
2428        umask: u32,
2429        security_ctx: Option<&CStr>,
2430    ) -> io::Result<Entry> {
2431        let _trace = fs_trace!(self.tag, "mkdir", parent, name, mode, umask, security_ctx);
2432        let data = self.find_inode(parent)?;
2433
2434        let _ctx = security_ctx
2435            .filter(|ctx| *ctx != UNLABELED_CSTR)
2436            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2437            .transpose()?;
2438
2439        #[allow(unused_variables)]
2440        #[cfg(feature = "arc_quota")]
2441        let (uid, gid) = self.change_creds(&ctx, &data, name);
2442        #[cfg(feature = "fs_runtime_ugid_map")]
2443        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2444        #[cfg(not(feature = "fs_permission_translation"))]
2445        let (uid, gid) = (ctx.uid, ctx.gid);
2446
2447        let (_uid, _gid) = set_creds(uid, gid)?;
2448        {
2449            let casefold_cache = self.lock_casefold_lookup_caches();
2450            let _scoped_umask = ScopedUmask::new(umask);
2451
2452            // SAFETY: this doesn't modify any memory and we check the return value.
2453            syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
2454            if let Some(mut c) = casefold_cache {
2455                c.insert(data.inode, name);
2456            }
2457        }
2458        self.do_lookup(&data, name)
2459    }
2460
2461    fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2462        let _trace = fs_trace!(self.tag, "rmdir", parent, name);
2463        let data = self.find_inode(parent)?;
2464        let casefold_cache = self.lock_casefold_lookup_caches();
2465        // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2466        // `get_case_unfolded_name()` to get the actual name to be unlinked.
2467        self.do_unlink(&data, name, libc::AT_REMOVEDIR)?;
2468        if let Some(mut c) = casefold_cache {
2469            c.remove(data.inode, name);
2470        }
2471        Ok(())
2472    }
2473
2474    fn readdir(
2475        &self,
2476        _ctx: Context,
2477        inode: Inode,
2478        handle: Handle,
2479        size: u32,
2480        offset: u64,
2481    ) -> io::Result<Self::DirIter> {
2482        let _trace = fs_trace!(self.tag, "readdir", inode, handle, size, offset);
2483        let buf = vec![0; size as usize].into_boxed_slice();
2484
2485        if self.zero_message_opendir.load(Ordering::Relaxed) {
2486            let data = self.find_inode(inode)?;
2487            ReadDir::new(&*data, offset as libc::off64_t, buf)
2488        } else {
2489            let data = self.find_handle(handle, inode)?;
2490
2491            let dir = data.file.lock();
2492
2493            ReadDir::new(&*dir, offset as libc::off64_t, buf)
2494        }
2495    }
2496
2497    fn open(
2498        &self,
2499        _ctx: Context,
2500        inode: Inode,
2501        flags: u32,
2502    ) -> io::Result<(Option<Handle>, OpenOptions)> {
2503        if self.zero_message_open.load(Ordering::Relaxed) {
2504            let _trace = fs_trace!(self.tag, "open (zero-message)", inode, flags);
2505            Err(io::Error::from_raw_os_error(libc::ENOSYS))
2506        } else {
2507            let _trace = fs_trace!(self.tag, "open", inode, flags);
2508            self.do_open(inode, flags)
2509        }
2510    }
2511
2512    fn release(
2513        &self,
2514        _ctx: Context,
2515        inode: Inode,
2516        _flags: u32,
2517        handle: Handle,
2518        _flush: bool,
2519        _flock_release: bool,
2520        _lock_owner: Option<u64>,
2521    ) -> io::Result<()> {
2522        if self.zero_message_open.load(Ordering::Relaxed) {
2523            let _trace = fs_trace!(self.tag, "release (zero-message)", inode, handle);
2524            Ok(())
2525        } else {
2526            let _trace = fs_trace!(self.tag, "release", inode, handle);
2527            self.do_release(inode, handle)
2528        }
2529    }
2530
2531    fn chromeos_tmpfile(
2532        &self,
2533        ctx: Context,
2534        parent: Self::Inode,
2535        mode: u32,
2536        umask: u32,
2537        security_ctx: Option<&CStr>,
2538    ) -> io::Result<Entry> {
2539        let _trace = fs_trace!(
2540            self.tag,
2541            "chromeos_tempfile",
2542            parent,
2543            mode,
2544            umask,
2545            security_ctx
2546        );
2547        let data = self.find_inode(parent)?;
2548
2549        let _ctx = security_ctx
2550            .filter(|ctx| *ctx != UNLABELED_CSTR)
2551            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2552            .transpose()?;
2553
2554        let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
2555
2556        let current_dir = c".";
2557
2558        #[allow(unused_variables)]
2559        #[cfg(feature = "arc_quota")]
2560        let (uid, gid) = self.change_creds(&ctx, &data, current_dir);
2561        #[cfg(feature = "fs_runtime_ugid_map")]
2562        let (uid, gid) = self.change_ugid_creds(&ctx, &data, current_dir);
2563        #[cfg(not(feature = "fs_permission_translation"))]
2564        let (uid, gid) = (ctx.uid, ctx.gid);
2565
2566        let (_uid, _gid) = set_creds(uid, gid)?;
2567
2568        let fd = {
2569            let _scoped_umask = ScopedUmask::new(umask);
2570
2571            // SAFETY: this doesn't modify any memory and we check the return value.
2572            syscall!(unsafe {
2573                libc::openat64(
2574                    data.as_raw_descriptor(),
2575                    current_dir.as_ptr(),
2576                    tmpflags,
2577                    mode,
2578                )
2579            })?
2580        };
2581        // No need to add casefold_cache becuase we created an anonymous file.
2582
2583        // SAFETY: safe because we just opened this fd.
2584        let tmpfile = unsafe { File::from_raw_descriptor(fd) };
2585        let st = stat(&tmpfile)?;
2586        let path = format!(
2587            "{}/{}",
2588            data.path.clone(),
2589            current_dir.to_str().unwrap_or("<non UTF-8 str>")
2590        );
2591        Ok(self.add_entry(tmpfile, st, tmpflags, path))
2592    }
2593
2594    fn create(
2595        &self,
2596        ctx: Context,
2597        parent: Inode,
2598        name: &CStr,
2599        mode: u32,
2600        flags: u32,
2601        umask: u32,
2602        security_ctx: Option<&CStr>,
2603    ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
2604        let _trace = fs_trace!(
2605            self.tag,
2606            "create",
2607            parent,
2608            name,
2609            mode,
2610            flags,
2611            umask,
2612            security_ctx
2613        );
2614        let data = self.find_inode(parent)?;
2615
2616        let _ctx = security_ctx
2617            .filter(|ctx| *ctx != UNLABELED_CSTR)
2618            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2619            .transpose()?;
2620
2621        #[allow(unused_variables)]
2622        #[cfg(feature = "arc_quota")]
2623        let (uid, gid) = self.change_creds(&ctx, &data, name);
2624        #[cfg(feature = "fs_runtime_ugid_map")]
2625        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2626        #[cfg(not(feature = "fs_permission_translation"))]
2627        let (uid, gid) = (ctx.uid, ctx.gid);
2628
2629        let (_uid, _gid) = set_creds(uid, gid)?;
2630
2631        let flags = self.update_open_flags(flags as i32);
2632        let create_flags =
2633            (flags | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
2634
2635        let fd = {
2636            let _scoped_umask = ScopedUmask::new(umask);
2637            let casefold_cache = self.lock_casefold_lookup_caches();
2638
2639            // SAFETY: this doesn't modify any memory and we check the return value. We don't really
2640            // check `flags` because if the kernel can't handle poorly specified flags then we have
2641            // much bigger problems.
2642            // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2643            // `get_case_unfolded_name()` to get the actual name to be created.
2644            let fd = syscall!(unsafe {
2645                libc::openat64(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode)
2646            })?;
2647            if let Some(mut c) = casefold_cache {
2648                c.insert(parent, name);
2649            }
2650            fd
2651        };
2652
2653        // SAFETY: safe because we just opened this fd.
2654        let file = unsafe { File::from_raw_descriptor(fd) };
2655
2656        let st = stat(&file)?;
2657        let path = format!(
2658            "{}/{}",
2659            data.path.clone(),
2660            name.to_str().unwrap_or("<non UTF-8 str>")
2661        );
2662        let entry = self.add_entry(file, st, create_flags, path);
2663
2664        let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
2665            (None, OpenOptions::KEEP_CACHE)
2666        } else {
2667            self.do_open_at(
2668                data,
2669                name,
2670                entry.inode,
2671                flags as u32 & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
2672            )
2673            .inspect_err(|_e| {
2674                // Don't leak the entry.
2675                self.forget(ctx, entry.inode, 1);
2676            })?
2677        };
2678        Ok((entry, handle, opts))
2679    }
2680
2681    fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2682        let _trace = fs_trace!(self.tag, "unlink", parent, name);
2683        let data = self.find_inode(parent)?;
2684        let casefold_cache = self.lock_casefold_lookup_caches();
2685        // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2686        // `get_case_unfolded_name()` to get the actual name to be unlinked.
2687        self.do_unlink(&data, name, 0)?;
2688        if let Some(mut c) = casefold_cache {
2689            c.remove(data.inode, name);
2690        }
2691        Ok(())
2692    }
2693
2694    fn read<W: io::Write + ZeroCopyWriter>(
2695        &self,
2696        _ctx: Context,
2697        inode: Inode,
2698        handle: Handle,
2699        mut w: W,
2700        size: u32,
2701        offset: u64,
2702        _lock_owner: Option<u64>,
2703        _flags: u32,
2704    ) -> io::Result<usize> {
2705        if self.zero_message_open.load(Ordering::Relaxed) {
2706            let _trace = fs_trace!(self.tag, "read (zero-message)", inode, handle, size, offset);
2707            let data = self.find_inode(inode)?;
2708
2709            let mut file = data.file.lock();
2710            let mut flags = file.open_flags;
2711            match flags & libc::O_ACCMODE {
2712                libc::O_WRONLY => {
2713                    flags &= !libc::O_WRONLY;
2714                    flags |= libc::O_RDWR;
2715
2716                    // We need to get a readable handle for this file.
2717                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
2718                    *file = OpenedFile::new(newfile, flags);
2719                }
2720                libc::O_RDONLY | libc::O_RDWR => {}
2721                _ => panic!("Unexpected flags: {flags:#x}"),
2722            }
2723
2724            w.write_from(file.file_mut(), size as usize, offset)
2725        } else {
2726            let _trace = fs_trace!(self.tag, "read", inode, handle, size, offset);
2727            let data = self.find_handle(handle, inode)?;
2728
2729            let mut f = data.file.lock();
2730            w.write_from(f.file_mut(), size as usize, offset)
2731        }
2732    }
2733
2734    fn write<R: io::Read + ZeroCopyReader>(
2735        &self,
2736        _ctx: Context,
2737        inode: Inode,
2738        handle: Handle,
2739        mut r: R,
2740        size: u32,
2741        offset: u64,
2742        _lock_owner: Option<u64>,
2743        _delayed_write: bool,
2744        flags: u32,
2745    ) -> io::Result<usize> {
2746        // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
2747        // automatically clear the setuid and setgid bits for us.
2748        let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
2749            Some(drop_cap_fsetid()?)
2750        } else {
2751            None
2752        };
2753
2754        if self.zero_message_open.load(Ordering::Relaxed) {
2755            let _trace = fs_trace!(
2756                self.tag,
2757                "write (zero-message)",
2758                inode,
2759                handle,
2760                size,
2761                offset
2762            );
2763
2764            let data = self.find_inode(inode)?;
2765
2766            let mut file = data.file.lock();
2767            let mut flags = file.open_flags;
2768            match flags & libc::O_ACCMODE {
2769                libc::O_RDONLY => {
2770                    flags &= !libc::O_RDONLY;
2771                    flags |= libc::O_RDWR;
2772
2773                    // We need to get a writable handle for this file.
2774                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
2775                    *file = OpenedFile::new(newfile, flags);
2776                }
2777                libc::O_WRONLY | libc::O_RDWR => {}
2778                _ => panic!("Unexpected flags: {flags:#x}"),
2779            }
2780
2781            r.read_to(file.file_mut(), size as usize, offset)
2782        } else {
2783            let _trace = fs_trace!(self.tag, "write", inode, handle, size, offset);
2784
2785            let data = self.find_handle(handle, inode)?;
2786
2787            let mut f = data.file.lock();
2788            r.read_to(f.file_mut(), size as usize, offset)
2789        }
2790    }
2791
2792    fn getattr(
2793        &self,
2794        _ctx: Context,
2795        inode: Inode,
2796        _handle: Option<Handle>,
2797    ) -> io::Result<(libc::stat64, Duration)> {
2798        let _trace = fs_trace!(self.tag, "getattr", inode, _handle);
2799
2800        let data = self.find_inode(inode)?;
2801        self.do_getattr(&data)
2802    }
2803
2804    fn setattr(
2805        &self,
2806        _ctx: Context,
2807        inode: Inode,
2808        attr: libc::stat64,
2809        handle: Option<Handle>,
2810        valid: SetattrValid,
2811    ) -> io::Result<(libc::stat64, Duration)> {
2812        let _trace = fs_trace!(self.tag, "setattr", inode, handle);
2813        let inode_data = self.find_inode(inode)?;
2814
2815        enum Data<'a> {
2816            Handle(MutexGuard<'a, OpenedFile>),
2817            ProcPath(CString),
2818        }
2819
2820        // If we have a handle then use it otherwise get a new fd from the inode.
2821        let hd;
2822        let data = if let Some(handle) = handle.filter(|&h| h != 0) {
2823            hd = self.find_handle(handle, inode)?;
2824            Data::Handle(hd.file.lock())
2825        } else {
2826            let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
2827                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2828            Data::ProcPath(pathname)
2829        };
2830
2831        if valid.contains(SetattrValid::MODE) {
2832            // SAFETY: this doesn't modify any memory and we check the return value.
2833            syscall!(unsafe {
2834                match data {
2835                    Data::Handle(ref fd) => libc::fchmod(fd.as_raw_descriptor(), attr.st_mode),
2836                    Data::ProcPath(ref p) => {
2837                        libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
2838                    }
2839                }
2840            })?;
2841        }
2842
2843        if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
2844            let uid = if valid.contains(SetattrValid::UID) {
2845                attr.st_uid
2846            } else {
2847                // Cannot use -1 here because these are unsigned values.
2848                u32::MAX
2849            };
2850            let gid = if valid.contains(SetattrValid::GID) {
2851                attr.st_gid
2852            } else {
2853                // Cannot use -1 here because these are unsigned values.
2854                u32::MAX
2855            };
2856
2857            // SAFETY: this doesn't modify any memory and we check the return value.
2858            syscall!(unsafe {
2859                libc::fchownat(
2860                    inode_data.as_raw_descriptor(),
2861                    EMPTY_CSTR.as_ptr(),
2862                    uid,
2863                    gid,
2864                    libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
2865                )
2866            })?;
2867        }
2868
2869        if valid.contains(SetattrValid::SIZE) {
2870            syscall!(match data {
2871                Data::Handle(ref fd) => {
2872                    // SAFETY: this doesn't modify any memory and we check the return value.
2873                    unsafe { libc::ftruncate64(fd.as_raw_descriptor(), attr.st_size) }
2874                }
2875                _ => {
2876                    // There is no `ftruncateat` so we need to get a new fd and truncate it.
2877                    let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
2878                    // SAFETY: this doesn't modify any memory and we check the return value.
2879                    unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
2880                }
2881            })?;
2882        }
2883
2884        if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
2885            let mut tvs = [
2886                libc::timespec {
2887                    tv_sec: 0,
2888                    tv_nsec: libc::UTIME_OMIT,
2889                },
2890                libc::timespec {
2891                    tv_sec: 0,
2892                    tv_nsec: libc::UTIME_OMIT,
2893                },
2894            ];
2895
2896            if valid.contains(SetattrValid::ATIME_NOW) {
2897                tvs[0].tv_nsec = libc::UTIME_NOW;
2898            } else if valid.contains(SetattrValid::ATIME) {
2899                tvs[0].tv_sec = attr.st_atime;
2900                tvs[0].tv_nsec = attr.st_atime_nsec;
2901            }
2902
2903            if valid.contains(SetattrValid::MTIME_NOW) {
2904                tvs[1].tv_nsec = libc::UTIME_NOW;
2905            } else if valid.contains(SetattrValid::MTIME) {
2906                tvs[1].tv_sec = attr.st_mtime;
2907                tvs[1].tv_nsec = attr.st_mtime_nsec;
2908            }
2909
2910            // SAFETY: this doesn't modify any memory and we check the return value.
2911            syscall!(unsafe {
2912                match data {
2913                    Data::Handle(ref fd) => libc::futimens(fd.as_raw_descriptor(), tvs.as_ptr()),
2914                    Data::ProcPath(ref p) => {
2915                        libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
2916                    }
2917                }
2918            })?;
2919        }
2920
2921        self.do_getattr(&inode_data)
2922    }
2923
2924    fn rename(
2925        &self,
2926        _ctx: Context,
2927        olddir: Inode,
2928        oldname: &CStr,
2929        newdir: Inode,
2930        newname: &CStr,
2931        flags: u32,
2932    ) -> io::Result<()> {
2933        let _trace = fs_trace!(self.tag, "rename", olddir, oldname, newdir, newname, flags);
2934
2935        let old_inode = self.find_inode(olddir)?;
2936        let new_inode = self.find_inode(newdir)?;
2937        {
2938            let casefold_cache = self.lock_casefold_lookup_caches();
2939
2940            // SAFETY: this doesn't modify any memory and we check the return value.
2941            // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
2942            // and we have glibc 2.28.
2943            syscall!(unsafe {
2944                libc::syscall(
2945                    libc::SYS_renameat2,
2946                    old_inode.as_raw_descriptor(),
2947                    oldname.as_ptr(),
2948                    new_inode.as_raw_descriptor(),
2949                    newname.as_ptr(),
2950                    flags,
2951                )
2952            })?;
2953            if let Some(mut c) = casefold_cache {
2954                c.remove(olddir, oldname);
2955                c.insert(newdir, newname);
2956            }
2957        }
2958
2959        Ok(())
2960    }
2961
2962    fn mknod(
2963        &self,
2964        ctx: Context,
2965        parent: Inode,
2966        name: &CStr,
2967        mode: u32,
2968        rdev: u32,
2969        umask: u32,
2970        security_ctx: Option<&CStr>,
2971    ) -> io::Result<Entry> {
2972        let _trace = fs_trace!(
2973            self.tag,
2974            "mknod",
2975            parent,
2976            name,
2977            mode,
2978            rdev,
2979            umask,
2980            security_ctx
2981        );
2982        let data = self.find_inode(parent)?;
2983
2984        let _ctx = security_ctx
2985            .filter(|ctx| *ctx != UNLABELED_CSTR)
2986            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2987            .transpose()?;
2988
2989        #[allow(unused_variables)]
2990        #[cfg(feature = "arc_quota")]
2991        let (uid, gid) = self.change_creds(&ctx, &data, name);
2992        #[cfg(feature = "fs_runtime_ugid_map")]
2993        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2994        #[cfg(not(feature = "fs_permission_translation"))]
2995        let (uid, gid) = (ctx.uid, ctx.gid);
2996
2997        let (_uid, _gid) = set_creds(uid, gid)?;
2998        {
2999            let _scoped_umask = ScopedUmask::new(umask);
3000            let casefold_cache = self.lock_casefold_lookup_caches();
3001
3002            // SAFETY: this doesn't modify any memory and we check the return value.
3003            syscall!(unsafe {
3004                libc::mknodat(
3005                    data.as_raw_descriptor(),
3006                    name.as_ptr(),
3007                    mode as libc::mode_t,
3008                    rdev as libc::dev_t,
3009                )
3010            })?;
3011            if let Some(mut c) = casefold_cache {
3012                c.insert(parent, name);
3013            }
3014        }
3015
3016        self.do_lookup(&data, name)
3017    }
3018
3019    fn link(
3020        &self,
3021        _ctx: Context,
3022        inode: Inode,
3023        newparent: Inode,
3024        newname: &CStr,
3025    ) -> io::Result<Entry> {
3026        let _trace = fs_trace!(self.tag, "link", inode, newparent, newname);
3027        let data = self.find_inode(inode)?;
3028        let new_inode = self.find_inode(newparent)?;
3029
3030        let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
3031            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3032
3033        {
3034            let casefold_cache = self.lock_casefold_lookup_caches();
3035            // SAFETY: this doesn't modify any memory and we check the return value.
3036            syscall!(unsafe {
3037                libc::linkat(
3038                    self.proc.as_raw_descriptor(),
3039                    path.as_ptr(),
3040                    new_inode.as_raw_descriptor(),
3041                    newname.as_ptr(),
3042                    libc::AT_SYMLINK_FOLLOW,
3043                )
3044            })?;
3045            if let Some(mut c) = casefold_cache {
3046                c.insert(newparent, newname);
3047            }
3048        }
3049
3050        self.do_lookup(&new_inode, newname)
3051    }
3052
3053    fn symlink(
3054        &self,
3055        ctx: Context,
3056        linkname: &CStr,
3057        parent: Inode,
3058        name: &CStr,
3059        security_ctx: Option<&CStr>,
3060    ) -> io::Result<Entry> {
3061        let _trace = fs_trace!(self.tag, "symlink", parent, linkname, name, security_ctx);
3062        let data = self.find_inode(parent)?;
3063
3064        let _ctx = security_ctx
3065            .filter(|ctx| *ctx != UNLABELED_CSTR)
3066            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
3067            .transpose()?;
3068
3069        #[allow(unused_variables)]
3070        #[cfg(feature = "arc_quota")]
3071        let (uid, gid) = self.change_creds(&ctx, &data, name);
3072        #[cfg(feature = "fs_runtime_ugid_map")]
3073        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3074        #[cfg(not(feature = "fs_permission_translation"))]
3075        let (uid, gid) = (ctx.uid, ctx.gid);
3076
3077        let (_uid, _gid) = set_creds(uid, gid)?;
3078        {
3079            let casefold_cache = self.lock_casefold_lookup_caches();
3080            // SAFETY: this doesn't modify any memory and we check the return value.
3081            syscall!(unsafe {
3082                libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
3083            })?;
3084            if let Some(mut c) = casefold_cache {
3085                c.insert(parent, name);
3086            }
3087        }
3088
3089        self.do_lookup(&data, name)
3090    }
3091
3092    fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
3093        let _trace = fs_trace!(self.tag, "readlink", inode);
3094        let data = self.find_inode(inode)?;
3095
3096        let mut buf = vec![0; libc::PATH_MAX as usize];
3097
3098        // SAFETY: this will only modify the contents of `buf` and we check the return value.
3099        let res = syscall!(unsafe {
3100            libc::readlinkat(
3101                data.as_raw_descriptor(),
3102                EMPTY_CSTR.as_ptr(),
3103                buf.as_mut_ptr() as *mut libc::c_char,
3104                buf.len(),
3105            )
3106        })?;
3107
3108        buf.resize(res as usize, 0);
3109
3110        #[cfg(feature = "fs_runtime_ugid_map")]
3111        {
3112            let link_target = Path::new(OsStr::from_bytes(&buf[..res as usize]));
3113            if !link_target.starts_with(&self.root_dir) {
3114                return Err(io::Error::new(
3115                    io::ErrorKind::InvalidInput,
3116                    "Symbolic link points outside of root_dir",
3117                ));
3118            }
3119        }
3120        Ok(buf)
3121    }
3122
3123    fn flush(
3124        &self,
3125        _ctx: Context,
3126        inode: Inode,
3127        handle: Handle,
3128        _lock_owner: u64,
3129    ) -> io::Result<()> {
3130        let _trace = fs_trace!(self.tag, "flush", inode, handle);
3131        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
3132            self.find_inode(inode)?
3133        } else {
3134            self.find_handle(handle, inode)?
3135        };
3136
3137        // SAFETY:
3138        // Since this method is called whenever an fd is closed in the client, we can emulate that
3139        // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
3140        // because this doesn't modify any memory and we check the return values.
3141        unsafe {
3142            let newfd = syscall!(libc::fcntl(
3143                data.as_raw_descriptor(),
3144                libc::F_DUPFD_CLOEXEC,
3145                0
3146            ))?;
3147
3148            syscall!(libc::close(newfd))?;
3149        }
3150        Ok(())
3151    }
3152
3153    fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
3154        if self.zero_message_open.load(Ordering::Relaxed) {
3155            let _trace = fs_trace!(self.tag, "fsync (zero-message)", inode, datasync, handle);
3156            let data = self.find_inode(inode)?;
3157            self.do_fsync(&*data, datasync)
3158        } else {
3159            let _trace = fs_trace!(self.tag, "fsync", inode, datasync, handle);
3160            let data = self.find_handle(handle, inode)?;
3161
3162            let file = data.file.lock();
3163            self.do_fsync(&*file, datasync)
3164        }
3165    }
3166
3167    fn fsyncdir(
3168        &self,
3169        _ctx: Context,
3170        inode: Inode,
3171        datasync: bool,
3172        handle: Handle,
3173    ) -> io::Result<()> {
3174        if self.zero_message_opendir.load(Ordering::Relaxed) {
3175            let _trace = fs_trace!(self.tag, "fsyncdir (zero-message)", inode, datasync, handle);
3176            let data = self.find_inode(inode)?;
3177            self.do_fsync(&*data, datasync)
3178        } else {
3179            let _trace = fs_trace!(self.tag, "fsyncdir", inode, datasync, handle);
3180            let data = self.find_handle(handle, inode)?;
3181
3182            let file = data.file.lock();
3183            self.do_fsync(&*file, datasync)
3184        }
3185    }
3186
3187    fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
3188        let _trace = fs_trace!(self.tag, "access", inode, mask);
3189        let data = self.find_inode(inode)?;
3190
3191        let st = stat(&*data)?;
3192        let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
3193
3194        if mode == libc::F_OK {
3195            // The file exists since we were able to call `stat(2)` on it.
3196            return Ok(());
3197        }
3198
3199        if (mode & libc::R_OK) != 0 {
3200            if ctx.uid != 0
3201                && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
3202                && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
3203                && st.st_mode & 0o004 == 0
3204            {
3205                return Err(io::Error::from_raw_os_error(libc::EACCES));
3206            }
3207        }
3208
3209        if (mode & libc::W_OK) != 0 {
3210            if ctx.uid != 0
3211                && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
3212                && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
3213                && st.st_mode & 0o002 == 0
3214            {
3215                return Err(io::Error::from_raw_os_error(libc::EACCES));
3216            }
3217        }
3218
3219        // root can only execute something if it is executable by one of the owner, the group, or
3220        // everyone.
3221        if (mode & libc::X_OK) != 0 {
3222            if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
3223                && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
3224                && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
3225                && st.st_mode & 0o001 == 0
3226            {
3227                return Err(io::Error::from_raw_os_error(libc::EACCES));
3228            }
3229        }
3230
3231        Ok(())
3232    }
3233
3234    fn setxattr(
3235        &self,
3236        _ctx: Context,
3237        inode: Inode,
3238        name: &CStr,
3239        value: &[u8],
3240        flags: u32,
3241    ) -> io::Result<()> {
3242        let _trace = fs_trace!(self.tag, "setxattr", inode, name, flags);
3243        // We can't allow the VM to set this xattr because an unprivileged process may use it to set
3244        // a privileged xattr.
3245        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3246            return Err(io::Error::from_raw_os_error(libc::EPERM));
3247        }
3248
3249        let data = self.find_inode(inode)?;
3250        let name = self.rewrite_xattr_name(name);
3251
3252        #[cfg(feature = "arc_quota")]
3253        if self.skip_host_set_xattr(&data.path, &name.to_string_lossy()) {
3254            debug!(
3255                "ignore setxattr for path:{} xattr_name:{}",
3256                &data.path,
3257                &name.to_string_lossy()
3258            );
3259            return Ok(());
3260        }
3261
3262        let file = data.file.lock();
3263        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3264        if o_path_file {
3265            // For FDs opened with `O_PATH`, we cannot call `fsetxattr` normally. Instead we emulate
3266            // an _at syscall by changing the CWD to /proc, running the path based syscall, and then
3267            // setting the CWD back to the root directory.
3268            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3269                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3270
3271            syscall!(self.with_proc_chdir(|| {
3272                // SAFETY: this doesn't modify any memory and we check the return value.
3273                unsafe {
3274                    libc::setxattr(
3275                        path.as_ptr(),
3276                        name.as_ptr(),
3277                        value.as_ptr() as *const libc::c_void,
3278                        value.len() as libc::size_t,
3279                        flags as c_int,
3280                    )
3281                }
3282            }))?;
3283        } else {
3284            syscall!(
3285                // For regular files and directories, we can just use fsetxattr.
3286                // SAFETY: this doesn't modify any memory and we check the return value.
3287                unsafe {
3288                    libc::fsetxattr(
3289                        file.as_raw_descriptor(),
3290                        name.as_ptr(),
3291                        value.as_ptr() as *const libc::c_void,
3292                        value.len() as libc::size_t,
3293                        flags as c_int,
3294                    )
3295                }
3296            )?;
3297        }
3298
3299        Ok(())
3300    }
3301
3302    fn getxattr(
3303        &self,
3304        _ctx: Context,
3305        inode: Inode,
3306        name: &CStr,
3307        size: u32,
3308    ) -> io::Result<GetxattrReply> {
3309        let _trace = fs_trace!(self.tag, "getxattr", inode, name, size);
3310        // We don't allow the VM to set this xattr so we also pretend there is no value associated
3311        // with it.
3312        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3313            return Err(io::Error::from_raw_os_error(libc::ENODATA));
3314        }
3315
3316        let data = self.find_inode(inode)?;
3317        let name = self.rewrite_xattr_name(name);
3318        let mut buf = vec![0u8; size as usize];
3319
3320        #[cfg(feature = "arc_quota")]
3321        let res = self.do_getxattr_with_filter(data, name, &mut buf)?;
3322
3323        #[cfg(not(feature = "arc_quota"))]
3324        let res = self.do_getxattr(&data, &name, &mut buf[..])?;
3325
3326        if size == 0 {
3327            Ok(GetxattrReply::Count(res as u32))
3328        } else {
3329            buf.truncate(res);
3330            Ok(GetxattrReply::Value(buf))
3331        }
3332    }
3333
3334    fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
3335        let _trace = fs_trace!(self.tag, "listxattr", inode, size);
3336        let data = self.find_inode(inode)?;
3337
3338        let mut buf = vec![0u8; size as usize];
3339
3340        let file = data.file.lock();
3341        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3342        let res = if o_path_file {
3343            // For FDs opened with `O_PATH`, we cannot call `flistxattr` normally. Instead we
3344            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
3345            // and then setting the CWD back to the root directory.
3346            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3347                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3348
3349            // SAFETY: this will only modify `buf` and we check the return value.
3350            syscall!(self.with_proc_chdir(|| unsafe {
3351                libc::listxattr(
3352                    path.as_ptr(),
3353                    buf.as_mut_ptr() as *mut libc::c_char,
3354                    buf.len() as libc::size_t,
3355                )
3356            }))?
3357        } else {
3358            // For regular files and directories, we can just flistxattr.
3359            // SAFETY: this will only write to `buf` and we check the return value.
3360            syscall!(unsafe {
3361                libc::flistxattr(
3362                    file.as_raw_descriptor(),
3363                    buf.as_mut_ptr() as *mut libc::c_char,
3364                    buf.len() as libc::size_t,
3365                )
3366            })?
3367        };
3368
3369        if size == 0 {
3370            Ok(ListxattrReply::Count(res as u32))
3371        } else {
3372            buf.truncate(res as usize);
3373
3374            if self.cfg.rewrite_security_xattrs {
3375                strip_xattr_prefix(&mut buf);
3376            }
3377            Ok(ListxattrReply::Names(buf))
3378        }
3379    }
3380
3381    fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
3382        let _trace = fs_trace!(self.tag, "removexattr", inode, name);
3383        // We don't allow the VM to set this xattr so we also pretend there is no value associated
3384        // with it.
3385        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3386            return Err(io::Error::from_raw_os_error(libc::ENODATA));
3387        }
3388
3389        let data = self.find_inode(inode)?;
3390        let name = self.rewrite_xattr_name(name);
3391
3392        let file = data.file.lock();
3393        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3394        if o_path_file {
3395            // For files opened with `O_PATH`, we cannot call `fremovexattr` normally. Instead we
3396            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
3397            // and then setting the CWD back to the root directory.
3398            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3399                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3400
3401            syscall!(self.with_proc_chdir(||
3402                    // SAFETY: this doesn't modify any memory and we check the return value.
3403                    unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) }))?;
3404        } else {
3405            // For regular files and directories, we can just use fremovexattr.
3406            syscall!(
3407                // SAFETY: this doesn't modify any memory and we check the return value.
3408                unsafe { libc::fremovexattr(file.as_raw_descriptor(), name.as_ptr()) }
3409            )?;
3410        }
3411
3412        Ok(())
3413    }
3414
3415    fn fallocate(
3416        &self,
3417        _ctx: Context,
3418        inode: Inode,
3419        handle: Handle,
3420        mode: u32,
3421        offset: u64,
3422        length: u64,
3423    ) -> io::Result<()> {
3424        let _trace = fs_trace!(self.tag, "fallocate", inode, handle, mode, offset, length);
3425
3426        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
3427            let data = self.find_inode(inode)?;
3428
3429            {
3430                // fallocate needs a writable fd
3431                let mut file = data.file.lock();
3432                let mut flags = file.open_flags;
3433                match flags & libc::O_ACCMODE {
3434                    libc::O_RDONLY => {
3435                        flags &= !libc::O_RDONLY;
3436                        flags |= libc::O_RDWR;
3437
3438                        // We need to get a writable handle for this file.
3439                        let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
3440                        *file = OpenedFile::new(newfile, flags);
3441                    }
3442                    libc::O_WRONLY | libc::O_RDWR => {}
3443                    _ => panic!("Unexpected flags: {flags:#x}"),
3444                }
3445            }
3446
3447            data
3448        } else {
3449            self.find_handle(handle, inode)?
3450        };
3451
3452        let fd = data.as_raw_descriptor();
3453        // SAFETY: this doesn't modify any memory and we check the return value.
3454        syscall!(unsafe {
3455            libc::fallocate64(
3456                fd,
3457                mode as libc::c_int,
3458                offset as libc::off64_t,
3459                length as libc::off64_t,
3460            )
3461        })?;
3462
3463        Ok(())
3464    }
3465
3466    #[allow(clippy::unnecessary_cast)]
3467    fn ioctl<R: io::Read>(
3468        &self,
3469        ctx: Context,
3470        inode: Inode,
3471        handle: Handle,
3472        _flags: IoctlFlags,
3473        cmd: u32,
3474        _arg: u64,
3475        in_size: u32,
3476        out_size: u32,
3477        r: R,
3478    ) -> io::Result<IoctlReply> {
3479        let _trace = fs_trace!(self.tag, "ioctl", inode, handle, cmd, in_size, out_size);
3480
3481        match cmd as IoctlNr {
3482            FS_IOC_GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
3483            FS_IOC_FSGETXATTR => {
3484                if out_size < size_of::<fsxattr>() as u32 {
3485                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3486                } else {
3487                    self.get_fsxattr(inode, handle)
3488                }
3489            }
3490            FS_IOC_FSSETXATTR => {
3491                if in_size < size_of::<fsxattr>() as u32 {
3492                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3493                } else {
3494                    self.set_fsxattr(ctx, inode, handle, r)
3495                }
3496            }
3497            FS_IOC32_GETFLAGS | FS_IOC64_GETFLAGS => {
3498                if out_size < size_of::<c_int>() as u32 {
3499                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3500                } else {
3501                    self.get_flags(inode, handle)
3502                }
3503            }
3504            FS_IOC32_SETFLAGS | FS_IOC64_SETFLAGS => {
3505                if in_size < size_of::<c_int>() as u32 {
3506                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3507                } else {
3508                    self.set_flags(ctx, inode, handle, r)
3509                }
3510            }
3511            FS_IOC_ENABLE_VERITY => {
3512                if in_size < size_of::<fsverity_enable_arg>() as u32 {
3513                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3514                } else {
3515                    self.enable_verity(inode, handle, r)
3516                }
3517            }
3518            FS_IOC_MEASURE_VERITY => {
3519                if in_size < size_of::<fsverity_digest>() as u32
3520                    || out_size < size_of::<fsverity_digest>() as u32
3521                {
3522                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3523                } else {
3524                    self.measure_verity(inode, handle, r, out_size)
3525                }
3526            }
3527            // The following is ARCVM-specific ioctl
3528            // Refer go/remove-mount-passthrough-fuse for more design details
3529            #[cfg(feature = "arc_quota")]
3530            FS_IOC_SETPERMISSION => {
3531                if in_size != size_of::<FsPermissionDataBuffer>() as u32 {
3532                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3533                } else {
3534                    Ok(self.set_permission_by_path(r))
3535                }
3536            }
3537            #[cfg(feature = "arc_quota")]
3538            FS_IOC_SETPATHXATTR => {
3539                if in_size != size_of::<FsPathXattrDataBuffer>() as u32 {
3540                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3541                } else {
3542                    Ok(self.set_xattr_by_path(r))
3543                }
3544            }
3545            _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
3546        }
3547    }
3548
3549    fn copy_file_range(
3550        &self,
3551        ctx: Context,
3552        inode_src: Inode,
3553        handle_src: Handle,
3554        offset_src: u64,
3555        inode_dst: Inode,
3556        handle_dst: Handle,
3557        offset_dst: u64,
3558        length: u64,
3559        flags: u64,
3560    ) -> io::Result<usize> {
3561        let _trace = fs_trace!(
3562            self.tag,
3563            "copy_file_range",
3564            inode_src,
3565            handle_src,
3566            offset_src,
3567            inode_dst,
3568            handle_dst,
3569            offset_dst,
3570            length,
3571            flags
3572        );
3573        // We need to change credentials during a write so that the kernel will remove setuid or
3574        // setgid bits from the file if it was written to by someone other than the owner.
3575        let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
3576        let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
3577            if self.zero_message_open.load(Ordering::Relaxed) {
3578                (self.find_inode(inode_src)?, self.find_inode(inode_dst)?)
3579            } else {
3580                (
3581                    self.find_handle(handle_src, inode_src)?,
3582                    self.find_handle(handle_dst, inode_dst)?,
3583                )
3584            };
3585
3586        let src = src_data.as_raw_descriptor();
3587        let dst = dst_data.as_raw_descriptor();
3588
3589        Ok(syscall!(
3590            // SAFETY: this call is safe because it doesn't modify any memory and we
3591            // check the return value.
3592            unsafe {
3593                libc::syscall(
3594                    libc::SYS_copy_file_range,
3595                    src,
3596                    &offset_src,
3597                    dst,
3598                    &offset_dst,
3599                    length,
3600                    flags,
3601                )
3602            }
3603        )? as usize)
3604    }
3605
3606    fn set_up_mapping<M: Mapper>(
3607        &self,
3608        _ctx: Context,
3609        inode: Self::Inode,
3610        _handle: Self::Handle,
3611        file_offset: u64,
3612        mem_offset: u64,
3613        size: usize,
3614        prot: u32,
3615        mapper: M,
3616    ) -> io::Result<()> {
3617        let _trace = fs_trace!(
3618            self.tag,
3619            "set_up_mapping",
3620            inode,
3621            file_offset,
3622            mem_offset,
3623            size,
3624            prot
3625        );
3626        if !self.cfg.use_dax {
3627            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3628        }
3629
3630        let read = prot & libc::PROT_READ as u32 != 0;
3631        let write = prot & libc::PROT_WRITE as u32 != 0;
3632        let (mmap_flags, prot) = match (read, write) {
3633            (true, true) => (libc::O_RDWR, Protection::read_write()),
3634            (true, false) => (libc::O_RDONLY, Protection::read()),
3635            // Write-only is mapped to O_RDWR since mmap always requires an fd opened for reading.
3636            (false, true) => (libc::O_RDWR, Protection::write()),
3637            (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
3638        };
3639
3640        let data = self.find_inode(inode)?;
3641
3642        if self.zero_message_open.load(Ordering::Relaxed) {
3643            let mut file = data.file.lock();
3644            let mut open_flags = file.open_flags;
3645            match (mmap_flags, open_flags & libc::O_ACCMODE) {
3646                (libc::O_RDONLY, libc::O_WRONLY)
3647                | (libc::O_RDWR, libc::O_RDONLY)
3648                | (libc::O_RDWR, libc::O_WRONLY) => {
3649                    // We have a read-only or write-only fd and we need to upgrade it.
3650                    open_flags &= !libc::O_ACCMODE;
3651                    open_flags |= libc::O_RDWR;
3652
3653                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
3654                    *file = OpenedFile::new(newfile, open_flags);
3655                }
3656                (libc::O_RDONLY, libc::O_RDONLY)
3657                | (libc::O_RDONLY, libc::O_RDWR)
3658                | (libc::O_RDWR, libc::O_RDWR) => {}
3659                (m, o) => panic!("Unexpected combination of access flags: ({m:#x}, {o:#x})"),
3660            }
3661            mapper.map(mem_offset, size, file.file(), file_offset, prot)
3662        } else {
3663            let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
3664            mapper.map(mem_offset, size, &file, file_offset, prot)
3665        }
3666    }
3667
3668    fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
3669        let _trace = fs_trace!(self.tag, "remove_mapping", msgs);
3670        if !self.cfg.use_dax {
3671            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3672        }
3673
3674        for RemoveMappingOne { moffset, len } in msgs {
3675            mapper.unmap(*moffset, *len)?;
3676        }
3677        Ok(())
3678    }
3679
3680    fn atomic_open(
3681        &self,
3682        ctx: Context,
3683        parent: Self::Inode,
3684        name: &CStr,
3685        mode: u32,
3686        flags: u32,
3687        umask: u32,
3688        security_ctx: Option<&CStr>,
3689    ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)> {
3690        let _trace = fs_trace!(
3691            self.tag,
3692            "atomic_open",
3693            parent,
3694            name,
3695            mode,
3696            flags,
3697            umask,
3698            security_ctx
3699        );
3700        // Perform lookup but not create negative dentry
3701        let data = self.find_inode(parent)?;
3702
3703        #[allow(unused_variables)]
3704        #[cfg(feature = "arc_quota")]
3705        let (uid, gid) = self.change_creds(&ctx, &data, name);
3706        #[cfg(feature = "fs_runtime_ugid_map")]
3707        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3708        #[cfg(not(feature = "fs_permission_translation"))]
3709        let (uid, gid) = (ctx.uid, ctx.gid);
3710
3711        let (_uid, _gid) = set_creds(uid, gid)?;
3712
3713        // This lookup serves two purposes:
3714        // 1. If the O_CREATE flag is not set, it retrieves the d_entry for the file.
3715        // 2. If the O_CREATE flag is set, it checks whether the file exists.
3716        let res = self.do_lookup_with_casefold_fallback(&data, name);
3717
3718        if let Err(e) = res {
3719            if e.kind() == std::io::ErrorKind::NotFound && (flags as i32 & libc::O_CREAT) != 0 {
3720                // If the file did not exist & O_CREAT is set,
3721                // create file & set FILE_CREATED bits in open options
3722                let (entry, handler, mut opts) =
3723                    self.create(ctx, parent, name, mode, flags, umask, security_ctx)?;
3724                opts |= OpenOptions::FILE_CREATED;
3725                return Ok((entry, handler, opts));
3726            } else if e.kind() == std::io::ErrorKind::NotFound
3727                && !self.cfg.negative_timeout.is_zero()
3728            {
3729                return Ok((
3730                    Entry::new_negative(self.cfg.negative_timeout),
3731                    None,
3732                    OpenOptions::empty(),
3733                ));
3734            }
3735            return Err(e);
3736        }
3737
3738        // SAFETY: checked res is not error before
3739        let entry = res.unwrap();
3740
3741        if entry.attr.st_mode & libc::S_IFMT == libc::S_IFLNK {
3742            return Ok((entry, None, OpenOptions::empty()));
3743        }
3744
3745        if (flags as i32 & (libc::O_CREAT | libc::O_EXCL)) == (libc::O_CREAT | libc::O_EXCL) {
3746            return Err(eexist());
3747        }
3748
3749        let (handler, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
3750            (None, OpenOptions::KEEP_CACHE)
3751        } else {
3752            let (handler, opts) = self.do_open(entry.inode, flags)?;
3753            (handler, opts)
3754        };
3755        Ok((entry, handler, opts))
3756    }
3757}
3758
3759#[cfg(test)]
3760mod tests {
3761    use std::path::Path;
3762
3763    use named_lock::NamedLock;
3764    use tempfile::TempDir;
3765
3766    use super::*;
3767    #[cfg(feature = "arc_quota")]
3768    use crate::virtio::fs::arc_ioctl::FS_IOCTL_PATH_MAX_LEN;
3769    #[cfg(feature = "arc_quota")]
3770    use crate::virtio::fs::arc_ioctl::FS_IOCTL_XATTR_NAME_MAX_LEN;
3771    #[cfg(feature = "arc_quota")]
3772    use crate::virtio::fs::arc_ioctl::FS_IOCTL_XATTR_VALUE_MAX_LEN;
3773
3774    const UNITTEST_LOCK_NAME: &str = "passthroughfs_unittest_lock";
3775
3776    // Create an instance of `Context` with valid uid, gid, and pid.
3777    // The correct ids are necessary for test cases where new files are created.
3778    fn get_context() -> Context {
3779        // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3780        // guarantees that they can never fail.
3781        let uid = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
3782        // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3783        // guarantees that they can never fail.
3784        let gid = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
3785        let pid = std::process::id() as libc::pid_t;
3786        Context { uid, gid, pid }
3787    }
3788
3789    /// Creates the given directories and files under `temp_dir`.
3790    fn create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str]) {
3791        let path = temp_dir.path();
3792
3793        for d in dirs {
3794            std::fs::create_dir_all(path.join(d)).unwrap();
3795        }
3796
3797        for f in files {
3798            File::create(path.join(f)).unwrap();
3799        }
3800    }
3801
3802    /// Looks up the given `path` in `fs`.
3803    fn lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode> {
3804        let mut inode = 1;
3805        let ctx = get_context();
3806        for name in path.iter() {
3807            let name = CString::new(name.to_str().unwrap()).unwrap();
3808            let ent = match fs.lookup(ctx, inode, &name) {
3809                Ok(ent) => ent,
3810                Err(e) => {
3811                    return Err(e);
3812                }
3813            };
3814            inode = ent.inode;
3815        }
3816        Ok(inode)
3817    }
3818
3819    /// Looks up the given `path` in `fs`.
3820    #[cfg(feature = "arc_quota")]
3821    fn lookup_ent(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
3822        let mut inode = 1;
3823        let ctx = get_context();
3824        let mut entry = Entry::new_negative(Duration::from_secs(10));
3825        for name in path.iter() {
3826            let name = CString::new(name.to_str().unwrap()).unwrap();
3827            entry = match fs.lookup(ctx, inode, &name) {
3828                Ok(ent) => ent,
3829                Err(e) => {
3830                    return Err(e);
3831                }
3832            };
3833            inode = entry.inode;
3834        }
3835        Ok(entry)
3836    }
3837
3838    /// Creates a file at the given `path`.
3839    fn create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
3840        let parent = path.parent().unwrap();
3841        let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3842        let parent_inode = lookup(fs, parent)?;
3843        let ctx = get_context();
3844        let security_ctx = None;
3845        fs.create(
3846            ctx,
3847            parent_inode,
3848            &filename,
3849            0o666,
3850            libc::O_RDWR as u32,
3851            0,
3852            security_ctx,
3853        )
3854        .map(|(entry, _, _)| entry)
3855    }
3856
3857    /// Removes a file at the given `path`.
3858    fn unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3859        let parent = path.parent().unwrap();
3860        let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3861        let parent_inode = lookup(fs, parent)?;
3862        let ctx = get_context();
3863        fs.unlink(ctx, parent_inode, &filename)
3864    }
3865
3866    /// Forgets cache.
3867    fn forget(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3868        let ctx = get_context();
3869        let inode = lookup(fs, path)?;
3870        // Pass `u64::MAX` to ensure that the refcount goes to 0 and we forget inode.
3871        fs.forget(ctx, inode, u64::MAX);
3872        Ok(())
3873    }
3874
3875    /// Looks up and open the given `path` in `fs`.
3876    fn atomic_open(
3877        fs: &PassthroughFs,
3878        path: &Path,
3879        mode: u32,
3880        flags: u32,
3881        umask: u32,
3882        security_ctx: Option<&CStr>,
3883    ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
3884        let mut inode = 1;
3885        let ctx = get_context();
3886
3887        let path_vec: Vec<_> = path.iter().collect();
3888        let vec_len = path_vec.len();
3889
3890        // Do lookup before util (vec_len-1)-th pathname, this operation is to simulate
3891        // the behavior of VFS, since when VFS call atomic_open only at last look up.
3892        for name in &path_vec[0..vec_len - 1] {
3893            let name = CString::new(name.to_str().unwrap()).unwrap();
3894            let ent = fs.lookup(ctx, inode, &name)?;
3895            inode = ent.inode;
3896        }
3897
3898        let name = CString::new(path_vec[vec_len - 1].to_str().unwrap()).unwrap();
3899
3900        fs.atomic_open(ctx, inode, &name, mode, flags, umask, security_ctx)
3901    }
3902
3903    fn symlink(
3904        fs: &PassthroughFs,
3905        linkname: &Path,
3906        name: &Path,
3907        security_ctx: Option<&CStr>,
3908    ) -> io::Result<Entry> {
3909        let inode = 1;
3910        let ctx = get_context();
3911        let name = CString::new(name.to_str().unwrap()).unwrap();
3912        let linkname = CString::new(linkname.to_str().unwrap()).unwrap();
3913        fs.symlink(ctx, &linkname, inode, &name, security_ctx)
3914    }
3915
3916    // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
3917    #[cfg(feature = "arc_quota")]
3918    fn fs_ioc_setpermission<R: io::Read>(
3919        fs: &PassthroughFs,
3920        in_size: u32,
3921        r: R,
3922    ) -> io::Result<IoctlReply> {
3923        let ctx = get_context();
3924        fs.ioctl(
3925            ctx,
3926            0,
3927            0,
3928            IoctlFlags::empty(),
3929            FS_IOC_SETPERMISSION as u32,
3930            0,
3931            in_size,
3932            0,
3933            r,
3934        )
3935    }
3936
3937    // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
3938    #[cfg(feature = "arc_quota")]
3939    fn fs_ioc_setpathxattr<R: io::Read>(
3940        fs: &PassthroughFs,
3941        in_size: u32,
3942        r: R,
3943    ) -> io::Result<IoctlReply> {
3944        let ctx = get_context();
3945        fs.ioctl(
3946            ctx,
3947            0,
3948            0,
3949            IoctlFlags::empty(),
3950            FS_IOC_SETPATHXATTR as u32,
3951            0,
3952            in_size,
3953            0,
3954            r,
3955        )
3956    }
3957
3958    #[test]
3959    fn rewrite_xattr_names() {
3960        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3961        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3962        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3963        let _guard = lock.lock().expect("acquire named lock");
3964
3965        let cfg = Config {
3966            rewrite_security_xattrs: true,
3967            ..Default::default()
3968        };
3969
3970        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
3971
3972        // Selinux shouldn't get overwritten.
3973        let selinux = c"security.selinux";
3974        assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
3975
3976        // user, trusted, and system should not be changed either.
3977        let user = c"user.foobar";
3978        assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
3979        let trusted = c"trusted.foobar";
3980        assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
3981        let system = c"system.foobar";
3982        assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
3983
3984        // sehash should be re-written.
3985        let sehash = c"security.sehash";
3986        assert_eq!(
3987            p.rewrite_xattr_name(sehash).to_bytes(),
3988            b"user.virtiofs.security.sehash"
3989        );
3990    }
3991
3992    #[test]
3993    fn strip_xattr_names() {
3994        let only_nuls = b"\0\0\0\0\0";
3995        let mut actual = only_nuls.to_vec();
3996        strip_xattr_prefix(&mut actual);
3997        assert_eq!(&actual[..], &only_nuls[..]);
3998
3999        let no_nuls = b"security.sehashuser.virtiofs";
4000        let mut actual = no_nuls.to_vec();
4001        strip_xattr_prefix(&mut actual);
4002        assert_eq!(&actual[..], &no_nuls[..]);
4003
4004        let empty = b"";
4005        let mut actual = empty.to_vec();
4006        strip_xattr_prefix(&mut actual);
4007        assert_eq!(&actual[..], &empty[..]);
4008
4009        let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
4010        let mut actual = no_strippable_names.to_vec();
4011        strip_xattr_prefix(&mut actual);
4012        assert_eq!(&actual[..], &no_strippable_names[..]);
4013
4014        let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
4015        let mut actual = only_strippable_names.to_vec();
4016        strip_xattr_prefix(&mut actual);
4017        assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
4018
4019        let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
4020        let mut actual = mixed_names.to_vec();
4021        strip_xattr_prefix(&mut actual);
4022        let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
4023        assert_eq!(&actual[..], &expected[..]);
4024
4025        let no_nul_with_prefix = b"user.virtiofs.security.sehash";
4026        let mut actual = no_nul_with_prefix.to_vec();
4027        strip_xattr_prefix(&mut actual);
4028        assert_eq!(&actual[..], b"security.sehash");
4029    }
4030
4031    #[test]
4032    fn lookup_files() {
4033        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4034        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4035        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4036        let _guard = lock.lock().expect("acquire named lock");
4037
4038        let temp_dir = TempDir::new().unwrap();
4039        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
4040
4041        let cfg = Default::default();
4042        let fs = PassthroughFs::new("tag", cfg).unwrap();
4043
4044        let capable = FsOptions::empty();
4045        fs.init(capable).unwrap();
4046
4047        assert!(lookup(&fs, &temp_dir.path().join("a.txt")).is_ok());
4048        assert!(lookup(&fs, &temp_dir.path().join("dir")).is_ok());
4049        assert!(lookup(&fs, &temp_dir.path().join("dir/b.txt")).is_ok());
4050
4051        assert_eq!(
4052            lookup(&fs, &temp_dir.path().join("nonexistent-file"))
4053                .expect_err("file must not exist")
4054                .kind(),
4055            io::ErrorKind::NotFound
4056        );
4057        // "A.txt" is different from "a.txt".
4058        assert_eq!(
4059            lookup(&fs, &temp_dir.path().join("A.txt"))
4060                .expect_err("file must not exist")
4061                .kind(),
4062            io::ErrorKind::NotFound
4063        );
4064    }
4065
4066    #[test]
4067    fn lookup_files_ascii_casefold() {
4068        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4069        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4070        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4071        let _guard = lock.lock().expect("acquire named lock");
4072
4073        let temp_dir = TempDir::new().unwrap();
4074        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
4075
4076        let cfg = Config {
4077            ascii_casefold: true,
4078            ..Default::default()
4079        };
4080        let fs = PassthroughFs::new("tag", cfg).unwrap();
4081
4082        let capable = FsOptions::empty();
4083        fs.init(capable).unwrap();
4084
4085        // Ensure that "A.txt" is equated with "a.txt".
4086        let a_inode = lookup(&fs, &temp_dir.path().join("a.txt")).expect("a.txt must be found");
4087        assert_eq!(
4088            lookup(&fs, &temp_dir.path().join("A.txt")).expect("A.txt must exist"),
4089            a_inode
4090        );
4091
4092        let dir_inode = lookup(&fs, &temp_dir.path().join("dir")).expect("dir must be found");
4093        assert_eq!(
4094            lookup(&fs, &temp_dir.path().join("DiR")).expect("DiR must exist"),
4095            dir_inode
4096        );
4097
4098        let b_inode =
4099            lookup(&fs, &temp_dir.path().join("dir/b.txt")).expect("dir/b.txt must be found");
4100        assert_eq!(
4101            lookup(&fs, &temp_dir.path().join("dIr/B.TxT")).expect("dIr/B.TxT must exist"),
4102            b_inode
4103        );
4104
4105        assert_eq!(
4106            lookup(&fs, &temp_dir.path().join("nonexistent-file"))
4107                .expect_err("file must not exist")
4108                .kind(),
4109            io::ErrorKind::NotFound
4110        );
4111    }
4112
4113    fn test_create_and_remove(ascii_casefold: bool) {
4114        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4115        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4116        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4117        let _guard = lock.lock().expect("acquire named lock");
4118
4119        let temp_dir = TempDir::new().unwrap();
4120        let timeout = Duration::from_millis(10);
4121        let cfg = Config {
4122            timeout,
4123            cache_policy: CachePolicy::Auto,
4124            ascii_casefold,
4125            ..Default::default()
4126        };
4127        let fs = PassthroughFs::new("tag", cfg).unwrap();
4128
4129        let capable = FsOptions::empty();
4130        fs.init(capable).unwrap();
4131
4132        // Create a.txt and b.txt.
4133        let a_path = temp_dir.path().join("a.txt");
4134        let b_path = temp_dir.path().join("b.txt");
4135        let a_entry = create(&fs, &a_path).expect("create a.txt");
4136        let b_entry = create(&fs, &b_path).expect("create b.txt");
4137        assert_eq!(
4138            a_entry.inode,
4139            lookup(&fs, &a_path).expect("lookup a.txt"),
4140            "Created file 'a.txt' must be looked up"
4141        );
4142        assert_eq!(
4143            b_entry.inode,
4144            lookup(&fs, &b_path).expect("lookup b.txt"),
4145            "Created file 'b.txt' must be looked up"
4146        );
4147
4148        // Remove a.txt only
4149        unlink(&fs, &a_path).expect("Remove");
4150        assert_eq!(
4151            lookup(&fs, &a_path)
4152                .expect_err("file must not exist")
4153                .kind(),
4154            io::ErrorKind::NotFound,
4155            "a.txt must be removed"
4156        );
4157        // "A.TXT" must not be found regardless of whether casefold is enabled or not.
4158        let upper_a_path = temp_dir.path().join("A.TXT");
4159        assert_eq!(
4160            lookup(&fs, &upper_a_path)
4161                .expect_err("file must not exist")
4162                .kind(),
4163            io::ErrorKind::NotFound,
4164            "A.txt must be removed"
4165        );
4166
4167        // Check if the host file system doesn't have a.txt but does b.txt.
4168        assert!(!a_path.exists(), "a.txt must be removed");
4169        assert!(b_path.exists(), "b.txt must exist");
4170    }
4171
4172    #[test]
4173    fn create_and_remove() {
4174        test_create_and_remove(false /* casefold */);
4175    }
4176
4177    #[test]
4178    fn create_and_remove_casefold() {
4179        test_create_and_remove(true /* casefold */);
4180    }
4181
4182    fn test_create_and_forget(ascii_casefold: bool) {
4183        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4184        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4185        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4186        let _guard = lock.lock().expect("acquire named lock");
4187
4188        let temp_dir = TempDir::new().unwrap();
4189        let timeout = Duration::from_millis(10);
4190        let cfg = Config {
4191            timeout,
4192            cache_policy: CachePolicy::Auto,
4193            ascii_casefold,
4194            ..Default::default()
4195        };
4196        let fs = PassthroughFs::new("tag", cfg).unwrap();
4197
4198        let capable = FsOptions::empty();
4199        fs.init(capable).unwrap();
4200
4201        // Create a.txt.
4202        let a_path = temp_dir.path().join("a.txt");
4203        let a_entry = create(&fs, &a_path).expect("create a.txt");
4204        assert_eq!(
4205            a_entry.inode,
4206            lookup(&fs, &a_path).expect("lookup a.txt"),
4207            "Created file 'a.txt' must be looked up"
4208        );
4209
4210        // Forget a.txt's inode from PassthroughFs's internal cache.
4211        forget(&fs, &a_path).expect("forget a.txt");
4212
4213        if ascii_casefold {
4214            let upper_a_path = temp_dir.path().join("A.TXT");
4215            let new_a_inode = lookup(&fs, &upper_a_path).expect("lookup a.txt");
4216            assert_ne!(
4217                a_entry.inode, new_a_inode,
4218                "inode must be changed after forget()"
4219            );
4220            assert_eq!(
4221                new_a_inode,
4222                lookup(&fs, &a_path).expect("lookup a.txt"),
4223                "inode must be same for a.txt and A.TXT"
4224            );
4225        } else {
4226            assert_ne!(
4227                a_entry.inode,
4228                lookup(&fs, &a_path).expect("lookup a.txt"),
4229                "inode must be changed after forget()"
4230            );
4231        }
4232    }
4233
4234    #[test]
4235    fn create_and_forget() {
4236        test_create_and_forget(false /* ascii_casefold */);
4237    }
4238
4239    #[test]
4240    fn create_and_forget_casefold() {
4241        test_create_and_forget(true /* ascii_casefold */);
4242    }
4243
4244    #[test]
4245    fn casefold_lookup_cache() {
4246        let temp_dir = TempDir::new().unwrap();
4247        // Prepare `a.txt` before starting the test.
4248        create_test_data(&temp_dir, &[], &["a.txt"]);
4249
4250        let cfg = Config {
4251            ascii_casefold: true,
4252            ..Default::default()
4253        };
4254        let fs = PassthroughFs::new("tag", cfg).unwrap();
4255
4256        let capable = FsOptions::empty();
4257        fs.init(capable).unwrap();
4258
4259        let parent = lookup(&fs, temp_dir.path()).expect("lookup temp_dir");
4260
4261        // Since `a.txt` exists, "A.TXT" must exist.
4262        let large_a_path = temp_dir.path().join("A.TXT");
4263        // Looking up "A.TXT" must create a CasefoldCache entry.
4264        lookup(&fs, &large_a_path).expect("A.TXT must exist");
4265        assert!(fs.exists_in_casefold_cache(parent, &CString::new("A.TXT").unwrap()));
4266
4267        // Create b.txt.
4268        let b_path = temp_dir.path().join("b.txt");
4269        create(&fs, &b_path).expect("create b.txt");
4270        // Then, b.txt must exists in the cache.
4271        assert!(fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
4272        // When removing b.txt, it must be removed from the cache as well.
4273        unlink(&fs, &b_path).expect("remove b.txt");
4274        assert!(!fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
4275    }
4276
4277    #[test]
4278    fn lookup_negative_cache() {
4279        let temp_dir = TempDir::new().unwrap();
4280        // Prepare `a.txt` before starting the test.
4281        create_test_data(&temp_dir, &[], &[]);
4282
4283        let cfg = Config {
4284            negative_timeout: Duration::from_secs(5),
4285            ..Default::default()
4286        };
4287        let fs = PassthroughFs::new("tag", cfg).unwrap();
4288
4289        let capable = FsOptions::empty();
4290        fs.init(capable).unwrap();
4291
4292        let a_path = temp_dir.path().join("a.txt");
4293        // a.txt hasn't existed yet.
4294        // Since negative_timeout is enabled, success with inode=0 is expected.
4295        assert_eq!(
4296            0,
4297            lookup(&fs, &a_path).expect("lookup a.txt"),
4298            "Entry with inode=0 is expected for non-existing file 'a.txt'"
4299        );
4300        // Create a.txt
4301        let a_entry = create(&fs, &a_path).expect("create a.txt");
4302        assert_eq!(
4303            a_entry.inode,
4304            lookup(&fs, &a_path).expect("lookup a.txt"),
4305            "Created file 'a.txt' must be looked up"
4306        );
4307        // Remove a.txt
4308        unlink(&fs, &a_path).expect("Remove");
4309        assert_eq!(
4310            0,
4311            lookup(&fs, &a_path).expect("lookup a.txt"),
4312            "Entry with inode=0 is expected for the removed file 'a.txt'"
4313        );
4314    }
4315    #[test]
4316    fn test_atomic_open_existing_file() {
4317        atomic_open_existing_file(false);
4318    }
4319
4320    #[test]
4321    fn test_atomic_open_existing_file_zero_message() {
4322        atomic_open_existing_file(true);
4323    }
4324
4325    fn atomic_open_existing_file(zero_message_open: bool) {
4326        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4327        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4328        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4329        let _guard = lock.lock().expect("acquire named lock");
4330
4331        let temp_dir = TempDir::new().unwrap();
4332        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt", "dir/c.txt"]);
4333
4334        let cache_policy = match zero_message_open {
4335            true => CachePolicy::Always,
4336            false => CachePolicy::Auto,
4337        };
4338
4339        let cfg = Config {
4340            cache_policy,
4341            ..Default::default()
4342        };
4343        let fs = PassthroughFs::new("tag", cfg).unwrap();
4344
4345        let capable = FsOptions::ZERO_MESSAGE_OPEN;
4346        fs.init(capable).unwrap();
4347
4348        // atomic_open with flag O_RDWR, should return positive dentry and file handler
4349        let res = atomic_open(
4350            &fs,
4351            &temp_dir.path().join("a.txt"),
4352            0o666,
4353            libc::O_RDWR as u32,
4354            0,
4355            None,
4356        );
4357        assert!(res.is_ok());
4358        let (entry, handler, open_options) = res.unwrap();
4359        assert_ne!(entry.inode, 0);
4360
4361        if zero_message_open {
4362            assert!(handler.is_none());
4363            assert_eq!(open_options, OpenOptions::KEEP_CACHE);
4364        } else {
4365            assert!(handler.is_some());
4366            assert_ne!(
4367                open_options & OpenOptions::FILE_CREATED,
4368                OpenOptions::FILE_CREATED
4369            );
4370        }
4371
4372        // atomic_open with flag O_RDWR |  O_CREATE, should return positive dentry and file handler
4373        let res = atomic_open(
4374            &fs,
4375            &temp_dir.path().join("dir/b.txt"),
4376            0o666,
4377            (libc::O_RDWR | libc::O_CREAT) as u32,
4378            0,
4379            None,
4380        );
4381        assert!(res.is_ok());
4382        let (entry, handler, open_options) = res.unwrap();
4383        assert_ne!(entry.inode, 0);
4384
4385        if zero_message_open {
4386            assert!(handler.is_none());
4387            assert_eq!(open_options, OpenOptions::KEEP_CACHE);
4388        } else {
4389            assert!(handler.is_some());
4390            assert_ne!(
4391                open_options & OpenOptions::FILE_CREATED,
4392                OpenOptions::FILE_CREATED
4393            );
4394        }
4395
4396        // atomic_open with flag O_RDWR | O_CREATE | O_EXCL, should return positive dentry and file
4397        // handler
4398        let res = atomic_open(
4399            &fs,
4400            &temp_dir.path().join("dir/c.txt"),
4401            0o666,
4402            (libc::O_RDWR | libc::O_CREAT | libc::O_EXCL) as u32,
4403            0,
4404            None,
4405        );
4406        assert!(res.is_err());
4407        let err_kind = res.unwrap_err().kind();
4408        assert_eq!(err_kind, io::ErrorKind::AlreadyExists);
4409    }
4410
4411    #[test]
4412    fn test_atomic_open_non_existing_file() {
4413        atomic_open_non_existing_file(false);
4414    }
4415
4416    #[test]
4417    fn test_atomic_open_non_existing_file_zero_message() {
4418        atomic_open_non_existing_file(true);
4419    }
4420
4421    fn atomic_open_non_existing_file(zero_message_open: bool) {
4422        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4423        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4424        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4425        let _guard = lock.lock().expect("acquire named lock");
4426
4427        let temp_dir = TempDir::new().unwrap();
4428
4429        let cache_policy = match zero_message_open {
4430            true => CachePolicy::Always,
4431            false => CachePolicy::Auto,
4432        };
4433
4434        let cfg = Config {
4435            cache_policy,
4436            ..Default::default()
4437        };
4438        let fs = PassthroughFs::new("tag", cfg).unwrap();
4439
4440        let capable = FsOptions::ZERO_MESSAGE_OPEN;
4441        fs.init(capable).unwrap();
4442
4443        // atomic_open with flag O_RDWR, should return NO_EXIST error
4444        let res = atomic_open(
4445            &fs,
4446            &temp_dir.path().join("a.txt"),
4447            0o666,
4448            libc::O_RDWR as u32,
4449            0,
4450            None,
4451        );
4452        assert!(res.is_err());
4453        let err_kind = res.unwrap_err().kind();
4454        assert_eq!(err_kind, io::ErrorKind::NotFound);
4455
4456        // atomic_open with flag O_RDWR | O_CREATE, should return positive dentry and file handler
4457        let res = atomic_open(
4458            &fs,
4459            &temp_dir.path().join("b.txt"),
4460            0o666,
4461            (libc::O_RDWR | libc::O_CREAT) as u32,
4462            0,
4463            None,
4464        );
4465        assert!(res.is_ok());
4466        let (entry, handler, open_options) = res.unwrap();
4467        assert_ne!(entry.inode, 0);
4468
4469        if zero_message_open {
4470            assert!(handler.is_none());
4471            assert_eq!(
4472                open_options & OpenOptions::KEEP_CACHE,
4473                OpenOptions::KEEP_CACHE
4474            );
4475        } else {
4476            assert!(handler.is_some());
4477        }
4478        assert_eq!(
4479            open_options & OpenOptions::FILE_CREATED,
4480            OpenOptions::FILE_CREATED
4481        );
4482    }
4483
4484    #[test]
4485    fn atomic_open_symbol_link() {
4486        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4487        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4488        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4489        let _guard = lock.lock().expect("acquire named lock");
4490
4491        let temp_dir = TempDir::new().unwrap();
4492        create_test_data(&temp_dir, &["dir"], &["a.txt"]);
4493
4494        let cfg = Default::default();
4495        let fs = PassthroughFs::new("tag", cfg).unwrap();
4496
4497        let capable = FsOptions::empty();
4498        fs.init(capable).unwrap();
4499
4500        // atomic open the link destination file
4501        let res_dst = atomic_open(
4502            &fs,
4503            &temp_dir.path().join("a.txt"),
4504            0o666,
4505            libc::O_RDWR as u32,
4506            0,
4507            None,
4508        );
4509        assert!(res_dst.is_ok());
4510        let (entry_dst, handler_dst, _) = res_dst.unwrap();
4511        assert_ne!(entry_dst.inode, 0);
4512        assert!(handler_dst.is_some());
4513
4514        // create depth 1 symbol link
4515        let sym1_res = symlink(
4516            &fs,
4517            &temp_dir.path().join("a.txt"),
4518            &temp_dir.path().join("blink"),
4519            None,
4520        );
4521        assert!(sym1_res.is_ok());
4522        let sym1_entry = sym1_res.unwrap();
4523        assert_ne!(sym1_entry.inode, 0);
4524
4525        // atomic_open symbol link, should return dentry with no handler
4526        let res = atomic_open(
4527            &fs,
4528            &temp_dir.path().join("blink"),
4529            0o666,
4530            libc::O_RDWR as u32,
4531            0,
4532            None,
4533        );
4534        assert!(res.is_ok());
4535        let (entry, handler, open_options) = res.unwrap();
4536        assert_eq!(entry.inode, sym1_entry.inode);
4537        assert!(handler.is_none());
4538        assert_eq!(open_options, OpenOptions::empty());
4539
4540        // delete link destination
4541        unlink(&fs, &temp_dir.path().join("a.txt")).expect("Remove");
4542        assert_eq!(
4543            lookup(&fs, &temp_dir.path().join("a.txt"))
4544                .expect_err("file must not exist")
4545                .kind(),
4546            io::ErrorKind::NotFound,
4547            "a.txt must be removed"
4548        );
4549
4550        // after link destination removed, should still return valid dentry
4551        let res = atomic_open(
4552            &fs,
4553            &temp_dir.path().join("blink"),
4554            0o666,
4555            libc::O_RDWR as u32,
4556            0,
4557            None,
4558        );
4559        assert!(res.is_ok());
4560        let (entry, handler, open_options) = res.unwrap();
4561        assert_eq!(entry.inode, sym1_entry.inode);
4562        assert!(handler.is_none());
4563        assert_eq!(open_options, OpenOptions::empty());
4564    }
4565
4566    #[test]
4567    #[cfg(feature = "arc_quota")]
4568    fn set_permission_ioctl_valid_data() {
4569        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4570        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4571        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4572        let _guard = lock.lock().expect("acquire named lock");
4573
4574        let cfg = Config {
4575            max_dynamic_perm: 1,
4576            ..Default::default()
4577        };
4578        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4579
4580        let perm_path_string = String::from("/test");
4581        let fs_permission_data_buffer = FsPermissionDataBuffer {
4582            guest_uid: 1,
4583            guest_gid: 2,
4584            host_uid: 3,
4585            host_gid: 4,
4586            umask: 5,
4587            pad: 0,
4588            perm_path: {
4589                let mut perm_path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4590                perm_path[..perm_path_string.len()].copy_from_slice(perm_path_string.as_bytes());
4591                perm_path
4592            },
4593        };
4594        let r = std::io::Cursor::new(fs_permission_data_buffer.as_bytes());
4595
4596        let res = fs_ioc_setpermission(
4597            &p,
4598            mem::size_of_val(&fs_permission_data_buffer) as u32,
4599            r.clone(),
4600        )
4601        .expect("valid input should get IoctlReply");
4602        assert!(matches!(res, IoctlReply::Done(Ok(data)) if data.is_empty()));
4603
4604        let read_guard = p
4605            .permission_paths
4606            .read()
4607            .expect("read permission_paths failed");
4608        let permission_data = read_guard
4609            .first()
4610            .expect("permission path should not be empty");
4611
4612        // Check expected data item is added to permission_paths.
4613        let expected_data = PermissionData {
4614            guest_uid: 1,
4615            guest_gid: 2,
4616            host_uid: 3,
4617            host_gid: 4,
4618            umask: 5,
4619            perm_path: perm_path_string,
4620        };
4621        assert_eq!(*permission_data, expected_data);
4622
4623        // Second ioctl should not succeed since max_dynamic_perm is set to 1
4624        let res = fs_ioc_setpermission(
4625            &p,
4626            mem::size_of_val(&fs_permission_data_buffer) as u32,
4627            r.clone(),
4628        )
4629        .expect("valid input should get IoctlReply");
4630        assert!(
4631            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4632                errno == libc::EPERM
4633            }))
4634        );
4635    }
4636
4637    #[test]
4638    #[cfg(feature = "arc_quota")]
4639    fn set_permission_ioctl_invalid_data() {
4640        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4641        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4642        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4643        let _guard = lock.lock().expect("acquire named lock");
4644
4645        let cfg = Config {
4646            max_dynamic_perm: 1,
4647            ..Default::default()
4648        };
4649        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4650
4651        // The perm_path is not valid since it does not start with /.
4652        let perm_path_string = String::from("test");
4653        let fs_permission_data_buffer = FsPermissionDataBuffer {
4654            guest_uid: 1,
4655            guest_gid: 2,
4656            host_uid: 3,
4657            host_gid: 4,
4658            umask: 5,
4659            pad: 0,
4660            perm_path: {
4661                let mut perm_path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4662                perm_path[..perm_path_string.len()].copy_from_slice(perm_path_string.as_bytes());
4663                perm_path
4664            },
4665        };
4666
4667        let r = std::io::Cursor::new(fs_permission_data_buffer.as_bytes());
4668        // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
4669        // This call is supposed to get EINVAL ioctlReply, since the perm_path is invalid.
4670        let res = fs_ioc_setpermission(&p, mem::size_of_val(&fs_permission_data_buffer) as u32, r)
4671            .expect("invalid perm_path should get IoctlReply");
4672        assert!(
4673            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4674                errno == libc::EINVAL
4675            }))
4676        );
4677
4678        let fake_data_buffer: [u8; 128] = [0; 128];
4679        let r = std::io::Cursor::new(fake_data_buffer.as_bytes());
4680
4681        // This call is supposed to get EINVAL ioctlReply, since the in_size is not the size of
4682        // struct FsPermissionDataBuffer.
4683        let res = fs_ioc_setpermission(&p, mem::size_of_val(&fake_data_buffer) as u32, r)
4684            .expect_err("invalid in_size should get Error");
4685        assert!(res
4686            .raw_os_error()
4687            .is_some_and(|errno| { errno == libc::EINVAL }));
4688    }
4689
4690    #[test]
4691    #[cfg(feature = "arc_quota")]
4692    fn permission_data_path_matching() {
4693        let ctx = get_context();
4694        let temp_dir = TempDir::new().unwrap();
4695        // Prepare `a.txt` before starting the test.
4696        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/a.txt"]);
4697
4698        let cfg = Config {
4699            max_dynamic_perm: 1,
4700            ..Default::default()
4701        };
4702        let fs = PassthroughFs::new("tag", cfg).unwrap();
4703
4704        let capable = FsOptions::empty();
4705        fs.init(capable).unwrap();
4706
4707        const BY_PATH_UID: u32 = 655360;
4708        const BY_PATH_GID: u32 = 655361;
4709        const BY_PATH_UMASK: u32 = 0o007;
4710
4711        let dir_path = temp_dir.path().join("dir");
4712        let permission_data = PermissionData {
4713            guest_uid: BY_PATH_UID,
4714            guest_gid: BY_PATH_GID,
4715            host_uid: ctx.uid,
4716            host_gid: ctx.gid,
4717            umask: BY_PATH_UMASK,
4718            perm_path: dir_path.to_string_lossy().into_owned(),
4719        };
4720        fs.permission_paths
4721            .write()
4722            .expect("permission_path lock must be acquired")
4723            .push(permission_data);
4724
4725        // a_path is the path with out set permission by path
4726        let a_path = temp_dir.path().join("a.txt");
4727        let in_dir_a_path = dir_path.join("a.txt");
4728
4729        // a.txt should not be set with guest_uid/guest_uid/umask by path
4730        let a_entry = lookup_ent(&fs, &a_path).expect("a.txt must exist");
4731        assert_ne!(a_entry.attr.st_uid, BY_PATH_UID);
4732        assert_ne!(a_entry.attr.st_gid, BY_PATH_GID);
4733
4734        // a.txt in dir should be set guest_uid/guest_uid/umask by path
4735        let in_dir_a_entry = lookup_ent(&fs, &in_dir_a_path).expect("dir/a.txt must exist");
4736        assert_eq!(in_dir_a_entry.attr.st_uid, BY_PATH_UID);
4737        assert_eq!(in_dir_a_entry.attr.st_gid, BY_PATH_GID);
4738        assert_eq!(in_dir_a_entry.attr.st_mode & 0o777, !BY_PATH_UMASK & 0o777);
4739
4740        // Create dir/b.txt.
4741        let in_dir_b_path = dir_path.join("b.txt");
4742        create(&fs, &in_dir_b_path).expect("create b.txt");
4743
4744        // newly created b.txt in dir should be set guest_uid/guest_uid/umask by path
4745        let in_dir_b_entry = lookup_ent(&fs, &in_dir_a_path).expect("dir/b.txt must exist");
4746        assert_eq!(in_dir_b_entry.attr.st_uid, BY_PATH_UID);
4747        assert_eq!(in_dir_b_entry.attr.st_gid, BY_PATH_GID);
4748        assert_eq!(in_dir_b_entry.attr.st_mode & 0o777, !BY_PATH_UMASK & 0o777);
4749    }
4750
4751    #[test]
4752    #[cfg(feature = "arc_quota")]
4753    fn set_path_xattr_ioctl_valid_data() {
4754        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4755        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4756        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4757        let _guard = lock.lock().expect("acquire named lock");
4758
4759        let cfg: Config = Config {
4760            max_dynamic_xattr: 1,
4761            ..Default::default()
4762        };
4763        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4764
4765        let path_string = String::from("/test");
4766        let xattr_name_string = String::from("test_name");
4767        let xattr_value_string = String::from("test_value");
4768        let fs_path_xattr_data_buffer = FsPathXattrDataBuffer {
4769            path: {
4770                let mut path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4771                path[..path_string.len()].copy_from_slice(path_string.as_bytes());
4772                path
4773            },
4774            xattr_name: {
4775                let mut xattr_name: [u8; FS_IOCTL_XATTR_NAME_MAX_LEN] =
4776                    [0; FS_IOCTL_XATTR_NAME_MAX_LEN];
4777                xattr_name[..xattr_name_string.len()].copy_from_slice(xattr_name_string.as_bytes());
4778                xattr_name
4779            },
4780            xattr_value: {
4781                let mut xattr_value: [u8; FS_IOCTL_XATTR_VALUE_MAX_LEN] =
4782                    [0; FS_IOCTL_XATTR_VALUE_MAX_LEN];
4783                xattr_value[..xattr_value_string.len()]
4784                    .copy_from_slice(xattr_value_string.as_bytes());
4785                xattr_value
4786            },
4787        };
4788        let r = std::io::Cursor::new(fs_path_xattr_data_buffer.as_bytes());
4789
4790        let res = fs_ioc_setpathxattr(
4791            &p,
4792            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4793            r.clone(),
4794        )
4795        .expect("valid input should get IoctlReply");
4796        assert!(matches!(res, IoctlReply::Done(Ok(data)) if data.is_empty()));
4797
4798        let read_guard = p.xattr_paths.read().expect("read xattr_paths failed");
4799        let xattr_data = read_guard.first().expect("xattr_paths should not be empty");
4800
4801        // Check expected data item is added to permission_paths.
4802        let expected_data = XattrData {
4803            xattr_path: path_string,
4804            xattr_name: xattr_name_string,
4805            xattr_value: xattr_value_string,
4806        };
4807        assert_eq!(*xattr_data, expected_data);
4808
4809        // Second ioctl should not succeed since max_dynamic_perm is set to 1
4810        let res = fs_ioc_setpathxattr(
4811            &p,
4812            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4813            r.clone(),
4814        )
4815        .expect("valid input should get IoctlReply");
4816        assert!(
4817            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4818                errno == libc::EPERM
4819            }))
4820        );
4821    }
4822    #[test]
4823    #[cfg(feature = "arc_quota")]
4824    fn set_path_xattr_ioctl_invalid_data() {
4825        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4826        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4827        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4828        let _guard = lock.lock().expect("acquire named lock");
4829
4830        let cfg: Config = Config {
4831            max_dynamic_xattr: 1,
4832            ..Default::default()
4833        };
4834        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4835
4836        let path_string = String::from("test");
4837        let xattr_name_string = String::from("test_name");
4838        let xattr_value_string = String::from("test_value");
4839        let fs_path_xattr_data_buffer = FsPathXattrDataBuffer {
4840            path: {
4841                let mut path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4842                path[..path_string.len()].copy_from_slice(path_string.as_bytes());
4843                path
4844            },
4845            xattr_name: {
4846                let mut xattr_name: [u8; FS_IOCTL_XATTR_NAME_MAX_LEN] =
4847                    [0; FS_IOCTL_XATTR_NAME_MAX_LEN];
4848                xattr_name[..xattr_name_string.len()].copy_from_slice(xattr_name_string.as_bytes());
4849                xattr_name
4850            },
4851            xattr_value: {
4852                let mut xattr_value: [u8; FS_IOCTL_XATTR_VALUE_MAX_LEN] =
4853                    [0; FS_IOCTL_XATTR_VALUE_MAX_LEN];
4854                xattr_value[..xattr_value_string.len()]
4855                    .copy_from_slice(xattr_value_string.as_bytes());
4856                xattr_value
4857            },
4858        };
4859        let r = std::io::Cursor::new(fs_path_xattr_data_buffer.as_bytes());
4860
4861        // This call is supposed to get EINVAL ioctlReply, since the perm_path is invalid.
4862        let res = fs_ioc_setpathxattr(
4863            &p,
4864            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4865            r.clone(),
4866        )
4867        .expect("valid input should get IoctlReply");
4868        assert!(
4869            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4870                errno == libc::EINVAL
4871            }))
4872        );
4873
4874        let fake_data_buffer: [u8; 128] = [0; 128];
4875        let r = std::io::Cursor::new(fake_data_buffer.as_bytes());
4876        // This call is supposed to get EINVAL ioctlReply, since the in_size is not the size of
4877        // struct FsPathXattrDataBuffer.
4878        let res = fs_ioc_setpathxattr(&p, mem::size_of_val(&fake_data_buffer) as u32, r.clone())
4879            .expect_err("valid input should get IoctlReply");
4880        assert!(res
4881            .raw_os_error()
4882            .is_some_and(|errno| { errno == libc::EINVAL }));
4883    }
4884
4885    #[test]
4886    #[cfg(feature = "arc_quota")]
4887    fn xattr_data_path_matching() {
4888        let ctx = get_context();
4889        let temp_dir = TempDir::new().unwrap();
4890        // Prepare `a.txt` before starting the test.
4891        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/a.txt"]);
4892
4893        let cfg = Config {
4894            max_dynamic_xattr: 1,
4895            ..Default::default()
4896        };
4897        let fs = PassthroughFs::new("tag", cfg).unwrap();
4898
4899        let capable = FsOptions::empty();
4900        fs.init(capable).unwrap();
4901
4902        let dir_path = temp_dir.path().join("dir");
4903        let xattr_name_string = String::from("test_name");
4904        let xattr_name_cstring = CString::new(xattr_name_string.clone()).expect("create c string");
4905        let xattr_value_string = String::from("test_value");
4906        let xattr_value_bytes = xattr_value_string.clone().into_bytes();
4907
4908        let xattr_data = XattrData {
4909            xattr_name: xattr_name_string,
4910            xattr_value: xattr_value_string,
4911            xattr_path: dir_path.to_string_lossy().into_owned(),
4912        };
4913        fs.xattr_paths
4914            .write()
4915            .expect("xattr_paths lock must be acquired")
4916            .push(xattr_data);
4917
4918        // a_path is the path with out set xattr by path
4919        let a_path: std::path::PathBuf = temp_dir.path().join("a.txt");
4920        let in_dir_a_path = dir_path.join("a.txt");
4921
4922        let a_node = lookup(&fs, a_path.as_path()).expect("lookup a node");
4923        // a.txt should not be set with xattr by path
4924        assert!(fs
4925            .getxattr(
4926                ctx,
4927                a_node,
4928                &xattr_name_cstring,
4929                xattr_value_bytes.len() as u32
4930            )
4931            .is_err());
4932
4933        let in_dir_a_node = lookup(&fs, in_dir_a_path.as_path()).expect("lookup in dir a node");
4934        // a.txt in dir should be set xattr by path
4935        let in_dir_a_reply = fs
4936            .getxattr(
4937                ctx,
4938                in_dir_a_node,
4939                &xattr_name_cstring,
4940                xattr_value_bytes.len() as u32,
4941            )
4942            .expect("Getxattr should success");
4943        assert!(matches!(in_dir_a_reply, GetxattrReply::Value(v) if v == xattr_value_bytes));
4944        // Create dir/b.txt.
4945        let in_dir_b_path = dir_path.join("b.txt");
4946        create(&fs, &in_dir_b_path).expect("create b.txt");
4947
4948        // newly created b.txt in dir should be set xattr by path
4949        let in_dir_b_node = lookup(&fs, in_dir_a_path.as_path()).expect("lookup in dir b node");
4950        let in_dir_b_reply = fs
4951            .getxattr(
4952                ctx,
4953                in_dir_b_node,
4954                &xattr_name_cstring,
4955                xattr_value_bytes.len() as u32,
4956            )
4957            .expect("Getxattr should success");
4958        assert!(matches!(in_dir_b_reply, GetxattrReply::Value(v) if v == xattr_value_bytes));
4959    }
4960
4961    /// Creates and open a new file by atomic_open with O_APPEND flag.
4962    /// We check O_APPEND is properly handled, depending on writeback cache is enabled or not.
4963    fn atomic_open_create_o_append(writeback: bool) {
4964        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4965        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4966        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4967        let _guard = lock.lock().expect("acquire named lock");
4968
4969        let temp_dir = TempDir::new().unwrap();
4970
4971        let cfg = Config {
4972            cache_policy: CachePolicy::Always,
4973            writeback,
4974            ..Default::default()
4975        };
4976        let fs = PassthroughFs::new("tag", cfg).unwrap();
4977
4978        let capable = FsOptions::ZERO_MESSAGE_OPEN | FsOptions::WRITEBACK_CACHE;
4979        fs.init(capable).unwrap();
4980
4981        let (entry, _, _) = atomic_open(
4982            &fs,
4983            &temp_dir.path().join("a.txt"),
4984            0o666,
4985            (libc::O_RDWR | libc::O_CREAT | libc::O_APPEND) as u32,
4986            0,
4987            None,
4988        )
4989        .expect("atomic_open");
4990        assert_ne!(entry.inode, 0);
4991
4992        let inodes = fs.inodes.lock();
4993        let data = inodes.get(&entry.inode).unwrap();
4994        let flags = data.file.lock().open_flags;
4995        if writeback {
4996            // When writeback is enabled, O_APPEND must be handled by the guest kernel.
4997            // So, it must be cleared.
4998            assert_eq!(flags & libc::O_APPEND, 0);
4999        } else {
5000            // Without writeback cache, O_APPEND must not be cleared.
5001            assert_eq!(flags & libc::O_APPEND, libc::O_APPEND);
5002        }
5003    }
5004
5005    #[test]
5006    fn test_atomic_open_create_o_append_no_writeback() {
5007        atomic_open_create_o_append(false);
5008    }
5009
5010    #[test]
5011    fn test_atomic_open_create_o_append_writeback() {
5012        atomic_open_create_o_append(true);
5013    }
5014}