devices/virtio/fs/
passthrough.rs

1// Copyright 2019 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::borrow::Cow;
6use std::cell::RefCell;
7use std::cmp;
8use std::collections::btree_map;
9use std::collections::BTreeMap;
10use std::ffi::CStr;
11use std::ffi::CString;
12#[cfg(feature = "fs_runtime_ugid_map")]
13use std::ffi::OsStr;
14use std::fs::File;
15use std::io;
16use std::mem;
17use std::mem::size_of;
18use std::mem::MaybeUninit;
19use std::os::raw::c_int;
20use std::os::raw::c_long;
21#[cfg(feature = "fs_runtime_ugid_map")]
22use std::os::unix::ffi::OsStrExt;
23#[cfg(feature = "fs_runtime_ugid_map")]
24use std::path::Path;
25use std::ptr;
26use std::ptr::addr_of;
27use std::ptr::addr_of_mut;
28use std::sync::atomic::AtomicBool;
29use std::sync::atomic::AtomicU64;
30use std::sync::atomic::Ordering;
31use std::sync::Arc;
32use std::sync::MutexGuard;
33#[cfg(feature = "fs_permission_translation")]
34use std::sync::RwLock;
35use std::time::Duration;
36
37#[cfg(feature = "arc_quota")]
38use base::debug;
39use base::error;
40use base::ioctl_ior_nr;
41use base::ioctl_iow_nr;
42use base::ioctl_iowr_nr;
43use base::ioctl_with_mut_ptr;
44use base::ioctl_with_ptr;
45use base::syscall;
46use base::unix::FileFlags;
47use base::warn;
48use base::AsRawDescriptor;
49use base::FromRawDescriptor;
50use base::IntoRawDescriptor;
51use base::IoctlNr;
52use base::Protection;
53use base::RawDescriptor;
54use fuse::filesystem::Context;
55use fuse::filesystem::DirectoryIterator;
56use fuse::filesystem::Entry;
57use fuse::filesystem::FileSystem;
58use fuse::filesystem::FsOptions;
59use fuse::filesystem::GetxattrReply;
60use fuse::filesystem::IoctlFlags;
61use fuse::filesystem::IoctlReply;
62use fuse::filesystem::ListxattrReply;
63use fuse::filesystem::OpenOptions;
64use fuse::filesystem::RemoveMappingOne;
65use fuse::filesystem::SetattrValid;
66use fuse::filesystem::ZeroCopyReader;
67use fuse::filesystem::ZeroCopyWriter;
68use fuse::filesystem::ROOT_ID;
69use fuse::sys::WRITE_KILL_PRIV;
70use fuse::Mapper;
71#[cfg(feature = "arc_quota")]
72use protobuf::Message;
73use sync::Mutex;
74#[cfg(feature = "arc_quota")]
75use system_api::client::OrgChromiumSpaced;
76#[cfg(feature = "arc_quota")]
77use system_api::spaced::SetProjectIdReply;
78#[cfg(feature = "arc_quota")]
79use system_api::spaced::SetProjectInheritanceFlagReply;
80use zerocopy::FromBytes;
81use zerocopy::FromZeros;
82use zerocopy::Immutable;
83use zerocopy::IntoBytes;
84use zerocopy::KnownLayout;
85
86#[cfg(feature = "arc_quota")]
87use crate::virtio::fs::arc_ioctl::FsPathXattrDataBuffer;
88#[cfg(feature = "arc_quota")]
89use crate::virtio::fs::arc_ioctl::FsPermissionDataBuffer;
90#[cfg(feature = "arc_quota")]
91use crate::virtio::fs::arc_ioctl::XattrData;
92use crate::virtio::fs::caps::Capability;
93use crate::virtio::fs::caps::Caps;
94use crate::virtio::fs::caps::Set as CapSet;
95use crate::virtio::fs::caps::Value as CapValue;
96use crate::virtio::fs::config::CachePolicy;
97use crate::virtio::fs::config::Config;
98#[cfg(feature = "fs_permission_translation")]
99use crate::virtio::fs::config::PermissionData;
100use crate::virtio::fs::expiring_map::ExpiringMap;
101use crate::virtio::fs::multikey::MultikeyBTreeMap;
102use crate::virtio::fs::read_dir::ReadDir;
103
104const EMPTY_CSTR: &CStr = c"";
105const PROC_CSTR: &CStr = c"/proc";
106const UNLABELED_CSTR: &CStr = c"unlabeled";
107
108const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
109const SECURITY_XATTR: &[u8] = b"security.";
110const SELINUX_XATTR: &[u8] = b"security.selinux";
111
112const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
113const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
114
115#[cfg(feature = "arc_quota")]
116const FS_PROJINHERIT_FL: c_int = 0x20000000;
117
118// 25 seconds is the default timeout for dbus-send.
119#[cfg(feature = "arc_quota")]
120const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
121
122/// Internal utility wrapper for `cros_tracing::trace_event!()` macro with VirtioFS calls.
123macro_rules! fs_trace {
124    ($tag:expr, $name:expr, $($arg:expr),+) => {
125        cros_tracing::trace_event!(VirtioFs, $name, $tag, $($arg),*)
126    };
127}
128
129#[repr(C)]
130#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
131struct fscrypt_policy_v1 {
132    _version: u8,
133    _contents_encryption_mode: u8,
134    _filenames_encryption_mode: u8,
135    _flags: u8,
136    _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
137}
138
139#[repr(C)]
140#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
141struct fscrypt_policy_v2 {
142    _version: u8,
143    _contents_encryption_mode: u8,
144    _filenames_encryption_mode: u8,
145    _flags: u8,
146    __reserved: [u8; 4],
147    master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
148}
149
150#[repr(C)]
151#[derive(Copy, Clone, FromBytes, Immutable, KnownLayout)]
152union fscrypt_policy {
153    _version: u8,
154    _v1: fscrypt_policy_v1,
155    _v2: fscrypt_policy_v2,
156}
157
158#[repr(C)]
159#[derive(Copy, Clone, FromBytes, Immutable, KnownLayout)]
160struct fscrypt_get_policy_ex_arg {
161    policy_size: u64,       /* input/output */
162    policy: fscrypt_policy, /* output */
163}
164
165impl From<&fscrypt_get_policy_ex_arg> for &[u8] {
166    fn from(value: &fscrypt_get_policy_ex_arg) -> Self {
167        assert!(value.policy_size <= size_of::<fscrypt_policy>() as u64);
168        let data_raw: *const fscrypt_get_policy_ex_arg = value;
169        // SAFETY: the length of the output slice is asserted to be within the struct it points to
170        unsafe {
171            std::slice::from_raw_parts(
172                data_raw.cast(),
173                value.policy_size as usize + size_of::<u64>(),
174            )
175        }
176    }
177}
178
179ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
180
181#[repr(C)]
182#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
183struct fsxattr {
184    fsx_xflags: u32,     /* xflags field value (get/set) */
185    fsx_extsize: u32,    /* extsize field value (get/set) */
186    fsx_nextents: u32,   /* nextents field value (get) */
187    fsx_projid: u32,     /* project identifier (get/set) */
188    fsx_cowextsize: u32, /* CoW extsize field value (get/set) */
189    fsx_pad: [u8; 8],
190}
191
192ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
193ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
194
195ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
196ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
197
198ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
199ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
200
201ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
202ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
203
204#[cfg(feature = "arc_quota")]
205ioctl_iow_nr!(FS_IOC_SETPERMISSION, 'f' as u32, 1, FsPermissionDataBuffer);
206#[cfg(feature = "arc_quota")]
207ioctl_iow_nr!(FS_IOC_SETPATHXATTR, 'f' as u32, 1, FsPathXattrDataBuffer);
208
209#[repr(C)]
210#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
211struct fsverity_enable_arg {
212    _version: u32,
213    _hash_algorithm: u32,
214    _block_size: u32,
215    salt_size: u32,
216    salt_ptr: u64,
217    sig_size: u32,
218    __reserved1: u32,
219    sig_ptr: u64,
220    __reserved2: [u64; 11],
221}
222
223#[repr(C)]
224#[derive(Clone, Copy, FromBytes, Immutable, IntoBytes, KnownLayout)]
225struct fsverity_digest {
226    _digest_algorithm: u16,
227    digest_size: u16,
228    // __u8 digest[];
229}
230
231ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
232ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
233
234pub type Inode = u64;
235type Handle = u64;
236
237#[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq)]
238struct InodeAltKey {
239    ino: libc::ino64_t,
240    dev: libc::dev_t,
241}
242
243#[derive(PartialEq, Eq, Debug)]
244enum FileType {
245    Regular,
246    Directory,
247    Other,
248}
249
250impl From<libc::mode_t> for FileType {
251    fn from(mode: libc::mode_t) -> Self {
252        match mode & libc::S_IFMT {
253            libc::S_IFREG => FileType::Regular,
254            libc::S_IFDIR => FileType::Directory,
255            _ => FileType::Other,
256        }
257    }
258}
259
260#[derive(Debug)]
261struct OpenedFile {
262    file: Option<File>,
263    open_flags: libc::c_int,
264}
265
266impl AsRawDescriptor for OpenedFile {
267    fn as_raw_descriptor(&self) -> RawDescriptor {
268        self.file().as_raw_descriptor()
269    }
270}
271
272impl OpenedFile {
273    fn new(file: File, open_flags: libc::c_int) -> Self {
274        OpenedFile {
275            file: Some(file),
276            open_flags,
277        }
278    }
279
280    fn file(&self) -> &File {
281        self.file.as_ref().expect("must have a file")
282    }
283
284    fn file_mut(&mut self) -> &mut File {
285        self.file.as_mut().expect("must have a file")
286    }
287
288    /// Leaks the file descriptor and makes the struct unusable.
289    ///
290    /// This is an optimization to speed up dropping `OpenedFile` instances, which is useful
291    /// during an abrupt shutdown. Instead of properly closing the file descriptor, which
292    /// involves a syscall, this function effectively forgets the file descriptor, relying on the
293    /// OS to clean it up when the process terminates.
294    fn leak_fd(&mut self) {
295        let f = self.file.take().expect("must have a file");
296        let _ = f.into_raw_descriptor();
297    }
298}
299
300#[derive(Debug)]
301struct InodeData {
302    inode: Inode,
303    // (File, open_flags)
304    file: Mutex<OpenedFile>,
305    refcount: AtomicU64,
306    filetype: FileType,
307    path: String,
308    // This needs to be atomic because we need to set it through a shared reference.
309    unsafe_leak_fd: AtomicBool,
310}
311
312impl AsRawDescriptor for InodeData {
313    fn as_raw_descriptor(&self) -> RawDescriptor {
314        self.file.lock().as_raw_descriptor()
315    }
316}
317
318impl Drop for InodeData {
319    /// If `unsafe_leak_fd` is set, this `drop` implementation will "leak" the file descriptor.
320    /// This is an optimization to speed up the cleanup process, based on the
321    /// assumption that the OS will handle the cleanup of file descriptors after the process
322    /// terminates. This is only okay if the process is guaranteed to terminate immediately
323    /// after the `PassthroughFs` instance is dropped.
324    fn drop(&mut self) {
325        if self.unsafe_leak_fd.load(Ordering::Relaxed) {
326            self.file.get_mut().leak_fd();
327        }
328    }
329}
330
331impl InodeData {
332    fn set_unsafe_leak_fd(&self) {
333        self.unsafe_leak_fd.store(true, Ordering::Relaxed);
334    }
335}
336
337#[derive(Debug)]
338struct HandleData {
339    inode: Inode,
340    file: Mutex<OpenedFile>,
341
342    unsafe_leak_fd: AtomicBool,
343}
344
345impl AsRawDescriptor for HandleData {
346    fn as_raw_descriptor(&self) -> RawDescriptor {
347        self.file.lock().as_raw_descriptor()
348    }
349}
350
351impl Drop for HandleData {
352    /// If `unsafe_leak_fd` is set, this `drop` implementation will "leak" the file descriptor by
353    /// forgetting it. This is an optimization to speed up the cleanup process, based on the
354    /// assumption that the OS will handle the cleanup of file descriptors after the process
355    // terminates. This is only safe if the process is guaranteed to terminate immediately
356    /// after the `PassthroughFs` instance is dropped.
357    fn drop(&mut self) {
358        if self.unsafe_leak_fd.load(Ordering::Relaxed) {
359            self.file.get_mut().leak_fd();
360        }
361    }
362}
363
364impl HandleData {
365    fn set_unsafe_leak_fd(&self) {
366        self.unsafe_leak_fd.store(true, Ordering::Relaxed);
367    }
368}
369
370macro_rules! scoped_cred {
371    ($name:ident, $ty:ty, $syscall_nr:expr) => {
372        #[derive(Debug)]
373        struct $name {
374            old: $ty,
375        }
376
377        impl $name {
378            // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
379            // credentials back to `old` when the returned struct is dropped.
380            fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
381                if val == old {
382                    // Nothing to do since we already have the correct value.
383                    return Ok(None);
384                }
385
386                // We want credential changes to be per-thread because otherwise
387                // we might interfere with operations being carried out on other
388                // threads with different uids/gids.  However, posix requires that
389                // all threads in a process share the same credentials.  To do this
390                // libc uses signals to ensure that when one thread changes its
391                // credentials the other threads do the same thing.
392                //
393                // So instead we invoke the syscall directly in order to get around
394                // this limitation.  Another option is to use the setfsuid and
395                // setfsgid systems calls.   However since those calls have no way to
396                // return an error, it's preferable to do this instead.
397
398                // SAFETY: this call is safe because it doesn't modify any memory and we
399                // check the return value.
400                let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
401                if res == 0 {
402                    Ok(Some($name { old }))
403                } else {
404                    Err(io::Error::last_os_error())
405                }
406            }
407        }
408
409        impl Drop for $name {
410            fn drop(&mut self) {
411                // SAFETY: trivially safe
412                let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
413                if res < 0 {
414                    error!(
415                        "failed to change credentials back to {}: {}",
416                        self.old,
417                        io::Error::last_os_error(),
418                    );
419                }
420            }
421        }
422    };
423}
424scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
425scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
426
427const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
428const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
429
430thread_local! {
431    // SAFETY: both calls take no parameters and only return an integer value. The kernel also
432    // guarantees that they can never fail.
433    static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
434    // SAFETY: both calls take no parameters and only return an integer value. The kernel also
435    // guarantees that they can never fail.
436    static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
437}
438
439fn set_creds(
440    uid: libc::uid_t,
441    gid: libc::gid_t,
442) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
443    let olduid = THREAD_EUID.with(|uid| *uid);
444    let oldgid = THREAD_EGID.with(|gid| *gid);
445
446    // We have to change the gid before we change the uid because if we change the uid first then we
447    // lose the capability to change the gid.  However changing back can happen in any order.
448    ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
449}
450
451thread_local!(static THREAD_FSCREATE: RefCell<Option<File>> = const { RefCell::new(None) });
452
453// Opens and returns a write-only handle to /proc/thread-self/attr/fscreate. Panics if it fails to
454// open the file.
455fn open_fscreate(proc: &File) -> File {
456    let fscreate = c"thread-self/attr/fscreate";
457
458    // SAFETY: this doesn't modify any memory and we check the return value.
459    let raw_descriptor = unsafe {
460        libc::openat(
461            proc.as_raw_descriptor(),
462            fscreate.as_ptr(),
463            libc::O_CLOEXEC | libc::O_WRONLY,
464        )
465    };
466
467    // We don't expect this to fail and we're not in a position to return an error here so just
468    // panic.
469    if raw_descriptor < 0 {
470        panic!(
471            "Failed to open /proc/thread-self/attr/fscreate: {}",
472            io::Error::last_os_error()
473        );
474    }
475
476    // SAFETY: safe because we just opened this descriptor.
477    unsafe { File::from_raw_descriptor(raw_descriptor) }
478}
479
480struct ScopedSecurityContext;
481
482impl ScopedSecurityContext {
483    fn new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext> {
484        THREAD_FSCREATE.with(|thread_fscreate| {
485            let mut fscreate = thread_fscreate.borrow_mut();
486            let file = fscreate.get_or_insert_with(|| open_fscreate(proc));
487            // SAFETY: this doesn't modify any memory and we check the return value.
488            let ret = unsafe {
489                libc::write(
490                    file.as_raw_descriptor(),
491                    ctx.as_ptr() as *const libc::c_void,
492                    ctx.to_bytes_with_nul().len(),
493                )
494            };
495            if ret < 0 {
496                Err(io::Error::last_os_error())
497            } else {
498                Ok(ScopedSecurityContext)
499            }
500        })
501    }
502}
503
504impl Drop for ScopedSecurityContext {
505    fn drop(&mut self) {
506        THREAD_FSCREATE.with(|thread_fscreate| {
507            // expect is safe here because the thread local would have been initialized by the call
508            // to `new` above.
509            let fscreate = thread_fscreate.borrow();
510            let file = fscreate
511                .as_ref()
512                .expect("Uninitialized thread-local when dropping ScopedSecurityContext");
513
514            // SAFETY: this doesn't modify any memory and we check the return value.
515            let ret = unsafe { libc::write(file.as_raw_descriptor(), ptr::null(), 0) };
516
517            if ret < 0 {
518                warn!(
519                    "Failed to restore security context: {}",
520                    io::Error::last_os_error()
521                );
522            }
523        })
524    }
525}
526
527struct ScopedUmask {
528    old: libc::mode_t,
529    mask: libc::mode_t,
530}
531
532impl ScopedUmask {
533    fn new(mask: libc::mode_t) -> ScopedUmask {
534        ScopedUmask {
535            // SAFETY: this doesn't modify any memory and always succeeds.
536            old: unsafe { libc::umask(mask) },
537            mask,
538        }
539    }
540}
541
542impl Drop for ScopedUmask {
543    fn drop(&mut self) {
544        // SAFETY: this doesn't modify any memory and always succeeds.
545        let previous = unsafe { libc::umask(self.old) };
546        debug_assert_eq!(
547            previous, self.mask,
548            "umask changed while holding ScopedUmask"
549        );
550    }
551}
552
553struct ScopedFsetid(Caps);
554impl Drop for ScopedFsetid {
555    fn drop(&mut self) {
556        if let Err(e) = raise_cap_fsetid(&mut self.0) {
557            error!(
558                "Failed to restore CAP_FSETID: {}.  Some operations may be broken.",
559                e
560            )
561        }
562    }
563}
564
565fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
566    c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
567    c.apply()
568}
569
570// Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
571// adds the capability back when it is dropped.
572fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
573    let mut caps = Caps::for_current_thread()?;
574    caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
575    caps.apply()?;
576    Ok(ScopedFsetid(caps))
577}
578
579fn ebadf() -> io::Error {
580    io::Error::from_raw_os_error(libc::EBADF)
581}
582
583fn eexist() -> io::Error {
584    io::Error::from_raw_os_error(libc::EEXIST)
585}
586
587fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
588    let mut st: MaybeUninit<libc::stat64> = MaybeUninit::<libc::stat64>::zeroed();
589
590    // SAFETY: the kernel will only write data in `st` and we check the return value.
591    syscall!(unsafe {
592        libc::fstatat64(
593            f.as_raw_descriptor(),
594            EMPTY_CSTR.as_ptr(),
595            st.as_mut_ptr(),
596            libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
597        )
598    })?;
599
600    // SAFETY: the kernel guarantees that the struct is now fully initialized.
601    Ok(unsafe { st.assume_init() })
602}
603
604fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
605    let mut st = MaybeUninit::<libc::stat64>::zeroed();
606
607    // SAFETY: the kernel will only write data in `st` and we check the return value.
608    syscall!(unsafe {
609        libc::fstatat64(
610            dir.as_raw_descriptor(),
611            name.as_ptr(),
612            st.as_mut_ptr(),
613            libc::AT_SYMLINK_NOFOLLOW,
614        )
615    })?;
616
617    // SAFETY: the kernel guarantees that the struct is now fully initialized.
618    Ok(unsafe { st.assume_init() })
619}
620
621#[cfg(feature = "arc_quota")]
622fn is_android_project_id(project_id: u32) -> bool {
623    // The following constants defines the valid range of project ID used by
624    // Android and are taken from android_filesystem_config.h in Android
625    // codebase.
626    //
627    // Project IDs reserved for Android files on external storage. Total 100 IDs
628    // from PROJECT_ID_EXT_DEFAULT (1000) are reserved.
629    const PROJECT_ID_FOR_ANDROID_FILES: std::ops::RangeInclusive<u32> = 1000..=1099;
630    // Project IDs reserved for Android apps.
631    // The lower-limit of the range is PROJECT_ID_EXT_DATA_START.
632    // The upper-limit of the range differs before and after T. Here we use that
633    // of T (PROJECT_ID_APP_CACHE_END) as it is larger.
634    const PROJECT_ID_FOR_ANDROID_APPS: std::ops::RangeInclusive<u32> = 20000..=69999;
635
636    PROJECT_ID_FOR_ANDROID_FILES.contains(&project_id)
637        || PROJECT_ID_FOR_ANDROID_APPS.contains(&project_id)
638}
639
640/// Per-directory cache for `PassthroughFs::ascii_casefold_lookup()`.
641///
642/// The key of the underlying `BTreeMap` is a lower-cased file name in the direcoty.
643/// The value is the case-sensitive file name stored in the host file system.
644/// We assume that if PassthroughFs has exclusive access to the filesystem, this cache exhaustively
645///  covers all file names that exist within the directory.
646/// So every `PassthroughFs`'s handler that adds or removes files in the directory is expected to
647/// update this cache.
648struct CasefoldCache(BTreeMap<Vec<u8>, CString>);
649
650impl CasefoldCache {
651    fn new(dir: &InodeData) -> io::Result<Self> {
652        let mut mp = BTreeMap::new();
653
654        let mut buf = [0u8; 1024];
655        let mut offset = 0;
656        loop {
657            let mut read_dir = ReadDir::new(dir, offset, &mut buf[..])?;
658            if read_dir.remaining() == 0 {
659                break;
660            }
661
662            while let Some(entry) = read_dir.next() {
663                offset = entry.offset as libc::off64_t;
664                let entry_name = entry.name;
665                mp.insert(
666                    entry_name.to_bytes().to_ascii_lowercase(),
667                    entry_name.to_owned(),
668                );
669            }
670        }
671        Ok(Self(mp))
672    }
673
674    fn insert(&mut self, name: &CStr) {
675        let lower_case = name.to_bytes().to_ascii_lowercase();
676        self.0.insert(lower_case, name.into());
677    }
678
679    fn lookup(&self, name: &[u8]) -> Option<CString> {
680        let lower = name.to_ascii_lowercase();
681        self.0.get(&lower).cloned()
682    }
683
684    fn remove(&mut self, name: &CStr) {
685        let lower_case = name.to_bytes().to_ascii_lowercase();
686        self.0.remove(&lower_case);
687    }
688}
689
690/// Time expiring mapping from an inode of a directory to `CasefoldCache` for the directory.
691/// Each entry will be expired after `timeout`.
692/// When ascii_casefold is disabled, this struct does nothing.
693struct ExpiringCasefoldLookupCaches {
694    inner: ExpiringMap<Inode, CasefoldCache>,
695}
696
697impl ExpiringCasefoldLookupCaches {
698    fn new(timeout: Duration) -> Self {
699        Self {
700            inner: ExpiringMap::new(timeout),
701        }
702    }
703
704    fn insert(&mut self, parent: Inode, name: &CStr) {
705        if let Some(dir_cache) = self.inner.get_mut(&parent) {
706            dir_cache.insert(name);
707        }
708    }
709
710    fn remove(&mut self, parent: Inode, name: &CStr) {
711        if let Some(dir_cache) = self.inner.get_mut(&parent) {
712            dir_cache.remove(name);
713        }
714    }
715
716    fn forget(&mut self, parent: Inode) {
717        self.inner.remove(&parent);
718    }
719
720    /// Get `CasefoldCache` for the given directory.
721    /// If the cache doesn't exist, generate it by fetching directory information with
722    /// `getdents64()`.
723    fn get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache> {
724        self.inner
725            .get_or_insert_with(&parent.inode, || CasefoldCache::new(parent))
726    }
727
728    #[cfg(test)]
729    fn exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool {
730        if let Some(dir_cache) = self.inner.get(&parent) {
731            dir_cache.lookup(name.to_bytes()).is_some()
732        } else {
733            false
734        }
735    }
736}
737
738#[cfg(feature = "fs_permission_translation")]
739impl PermissionData {
740    pub(crate) fn need_set_permission(&self, path: &str) -> bool {
741        path.starts_with(&self.perm_path)
742    }
743}
744
745/// A file system that simply "passes through" all requests it receives to the underlying file
746/// system. To keep the implementation simple it servers the contents of its root directory. Users
747/// that wish to serve only a specific directory should set up the environment so that that
748/// directory ends up as the root of the file system process. One way to accomplish this is via a
749/// combination of mount namespaces and the pivot_root system call.
750///
751/// # Safety
752///
753/// The `Drop` implementation for this struct intentionally leaks all open file
754/// descriptors. It is **critical** that an instance of `PassthroughFs` is
755/// only dropped immediately prior to process termination. Failure to uphold
756/// this invariant **will** result in resource leaks. This is a deliberate
757/// performance optimization for abrupt shutdowns, where we let the OS
758/// handle resource cleanup.
759pub struct PassthroughFs {
760    // Mutex that must be acquired before executing a process-wide operation such as fchdir.
761    process_lock: Mutex<()>,
762    // virtio-fs tag that the guest uses when mounting. This is only used for debugging
763    // when tracing is enabled.
764    tag: String,
765
766    // File descriptors for various points in the file system tree.
767    inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
768    next_inode: AtomicU64,
769
770    // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
771    // used for reading and writing data.
772    handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
773    next_handle: AtomicU64,
774
775    // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
776    // `inodes` into one that can go into `handles`. This is accomplished by reading the
777    // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
778    // to be serving doesn't have access to `/proc`.
779    proc: File,
780
781    // Whether writeback caching is enabled for this directory. This will only be true when
782    // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
783    writeback: AtomicBool,
784
785    // Whether zero message opens are supported by the kernel driver.
786    zero_message_open: AtomicBool,
787
788    // Whether zero message opendir is supported by the kernel driver.
789    zero_message_opendir: AtomicBool,
790
791    // Used to communicate with other processes using D-Bus.
792    #[cfg(feature = "arc_quota")]
793    dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
794    #[cfg(feature = "arc_quota")]
795    dbus_fd: Option<std::os::unix::io::RawFd>,
796
797    // Time-expiring cache for `ascii_casefold_lookup()`.
798    // The key is an inode of a directory, and the value is a cache for the directory.
799    // Each value will be expired `cfg.timeout` after it's created.
800    //
801    // TODO(b/267748212): Instead of per-device Mutex, we might want to have per-directory Mutex
802    // if we use PassthroughFs in multi-threaded environments.
803    expiring_casefold_lookup_caches: Option<Mutex<ExpiringCasefoldLookupCaches>>,
804
805    // paths and coresponding permission setting set by `crosvm_client_fs_permission_set` API
806    #[cfg(feature = "fs_permission_translation")]
807    permission_paths: RwLock<Vec<PermissionData>>,
808
809    // paths and coresponding xattr setting set by `crosvm_client_fs_xattr_set` API
810    #[cfg(feature = "arc_quota")]
811    xattr_paths: RwLock<Vec<XattrData>>,
812
813    cfg: Config,
814
815    // Set the root directory when pivot root isn't enabled for jailed process.
816    //
817    // virtio-fs typically uses mount namespaces and pivot_root for file system isolation,
818    // making the jailed process's root directory "/".
819    //
820    // However, Android's security model prevents crosvm from having the necessary SYS_ADMIN
821    // capability for mount namespaces and pivot_root. This lack of isolation means that
822    // root_dir defaults to the path provided via "--shared-dir".
823    root_dir: String,
824}
825
826impl std::fmt::Debug for PassthroughFs {
827    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
828        f.debug_struct("PassthroughFs")
829            .field("tag", &self.tag)
830            .field("next_inode", &self.next_inode)
831            .field("next_handle", &self.next_handle)
832            .field("proc", &self.proc)
833            .field("writeback", &self.writeback)
834            .field("zero_message_open", &self.zero_message_open)
835            .field("zero_message_opendir", &self.zero_message_opendir)
836            .field("cfg", &self.cfg)
837            .finish()
838    }
839}
840
841impl PassthroughFs {
842    pub fn new(tag: &str, cfg: Config) -> io::Result<PassthroughFs> {
843        // SAFETY: this doesn't modify any memory and we check the return value.
844        let raw_descriptor = syscall!(unsafe {
845            libc::openat64(
846                libc::AT_FDCWD,
847                PROC_CSTR.as_ptr(),
848                libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
849            )
850        })?;
851
852        // Privileged UIDs can use D-Bus to perform some operations.
853        #[cfg(feature = "arc_quota")]
854        let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
855            (None, None)
856        } else {
857            let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
858                .map_err(io::Error::other)?;
859            channel.set_watch_enabled(true);
860            let dbus_fd = channel.watch().fd;
861            channel.set_watch_enabled(false);
862            (
863                Some(Mutex::new(dbus::blocking::Connection::from(channel))),
864                Some(dbus_fd),
865            )
866        };
867
868        // SAFETY: safe because we just opened this descriptor.
869        let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
870
871        let expiring_casefold_lookup_caches = if cfg.ascii_casefold {
872            Some(Mutex::new(ExpiringCasefoldLookupCaches::new(cfg.timeout)))
873        } else {
874            None
875        };
876
877        #[allow(unused_mut)]
878        let mut passthroughfs = PassthroughFs {
879            process_lock: Mutex::new(()),
880            tag: tag.to_string(),
881            inodes: Mutex::new(MultikeyBTreeMap::new()),
882            next_inode: AtomicU64::new(ROOT_ID + 1),
883
884            handles: Mutex::new(BTreeMap::new()),
885            next_handle: AtomicU64::new(1),
886
887            proc,
888
889            writeback: AtomicBool::new(false),
890            zero_message_open: AtomicBool::new(false),
891            zero_message_opendir: AtomicBool::new(false),
892
893            #[cfg(feature = "arc_quota")]
894            dbus_connection,
895            #[cfg(feature = "arc_quota")]
896            dbus_fd,
897            expiring_casefold_lookup_caches,
898            #[cfg(feature = "fs_permission_translation")]
899            permission_paths: RwLock::new(Vec::new()),
900            #[cfg(feature = "arc_quota")]
901            xattr_paths: RwLock::new(Vec::new()),
902            cfg,
903            root_dir: "/".to_string(),
904        };
905
906        #[cfg(feature = "fs_runtime_ugid_map")]
907        passthroughfs.set_permission_path();
908
909        cros_tracing::trace_simple_print!(
910            VirtioFs,
911            "New PassthroughFS initialized: {:?}",
912            passthroughfs
913        );
914        Ok(passthroughfs)
915    }
916
917    #[cfg(feature = "fs_runtime_ugid_map")]
918    fn set_permission_path(&mut self) {
919        if !self.cfg.ugid_map.is_empty() {
920            let mut write_lock = self
921                .permission_paths
922                .write()
923                .expect("Failed to acquire write lock on permission_paths");
924            *write_lock = self.cfg.ugid_map.clone();
925        }
926    }
927
928    #[cfg(feature = "fs_runtime_ugid_map")]
929    pub fn set_root_dir(&mut self, shared_dir: String) -> io::Result<()> {
930        let canonicalized_root = match std::fs::canonicalize(shared_dir) {
931            Ok(path) => path,
932            Err(e) => {
933                return Err(io::Error::new(
934                    io::ErrorKind::InvalidInput,
935                    format!("Failed to canonicalize root_dir: {e}"),
936                ));
937            }
938        };
939        self.root_dir = canonicalized_root.to_string_lossy().to_string();
940        Ok(())
941    }
942
943    pub fn cfg(&self) -> &Config {
944        &self.cfg
945    }
946
947    pub fn keep_rds(&self) -> Vec<RawDescriptor> {
948        #[cfg_attr(not(feature = "arc_quota"), allow(unused_mut))]
949        let mut keep_rds = vec![self.proc.as_raw_descriptor()];
950        #[cfg(feature = "arc_quota")]
951        if let Some(fd) = self.dbus_fd {
952            keep_rds.push(fd);
953        }
954        keep_rds
955    }
956
957    fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
958        if !self.cfg.rewrite_security_xattrs {
959            return Cow::Borrowed(name);
960        }
961
962        // Does not include nul-terminator.
963        let buf = name.to_bytes();
964        if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
965            return Cow::Borrowed(name);
966        }
967
968        let mut newname = USER_VIRTIOFS_XATTR.to_vec();
969        newname.extend_from_slice(buf);
970
971        // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
972        // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
973        Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
974    }
975
976    fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
977        self.inodes.lock().get(&inode).cloned().ok_or_else(ebadf)
978    }
979
980    fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
981        self.handles
982            .lock()
983            .get(&handle)
984            .filter(|hd| hd.inode == inode)
985            .cloned()
986            .ok_or_else(ebadf)
987    }
988
989    fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
990        let pathname = CString::new(format!("self/fd/{fd}"))
991            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
992
993        // SAFETY: this doesn't modify any memory and we check the return value. We don't really
994        // check `flags` because if the kernel can't handle poorly specified flags then we have
995        // much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
996        // to follow the `/proc/self/fd` symlink to get the file.
997        let raw_descriptor = syscall!(unsafe {
998            libc::openat64(
999                self.proc.as_raw_descriptor(),
1000                pathname.as_ptr(),
1001                (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1002            )
1003        })?;
1004
1005        // SAFETY: safe because we just opened this descriptor.
1006        Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
1007    }
1008
1009    /// Modifies the provided open flags based on the writeback caching configuration.
1010    /// Return the updated open flags.
1011    fn update_open_flags(&self, mut flags: i32) -> i32 {
1012        // When writeback caching is enabled, the kernel may send read requests even if the
1013        // userspace program opened the file write-only. So we need to ensure that we have opened
1014        // the file for reading as well as writing.
1015        let writeback = self.writeback.load(Ordering::Relaxed);
1016        if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
1017            flags &= !libc::O_ACCMODE;
1018            flags |= libc::O_RDWR;
1019        }
1020
1021        // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
1022        // However, this breaks atomicity as the file may have changed on disk, invalidating the
1023        // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
1024        // the file. Just allow this for now as it is the user's responsibility to enable writeback
1025        // caching only for directories that are not shared. It also means that we need to clear the
1026        // `O_APPEND` flag.
1027        if writeback && flags & libc::O_APPEND != 0 {
1028            flags &= !libc::O_APPEND;
1029        }
1030
1031        flags
1032    }
1033
1034    fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
1035        // handle writeback caching cases
1036        flags = self.update_open_flags(flags);
1037
1038        self.open_fd(inode.as_raw_descriptor(), flags)
1039    }
1040
1041    // Increases the inode refcount and returns the inode.
1042    fn increase_inode_refcount(&self, inode_data: &InodeData) -> Inode {
1043        // Matches with the release store in `forget`.
1044        inode_data.refcount.fetch_add(1, Ordering::Acquire);
1045        inode_data.inode
1046    }
1047
1048    // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
1049    // The inodes mutex lock must not be already taken by the same thread otherwise this
1050    // will deadlock.
1051    fn add_entry(
1052        &self,
1053        f: File,
1054        #[cfg_attr(not(feature = "fs_permission_translation"), allow(unused_mut))]
1055        mut st: libc::stat64,
1056        open_flags: libc::c_int,
1057        path: String,
1058    ) -> Entry {
1059        #[cfg(feature = "arc_quota")]
1060        self.set_permission(&mut st, &path);
1061        #[cfg(feature = "fs_runtime_ugid_map")]
1062        self.set_ugid_permission(&mut st, &path);
1063        let mut inodes = self.inodes.lock();
1064
1065        let altkey = InodeAltKey {
1066            ino: st.st_ino,
1067            dev: st.st_dev,
1068        };
1069
1070        let inode = if let Some(data) = inodes.get_alt(&altkey) {
1071            self.increase_inode_refcount(data)
1072        } else {
1073            let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
1074            inodes.insert(
1075                inode,
1076                altkey,
1077                Arc::new(InodeData {
1078                    inode,
1079                    file: Mutex::new(OpenedFile::new(f, open_flags)),
1080                    refcount: AtomicU64::new(1),
1081                    filetype: st.st_mode.into(),
1082                    path,
1083                    unsafe_leak_fd: AtomicBool::new(false),
1084                }),
1085            );
1086
1087            inode
1088        };
1089
1090        Entry {
1091            inode,
1092            generation: 0,
1093            attr: st,
1094            // We use the same timeout for the attribute and the entry.
1095            attr_timeout: self.cfg.timeout,
1096            entry_timeout: self.cfg.timeout,
1097        }
1098    }
1099
1100    /// Acquires lock of `expiring_casefold_lookup_caches` if `ascii_casefold` is enabled.
1101    fn lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>> {
1102        self.expiring_casefold_lookup_caches
1103            .as_ref()
1104            .map(|c| c.lock())
1105    }
1106
1107    // Returns an actual case-sensitive file name that matches with the given `name`.
1108    // Returns `Ok(None)` if no file matches with the give `name`.
1109    // This function will panic if casefold is not enabled.
1110    fn get_case_unfolded_name(
1111        &self,
1112        parent: &InodeData,
1113        name: &[u8],
1114    ) -> io::Result<Option<CString>> {
1115        let mut caches = self
1116            .lock_casefold_lookup_caches()
1117            .expect("casefold must be enabled");
1118        let dir_cache = caches.get(parent)?;
1119        Ok(dir_cache.lookup(name))
1120    }
1121
1122    // Performs an ascii case insensitive lookup.
1123    fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
1124        match self.get_case_unfolded_name(parent, name)? {
1125            None => Err(io::Error::from_raw_os_error(libc::ENOENT)),
1126            Some(actual_name) => self.do_lookup(parent, &actual_name),
1127        }
1128    }
1129
1130    #[cfg(test)]
1131    fn exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool {
1132        let mut cache = self
1133            .lock_casefold_lookup_caches()
1134            .expect("casefold must be enabled");
1135        cache.exists_in_cache(parent, name)
1136    }
1137
1138    fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
1139        #[cfg_attr(not(feature = "fs_permission_translation"), allow(unused_mut))]
1140        let mut st = statat(parent, name)?;
1141
1142        let altkey = InodeAltKey {
1143            ino: st.st_ino,
1144            dev: st.st_dev,
1145        };
1146
1147        let path = format!(
1148            "{}/{}",
1149            parent.path.clone(),
1150            name.to_str().unwrap_or("<non UTF-8 str>")
1151        );
1152
1153        // Check if we already have an entry before opening a new file.
1154        if let Some(data) = self.inodes.lock().get_alt(&altkey) {
1155            // Return the same inode with the reference counter increased.
1156            #[cfg(feature = "arc_quota")]
1157            self.set_permission(&mut st, &path);
1158            #[cfg(feature = "fs_runtime_ugid_map")]
1159            self.set_ugid_permission(&mut st, &path);
1160            return Ok(Entry {
1161                inode: self.increase_inode_refcount(data),
1162                generation: 0,
1163                attr: st,
1164                // We use the same timeout for the attribute and the entry.
1165                attr_timeout: self.cfg.timeout,
1166                entry_timeout: self.cfg.timeout,
1167            });
1168        }
1169
1170        // Open a regular file with O_RDONLY to store in `InodeData` so explicit open requests can
1171        // be skipped later if the ZERO_MESSAGE_{OPEN,OPENDIR} features are enabled.
1172        // If the crosvm process doesn't have a read permission, fall back to O_PATH below.
1173        let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1174        match FileType::from(st.st_mode) {
1175            FileType::Regular => {}
1176            FileType::Directory => flags |= libc::O_DIRECTORY,
1177            FileType::Other => flags |= libc::O_PATH,
1178        };
1179
1180        // SAFETY: this doesn't modify any memory and we check the return value.
1181        let fd = match unsafe {
1182            syscall!(libc::openat64(
1183                parent.as_raw_descriptor(),
1184                name.as_ptr(),
1185                flags
1186            ))
1187        } {
1188            Ok(fd) => fd,
1189            Err(e) if e.errno() == libc::EACCES => {
1190                // If O_RDONLY is unavailable, fall back to O_PATH to get an FD to store in
1191                // `InodeData`.
1192                // Note that some operations which should be allowed without read permissions
1193                // require syscalls that don't support O_PATH fds. For those syscalls, we will
1194                // need to fall back to their path-based equivalents with /self/fd/${FD}.
1195                // e.g. `fgetxattr()` for an O_PATH FD fails while `getxaattr()` for /self/fd/${FD}
1196                // works.
1197                flags |= libc::O_PATH;
1198                // SAFETY: this doesn't modify any memory and we check the return value.
1199                unsafe {
1200                    syscall!(libc::openat64(
1201                        parent.as_raw_descriptor(),
1202                        name.as_ptr(),
1203                        flags
1204                    ))
1205                }?
1206            }
1207            Err(e) => {
1208                return Err(e.into());
1209            }
1210        };
1211
1212        // SAFETY: safe because we own the fd.
1213        let f = unsafe { File::from_raw_descriptor(fd) };
1214        // We made sure the lock acquired for `self.inodes` is released automatically when
1215        // the if block above is exited, so a call to `self.add_entry()` should not cause a deadlock
1216        // here. This would not be the case if this were executed in an else block instead.
1217        Ok(self.add_entry(f, st, flags, path))
1218    }
1219
1220    fn get_cache_open_options(&self, flags: u32) -> OpenOptions {
1221        let mut opts = OpenOptions::empty();
1222        match self.cfg.cache_policy {
1223            // We only set the direct I/O option on files.
1224            CachePolicy::Never => opts.set(
1225                OpenOptions::DIRECT_IO,
1226                flags & (libc::O_DIRECTORY as u32) == 0,
1227            ),
1228            CachePolicy::Always => {
1229                opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
1230                    OpenOptions::KEEP_CACHE
1231                } else {
1232                    OpenOptions::CACHE_DIR
1233                }
1234            }
1235            _ => {}
1236        };
1237        opts
1238    }
1239
1240    // Performs lookup using original name first, if it fails and ascii_casefold is enabled,
1241    // it tries to unfold the name and do lookup again.
1242    fn do_lookup_with_casefold_fallback(
1243        &self,
1244        parent: &InodeData,
1245        name: &CStr,
1246    ) -> io::Result<Entry> {
1247        let mut res = self.do_lookup(parent, name);
1248        // If `ascii_casefold` is enabled, fallback to `ascii_casefold_lookup()`.
1249        if res.is_err() && self.cfg.ascii_casefold {
1250            res = self.ascii_casefold_lookup(parent, name.to_bytes());
1251        }
1252        res
1253    }
1254
1255    fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
1256        let inode_data = self.find_inode(inode)?;
1257
1258        let file = self.open_inode(&inode_data, flags as i32)?;
1259
1260        let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1261        let data = HandleData {
1262            inode,
1263            file: Mutex::new(OpenedFile::new(file, flags as i32)),
1264            unsafe_leak_fd: AtomicBool::new(false),
1265        };
1266
1267        self.handles.lock().insert(handle, Arc::new(data));
1268
1269        let opts = self.get_cache_open_options(flags);
1270
1271        Ok((Some(handle), opts))
1272    }
1273
1274    fn do_open_at(
1275        &self,
1276        parent_data: Arc<InodeData>,
1277        name: &CStr,
1278        inode: Inode,
1279        flags: u32,
1280    ) -> io::Result<(Option<Handle>, OpenOptions)> {
1281        let open_flags = self.update_open_flags(flags as i32);
1282
1283        let fd_open = syscall!(
1284            // SAFETY: return value is checked.
1285            unsafe {
1286                libc::openat64(
1287                    parent_data.as_raw_descriptor(),
1288                    name.as_ptr(),
1289                    (open_flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1290                )
1291            }
1292        )?;
1293
1294        // SAFETY: fd_open is valid
1295        let file_open = unsafe { File::from_raw_descriptor(fd_open) };
1296        let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1297        let data = HandleData {
1298            inode,
1299            file: Mutex::new(OpenedFile::new(file_open, open_flags)),
1300            unsafe_leak_fd: AtomicBool::new(false),
1301        };
1302
1303        self.handles.lock().insert(handle, Arc::new(data));
1304
1305        let opts = self.get_cache_open_options(open_flags as u32);
1306        Ok((Some(handle), opts))
1307    }
1308
1309    fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
1310        let mut handles = self.handles.lock();
1311
1312        if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
1313            if e.get().inode == inode {
1314                // We don't need to close the file here because that will happen automatically when
1315                // the last `Arc` is dropped.
1316                e.remove();
1317                return Ok(());
1318            }
1319        }
1320
1321        Err(ebadf())
1322    }
1323
1324    fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
1325        #[allow(unused_mut)]
1326        let mut st = stat(inode)?;
1327
1328        #[cfg(feature = "arc_quota")]
1329        self.set_permission(&mut st, &inode.path);
1330        #[cfg(feature = "fs_runtime_ugid_map")]
1331        self.set_ugid_permission(&mut st, &inode.path);
1332        Ok((st, self.cfg.timeout))
1333    }
1334
1335    fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
1336        // SAFETY: this doesn't modify any memory and we check the return value.
1337        syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
1338        Ok(())
1339    }
1340
1341    fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
1342        // SAFETY: this doesn't modify any memory and we check the return value.
1343        syscall!(unsafe {
1344            if datasync {
1345                libc::fdatasync(file.as_raw_descriptor())
1346            } else {
1347                libc::fsync(file.as_raw_descriptor())
1348            }
1349        })?;
1350
1351        Ok(())
1352    }
1353
1354    // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
1355    // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
1356    // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
1357    // root inode.
1358    //
1359    // NOTE: this method acquires an `self`-wide lock. If any locks are acquired in `f`, care must
1360    // be taken to avoid the risk of deadlocks.
1361    fn with_proc_chdir<F, T>(&self, f: F) -> T
1362    where
1363        F: FnOnce() -> T,
1364    {
1365        let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
1366
1367        // Acquire a lock for `fchdir`.
1368        let _proc_lock = self.process_lock.lock();
1369        // SAFETY: this doesn't modify any memory and we check the return value. Since the
1370        // fchdir should never fail we just use debug_asserts.
1371        let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
1372        debug_assert_eq!(
1373            proc_cwd,
1374            0,
1375            "failed to fchdir to /proc: {}",
1376            io::Error::last_os_error()
1377        );
1378
1379        let res = f();
1380
1381        // SAFETY: this doesn't modify any memory and we check the return value. Since the
1382        // fchdir should never fail we just use debug_asserts.
1383        let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
1384        debug_assert_eq!(
1385            root_cwd,
1386            0,
1387            "failed to fchdir back to root directory: {}",
1388            io::Error::last_os_error()
1389        );
1390
1391        res
1392    }
1393
1394    fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
1395        let file = inode.file.lock();
1396        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
1397        let res = if o_path_file {
1398            // For FDs opened with `O_PATH`, we cannot call `fgetxattr` normally. Instead we
1399            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1400            //  and then setting the CWD back to the root directory.
1401            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
1402                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1403
1404            // SAFETY: this will only modify `value` and we check the return value.
1405            self.with_proc_chdir(|| unsafe {
1406                libc::getxattr(
1407                    path.as_ptr(),
1408                    name.as_ptr(),
1409                    value.as_mut_ptr() as *mut libc::c_void,
1410                    value.len() as libc::size_t,
1411                )
1412            })
1413        } else {
1414            // For regular files and directories, we can just use fgetxattr.
1415            // SAFETY: this will only write to `value` and we check the return value.
1416            unsafe {
1417                libc::fgetxattr(
1418                    file.as_raw_descriptor(),
1419                    name.as_ptr(),
1420                    value.as_mut_ptr() as *mut libc::c_void,
1421                    value.len() as libc::size_t,
1422                )
1423            }
1424        };
1425
1426        if res < 0 {
1427            Err(io::Error::last_os_error())
1428        } else {
1429            Ok(res as usize)
1430        }
1431    }
1432
1433    fn get_encryption_policy_ex<R: io::Read>(
1434        &self,
1435        inode: Inode,
1436        handle: Handle,
1437        mut r: R,
1438    ) -> io::Result<IoctlReply> {
1439        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1440            self.find_inode(inode)?
1441        } else {
1442            self.find_handle(handle, inode)?
1443        };
1444
1445        // SAFETY: this struct only has integer fields and any value is valid.
1446        let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
1447        r.read_exact(arg.policy_size.as_mut_bytes())?;
1448
1449        let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
1450        arg.policy_size = policy_size;
1451
1452        let res =
1453            // SAFETY: the kernel will only write to `arg` and we check the return value.
1454            unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX, &mut arg) };
1455        if res < 0 {
1456            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1457        } else {
1458            let len = size_of::<u64>() + arg.policy_size as usize;
1459            Ok(IoctlReply::Done(Ok(<&[u8]>::from(&arg)[..len].to_vec())))
1460        }
1461    }
1462
1463    fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1464        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1465            self.find_inode(inode)?
1466        } else {
1467            self.find_handle(handle, inode)?
1468        };
1469
1470        let mut buf = MaybeUninit::<fsxattr>::zeroed();
1471
1472        // SAFETY: the kernel will only write to `buf` and we check the return value.
1473        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR, buf.as_mut_ptr()) };
1474        if res < 0 {
1475            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1476        } else {
1477            // SAFETY: the kernel guarantees that the policy is now initialized.
1478            let xattr = unsafe { buf.assume_init() };
1479            Ok(IoctlReply::Done(Ok(xattr.as_bytes().to_vec())))
1480        }
1481    }
1482
1483    fn set_fsxattr<R: io::Read>(
1484        &self,
1485        #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1486        inode: Inode,
1487        handle: Handle,
1488        mut r: R,
1489    ) -> io::Result<IoctlReply> {
1490        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1491            self.find_inode(inode)?
1492        } else {
1493            self.find_handle(handle, inode)?
1494        };
1495
1496        let mut in_attr = fsxattr::new_zeroed();
1497        r.read_exact(in_attr.as_mut_bytes())?;
1498
1499        #[cfg(feature = "arc_quota")]
1500        let st = stat(&*data)?;
1501
1502        #[cfg(feature = "arc_quota")]
1503        let ctx_uid = self.lookup_host_uid(&ctx, inode);
1504
1505        // Changing quota project ID requires CAP_FOWNER or being file owner.
1506        // Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
1507        #[cfg(feature = "arc_quota")]
1508        if ctx_uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx_uid) {
1509            // Get the current fsxattr.
1510            let mut buf = MaybeUninit::<fsxattr>::zeroed();
1511            // SAFETY: the kernel will only write to `buf` and we check the return value.
1512            let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR, buf.as_mut_ptr()) };
1513            if res < 0 {
1514                return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1515            }
1516            // SAFETY: the kernel guarantees that the policy is now initialized.
1517            let current_attr = unsafe { buf.assume_init() };
1518
1519            // Project ID cannot be changed inside a user namespace.
1520            // Use Spaced to avoid this restriction.
1521            if current_attr.fsx_projid != in_attr.fsx_projid {
1522                let connection = self.dbus_connection.as_ref().unwrap().lock();
1523                let proxy = connection.with_proxy(
1524                    "org.chromium.Spaced",
1525                    "/org/chromium/Spaced",
1526                    DEFAULT_DBUS_TIMEOUT,
1527                );
1528                let project_id = in_attr.fsx_projid;
1529                if !is_android_project_id(project_id) {
1530                    return Err(io::Error::from_raw_os_error(libc::EINVAL));
1531                }
1532                let file_clone = base::SafeDescriptor::try_from(&*data)?;
1533                match proxy.set_project_id(file_clone.into(), project_id) {
1534                    Ok(r) => {
1535                        let r = SetProjectIdReply::parse_from_bytes(&r)
1536                            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1537                        if !r.success {
1538                            return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1539                                r.error,
1540                            ))));
1541                        }
1542                    }
1543                    Err(e) => {
1544                        return Err(io::Error::other(e));
1545                    }
1546                };
1547            }
1548        }
1549
1550        //  SAFETY: this doesn't modify any memory and we check the return value.
1551        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR, &in_attr) };
1552        if res < 0 {
1553            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1554        } else {
1555            Ok(IoctlReply::Done(Ok(Vec::new())))
1556        }
1557    }
1558
1559    fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1560        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1561            self.find_inode(inode)?
1562        } else {
1563            self.find_handle(handle, inode)?
1564        };
1565
1566        // The ioctl encoding is a long but the parameter is actually an int.
1567        let mut flags: c_int = 0;
1568
1569        // SAFETY: the kernel will only write to `flags` and we check the return value.
1570        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS, &mut flags) };
1571        if res < 0 {
1572            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1573        } else {
1574            Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
1575        }
1576    }
1577
1578    fn set_flags<R: io::Read>(
1579        &self,
1580        #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1581        inode: Inode,
1582        handle: Handle,
1583        mut r: R,
1584    ) -> io::Result<IoctlReply> {
1585        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1586            self.find_inode(inode)?
1587        } else {
1588            self.find_handle(handle, inode)?
1589        };
1590
1591        // The ioctl encoding is a long but the parameter is actually an int.
1592        let mut in_flags: c_int = 0;
1593        r.read_exact(in_flags.as_mut_bytes())?;
1594
1595        #[cfg(feature = "arc_quota")]
1596        let st = stat(&*data)?;
1597
1598        #[cfg(feature = "arc_quota")]
1599        let ctx_uid = self.lookup_host_uid(&ctx, inode);
1600
1601        // Only privleged uid can perform FS_IOC_SETFLAGS through cryptohome.
1602        #[cfg(feature = "arc_quota")]
1603        if ctx_uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx_uid) {
1604            // Get the current flag.
1605            let mut buf = MaybeUninit::<c_int>::zeroed();
1606            // SAFETY: the kernel will only write to `buf` and we check the return value.
1607            let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS, buf.as_mut_ptr()) };
1608            if res < 0 {
1609                return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1610            }
1611            // SAFETY: the kernel guarantees that the policy is now initialized.
1612            let current_flags = unsafe { buf.assume_init() };
1613
1614            // Project inheritance flag cannot be changed inside a user namespace.
1615            // Use Spaced to avoid this restriction.
1616            if (in_flags & FS_PROJINHERIT_FL) != (current_flags & FS_PROJINHERIT_FL) {
1617                let connection = self.dbus_connection.as_ref().unwrap().lock();
1618                let proxy = connection.with_proxy(
1619                    "org.chromium.Spaced",
1620                    "/org/chromium/Spaced",
1621                    DEFAULT_DBUS_TIMEOUT,
1622                );
1623                // If the input flags contain FS_PROJINHERIT_FL, then it is a set. Otherwise it is a
1624                // reset.
1625                let enable = (in_flags & FS_PROJINHERIT_FL) == FS_PROJINHERIT_FL;
1626                let file_clone = base::SafeDescriptor::try_from(&*data)?;
1627                match proxy.set_project_inheritance_flag(file_clone.into(), enable) {
1628                    Ok(r) => {
1629                        let r = SetProjectInheritanceFlagReply::parse_from_bytes(&r)
1630                            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1631                        if !r.success {
1632                            return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1633                                r.error,
1634                            ))));
1635                        }
1636                    }
1637                    Err(e) => {
1638                        return Err(io::Error::other(e));
1639                    }
1640                };
1641            }
1642        }
1643
1644        // SAFETY: this doesn't modify any memory and we check the return value.
1645        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS, &in_flags) };
1646        if res < 0 {
1647            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1648        } else {
1649            Ok(IoctlReply::Done(Ok(Vec::new())))
1650        }
1651    }
1652
1653    fn enable_verity<R: io::Read>(
1654        &self,
1655        inode: Inode,
1656        handle: Handle,
1657        mut r: R,
1658    ) -> io::Result<IoctlReply> {
1659        let inode_data = self.find_inode(inode)?;
1660
1661        // These match the return codes from `fsverity_ioctl_enable` in the kernel.
1662        match inode_data.filetype {
1663            FileType::Regular => {}
1664            FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
1665            FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
1666        }
1667
1668        {
1669            // We cannot enable verity while holding a writable fd so get a new one, if necessary.
1670            let mut file = inode_data.file.lock();
1671            let mut flags = file.open_flags;
1672            match flags & libc::O_ACCMODE {
1673                libc::O_WRONLY | libc::O_RDWR => {
1674                    flags &= !libc::O_ACCMODE;
1675                    flags |= libc::O_RDONLY;
1676
1677                    // We need to get a read-only handle for this file.
1678                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
1679                    *file = OpenedFile::new(newfile, flags);
1680                }
1681                libc::O_RDONLY => {}
1682                _ => panic!("Unexpected flags: {flags:#x}"),
1683            }
1684        }
1685
1686        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1687            inode_data
1688        } else {
1689            let data = self.find_handle(handle, inode)?;
1690
1691            {
1692                // We can't enable verity while holding a writable fd. We don't know whether the
1693                // file was opened for writing so check it here. We don't expect
1694                // this to be a frequent operation so the extra latency should be
1695                // fine.
1696                let mut file = data.file.lock();
1697                let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
1698                match flags {
1699                    FileFlags::ReadWrite | FileFlags::Write => {
1700                        // We need to get a read-only handle for this file.
1701                        *file = OpenedFile::new(
1702                            self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?,
1703                            libc::O_RDONLY,
1704                        );
1705                    }
1706                    FileFlags::Read => {}
1707                }
1708            }
1709
1710            data
1711        };
1712
1713        let mut arg = fsverity_enable_arg::new_zeroed();
1714        r.read_exact(arg.as_mut_bytes())?;
1715
1716        let mut salt;
1717        if arg.salt_size > 0 {
1718            if arg.salt_size > self.max_buffer_size() {
1719                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1720                    libc::ENOMEM,
1721                ))));
1722            }
1723            salt = vec![0; arg.salt_size as usize];
1724            r.read_exact(&mut salt)?;
1725            arg.salt_ptr = salt.as_ptr() as usize as u64;
1726        } else {
1727            arg.salt_ptr = 0;
1728        }
1729
1730        let mut sig;
1731        if arg.sig_size > 0 {
1732            if arg.sig_size > self.max_buffer_size() {
1733                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1734                    libc::ENOMEM,
1735                ))));
1736            }
1737            sig = vec![0; arg.sig_size as usize];
1738            r.read_exact(&mut sig)?;
1739            arg.sig_ptr = sig.as_ptr() as usize as u64;
1740        } else {
1741            arg.sig_ptr = 0;
1742        }
1743
1744        // SAFETY: this doesn't modify any memory and we check the return value.
1745        let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY, &arg) };
1746        if res < 0 {
1747            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1748        } else {
1749            Ok(IoctlReply::Done(Ok(Vec::new())))
1750        }
1751    }
1752
1753    fn measure_verity<R: io::Read>(
1754        &self,
1755        inode: Inode,
1756        handle: Handle,
1757        mut r: R,
1758        out_size: u32,
1759    ) -> io::Result<IoctlReply> {
1760        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1761            self.find_inode(inode)?
1762        } else {
1763            self.find_handle(handle, inode)?
1764        };
1765
1766        let mut digest = fsverity_digest::new_zeroed();
1767        r.read_exact(digest.as_mut_bytes())?;
1768
1769        // Taken from fs/verity/fsverity_private.h.
1770        const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
1771
1772        // This digest size is what the fsverity command line utility uses.
1773        const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
1774        const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
1775        const ROUNDED_LEN: usize = BUFLEN.div_ceil(size_of::<fsverity_digest>());
1776
1777        // Make sure we get a properly aligned allocation.
1778        let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
1779
1780        // SAFETY: we are only writing data and not reading uninitialized memory.
1781        unsafe {
1782            // TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
1783            addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
1784                .write(DIGEST_SIZE)
1785        };
1786
1787        // SAFETY: this will only modify `buf` and we check the return value.
1788        let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY, buf.as_mut_ptr()) };
1789        if res < 0 {
1790            Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1791        } else {
1792            let digest_size =
1793                // SAFETY: this value was initialized by us already and then overwritten by the kernel.
1794                // TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
1795                unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
1796            let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
1797
1798            // The kernel guarantees this but it doesn't hurt to be paranoid.
1799            debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
1800            if digest.digest_size < digest_size || out_size < outlen {
1801                return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1802                    libc::EOVERFLOW,
1803                ))));
1804            }
1805
1806            let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
1807                // SAFETY: any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
1808                // doesn't contain any references.
1809                unsafe { mem::transmute(buf) };
1810
1811            let buf =
1812                // SAFETY: Casting to `*const [u8]` is safe because the kernel guarantees that the
1813                // first `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed
1814                // to have the same layout as `u8`.
1815                // TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
1816                unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
1817            Ok(IoctlReply::Done(Ok(buf.to_vec())))
1818        }
1819    }
1820}
1821
1822#[cfg(feature = "fs_runtime_ugid_map")]
1823impl PassthroughFs {
1824    fn find_and_set_ugid_permission(
1825        &self,
1826        st: &mut libc::stat64,
1827        path: &str,
1828        is_root_path: bool,
1829    ) -> bool {
1830        for perm_data in self
1831            .permission_paths
1832            .read()
1833            .expect("acquire permission_paths read lock")
1834            .iter()
1835        {
1836            if (is_root_path && perm_data.perm_path == "/")
1837                || (!is_root_path
1838                    && perm_data.perm_path != "/"
1839                    && perm_data.need_set_permission(path))
1840            {
1841                self.set_permission_from_data(st, perm_data);
1842                return true;
1843            }
1844        }
1845        false
1846    }
1847
1848    fn set_permission_from_data(&self, st: &mut libc::stat64, perm_data: &PermissionData) {
1849        st.st_uid = perm_data.guest_uid;
1850        st.st_gid = perm_data.guest_gid;
1851        st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
1852    }
1853
1854    /// Set permission according to path
1855    fn set_ugid_permission(&self, st: &mut libc::stat64, path: &str) {
1856        let is_root_path = path.is_empty();
1857
1858        if self.find_and_set_ugid_permission(st, path, is_root_path) {
1859            return;
1860        }
1861
1862        if let Some(perm_data) = self
1863            .permission_paths
1864            .read()
1865            .expect("acquire permission_paths read lock")
1866            .iter()
1867            .find(|pd| pd.perm_path == "/")
1868        {
1869            self.set_permission_from_data(st, perm_data);
1870        }
1871    }
1872
1873    /// Set host uid/gid to configured value according to path
1874    fn change_ugid_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32) {
1875        let path = format!(
1876            "{}/{}",
1877            parent_data.path.clone(),
1878            name.to_str().unwrap_or("<non UTF-8 str>")
1879        );
1880
1881        self.change_ugid_creds_for_path(ctx, &path)
1882    }
1883
1884    /// Set host uid/gid to configured value according to path
1885    fn change_ugid_creds_for_path(&self, ctx: &Context, path: &str) -> (u32, u32) {
1886        let is_root_path = path.is_empty();
1887
1888        if let Some(creds) = self.find_ugid_creds_for_path(path, is_root_path) {
1889            return creds;
1890        }
1891
1892        if let Some(perm_data) = self
1893            .permission_paths
1894            .read()
1895            .expect("acquire permission_paths read lock")
1896            .iter()
1897            .find(|pd| pd.perm_path == "/")
1898        {
1899            return (perm_data.host_uid, perm_data.host_gid);
1900        }
1901
1902        (ctx.uid, ctx.gid)
1903    }
1904
1905    fn find_ugid_creds_for_path(&self, path: &str, is_root_path: bool) -> Option<(u32, u32)> {
1906        for perm_data in self
1907            .permission_paths
1908            .read()
1909            .expect("acquire permission_paths read lock")
1910            .iter()
1911        {
1912            if (is_root_path && perm_data.perm_path == "/")
1913                || (!is_root_path
1914                    && perm_data.perm_path != "/"
1915                    && perm_data.need_set_permission(path))
1916            {
1917                return Some((perm_data.host_uid, perm_data.host_gid));
1918            }
1919        }
1920        None
1921    }
1922}
1923
1924#[cfg(feature = "arc_quota")]
1925impl PassthroughFs {
1926    /// Convert u8 slice to string
1927    fn string_from_u8_slice(&self, buf: &[u8]) -> io::Result<String> {
1928        match CStr::from_bytes_until_nul(buf).map(|s| s.to_string_lossy().to_string()) {
1929            Ok(s) => Ok(s),
1930            Err(e) => {
1931                error!("fail to convert u8 slice to string: {}", e);
1932                Err(io::Error::from_raw_os_error(libc::EINVAL))
1933            }
1934        }
1935    }
1936
1937    /// Set permission according to path
1938    fn set_permission(&self, st: &mut libc::stat64, path: &str) {
1939        for perm_data in self
1940            .permission_paths
1941            .read()
1942            .expect("acquire permission_paths read lock")
1943            .iter()
1944        {
1945            if perm_data.need_set_permission(path) {
1946                st.st_uid = perm_data.guest_uid;
1947                st.st_gid = perm_data.guest_gid;
1948                st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
1949            }
1950        }
1951    }
1952
1953    /// Set host uid/gid to configured value according to path
1954    fn change_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32) {
1955        let path = format!(
1956            "{}/{}",
1957            parent_data.path.clone(),
1958            name.to_str().unwrap_or("<non UTF-8 str>")
1959        );
1960
1961        self.change_creds_for_path(ctx, &path)
1962    }
1963
1964    /// Set host uid/gid to configured value according to path
1965    fn change_creds_for_path(&self, ctx: &Context, path: &str) -> (u32, u32) {
1966        for perm_data in self
1967            .permission_paths
1968            .read()
1969            .expect("acquire permission_paths read lock")
1970            .iter()
1971        {
1972            if perm_data.need_set_permission(path) {
1973                return (perm_data.host_uid, perm_data.host_gid);
1974            }
1975        }
1976
1977        (ctx.uid, ctx.gid)
1978    }
1979
1980    fn read_permission_data<R: io::Read>(&self, mut r: R) -> io::Result<PermissionData> {
1981        let mut fs_permission_data = FsPermissionDataBuffer::new_zeroed();
1982        r.read_exact(fs_permission_data.as_mut_bytes())?;
1983
1984        let perm_path = self.string_from_u8_slice(&fs_permission_data.perm_path)?;
1985        if !perm_path.starts_with('/') {
1986            error!("FS_IOC_SETPERMISSION: perm path must start with '/'");
1987            return Err(io::Error::from_raw_os_error(libc::EINVAL));
1988        }
1989        Ok(PermissionData {
1990            guest_uid: fs_permission_data.guest_uid,
1991            guest_gid: fs_permission_data.guest_gid,
1992            host_uid: fs_permission_data.host_uid,
1993            host_gid: fs_permission_data.host_gid,
1994            umask: fs_permission_data.umask,
1995            perm_path,
1996        })
1997    }
1998
1999    /// Sets uid/gid/umask for all files and directories under a specific path.
2000    ///
2001    /// This ioctl does not correspond to any upstream FUSE feature. It is used for arcvm
2002    /// It associates the specified path with the provide uid, gid, and umask values within the
2003    /// filesystem metadata.
2004    ///
2005    /// During subsequent lookup operations, the stored uid/gid/umask values are retrieved and
2006    /// applied to all files and directories found under the registered path. Before sending
2007    /// file stat information to the client, the uid and gid are substituted by `guest_uid` and
2008    /// `guest_gid` if the file falls under the registered path. The file mode is masked by the
2009    ///  umask.
2010    ///
2011    /// When the guest creates a file within the specified path, the file gid/uid stat in host
2012    /// will be overwritten to `host_uid` and `host_gid` values.
2013    ///
2014    /// This functionality enables dynamic configuration of ownership and permissions for a
2015    /// specific directory hierarchy within the filesystem.
2016    ///
2017    /// # Notes
2018    /// - This method affects all existing and future files under the registered path.
2019    /// - The original file ownership and permissions are overridden by the provided values.
2020    /// - The registered path should not be renamed
2021    /// - Refer go/remove-mount-passthrough-fuse for more design details
2022    fn set_permission_by_path<R: io::Read>(&self, r: R) -> IoctlReply {
2023        if self
2024            .permission_paths
2025            .read()
2026            .expect("acquire permission_paths read lock")
2027            .len()
2028            >= self.cfg.max_dynamic_perm
2029        {
2030            error!(
2031                "FS_IOC_SETPERMISSION exceeds limits of max_dynamic_perm: {}",
2032                self.cfg.max_dynamic_perm
2033            );
2034            return IoctlReply::Done(Err(io::Error::from_raw_os_error(libc::EPERM)));
2035        }
2036
2037        let perm_data = match self.read_permission_data(r) {
2038            Ok(data) => data,
2039            Err(e) => {
2040                error!("fail to read permission data: {}", e);
2041                return IoctlReply::Done(Err(e));
2042            }
2043        };
2044
2045        self.permission_paths
2046            .write()
2047            .expect("acquire permission_paths write lock")
2048            .push(perm_data);
2049
2050        IoctlReply::Done(Ok(Vec::new()))
2051    }
2052
2053    // Get xattr value according to path and name
2054    fn get_xattr_by_path(&self, path: &str, name: &str) -> Option<String> {
2055        self.xattr_paths
2056            .read()
2057            .expect("acquire permission_paths read lock")
2058            .iter()
2059            .find(|data| data.need_set_guest_xattr(path, name))
2060            .map(|data| data.xattr_value.clone())
2061    }
2062
2063    fn skip_host_set_xattr(&self, path: &str, name: &str) -> bool {
2064        self.get_xattr_by_path(path, name).is_some()
2065    }
2066
2067    fn read_xattr_data<R: io::Read>(&self, mut r: R) -> io::Result<XattrData> {
2068        let mut fs_path_xattr_data = FsPathXattrDataBuffer::new_zeroed();
2069        r.read_exact(fs_path_xattr_data.as_mut_bytes())?;
2070
2071        let xattr_path = self.string_from_u8_slice(&fs_path_xattr_data.path)?;
2072        if !xattr_path.starts_with('/') {
2073            error!("FS_IOC_SETPATHXATTR: perm path must start with '/'");
2074            return Err(io::Error::from_raw_os_error(libc::EINVAL));
2075        }
2076        let xattr_name = self.string_from_u8_slice(&fs_path_xattr_data.xattr_name)?;
2077        let xattr_value = self.string_from_u8_slice(&fs_path_xattr_data.xattr_value)?;
2078
2079        Ok(XattrData {
2080            xattr_path,
2081            xattr_name,
2082            xattr_value,
2083        })
2084    }
2085
2086    /// Sets xattr value for all files and directories under a specific path.
2087    ///
2088    /// This ioctl does not correspond to any upstream FUSE feature. It is used for arcvm.
2089    /// It associates the specified path and xattr name with a value.
2090    ///
2091    /// When the getxattr is called for the specified path and name, the predefined
2092    /// value is returned.
2093    ///
2094    /// # Notes
2095    /// - This method affects all existing and future files under the registered path.
2096    /// - The SECURITY_CONTEXT feature will be disabled if this ioctl is enabled.
2097    /// - The registered path should not be renamed
2098    /// - Refer go/remove-mount-passthrough-fuse for more design details
2099    fn set_xattr_by_path<R: io::Read>(&self, r: R) -> IoctlReply {
2100        if self
2101            .xattr_paths
2102            .read()
2103            .expect("acquire xattr_paths read lock")
2104            .len()
2105            >= self.cfg.max_dynamic_xattr
2106        {
2107            error!(
2108                "FS_IOC_SETPATHXATTR exceeds limits of max_dynamic_xattr: {}",
2109                self.cfg.max_dynamic_xattr
2110            );
2111            return IoctlReply::Done(Err(io::Error::from_raw_os_error(libc::EPERM)));
2112        }
2113
2114        let xattr_data = match self.read_xattr_data(r) {
2115            Ok(data) => data,
2116            Err(e) => {
2117                error!("fail to read xattr data: {}", e);
2118                return IoctlReply::Done(Err(e));
2119            }
2120        };
2121
2122        self.xattr_paths
2123            .write()
2124            .expect("acquire xattr_paths write lock")
2125            .push(xattr_data);
2126
2127        IoctlReply::Done(Ok(Vec::new()))
2128    }
2129
2130    fn do_getxattr_with_filter(
2131        &self,
2132        data: Arc<InodeData>,
2133        name: Cow<CStr>,
2134        buf: &mut [u8],
2135    ) -> io::Result<usize> {
2136        let res: usize = match self.get_xattr_by_path(&data.path, &name.to_string_lossy()) {
2137            Some(predifined_xattr) => {
2138                let x = predifined_xattr.into_bytes();
2139                if x.len() > buf.len() {
2140                    return Err(io::Error::from_raw_os_error(libc::ERANGE));
2141                }
2142                buf[..x.len()].copy_from_slice(&x);
2143                x.len()
2144            }
2145            None => self.do_getxattr(&data, &name, &mut buf[..])?,
2146        };
2147        Ok(res)
2148    }
2149
2150    /// Looks up the host uid according to the path of file that inode is referring to.
2151    fn lookup_host_uid(&self, ctx: &Context, inode: Inode) -> u32 {
2152        if let Ok(inode_data) = self.find_inode(inode) {
2153            let path = &inode_data.path;
2154            for perm_data in self
2155                .permission_paths
2156                .read()
2157                .expect("acquire permission_paths read lock")
2158                .iter()
2159            {
2160                if perm_data.need_set_permission(path) {
2161                    return perm_data.host_uid;
2162                }
2163            }
2164        }
2165        ctx.uid
2166    }
2167}
2168
2169/// Decrements the refcount of the inode.
2170/// Returns `true` if the refcount became 0.
2171fn forget_one(
2172    inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
2173    inode: Inode,
2174    count: u64,
2175) -> bool {
2176    if let Some(data) = inodes.get(&inode) {
2177        // Acquiring the write lock on the inode map prevents new lookups from incrementing the
2178        // refcount but there is the possibility that a previous lookup already acquired a
2179        // reference to the inode data and is in the process of updating the refcount so we need
2180        // to loop here until we can decrement successfully.
2181        loop {
2182            let refcount = data.refcount.load(Ordering::Relaxed);
2183
2184            // Saturating sub because it doesn't make sense for a refcount to go below zero and
2185            // we don't want misbehaving clients to cause integer overflow.
2186            let new_count = refcount.saturating_sub(count);
2187
2188            // Synchronizes with the acquire load in `do_lookup`.
2189            if data
2190                .refcount
2191                .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
2192                .is_ok()
2193            {
2194                if new_count == 0 {
2195                    // We just removed the last refcount for this inode. There's no need for an
2196                    // acquire fence here because we hold a write lock on the inode map and any
2197                    // thread that is waiting to do a forget on the same inode will have to wait
2198                    // until we release the lock. So there's is no other release store for us to
2199                    // synchronize with before deleting the entry.
2200                    inodes.remove(&inode);
2201                    return true;
2202                }
2203                break;
2204            }
2205        }
2206    }
2207    false
2208}
2209
2210// Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
2211// nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
2212fn strip_xattr_prefix(buf: &mut Vec<u8>) {
2213    fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
2214        if start >= b.len() {
2215            return None;
2216        }
2217
2218        let end = b[start..]
2219            .iter()
2220            .position(|&c| c == b'\0')
2221            .map(|p| start + p + 1)
2222            .unwrap_or(b.len());
2223
2224        Some(&b[start..end])
2225    }
2226
2227    let mut pos = 0;
2228    while let Some(name) = next_cstr(buf, pos) {
2229        if !name.starts_with(USER_VIRTIOFS_XATTR) {
2230            pos += name.len();
2231            continue;
2232        }
2233
2234        let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
2235        buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
2236        pos += newlen;
2237    }
2238}
2239
2240impl Drop for PassthroughFs {
2241    /// The `Drop` implementation for this struct intentionally leaks all open file descriptors.
2242    /// It sets the `unsafe_leak_fd` flag on all `InodeData` and `HandleData` instances, which
2243    /// causes their `drop` implementations to forget the underlying `File` objects.
2244    ///
2245    /// This is a deliberate performance optimization for abrupt shutdowns. It relies on the
2246    /// operating system to clean up the file descriptors when the process terminates. It is
2247    /// **critical** that an instance of `PassthroughFs` is only dropped immediately prior to
2248    /// process termination.
2249    fn drop(&mut self) {
2250        let inodes = self.inodes.lock();
2251        inodes.apply(|v| {
2252            v.set_unsafe_leak_fd();
2253        });
2254        let handles = self.handles.lock();
2255        handles.values().for_each(|v| v.set_unsafe_leak_fd());
2256    }
2257}
2258
2259impl FileSystem for PassthroughFs {
2260    type Inode = Inode;
2261    type Handle = Handle;
2262    type DirIter = ReadDir<Box<[u8]>>;
2263
2264    fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
2265        let root = CString::new(self.root_dir.clone())
2266            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
2267
2268        let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
2269        // SAFETY: this doesn't modify any memory and we check the return value.
2270        let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
2271        if raw_descriptor < 0 {
2272            return Err(io::Error::last_os_error());
2273        }
2274
2275        // SAFETY: safe because we just opened this descriptor above.
2276        let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
2277
2278        let st = stat(&f)?;
2279
2280        // SAFETY: this doesn't modify any memory and there is no need to check the return
2281        // value because this system call always succeeds. We need to clear the umask here because
2282        // we want the client to be able to set all the bits in the mode.
2283        unsafe { libc::umask(0o000) };
2284
2285        let mut inodes = self.inodes.lock();
2286
2287        // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
2288        inodes.insert(
2289            ROOT_ID,
2290            InodeAltKey {
2291                ino: st.st_ino,
2292                dev: st.st_dev,
2293            },
2294            Arc::new(InodeData {
2295                inode: ROOT_ID,
2296                file: Mutex::new(OpenedFile::new(f, flags)),
2297                refcount: AtomicU64::new(2),
2298                filetype: st.st_mode.into(),
2299                path: "".to_string(),
2300                unsafe_leak_fd: AtomicBool::new(false),
2301            }),
2302        );
2303
2304        let mut opts = FsOptions::DO_READDIRPLUS
2305            | FsOptions::READDIRPLUS_AUTO
2306            | FsOptions::EXPORT_SUPPORT
2307            | FsOptions::DONT_MASK
2308            | FsOptions::CACHE_SYMLINKS;
2309
2310        // Device using dynamic xattr feature will have different security context in
2311        // host and guests. The SECURITY_CONTEXT feature should not be enabled in the
2312        // device.
2313        if self.cfg.max_dynamic_xattr == 0 && self.cfg.security_ctx {
2314            opts |= FsOptions::SECURITY_CONTEXT;
2315        }
2316
2317        if self.cfg.posix_acl {
2318            opts |= FsOptions::POSIX_ACL;
2319        }
2320        if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
2321            opts |= FsOptions::WRITEBACK_CACHE;
2322            self.writeback.store(true, Ordering::Relaxed);
2323        }
2324        if self.cfg.cache_policy == CachePolicy::Always {
2325            if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
2326                opts |= FsOptions::ZERO_MESSAGE_OPEN;
2327                self.zero_message_open.store(true, Ordering::Relaxed);
2328            }
2329            if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
2330                opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
2331                self.zero_message_opendir.store(true, Ordering::Relaxed);
2332            }
2333        }
2334        Ok(opts)
2335    }
2336
2337    fn destroy(&self) {
2338        cros_tracing::trace_simple_print!(VirtioFs, "{:?}: destroy", self);
2339        self.handles.lock().clear();
2340        self.inodes.lock().clear();
2341    }
2342
2343    fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
2344        let _trace = fs_trace!(self.tag, "statfs", inode);
2345        let data = self.find_inode(inode)?;
2346
2347        let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
2348
2349        // SAFETY: this will only modify `out` and we check the return value.
2350        syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
2351
2352        // SAFETY: the kernel guarantees that `out` has been initialized.
2353        Ok(unsafe { out.assume_init() })
2354    }
2355
2356    fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
2357        let data = self.find_inode(parent)?;
2358        #[allow(unused_variables)]
2359        let path = format!(
2360            "{}/{}",
2361            data.path,
2362            name.to_str().unwrap_or("<non UTF-8 path>")
2363        );
2364        let _trace = fs_trace!(self.tag, "lookup", parent, path);
2365
2366        let mut res = self.do_lookup_with_casefold_fallback(&data, name);
2367
2368        // FUSE takes a inode=0 as a request to do negative dentry cache.
2369        // So, if `negative_timeout` is set, return success with the timeout value and inode=0 as a
2370        // response.
2371        if let Err(e) = &res {
2372            if e.kind() == std::io::ErrorKind::NotFound && !self.cfg.negative_timeout.is_zero() {
2373                res = Ok(Entry::new_negative(self.cfg.negative_timeout));
2374            }
2375        }
2376
2377        res
2378    }
2379
2380    fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
2381        let _trace = fs_trace!(self.tag, "forget", inode, count);
2382        let mut inodes = self.inodes.lock();
2383        let caches = self.lock_casefold_lookup_caches();
2384        if forget_one(&mut inodes, inode, count) {
2385            if let Some(mut c) = caches {
2386                c.forget(inode);
2387            }
2388        }
2389    }
2390
2391    fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
2392        let mut inodes = self.inodes.lock();
2393        let mut caches = self.lock_casefold_lookup_caches();
2394        for (inode, count) in requests {
2395            if forget_one(&mut inodes, inode, count) {
2396                if let Some(c) = caches.as_mut() {
2397                    c.forget(inode);
2398                }
2399            }
2400        }
2401    }
2402
2403    fn opendir(
2404        &self,
2405        _ctx: Context,
2406        inode: Inode,
2407        flags: u32,
2408    ) -> io::Result<(Option<Handle>, OpenOptions)> {
2409        let _trace = fs_trace!(self.tag, "opendir", inode, flags);
2410        if self.zero_message_opendir.load(Ordering::Relaxed) {
2411            Err(io::Error::from_raw_os_error(libc::ENOSYS))
2412        } else {
2413            self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
2414        }
2415    }
2416
2417    fn releasedir(
2418        &self,
2419        _ctx: Context,
2420        inode: Inode,
2421        _flags: u32,
2422        handle: Handle,
2423    ) -> io::Result<()> {
2424        let _trace = fs_trace!(self.tag, "releasedir", inode, handle);
2425        if self.zero_message_opendir.load(Ordering::Relaxed) {
2426            Ok(())
2427        } else {
2428            self.do_release(inode, handle)
2429        }
2430    }
2431
2432    fn mkdir(
2433        &self,
2434        ctx: Context,
2435        parent: Inode,
2436        name: &CStr,
2437        mode: u32,
2438        umask: u32,
2439        security_ctx: Option<&CStr>,
2440    ) -> io::Result<Entry> {
2441        let _trace = fs_trace!(self.tag, "mkdir", parent, name, mode, umask, security_ctx);
2442        let data = self.find_inode(parent)?;
2443
2444        let _ctx = security_ctx
2445            .filter(|ctx| *ctx != UNLABELED_CSTR)
2446            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2447            .transpose()?;
2448
2449        #[allow(unused_variables)]
2450        #[cfg(feature = "arc_quota")]
2451        let (uid, gid) = self.change_creds(&ctx, &data, name);
2452        #[cfg(feature = "fs_runtime_ugid_map")]
2453        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2454        #[cfg(not(feature = "fs_permission_translation"))]
2455        let (uid, gid) = (ctx.uid, ctx.gid);
2456
2457        let (_uid, _gid) = set_creds(uid, gid)?;
2458        {
2459            let casefold_cache = self.lock_casefold_lookup_caches();
2460            let _scoped_umask = ScopedUmask::new(umask);
2461
2462            // SAFETY: this doesn't modify any memory and we check the return value.
2463            syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
2464            if let Some(mut c) = casefold_cache {
2465                c.insert(data.inode, name);
2466            }
2467        }
2468        self.do_lookup(&data, name)
2469    }
2470
2471    fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2472        let _trace = fs_trace!(self.tag, "rmdir", parent, name);
2473        let data = self.find_inode(parent)?;
2474        let casefold_cache = self.lock_casefold_lookup_caches();
2475        // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2476        // `get_case_unfolded_name()` to get the actual name to be unlinked.
2477        self.do_unlink(&data, name, libc::AT_REMOVEDIR)?;
2478        if let Some(mut c) = casefold_cache {
2479            c.remove(data.inode, name);
2480        }
2481        Ok(())
2482    }
2483
2484    fn readdir(
2485        &self,
2486        _ctx: Context,
2487        inode: Inode,
2488        handle: Handle,
2489        size: u32,
2490        offset: u64,
2491    ) -> io::Result<Self::DirIter> {
2492        let _trace = fs_trace!(self.tag, "readdir", inode, handle, size, offset);
2493        let buf = vec![0; size as usize].into_boxed_slice();
2494
2495        if self.zero_message_opendir.load(Ordering::Relaxed) {
2496            let data = self.find_inode(inode)?;
2497            ReadDir::new(&*data, offset as libc::off64_t, buf)
2498        } else {
2499            let data = self.find_handle(handle, inode)?;
2500
2501            let dir = data.file.lock();
2502
2503            ReadDir::new(&*dir, offset as libc::off64_t, buf)
2504        }
2505    }
2506
2507    fn open(
2508        &self,
2509        _ctx: Context,
2510        inode: Inode,
2511        flags: u32,
2512    ) -> io::Result<(Option<Handle>, OpenOptions)> {
2513        if self.zero_message_open.load(Ordering::Relaxed) {
2514            let _trace = fs_trace!(self.tag, "open (zero-message)", inode, flags);
2515            Err(io::Error::from_raw_os_error(libc::ENOSYS))
2516        } else {
2517            let _trace = fs_trace!(self.tag, "open", inode, flags);
2518            self.do_open(inode, flags)
2519        }
2520    }
2521
2522    fn release(
2523        &self,
2524        _ctx: Context,
2525        inode: Inode,
2526        _flags: u32,
2527        handle: Handle,
2528        _flush: bool,
2529        _flock_release: bool,
2530        _lock_owner: Option<u64>,
2531    ) -> io::Result<()> {
2532        if self.zero_message_open.load(Ordering::Relaxed) {
2533            let _trace = fs_trace!(self.tag, "release (zero-message)", inode, handle);
2534            Ok(())
2535        } else {
2536            let _trace = fs_trace!(self.tag, "release", inode, handle);
2537            self.do_release(inode, handle)
2538        }
2539    }
2540
2541    fn chromeos_tmpfile(
2542        &self,
2543        ctx: Context,
2544        parent: Self::Inode,
2545        mode: u32,
2546        umask: u32,
2547        security_ctx: Option<&CStr>,
2548    ) -> io::Result<Entry> {
2549        let _trace = fs_trace!(
2550            self.tag,
2551            "chromeos_tempfile",
2552            parent,
2553            mode,
2554            umask,
2555            security_ctx
2556        );
2557        let data = self.find_inode(parent)?;
2558
2559        let _ctx = security_ctx
2560            .filter(|ctx| *ctx != UNLABELED_CSTR)
2561            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2562            .transpose()?;
2563
2564        let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
2565
2566        let current_dir = c".";
2567
2568        #[allow(unused_variables)]
2569        #[cfg(feature = "arc_quota")]
2570        let (uid, gid) = self.change_creds(&ctx, &data, current_dir);
2571        #[cfg(feature = "fs_runtime_ugid_map")]
2572        let (uid, gid) = self.change_ugid_creds(&ctx, &data, current_dir);
2573        #[cfg(not(feature = "fs_permission_translation"))]
2574        let (uid, gid) = (ctx.uid, ctx.gid);
2575
2576        let (_uid, _gid) = set_creds(uid, gid)?;
2577
2578        let fd = {
2579            let _scoped_umask = ScopedUmask::new(umask);
2580
2581            // SAFETY: this doesn't modify any memory and we check the return value.
2582            syscall!(unsafe {
2583                libc::openat64(
2584                    data.as_raw_descriptor(),
2585                    current_dir.as_ptr(),
2586                    tmpflags,
2587                    mode,
2588                )
2589            })?
2590        };
2591        // No need to add casefold_cache becuase we created an anonymous file.
2592
2593        // SAFETY: safe because we just opened this fd.
2594        let tmpfile = unsafe { File::from_raw_descriptor(fd) };
2595        let st = stat(&tmpfile)?;
2596        let path = format!(
2597            "{}/{}",
2598            data.path.clone(),
2599            current_dir.to_str().unwrap_or("<non UTF-8 str>")
2600        );
2601        Ok(self.add_entry(tmpfile, st, tmpflags, path))
2602    }
2603
2604    fn create(
2605        &self,
2606        ctx: Context,
2607        parent: Inode,
2608        name: &CStr,
2609        mode: u32,
2610        flags: u32,
2611        umask: u32,
2612        security_ctx: Option<&CStr>,
2613    ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
2614        let _trace = fs_trace!(
2615            self.tag,
2616            "create",
2617            parent,
2618            name,
2619            mode,
2620            flags,
2621            umask,
2622            security_ctx
2623        );
2624        let data = self.find_inode(parent)?;
2625
2626        let _ctx = security_ctx
2627            .filter(|ctx| *ctx != UNLABELED_CSTR)
2628            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2629            .transpose()?;
2630
2631        #[allow(unused_variables)]
2632        #[cfg(feature = "arc_quota")]
2633        let (uid, gid) = self.change_creds(&ctx, &data, name);
2634        #[cfg(feature = "fs_runtime_ugid_map")]
2635        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2636        #[cfg(not(feature = "fs_permission_translation"))]
2637        let (uid, gid) = (ctx.uid, ctx.gid);
2638
2639        let (_uid, _gid) = set_creds(uid, gid)?;
2640
2641        let flags = self.update_open_flags(flags as i32);
2642        let create_flags =
2643            (flags | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
2644
2645        let fd = {
2646            let _scoped_umask = ScopedUmask::new(umask);
2647            let casefold_cache = self.lock_casefold_lookup_caches();
2648
2649            // SAFETY: this doesn't modify any memory and we check the return value. We don't really
2650            // check `flags` because if the kernel can't handle poorly specified flags then we have
2651            // much bigger problems.
2652            // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2653            // `get_case_unfolded_name()` to get the actual name to be created.
2654            let fd = syscall!(unsafe {
2655                libc::openat64(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode)
2656            })?;
2657            if let Some(mut c) = casefold_cache {
2658                c.insert(parent, name);
2659            }
2660            fd
2661        };
2662
2663        // SAFETY: safe because we just opened this fd.
2664        let file = unsafe { File::from_raw_descriptor(fd) };
2665
2666        let st = stat(&file)?;
2667        let path = format!(
2668            "{}/{}",
2669            data.path.clone(),
2670            name.to_str().unwrap_or("<non UTF-8 str>")
2671        );
2672        let entry = self.add_entry(file, st, create_flags, path);
2673
2674        let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
2675            (None, OpenOptions::KEEP_CACHE)
2676        } else {
2677            self.do_open_at(
2678                data,
2679                name,
2680                entry.inode,
2681                flags as u32 & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
2682            )
2683            .inspect_err(|_e| {
2684                // Don't leak the entry.
2685                self.forget(ctx, entry.inode, 1);
2686            })?
2687        };
2688        Ok((entry, handle, opts))
2689    }
2690
2691    fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2692        let _trace = fs_trace!(self.tag, "unlink", parent, name);
2693        let data = self.find_inode(parent)?;
2694        let casefold_cache = self.lock_casefold_lookup_caches();
2695        // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2696        // `get_case_unfolded_name()` to get the actual name to be unlinked.
2697        self.do_unlink(&data, name, 0)?;
2698        if let Some(mut c) = casefold_cache {
2699            c.remove(data.inode, name);
2700        }
2701        Ok(())
2702    }
2703
2704    fn read<W: io::Write + ZeroCopyWriter>(
2705        &self,
2706        _ctx: Context,
2707        inode: Inode,
2708        handle: Handle,
2709        mut w: W,
2710        size: u32,
2711        offset: u64,
2712        _lock_owner: Option<u64>,
2713        _flags: u32,
2714    ) -> io::Result<usize> {
2715        if self.zero_message_open.load(Ordering::Relaxed) {
2716            let _trace = fs_trace!(self.tag, "read (zero-message)", inode, handle, size, offset);
2717            let data = self.find_inode(inode)?;
2718
2719            let mut file = data.file.lock();
2720            let mut flags = file.open_flags;
2721            match flags & libc::O_ACCMODE {
2722                libc::O_WRONLY => {
2723                    flags &= !libc::O_WRONLY;
2724                    flags |= libc::O_RDWR;
2725
2726                    // We need to get a readable handle for this file.
2727                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
2728                    *file = OpenedFile::new(newfile, flags);
2729                }
2730                libc::O_RDONLY | libc::O_RDWR => {}
2731                _ => panic!("Unexpected flags: {flags:#x}"),
2732            }
2733
2734            w.write_from(file.file_mut(), size as usize, offset)
2735        } else {
2736            let _trace = fs_trace!(self.tag, "read", inode, handle, size, offset);
2737            let data = self.find_handle(handle, inode)?;
2738
2739            let mut f = data.file.lock();
2740            w.write_from(f.file_mut(), size as usize, offset)
2741        }
2742    }
2743
2744    fn write<R: io::Read + ZeroCopyReader>(
2745        &self,
2746        _ctx: Context,
2747        inode: Inode,
2748        handle: Handle,
2749        mut r: R,
2750        size: u32,
2751        offset: u64,
2752        _lock_owner: Option<u64>,
2753        _delayed_write: bool,
2754        flags: u32,
2755    ) -> io::Result<usize> {
2756        // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
2757        // automatically clear the setuid and setgid bits for us.
2758        let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
2759            Some(drop_cap_fsetid()?)
2760        } else {
2761            None
2762        };
2763
2764        if self.zero_message_open.load(Ordering::Relaxed) {
2765            let _trace = fs_trace!(
2766                self.tag,
2767                "write (zero-message)",
2768                inode,
2769                handle,
2770                size,
2771                offset
2772            );
2773
2774            let data = self.find_inode(inode)?;
2775
2776            let mut file = data.file.lock();
2777            let mut flags = file.open_flags;
2778            match flags & libc::O_ACCMODE {
2779                libc::O_RDONLY => {
2780                    flags &= !libc::O_RDONLY;
2781                    flags |= libc::O_RDWR;
2782
2783                    // We need to get a writable handle for this file.
2784                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
2785                    *file = OpenedFile::new(newfile, flags);
2786                }
2787                libc::O_WRONLY | libc::O_RDWR => {}
2788                _ => panic!("Unexpected flags: {flags:#x}"),
2789            }
2790
2791            r.read_to(file.file_mut(), size as usize, offset)
2792        } else {
2793            let _trace = fs_trace!(self.tag, "write", inode, handle, size, offset);
2794
2795            let data = self.find_handle(handle, inode)?;
2796
2797            let mut f = data.file.lock();
2798            r.read_to(f.file_mut(), size as usize, offset)
2799        }
2800    }
2801
2802    fn getattr(
2803        &self,
2804        _ctx: Context,
2805        inode: Inode,
2806        _handle: Option<Handle>,
2807    ) -> io::Result<(libc::stat64, Duration)> {
2808        let _trace = fs_trace!(self.tag, "getattr", inode, _handle);
2809
2810        let data = self.find_inode(inode)?;
2811        self.do_getattr(&data)
2812    }
2813
2814    fn setattr(
2815        &self,
2816        _ctx: Context,
2817        inode: Inode,
2818        attr: libc::stat64,
2819        handle: Option<Handle>,
2820        valid: SetattrValid,
2821    ) -> io::Result<(libc::stat64, Duration)> {
2822        let _trace = fs_trace!(self.tag, "setattr", inode, handle);
2823        let inode_data = self.find_inode(inode)?;
2824
2825        enum Data<'a> {
2826            Handle(MutexGuard<'a, OpenedFile>),
2827            ProcPath(CString),
2828        }
2829
2830        // If we have a handle then use it otherwise get a new fd from the inode.
2831        let hd;
2832        let data = if let Some(handle) = handle.filter(|&h| h != 0) {
2833            hd = self.find_handle(handle, inode)?;
2834            Data::Handle(hd.file.lock())
2835        } else {
2836            let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
2837                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2838            Data::ProcPath(pathname)
2839        };
2840
2841        if valid.contains(SetattrValid::MODE) {
2842            // SAFETY: this doesn't modify any memory and we check the return value.
2843            syscall!(unsafe {
2844                match data {
2845                    Data::Handle(ref fd) => libc::fchmod(fd.as_raw_descriptor(), attr.st_mode),
2846                    Data::ProcPath(ref p) => {
2847                        libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
2848                    }
2849                }
2850            })?;
2851        }
2852
2853        if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
2854            let uid = if valid.contains(SetattrValid::UID) {
2855                attr.st_uid
2856            } else {
2857                // Cannot use -1 here because these are unsigned values.
2858                u32::MAX
2859            };
2860            let gid = if valid.contains(SetattrValid::GID) {
2861                attr.st_gid
2862            } else {
2863                // Cannot use -1 here because these are unsigned values.
2864                u32::MAX
2865            };
2866
2867            // SAFETY: this doesn't modify any memory and we check the return value.
2868            syscall!(unsafe {
2869                libc::fchownat(
2870                    inode_data.as_raw_descriptor(),
2871                    EMPTY_CSTR.as_ptr(),
2872                    uid,
2873                    gid,
2874                    libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
2875                )
2876            })?;
2877        }
2878
2879        if valid.contains(SetattrValid::SIZE) {
2880            syscall!(match data {
2881                Data::Handle(ref fd) => {
2882                    // SAFETY: this doesn't modify any memory and we check the return value.
2883                    unsafe { libc::ftruncate64(fd.as_raw_descriptor(), attr.st_size) }
2884                }
2885                _ => {
2886                    // There is no `ftruncateat` so we need to get a new fd and truncate it.
2887                    let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
2888                    // SAFETY: this doesn't modify any memory and we check the return value.
2889                    unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
2890                }
2891            })?;
2892        }
2893
2894        if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
2895            let mut tvs = [
2896                libc::timespec {
2897                    tv_sec: 0,
2898                    tv_nsec: libc::UTIME_OMIT,
2899                },
2900                libc::timespec {
2901                    tv_sec: 0,
2902                    tv_nsec: libc::UTIME_OMIT,
2903                },
2904            ];
2905
2906            if valid.contains(SetattrValid::ATIME_NOW) {
2907                tvs[0].tv_nsec = libc::UTIME_NOW;
2908            } else if valid.contains(SetattrValid::ATIME) {
2909                tvs[0].tv_sec = attr.st_atime;
2910                tvs[0].tv_nsec = attr.st_atime_nsec;
2911            }
2912
2913            if valid.contains(SetattrValid::MTIME_NOW) {
2914                tvs[1].tv_nsec = libc::UTIME_NOW;
2915            } else if valid.contains(SetattrValid::MTIME) {
2916                tvs[1].tv_sec = attr.st_mtime;
2917                tvs[1].tv_nsec = attr.st_mtime_nsec;
2918            }
2919
2920            // SAFETY: this doesn't modify any memory and we check the return value.
2921            syscall!(unsafe {
2922                match data {
2923                    Data::Handle(ref fd) => libc::futimens(fd.as_raw_descriptor(), tvs.as_ptr()),
2924                    Data::ProcPath(ref p) => {
2925                        libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
2926                    }
2927                }
2928            })?;
2929        }
2930
2931        self.do_getattr(&inode_data)
2932    }
2933
2934    fn rename(
2935        &self,
2936        _ctx: Context,
2937        olddir: Inode,
2938        oldname: &CStr,
2939        newdir: Inode,
2940        newname: &CStr,
2941        flags: u32,
2942    ) -> io::Result<()> {
2943        let _trace = fs_trace!(self.tag, "rename", olddir, oldname, newdir, newname, flags);
2944
2945        let old_inode = self.find_inode(olddir)?;
2946        let new_inode = self.find_inode(newdir)?;
2947        {
2948            let casefold_cache = self.lock_casefold_lookup_caches();
2949
2950            // SAFETY: this doesn't modify any memory and we check the return value.
2951            // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
2952            // and we have glibc 2.28.
2953            syscall!(unsafe {
2954                libc::syscall(
2955                    libc::SYS_renameat2,
2956                    old_inode.as_raw_descriptor(),
2957                    oldname.as_ptr(),
2958                    new_inode.as_raw_descriptor(),
2959                    newname.as_ptr(),
2960                    flags,
2961                )
2962            })?;
2963            if let Some(mut c) = casefold_cache {
2964                c.remove(olddir, oldname);
2965                c.insert(newdir, newname);
2966            }
2967        }
2968
2969        Ok(())
2970    }
2971
2972    fn mknod(
2973        &self,
2974        ctx: Context,
2975        parent: Inode,
2976        name: &CStr,
2977        mode: u32,
2978        rdev: u32,
2979        umask: u32,
2980        security_ctx: Option<&CStr>,
2981    ) -> io::Result<Entry> {
2982        let _trace = fs_trace!(
2983            self.tag,
2984            "mknod",
2985            parent,
2986            name,
2987            mode,
2988            rdev,
2989            umask,
2990            security_ctx
2991        );
2992        let data = self.find_inode(parent)?;
2993
2994        let _ctx = security_ctx
2995            .filter(|ctx| *ctx != UNLABELED_CSTR)
2996            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2997            .transpose()?;
2998
2999        #[allow(unused_variables)]
3000        #[cfg(feature = "arc_quota")]
3001        let (uid, gid) = self.change_creds(&ctx, &data, name);
3002        #[cfg(feature = "fs_runtime_ugid_map")]
3003        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3004        #[cfg(not(feature = "fs_permission_translation"))]
3005        let (uid, gid) = (ctx.uid, ctx.gid);
3006
3007        let (_uid, _gid) = set_creds(uid, gid)?;
3008        {
3009            let _scoped_umask = ScopedUmask::new(umask);
3010            let casefold_cache = self.lock_casefold_lookup_caches();
3011
3012            // SAFETY: this doesn't modify any memory and we check the return value.
3013            syscall!(unsafe {
3014                libc::mknodat(
3015                    data.as_raw_descriptor(),
3016                    name.as_ptr(),
3017                    mode as libc::mode_t,
3018                    rdev as libc::dev_t,
3019                )
3020            })?;
3021            if let Some(mut c) = casefold_cache {
3022                c.insert(parent, name);
3023            }
3024        }
3025
3026        self.do_lookup(&data, name)
3027    }
3028
3029    fn link(
3030        &self,
3031        _ctx: Context,
3032        inode: Inode,
3033        newparent: Inode,
3034        newname: &CStr,
3035    ) -> io::Result<Entry> {
3036        let _trace = fs_trace!(self.tag, "link", inode, newparent, newname);
3037        let data = self.find_inode(inode)?;
3038        let new_inode = self.find_inode(newparent)?;
3039
3040        let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
3041            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3042
3043        {
3044            let casefold_cache = self.lock_casefold_lookup_caches();
3045            // SAFETY: this doesn't modify any memory and we check the return value.
3046            syscall!(unsafe {
3047                libc::linkat(
3048                    self.proc.as_raw_descriptor(),
3049                    path.as_ptr(),
3050                    new_inode.as_raw_descriptor(),
3051                    newname.as_ptr(),
3052                    libc::AT_SYMLINK_FOLLOW,
3053                )
3054            })?;
3055            if let Some(mut c) = casefold_cache {
3056                c.insert(newparent, newname);
3057            }
3058        }
3059
3060        self.do_lookup(&new_inode, newname)
3061    }
3062
3063    fn symlink(
3064        &self,
3065        ctx: Context,
3066        linkname: &CStr,
3067        parent: Inode,
3068        name: &CStr,
3069        security_ctx: Option<&CStr>,
3070    ) -> io::Result<Entry> {
3071        let _trace = fs_trace!(self.tag, "symlink", parent, linkname, name, security_ctx);
3072        let data = self.find_inode(parent)?;
3073
3074        let _ctx = security_ctx
3075            .filter(|ctx| *ctx != UNLABELED_CSTR)
3076            .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
3077            .transpose()?;
3078
3079        #[allow(unused_variables)]
3080        #[cfg(feature = "arc_quota")]
3081        let (uid, gid) = self.change_creds(&ctx, &data, name);
3082        #[cfg(feature = "fs_runtime_ugid_map")]
3083        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3084        #[cfg(not(feature = "fs_permission_translation"))]
3085        let (uid, gid) = (ctx.uid, ctx.gid);
3086
3087        let (_uid, _gid) = set_creds(uid, gid)?;
3088        {
3089            let casefold_cache = self.lock_casefold_lookup_caches();
3090            // SAFETY: this doesn't modify any memory and we check the return value.
3091            syscall!(unsafe {
3092                libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
3093            })?;
3094            if let Some(mut c) = casefold_cache {
3095                c.insert(parent, name);
3096            }
3097        }
3098
3099        self.do_lookup(&data, name)
3100    }
3101
3102    fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
3103        let _trace = fs_trace!(self.tag, "readlink", inode);
3104        let data = self.find_inode(inode)?;
3105
3106        let mut buf = vec![0; libc::PATH_MAX as usize];
3107
3108        // SAFETY: this will only modify the contents of `buf` and we check the return value.
3109        let res = syscall!(unsafe {
3110            libc::readlinkat(
3111                data.as_raw_descriptor(),
3112                EMPTY_CSTR.as_ptr(),
3113                buf.as_mut_ptr() as *mut libc::c_char,
3114                buf.len(),
3115            )
3116        })?;
3117
3118        buf.resize(res as usize, 0);
3119
3120        #[cfg(feature = "fs_runtime_ugid_map")]
3121        {
3122            let link_target = Path::new(OsStr::from_bytes(&buf[..res as usize]));
3123            if !link_target.starts_with(&self.root_dir) {
3124                return Err(io::Error::new(
3125                    io::ErrorKind::InvalidInput,
3126                    "Symbolic link points outside of root_dir",
3127                ));
3128            }
3129        }
3130        Ok(buf)
3131    }
3132
3133    fn flush(
3134        &self,
3135        _ctx: Context,
3136        inode: Inode,
3137        handle: Handle,
3138        _lock_owner: u64,
3139    ) -> io::Result<()> {
3140        let _trace = fs_trace!(self.tag, "flush", inode, handle);
3141        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
3142            self.find_inode(inode)?
3143        } else {
3144            self.find_handle(handle, inode)?
3145        };
3146
3147        // SAFETY:
3148        // Since this method is called whenever an fd is closed in the client, we can emulate that
3149        // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
3150        // because this doesn't modify any memory and we check the return values.
3151        unsafe {
3152            let newfd = syscall!(libc::fcntl(
3153                data.as_raw_descriptor(),
3154                libc::F_DUPFD_CLOEXEC,
3155                0
3156            ))?;
3157
3158            syscall!(libc::close(newfd))?;
3159        }
3160        Ok(())
3161    }
3162
3163    fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
3164        if self.zero_message_open.load(Ordering::Relaxed) {
3165            let _trace = fs_trace!(self.tag, "fsync (zero-message)", inode, datasync, handle);
3166            let data = self.find_inode(inode)?;
3167            self.do_fsync(&*data, datasync)
3168        } else {
3169            let _trace = fs_trace!(self.tag, "fsync", inode, datasync, handle);
3170            let data = self.find_handle(handle, inode)?;
3171
3172            let file = data.file.lock();
3173            self.do_fsync(&*file, datasync)
3174        }
3175    }
3176
3177    fn fsyncdir(
3178        &self,
3179        _ctx: Context,
3180        inode: Inode,
3181        datasync: bool,
3182        handle: Handle,
3183    ) -> io::Result<()> {
3184        if self.zero_message_opendir.load(Ordering::Relaxed) {
3185            let _trace = fs_trace!(self.tag, "fsyncdir (zero-message)", inode, datasync, handle);
3186            let data = self.find_inode(inode)?;
3187            self.do_fsync(&*data, datasync)
3188        } else {
3189            let _trace = fs_trace!(self.tag, "fsyncdir", inode, datasync, handle);
3190            let data = self.find_handle(handle, inode)?;
3191
3192            let file = data.file.lock();
3193            self.do_fsync(&*file, datasync)
3194        }
3195    }
3196
3197    fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
3198        let _trace = fs_trace!(self.tag, "access", inode, mask);
3199        let data = self.find_inode(inode)?;
3200
3201        let st = stat(&*data)?;
3202        let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
3203
3204        if mode == libc::F_OK {
3205            // The file exists since we were able to call `stat(2)` on it.
3206            return Ok(());
3207        }
3208
3209        if (mode & libc::R_OK) != 0 {
3210            if ctx.uid != 0
3211                && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
3212                && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
3213                && st.st_mode & 0o004 == 0
3214            {
3215                return Err(io::Error::from_raw_os_error(libc::EACCES));
3216            }
3217        }
3218
3219        if (mode & libc::W_OK) != 0 {
3220            if ctx.uid != 0
3221                && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
3222                && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
3223                && st.st_mode & 0o002 == 0
3224            {
3225                return Err(io::Error::from_raw_os_error(libc::EACCES));
3226            }
3227        }
3228
3229        // root can only execute something if it is executable by one of the owner, the group, or
3230        // everyone.
3231        if (mode & libc::X_OK) != 0 {
3232            if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
3233                && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
3234                && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
3235                && st.st_mode & 0o001 == 0
3236            {
3237                return Err(io::Error::from_raw_os_error(libc::EACCES));
3238            }
3239        }
3240
3241        Ok(())
3242    }
3243
3244    fn setxattr(
3245        &self,
3246        _ctx: Context,
3247        inode: Inode,
3248        name: &CStr,
3249        value: &[u8],
3250        flags: u32,
3251    ) -> io::Result<()> {
3252        let _trace = fs_trace!(self.tag, "setxattr", inode, name, flags);
3253        // We can't allow the VM to set this xattr because an unprivileged process may use it to set
3254        // a privileged xattr.
3255        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3256            return Err(io::Error::from_raw_os_error(libc::EPERM));
3257        }
3258
3259        let data = self.find_inode(inode)?;
3260        let name = self.rewrite_xattr_name(name);
3261
3262        #[cfg(feature = "arc_quota")]
3263        if self.skip_host_set_xattr(&data.path, &name.to_string_lossy()) {
3264            debug!(
3265                "ignore setxattr for path:{} xattr_name:{}",
3266                &data.path,
3267                &name.to_string_lossy()
3268            );
3269            return Ok(());
3270        }
3271
3272        let file = data.file.lock();
3273        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3274        if o_path_file {
3275            // For FDs opened with `O_PATH`, we cannot call `fsetxattr` normally. Instead we emulate
3276            // an _at syscall by changing the CWD to /proc, running the path based syscall, and then
3277            // setting the CWD back to the root directory.
3278            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3279                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3280
3281            syscall!(self.with_proc_chdir(|| {
3282                // SAFETY: this doesn't modify any memory and we check the return value.
3283                unsafe {
3284                    libc::setxattr(
3285                        path.as_ptr(),
3286                        name.as_ptr(),
3287                        value.as_ptr() as *const libc::c_void,
3288                        value.len() as libc::size_t,
3289                        flags as c_int,
3290                    )
3291                }
3292            }))?;
3293        } else {
3294            syscall!(
3295                // For regular files and directories, we can just use fsetxattr.
3296                // SAFETY: this doesn't modify any memory and we check the return value.
3297                unsafe {
3298                    libc::fsetxattr(
3299                        file.as_raw_descriptor(),
3300                        name.as_ptr(),
3301                        value.as_ptr() as *const libc::c_void,
3302                        value.len() as libc::size_t,
3303                        flags as c_int,
3304                    )
3305                }
3306            )?;
3307        }
3308
3309        Ok(())
3310    }
3311
3312    fn getxattr(
3313        &self,
3314        _ctx: Context,
3315        inode: Inode,
3316        name: &CStr,
3317        size: u32,
3318    ) -> io::Result<GetxattrReply> {
3319        let _trace = fs_trace!(self.tag, "getxattr", inode, name, size);
3320        // We don't allow the VM to set this xattr so we also pretend there is no value associated
3321        // with it.
3322        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3323            return Err(io::Error::from_raw_os_error(libc::ENODATA));
3324        }
3325
3326        let data = self.find_inode(inode)?;
3327        let name = self.rewrite_xattr_name(name);
3328        let mut buf = vec![0u8; size as usize];
3329
3330        #[cfg(feature = "arc_quota")]
3331        let res = self.do_getxattr_with_filter(data, name, &mut buf)?;
3332
3333        #[cfg(not(feature = "arc_quota"))]
3334        let res = self.do_getxattr(&data, &name, &mut buf[..])?;
3335
3336        if size == 0 {
3337            Ok(GetxattrReply::Count(res as u32))
3338        } else {
3339            buf.truncate(res);
3340            Ok(GetxattrReply::Value(buf))
3341        }
3342    }
3343
3344    fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
3345        let _trace = fs_trace!(self.tag, "listxattr", inode, size);
3346        let data = self.find_inode(inode)?;
3347
3348        let mut buf = vec![0u8; size as usize];
3349
3350        let file = data.file.lock();
3351        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3352        let res = if o_path_file {
3353            // For FDs opened with `O_PATH`, we cannot call `flistxattr` normally. Instead we
3354            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
3355            // and then setting the CWD back to the root directory.
3356            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3357                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3358
3359            // SAFETY: this will only modify `buf` and we check the return value.
3360            syscall!(self.with_proc_chdir(|| unsafe {
3361                libc::listxattr(
3362                    path.as_ptr(),
3363                    buf.as_mut_ptr() as *mut libc::c_char,
3364                    buf.len() as libc::size_t,
3365                )
3366            }))?
3367        } else {
3368            // For regular files and directories, we can just flistxattr.
3369            // SAFETY: this will only write to `buf` and we check the return value.
3370            syscall!(unsafe {
3371                libc::flistxattr(
3372                    file.as_raw_descriptor(),
3373                    buf.as_mut_ptr() as *mut libc::c_char,
3374                    buf.len() as libc::size_t,
3375                )
3376            })?
3377        };
3378
3379        if size == 0 {
3380            Ok(ListxattrReply::Count(res as u32))
3381        } else {
3382            buf.truncate(res as usize);
3383
3384            if self.cfg.rewrite_security_xattrs {
3385                strip_xattr_prefix(&mut buf);
3386            }
3387            Ok(ListxattrReply::Names(buf))
3388        }
3389    }
3390
3391    fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
3392        let _trace = fs_trace!(self.tag, "removexattr", inode, name);
3393        // We don't allow the VM to set this xattr so we also pretend there is no value associated
3394        // with it.
3395        if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3396            return Err(io::Error::from_raw_os_error(libc::ENODATA));
3397        }
3398
3399        let data = self.find_inode(inode)?;
3400        let name = self.rewrite_xattr_name(name);
3401
3402        let file = data.file.lock();
3403        let o_path_file = (file.open_flags & libc::O_PATH) != 0;
3404        if o_path_file {
3405            // For files opened with `O_PATH`, we cannot call `fremovexattr` normally. Instead we
3406            // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
3407            // and then setting the CWD back to the root directory.
3408            let path = CString::new(format!("self/fd/{}", file.as_raw_descriptor()))
3409                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3410
3411            syscall!(self.with_proc_chdir(||
3412                    // SAFETY: this doesn't modify any memory and we check the return value.
3413                    unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) }))?;
3414        } else {
3415            // For regular files and directories, we can just use fremovexattr.
3416            syscall!(
3417                // SAFETY: this doesn't modify any memory and we check the return value.
3418                unsafe { libc::fremovexattr(file.as_raw_descriptor(), name.as_ptr()) }
3419            )?;
3420        }
3421
3422        Ok(())
3423    }
3424
3425    fn fallocate(
3426        &self,
3427        _ctx: Context,
3428        inode: Inode,
3429        handle: Handle,
3430        mode: u32,
3431        offset: u64,
3432        length: u64,
3433    ) -> io::Result<()> {
3434        let _trace = fs_trace!(self.tag, "fallocate", inode, handle, mode, offset, length);
3435
3436        let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
3437            let data = self.find_inode(inode)?;
3438
3439            {
3440                // fallocate needs a writable fd
3441                let mut file = data.file.lock();
3442                let mut flags = file.open_flags;
3443                match flags & libc::O_ACCMODE {
3444                    libc::O_RDONLY => {
3445                        flags &= !libc::O_RDONLY;
3446                        flags |= libc::O_RDWR;
3447
3448                        // We need to get a writable handle for this file.
3449                        let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
3450                        *file = OpenedFile::new(newfile, flags);
3451                    }
3452                    libc::O_WRONLY | libc::O_RDWR => {}
3453                    _ => panic!("Unexpected flags: {flags:#x}"),
3454                }
3455            }
3456
3457            data
3458        } else {
3459            self.find_handle(handle, inode)?
3460        };
3461
3462        let fd = data.as_raw_descriptor();
3463        // SAFETY: this doesn't modify any memory and we check the return value.
3464        syscall!(unsafe {
3465            libc::fallocate64(
3466                fd,
3467                mode as libc::c_int,
3468                offset as libc::off64_t,
3469                length as libc::off64_t,
3470            )
3471        })?;
3472
3473        Ok(())
3474    }
3475
3476    #[allow(clippy::unnecessary_cast)]
3477    fn ioctl<R: io::Read>(
3478        &self,
3479        ctx: Context,
3480        inode: Inode,
3481        handle: Handle,
3482        _flags: IoctlFlags,
3483        cmd: u32,
3484        _arg: u64,
3485        in_size: u32,
3486        out_size: u32,
3487        r: R,
3488    ) -> io::Result<IoctlReply> {
3489        let _trace = fs_trace!(self.tag, "ioctl", inode, handle, cmd, in_size, out_size);
3490
3491        match cmd as IoctlNr {
3492            FS_IOC_GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
3493            FS_IOC_FSGETXATTR => {
3494                if out_size < size_of::<fsxattr>() as u32 {
3495                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3496                } else {
3497                    self.get_fsxattr(inode, handle)
3498                }
3499            }
3500            FS_IOC_FSSETXATTR => {
3501                if in_size < size_of::<fsxattr>() as u32 {
3502                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3503                } else {
3504                    self.set_fsxattr(ctx, inode, handle, r)
3505                }
3506            }
3507            FS_IOC32_GETFLAGS | FS_IOC64_GETFLAGS => {
3508                if out_size < size_of::<c_int>() as u32 {
3509                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3510                } else {
3511                    self.get_flags(inode, handle)
3512                }
3513            }
3514            FS_IOC32_SETFLAGS | FS_IOC64_SETFLAGS => {
3515                if in_size < size_of::<c_int>() as u32 {
3516                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3517                } else {
3518                    self.set_flags(ctx, inode, handle, r)
3519                }
3520            }
3521            FS_IOC_ENABLE_VERITY => {
3522                if in_size < size_of::<fsverity_enable_arg>() as u32 {
3523                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3524                } else {
3525                    self.enable_verity(inode, handle, r)
3526                }
3527            }
3528            FS_IOC_MEASURE_VERITY => {
3529                if in_size < size_of::<fsverity_digest>() as u32
3530                    || out_size < size_of::<fsverity_digest>() as u32
3531                {
3532                    Err(io::Error::from_raw_os_error(libc::ENOMEM))
3533                } else {
3534                    self.measure_verity(inode, handle, r, out_size)
3535                }
3536            }
3537            // The following is ARCVM-specific ioctl
3538            // Refer go/remove-mount-passthrough-fuse for more design details
3539            #[cfg(feature = "arc_quota")]
3540            FS_IOC_SETPERMISSION => {
3541                if in_size != size_of::<FsPermissionDataBuffer>() as u32 {
3542                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3543                } else {
3544                    Ok(self.set_permission_by_path(r))
3545                }
3546            }
3547            #[cfg(feature = "arc_quota")]
3548            FS_IOC_SETPATHXATTR => {
3549                if in_size != size_of::<FsPathXattrDataBuffer>() as u32 {
3550                    Err(io::Error::from_raw_os_error(libc::EINVAL))
3551                } else {
3552                    Ok(self.set_xattr_by_path(r))
3553                }
3554            }
3555            _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
3556        }
3557    }
3558
3559    fn copy_file_range(
3560        &self,
3561        ctx: Context,
3562        inode_src: Inode,
3563        handle_src: Handle,
3564        offset_src: u64,
3565        inode_dst: Inode,
3566        handle_dst: Handle,
3567        offset_dst: u64,
3568        length: u64,
3569        flags: u64,
3570    ) -> io::Result<usize> {
3571        let _trace = fs_trace!(
3572            self.tag,
3573            "copy_file_range",
3574            inode_src,
3575            handle_src,
3576            offset_src,
3577            inode_dst,
3578            handle_dst,
3579            offset_dst,
3580            length,
3581            flags
3582        );
3583        let dst_inode_data = self.find_inode(inode_dst)?;
3584
3585        #[allow(unused_variables)]
3586        #[cfg(feature = "arc_quota")]
3587        let (uid, gid) = self.change_creds_for_path(&ctx, &dst_inode_data.path);
3588        #[cfg(feature = "fs_runtime_ugid_map")]
3589        let (uid, gid) = self.change_ugid_creds_for_path(&ctx, &dst_inode_data.path);
3590        #[cfg(not(feature = "fs_permission_translation"))]
3591        let (uid, gid) = (ctx.uid, ctx.gid);
3592
3593        // We need to change credentials during a write so that the kernel will remove setuid or
3594        // setgid bits from the file if it was written to by someone other than the owner.
3595        let (_uid, _gid) = set_creds(uid, gid)?;
3596        let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
3597            if self.zero_message_open.load(Ordering::Relaxed) {
3598                (self.find_inode(inode_src)?, dst_inode_data)
3599            } else {
3600                (
3601                    self.find_handle(handle_src, inode_src)?,
3602                    self.find_handle(handle_dst, inode_dst)?,
3603                )
3604            };
3605
3606        let src = src_data.as_raw_descriptor();
3607        let dst = dst_data.as_raw_descriptor();
3608
3609        Ok(syscall!(
3610            // SAFETY: this call is safe because it doesn't modify any memory and we
3611            // check the return value.
3612            unsafe {
3613                libc::syscall(
3614                    libc::SYS_copy_file_range,
3615                    src,
3616                    &offset_src,
3617                    dst,
3618                    &offset_dst,
3619                    length,
3620                    flags,
3621                )
3622            }
3623        )? as usize)
3624    }
3625
3626    fn set_up_mapping<M: Mapper>(
3627        &self,
3628        _ctx: Context,
3629        inode: Self::Inode,
3630        _handle: Self::Handle,
3631        file_offset: u64,
3632        mem_offset: u64,
3633        size: usize,
3634        prot: u32,
3635        mapper: M,
3636    ) -> io::Result<()> {
3637        let _trace = fs_trace!(
3638            self.tag,
3639            "set_up_mapping",
3640            inode,
3641            file_offset,
3642            mem_offset,
3643            size,
3644            prot
3645        );
3646        if !self.cfg.use_dax {
3647            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3648        }
3649
3650        let read = prot & libc::PROT_READ as u32 != 0;
3651        let write = prot & libc::PROT_WRITE as u32 != 0;
3652        let (mmap_flags, prot) = match (read, write) {
3653            (true, true) => (libc::O_RDWR, Protection::read_write()),
3654            (true, false) => (libc::O_RDONLY, Protection::read()),
3655            // Write-only is mapped to O_RDWR since mmap always requires an fd opened for reading.
3656            (false, true) => (libc::O_RDWR, Protection::write()),
3657            (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
3658        };
3659
3660        let data = self.find_inode(inode)?;
3661
3662        if self.zero_message_open.load(Ordering::Relaxed) {
3663            let mut file = data.file.lock();
3664            let mut open_flags = file.open_flags;
3665            match (mmap_flags, open_flags & libc::O_ACCMODE) {
3666                (libc::O_RDONLY, libc::O_WRONLY)
3667                | (libc::O_RDWR, libc::O_RDONLY)
3668                | (libc::O_RDWR, libc::O_WRONLY) => {
3669                    // We have a read-only or write-only fd and we need to upgrade it.
3670                    open_flags &= !libc::O_ACCMODE;
3671                    open_flags |= libc::O_RDWR;
3672
3673                    let newfile = self.open_fd(file.as_raw_descriptor(), libc::O_RDWR)?;
3674                    *file = OpenedFile::new(newfile, open_flags);
3675                }
3676                (libc::O_RDONLY, libc::O_RDONLY)
3677                | (libc::O_RDONLY, libc::O_RDWR)
3678                | (libc::O_RDWR, libc::O_RDWR) => {}
3679                (m, o) => panic!("Unexpected combination of access flags: ({m:#x}, {o:#x})"),
3680            }
3681            mapper.map(mem_offset, size, file.file(), file_offset, prot)
3682        } else {
3683            let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
3684            mapper.map(mem_offset, size, &file, file_offset, prot)
3685        }
3686    }
3687
3688    fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
3689        let _trace = fs_trace!(self.tag, "remove_mapping", msgs);
3690        if !self.cfg.use_dax {
3691            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3692        }
3693
3694        for RemoveMappingOne { moffset, len } in msgs {
3695            mapper.unmap(*moffset, *len)?;
3696        }
3697        Ok(())
3698    }
3699
3700    fn atomic_open(
3701        &self,
3702        ctx: Context,
3703        parent: Self::Inode,
3704        name: &CStr,
3705        mode: u32,
3706        flags: u32,
3707        umask: u32,
3708        security_ctx: Option<&CStr>,
3709    ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)> {
3710        let _trace = fs_trace!(
3711            self.tag,
3712            "atomic_open",
3713            parent,
3714            name,
3715            mode,
3716            flags,
3717            umask,
3718            security_ctx
3719        );
3720        // Perform lookup but not create negative dentry
3721        let data = self.find_inode(parent)?;
3722
3723        #[allow(unused_variables)]
3724        #[cfg(feature = "arc_quota")]
3725        let (uid, gid) = self.change_creds(&ctx, &data, name);
3726        #[cfg(feature = "fs_runtime_ugid_map")]
3727        let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3728        #[cfg(not(feature = "fs_permission_translation"))]
3729        let (uid, gid) = (ctx.uid, ctx.gid);
3730
3731        let (_uid, _gid) = set_creds(uid, gid)?;
3732
3733        // This lookup serves two purposes:
3734        // 1. If the O_CREATE flag is not set, it retrieves the d_entry for the file.
3735        // 2. If the O_CREATE flag is set, it checks whether the file exists.
3736        let res = self.do_lookup_with_casefold_fallback(&data, name);
3737
3738        if let Err(e) = res {
3739            if e.kind() == std::io::ErrorKind::NotFound && (flags as i32 & libc::O_CREAT) != 0 {
3740                // If the file did not exist & O_CREAT is set,
3741                // create file & set FILE_CREATED bits in open options
3742                let (entry, handler, mut opts) =
3743                    self.create(ctx, parent, name, mode, flags, umask, security_ctx)?;
3744                opts |= OpenOptions::FILE_CREATED;
3745                return Ok((entry, handler, opts));
3746            } else if e.kind() == std::io::ErrorKind::NotFound
3747                && !self.cfg.negative_timeout.is_zero()
3748            {
3749                return Ok((
3750                    Entry::new_negative(self.cfg.negative_timeout),
3751                    None,
3752                    OpenOptions::empty(),
3753                ));
3754            }
3755            return Err(e);
3756        }
3757
3758        // SAFETY: checked res is not error before
3759        let entry = res.unwrap();
3760
3761        if entry.attr.st_mode & libc::S_IFMT == libc::S_IFLNK {
3762            return Ok((entry, None, OpenOptions::empty()));
3763        }
3764
3765        if (flags as i32 & (libc::O_CREAT | libc::O_EXCL)) == (libc::O_CREAT | libc::O_EXCL) {
3766            return Err(eexist());
3767        }
3768
3769        let (handler, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
3770            (None, OpenOptions::KEEP_CACHE)
3771        } else {
3772            let (handler, opts) = self.do_open(entry.inode, flags)?;
3773            (handler, opts)
3774        };
3775        Ok((entry, handler, opts))
3776    }
3777}
3778
3779#[cfg(test)]
3780mod tests {
3781    use std::path::Path;
3782
3783    use named_lock::NamedLock;
3784    use tempfile::TempDir;
3785
3786    use super::*;
3787    #[cfg(feature = "arc_quota")]
3788    use crate::virtio::fs::arc_ioctl::FS_IOCTL_PATH_MAX_LEN;
3789    #[cfg(feature = "arc_quota")]
3790    use crate::virtio::fs::arc_ioctl::FS_IOCTL_XATTR_NAME_MAX_LEN;
3791    #[cfg(feature = "arc_quota")]
3792    use crate::virtio::fs::arc_ioctl::FS_IOCTL_XATTR_VALUE_MAX_LEN;
3793
3794    const UNITTEST_LOCK_NAME: &str = "passthroughfs_unittest_lock";
3795
3796    // Create an instance of `Context` with valid uid, gid, and pid.
3797    // The correct ids are necessary for test cases where new files are created.
3798    fn get_context() -> Context {
3799        // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3800        // guarantees that they can never fail.
3801        let uid = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
3802        // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3803        // guarantees that they can never fail.
3804        let gid = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
3805        let pid = std::process::id() as libc::pid_t;
3806        Context { uid, gid, pid }
3807    }
3808
3809    /// Creates the given directories and files under `temp_dir`.
3810    fn create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str]) {
3811        let path = temp_dir.path();
3812
3813        for d in dirs {
3814            std::fs::create_dir_all(path.join(d)).unwrap();
3815        }
3816
3817        for f in files {
3818            File::create(path.join(f)).unwrap();
3819        }
3820    }
3821
3822    /// Looks up the given `path` in `fs`.
3823    fn lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode> {
3824        let mut inode = 1;
3825        let ctx = get_context();
3826        for name in path.iter() {
3827            let name = CString::new(name.to_str().unwrap()).unwrap();
3828            let ent = match fs.lookup(ctx, inode, &name) {
3829                Ok(ent) => ent,
3830                Err(e) => {
3831                    return Err(e);
3832                }
3833            };
3834            inode = ent.inode;
3835        }
3836        Ok(inode)
3837    }
3838
3839    /// Looks up the given `path` in `fs`.
3840    #[cfg(feature = "arc_quota")]
3841    fn lookup_ent(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
3842        let mut inode = 1;
3843        let ctx = get_context();
3844        let mut entry = Entry::new_negative(Duration::from_secs(10));
3845        for name in path.iter() {
3846            let name = CString::new(name.to_str().unwrap()).unwrap();
3847            entry = match fs.lookup(ctx, inode, &name) {
3848                Ok(ent) => ent,
3849                Err(e) => {
3850                    return Err(e);
3851                }
3852            };
3853            inode = entry.inode;
3854        }
3855        Ok(entry)
3856    }
3857
3858    /// Creates a file at the given `path`.
3859    fn create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
3860        let parent = path.parent().unwrap();
3861        let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3862        let parent_inode = lookup(fs, parent)?;
3863        let ctx = get_context();
3864        let security_ctx = None;
3865        fs.create(
3866            ctx,
3867            parent_inode,
3868            &filename,
3869            0o666,
3870            libc::O_RDWR as u32,
3871            0,
3872            security_ctx,
3873        )
3874        .map(|(entry, _, _)| entry)
3875    }
3876
3877    /// Removes a file at the given `path`.
3878    fn unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3879        let parent = path.parent().unwrap();
3880        let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3881        let parent_inode = lookup(fs, parent)?;
3882        let ctx = get_context();
3883        fs.unlink(ctx, parent_inode, &filename)
3884    }
3885
3886    /// Forgets cache.
3887    fn forget(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3888        let ctx = get_context();
3889        let inode = lookup(fs, path)?;
3890        // Pass `u64::MAX` to ensure that the refcount goes to 0 and we forget inode.
3891        fs.forget(ctx, inode, u64::MAX);
3892        Ok(())
3893    }
3894
3895    /// Looks up and open the given `path` in `fs`.
3896    fn atomic_open(
3897        fs: &PassthroughFs,
3898        path: &Path,
3899        mode: u32,
3900        flags: u32,
3901        umask: u32,
3902        security_ctx: Option<&CStr>,
3903    ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
3904        let mut inode = 1;
3905        let ctx = get_context();
3906
3907        let path_vec: Vec<_> = path.iter().collect();
3908        let vec_len = path_vec.len();
3909
3910        // Do lookup before util (vec_len-1)-th pathname, this operation is to simulate
3911        // the behavior of VFS, since when VFS call atomic_open only at last look up.
3912        for name in &path_vec[0..vec_len - 1] {
3913            let name = CString::new(name.to_str().unwrap()).unwrap();
3914            let ent = fs.lookup(ctx, inode, &name)?;
3915            inode = ent.inode;
3916        }
3917
3918        let name = CString::new(path_vec[vec_len - 1].to_str().unwrap()).unwrap();
3919
3920        fs.atomic_open(ctx, inode, &name, mode, flags, umask, security_ctx)
3921    }
3922
3923    fn symlink(
3924        fs: &PassthroughFs,
3925        linkname: &Path,
3926        name: &Path,
3927        security_ctx: Option<&CStr>,
3928    ) -> io::Result<Entry> {
3929        let inode = 1;
3930        let ctx = get_context();
3931        let name = CString::new(name.to_str().unwrap()).unwrap();
3932        let linkname = CString::new(linkname.to_str().unwrap()).unwrap();
3933        fs.symlink(ctx, &linkname, inode, &name, security_ctx)
3934    }
3935
3936    // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
3937    #[cfg(feature = "arc_quota")]
3938    fn fs_ioc_setpermission<R: io::Read>(
3939        fs: &PassthroughFs,
3940        in_size: u32,
3941        r: R,
3942    ) -> io::Result<IoctlReply> {
3943        let ctx = get_context();
3944        fs.ioctl(
3945            ctx,
3946            0,
3947            0,
3948            IoctlFlags::empty(),
3949            FS_IOC_SETPERMISSION as u32,
3950            0,
3951            in_size,
3952            0,
3953            r,
3954        )
3955    }
3956
3957    // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
3958    #[cfg(feature = "arc_quota")]
3959    fn fs_ioc_setpathxattr<R: io::Read>(
3960        fs: &PassthroughFs,
3961        in_size: u32,
3962        r: R,
3963    ) -> io::Result<IoctlReply> {
3964        let ctx = get_context();
3965        fs.ioctl(
3966            ctx,
3967            0,
3968            0,
3969            IoctlFlags::empty(),
3970            FS_IOC_SETPATHXATTR as u32,
3971            0,
3972            in_size,
3973            0,
3974            r,
3975        )
3976    }
3977
3978    #[test]
3979    fn rewrite_xattr_names() {
3980        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3981        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3982        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3983        let _guard = lock.lock().expect("acquire named lock");
3984
3985        let cfg = Config {
3986            rewrite_security_xattrs: true,
3987            ..Default::default()
3988        };
3989
3990        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
3991
3992        // Selinux shouldn't get overwritten.
3993        let selinux = c"security.selinux";
3994        assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
3995
3996        // user, trusted, and system should not be changed either.
3997        let user = c"user.foobar";
3998        assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
3999        let trusted = c"trusted.foobar";
4000        assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
4001        let system = c"system.foobar";
4002        assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
4003
4004        // sehash should be re-written.
4005        let sehash = c"security.sehash";
4006        assert_eq!(
4007            p.rewrite_xattr_name(sehash).to_bytes(),
4008            b"user.virtiofs.security.sehash"
4009        );
4010    }
4011
4012    #[test]
4013    fn strip_xattr_names() {
4014        let only_nuls = b"\0\0\0\0\0";
4015        let mut actual = only_nuls.to_vec();
4016        strip_xattr_prefix(&mut actual);
4017        assert_eq!(&actual[..], &only_nuls[..]);
4018
4019        let no_nuls = b"security.sehashuser.virtiofs";
4020        let mut actual = no_nuls.to_vec();
4021        strip_xattr_prefix(&mut actual);
4022        assert_eq!(&actual[..], &no_nuls[..]);
4023
4024        let empty = b"";
4025        let mut actual = empty.to_vec();
4026        strip_xattr_prefix(&mut actual);
4027        assert_eq!(&actual[..], &empty[..]);
4028
4029        let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
4030        let mut actual = no_strippable_names.to_vec();
4031        strip_xattr_prefix(&mut actual);
4032        assert_eq!(&actual[..], &no_strippable_names[..]);
4033
4034        let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
4035        let mut actual = only_strippable_names.to_vec();
4036        strip_xattr_prefix(&mut actual);
4037        assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
4038
4039        let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
4040        let mut actual = mixed_names.to_vec();
4041        strip_xattr_prefix(&mut actual);
4042        let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
4043        assert_eq!(&actual[..], &expected[..]);
4044
4045        let no_nul_with_prefix = b"user.virtiofs.security.sehash";
4046        let mut actual = no_nul_with_prefix.to_vec();
4047        strip_xattr_prefix(&mut actual);
4048        assert_eq!(&actual[..], b"security.sehash");
4049    }
4050
4051    #[test]
4052    fn lookup_files() {
4053        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4054        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4055        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4056        let _guard = lock.lock().expect("acquire named lock");
4057
4058        let temp_dir = TempDir::new().unwrap();
4059        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
4060
4061        let cfg = Default::default();
4062        let fs = PassthroughFs::new("tag", cfg).unwrap();
4063
4064        let capable = FsOptions::empty();
4065        fs.init(capable).unwrap();
4066
4067        assert!(lookup(&fs, &temp_dir.path().join("a.txt")).is_ok());
4068        assert!(lookup(&fs, &temp_dir.path().join("dir")).is_ok());
4069        assert!(lookup(&fs, &temp_dir.path().join("dir/b.txt")).is_ok());
4070
4071        assert_eq!(
4072            lookup(&fs, &temp_dir.path().join("nonexistent-file"))
4073                .expect_err("file must not exist")
4074                .kind(),
4075            io::ErrorKind::NotFound
4076        );
4077        // "A.txt" is different from "a.txt".
4078        assert_eq!(
4079            lookup(&fs, &temp_dir.path().join("A.txt"))
4080                .expect_err("file must not exist")
4081                .kind(),
4082            io::ErrorKind::NotFound
4083        );
4084    }
4085
4086    #[test]
4087    fn lookup_files_ascii_casefold() {
4088        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4089        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4090        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4091        let _guard = lock.lock().expect("acquire named lock");
4092
4093        let temp_dir = TempDir::new().unwrap();
4094        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
4095
4096        let cfg = Config {
4097            ascii_casefold: true,
4098            ..Default::default()
4099        };
4100        let fs = PassthroughFs::new("tag", cfg).unwrap();
4101
4102        let capable = FsOptions::empty();
4103        fs.init(capable).unwrap();
4104
4105        // Ensure that "A.txt" is equated with "a.txt".
4106        let a_inode = lookup(&fs, &temp_dir.path().join("a.txt")).expect("a.txt must be found");
4107        assert_eq!(
4108            lookup(&fs, &temp_dir.path().join("A.txt")).expect("A.txt must exist"),
4109            a_inode
4110        );
4111
4112        let dir_inode = lookup(&fs, &temp_dir.path().join("dir")).expect("dir must be found");
4113        assert_eq!(
4114            lookup(&fs, &temp_dir.path().join("DiR")).expect("DiR must exist"),
4115            dir_inode
4116        );
4117
4118        let b_inode =
4119            lookup(&fs, &temp_dir.path().join("dir/b.txt")).expect("dir/b.txt must be found");
4120        assert_eq!(
4121            lookup(&fs, &temp_dir.path().join("dIr/B.TxT")).expect("dIr/B.TxT must exist"),
4122            b_inode
4123        );
4124
4125        assert_eq!(
4126            lookup(&fs, &temp_dir.path().join("nonexistent-file"))
4127                .expect_err("file must not exist")
4128                .kind(),
4129            io::ErrorKind::NotFound
4130        );
4131    }
4132
4133    fn test_create_and_remove(ascii_casefold: bool) {
4134        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4135        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4136        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4137        let _guard = lock.lock().expect("acquire named lock");
4138
4139        let temp_dir = TempDir::new().unwrap();
4140        let timeout = Duration::from_millis(10);
4141        let cfg = Config {
4142            timeout,
4143            cache_policy: CachePolicy::Auto,
4144            ascii_casefold,
4145            ..Default::default()
4146        };
4147        let fs = PassthroughFs::new("tag", cfg).unwrap();
4148
4149        let capable = FsOptions::empty();
4150        fs.init(capable).unwrap();
4151
4152        // Create a.txt and b.txt.
4153        let a_path = temp_dir.path().join("a.txt");
4154        let b_path = temp_dir.path().join("b.txt");
4155        let a_entry = create(&fs, &a_path).expect("create a.txt");
4156        let b_entry = create(&fs, &b_path).expect("create b.txt");
4157        assert_eq!(
4158            a_entry.inode,
4159            lookup(&fs, &a_path).expect("lookup a.txt"),
4160            "Created file 'a.txt' must be looked up"
4161        );
4162        assert_eq!(
4163            b_entry.inode,
4164            lookup(&fs, &b_path).expect("lookup b.txt"),
4165            "Created file 'b.txt' must be looked up"
4166        );
4167
4168        // Remove a.txt only
4169        unlink(&fs, &a_path).expect("Remove");
4170        assert_eq!(
4171            lookup(&fs, &a_path)
4172                .expect_err("file must not exist")
4173                .kind(),
4174            io::ErrorKind::NotFound,
4175            "a.txt must be removed"
4176        );
4177        // "A.TXT" must not be found regardless of whether casefold is enabled or not.
4178        let upper_a_path = temp_dir.path().join("A.TXT");
4179        assert_eq!(
4180            lookup(&fs, &upper_a_path)
4181                .expect_err("file must not exist")
4182                .kind(),
4183            io::ErrorKind::NotFound,
4184            "A.txt must be removed"
4185        );
4186
4187        // Check if the host file system doesn't have a.txt but does b.txt.
4188        assert!(!a_path.exists(), "a.txt must be removed");
4189        assert!(b_path.exists(), "b.txt must exist");
4190    }
4191
4192    #[test]
4193    fn create_and_remove() {
4194        test_create_and_remove(false /* casefold */);
4195    }
4196
4197    #[test]
4198    fn create_and_remove_casefold() {
4199        test_create_and_remove(true /* casefold */);
4200    }
4201
4202    fn test_create_and_forget(ascii_casefold: bool) {
4203        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4204        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4205        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4206        let _guard = lock.lock().expect("acquire named lock");
4207
4208        let temp_dir = TempDir::new().unwrap();
4209        let timeout = Duration::from_millis(10);
4210        let cfg = Config {
4211            timeout,
4212            cache_policy: CachePolicy::Auto,
4213            ascii_casefold,
4214            ..Default::default()
4215        };
4216        let fs = PassthroughFs::new("tag", cfg).unwrap();
4217
4218        let capable = FsOptions::empty();
4219        fs.init(capable).unwrap();
4220
4221        // Create a.txt.
4222        let a_path = temp_dir.path().join("a.txt");
4223        let a_entry = create(&fs, &a_path).expect("create a.txt");
4224        assert_eq!(
4225            a_entry.inode,
4226            lookup(&fs, &a_path).expect("lookup a.txt"),
4227            "Created file 'a.txt' must be looked up"
4228        );
4229
4230        // Forget a.txt's inode from PassthroughFs's internal cache.
4231        forget(&fs, &a_path).expect("forget a.txt");
4232
4233        if ascii_casefold {
4234            let upper_a_path = temp_dir.path().join("A.TXT");
4235            let new_a_inode = lookup(&fs, &upper_a_path).expect("lookup a.txt");
4236            assert_ne!(
4237                a_entry.inode, new_a_inode,
4238                "inode must be changed after forget()"
4239            );
4240            assert_eq!(
4241                new_a_inode,
4242                lookup(&fs, &a_path).expect("lookup a.txt"),
4243                "inode must be same for a.txt and A.TXT"
4244            );
4245        } else {
4246            assert_ne!(
4247                a_entry.inode,
4248                lookup(&fs, &a_path).expect("lookup a.txt"),
4249                "inode must be changed after forget()"
4250            );
4251        }
4252    }
4253
4254    #[test]
4255    fn create_and_forget() {
4256        test_create_and_forget(false /* ascii_casefold */);
4257    }
4258
4259    #[test]
4260    fn create_and_forget_casefold() {
4261        test_create_and_forget(true /* ascii_casefold */);
4262    }
4263
4264    #[test]
4265    fn casefold_lookup_cache() {
4266        let temp_dir = TempDir::new().unwrap();
4267        // Prepare `a.txt` before starting the test.
4268        create_test_data(&temp_dir, &[], &["a.txt"]);
4269
4270        let cfg = Config {
4271            ascii_casefold: true,
4272            ..Default::default()
4273        };
4274        let fs = PassthroughFs::new("tag", cfg).unwrap();
4275
4276        let capable = FsOptions::empty();
4277        fs.init(capable).unwrap();
4278
4279        let parent = lookup(&fs, temp_dir.path()).expect("lookup temp_dir");
4280
4281        // Since `a.txt` exists, "A.TXT" must exist.
4282        let large_a_path = temp_dir.path().join("A.TXT");
4283        // Looking up "A.TXT" must create a CasefoldCache entry.
4284        lookup(&fs, &large_a_path).expect("A.TXT must exist");
4285        assert!(fs.exists_in_casefold_cache(parent, &CString::new("A.TXT").unwrap()));
4286
4287        // Create b.txt.
4288        let b_path = temp_dir.path().join("b.txt");
4289        create(&fs, &b_path).expect("create b.txt");
4290        // Then, b.txt must exists in the cache.
4291        assert!(fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
4292        // When removing b.txt, it must be removed from the cache as well.
4293        unlink(&fs, &b_path).expect("remove b.txt");
4294        assert!(!fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
4295    }
4296
4297    #[test]
4298    fn lookup_negative_cache() {
4299        let temp_dir = TempDir::new().unwrap();
4300        // Prepare `a.txt` before starting the test.
4301        create_test_data(&temp_dir, &[], &[]);
4302
4303        let cfg = Config {
4304            negative_timeout: Duration::from_secs(5),
4305            ..Default::default()
4306        };
4307        let fs = PassthroughFs::new("tag", cfg).unwrap();
4308
4309        let capable = FsOptions::empty();
4310        fs.init(capable).unwrap();
4311
4312        let a_path = temp_dir.path().join("a.txt");
4313        // a.txt hasn't existed yet.
4314        // Since negative_timeout is enabled, success with inode=0 is expected.
4315        assert_eq!(
4316            0,
4317            lookup(&fs, &a_path).expect("lookup a.txt"),
4318            "Entry with inode=0 is expected for non-existing file 'a.txt'"
4319        );
4320        // Create a.txt
4321        let a_entry = create(&fs, &a_path).expect("create a.txt");
4322        assert_eq!(
4323            a_entry.inode,
4324            lookup(&fs, &a_path).expect("lookup a.txt"),
4325            "Created file 'a.txt' must be looked up"
4326        );
4327        // Remove a.txt
4328        unlink(&fs, &a_path).expect("Remove");
4329        assert_eq!(
4330            0,
4331            lookup(&fs, &a_path).expect("lookup a.txt"),
4332            "Entry with inode=0 is expected for the removed file 'a.txt'"
4333        );
4334    }
4335    #[test]
4336    fn test_atomic_open_existing_file() {
4337        atomic_open_existing_file(false);
4338    }
4339
4340    #[test]
4341    fn test_atomic_open_existing_file_zero_message() {
4342        atomic_open_existing_file(true);
4343    }
4344
4345    fn atomic_open_existing_file(zero_message_open: bool) {
4346        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4347        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4348        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4349        let _guard = lock.lock().expect("acquire named lock");
4350
4351        let temp_dir = TempDir::new().unwrap();
4352        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt", "dir/c.txt"]);
4353
4354        let cache_policy = match zero_message_open {
4355            true => CachePolicy::Always,
4356            false => CachePolicy::Auto,
4357        };
4358
4359        let cfg = Config {
4360            cache_policy,
4361            ..Default::default()
4362        };
4363        let fs = PassthroughFs::new("tag", cfg).unwrap();
4364
4365        let capable = FsOptions::ZERO_MESSAGE_OPEN;
4366        fs.init(capable).unwrap();
4367
4368        // atomic_open with flag O_RDWR, should return positive dentry and file handler
4369        let res = atomic_open(
4370            &fs,
4371            &temp_dir.path().join("a.txt"),
4372            0o666,
4373            libc::O_RDWR as u32,
4374            0,
4375            None,
4376        );
4377        assert!(res.is_ok());
4378        let (entry, handler, open_options) = res.unwrap();
4379        assert_ne!(entry.inode, 0);
4380
4381        if zero_message_open {
4382            assert!(handler.is_none());
4383            assert_eq!(open_options, OpenOptions::KEEP_CACHE);
4384        } else {
4385            assert!(handler.is_some());
4386            assert_ne!(
4387                open_options & OpenOptions::FILE_CREATED,
4388                OpenOptions::FILE_CREATED
4389            );
4390        }
4391
4392        // atomic_open with flag O_RDWR |  O_CREATE, should return positive dentry and file handler
4393        let res = atomic_open(
4394            &fs,
4395            &temp_dir.path().join("dir/b.txt"),
4396            0o666,
4397            (libc::O_RDWR | libc::O_CREAT) as u32,
4398            0,
4399            None,
4400        );
4401        assert!(res.is_ok());
4402        let (entry, handler, open_options) = res.unwrap();
4403        assert_ne!(entry.inode, 0);
4404
4405        if zero_message_open {
4406            assert!(handler.is_none());
4407            assert_eq!(open_options, OpenOptions::KEEP_CACHE);
4408        } else {
4409            assert!(handler.is_some());
4410            assert_ne!(
4411                open_options & OpenOptions::FILE_CREATED,
4412                OpenOptions::FILE_CREATED
4413            );
4414        }
4415
4416        // atomic_open with flag O_RDWR | O_CREATE | O_EXCL, should return positive dentry and file
4417        // handler
4418        let res = atomic_open(
4419            &fs,
4420            &temp_dir.path().join("dir/c.txt"),
4421            0o666,
4422            (libc::O_RDWR | libc::O_CREAT | libc::O_EXCL) as u32,
4423            0,
4424            None,
4425        );
4426        assert!(res.is_err());
4427        let err_kind = res.unwrap_err().kind();
4428        assert_eq!(err_kind, io::ErrorKind::AlreadyExists);
4429    }
4430
4431    #[test]
4432    fn test_atomic_open_non_existing_file() {
4433        atomic_open_non_existing_file(false);
4434    }
4435
4436    #[test]
4437    fn test_atomic_open_non_existing_file_zero_message() {
4438        atomic_open_non_existing_file(true);
4439    }
4440
4441    fn atomic_open_non_existing_file(zero_message_open: bool) {
4442        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4443        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4444        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4445        let _guard = lock.lock().expect("acquire named lock");
4446
4447        let temp_dir = TempDir::new().unwrap();
4448
4449        let cache_policy = match zero_message_open {
4450            true => CachePolicy::Always,
4451            false => CachePolicy::Auto,
4452        };
4453
4454        let cfg = Config {
4455            cache_policy,
4456            ..Default::default()
4457        };
4458        let fs = PassthroughFs::new("tag", cfg).unwrap();
4459
4460        let capable = FsOptions::ZERO_MESSAGE_OPEN;
4461        fs.init(capable).unwrap();
4462
4463        // atomic_open with flag O_RDWR, should return NO_EXIST error
4464        let res = atomic_open(
4465            &fs,
4466            &temp_dir.path().join("a.txt"),
4467            0o666,
4468            libc::O_RDWR as u32,
4469            0,
4470            None,
4471        );
4472        assert!(res.is_err());
4473        let err_kind = res.unwrap_err().kind();
4474        assert_eq!(err_kind, io::ErrorKind::NotFound);
4475
4476        // atomic_open with flag O_RDWR | O_CREATE, should return positive dentry and file handler
4477        let res = atomic_open(
4478            &fs,
4479            &temp_dir.path().join("b.txt"),
4480            0o666,
4481            (libc::O_RDWR | libc::O_CREAT) as u32,
4482            0,
4483            None,
4484        );
4485        assert!(res.is_ok());
4486        let (entry, handler, open_options) = res.unwrap();
4487        assert_ne!(entry.inode, 0);
4488
4489        if zero_message_open {
4490            assert!(handler.is_none());
4491            assert_eq!(
4492                open_options & OpenOptions::KEEP_CACHE,
4493                OpenOptions::KEEP_CACHE
4494            );
4495        } else {
4496            assert!(handler.is_some());
4497        }
4498        assert_eq!(
4499            open_options & OpenOptions::FILE_CREATED,
4500            OpenOptions::FILE_CREATED
4501        );
4502    }
4503
4504    #[test]
4505    fn atomic_open_symbol_link() {
4506        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4507        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4508        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4509        let _guard = lock.lock().expect("acquire named lock");
4510
4511        let temp_dir = TempDir::new().unwrap();
4512        create_test_data(&temp_dir, &["dir"], &["a.txt"]);
4513
4514        let cfg = Default::default();
4515        let fs = PassthroughFs::new("tag", cfg).unwrap();
4516
4517        let capable = FsOptions::empty();
4518        fs.init(capable).unwrap();
4519
4520        // atomic open the link destination file
4521        let res_dst = atomic_open(
4522            &fs,
4523            &temp_dir.path().join("a.txt"),
4524            0o666,
4525            libc::O_RDWR as u32,
4526            0,
4527            None,
4528        );
4529        assert!(res_dst.is_ok());
4530        let (entry_dst, handler_dst, _) = res_dst.unwrap();
4531        assert_ne!(entry_dst.inode, 0);
4532        assert!(handler_dst.is_some());
4533
4534        // create depth 1 symbol link
4535        let sym1_res = symlink(
4536            &fs,
4537            &temp_dir.path().join("a.txt"),
4538            &temp_dir.path().join("blink"),
4539            None,
4540        );
4541        assert!(sym1_res.is_ok());
4542        let sym1_entry = sym1_res.unwrap();
4543        assert_ne!(sym1_entry.inode, 0);
4544
4545        // atomic_open symbol link, should return dentry with no handler
4546        let res = atomic_open(
4547            &fs,
4548            &temp_dir.path().join("blink"),
4549            0o666,
4550            libc::O_RDWR as u32,
4551            0,
4552            None,
4553        );
4554        assert!(res.is_ok());
4555        let (entry, handler, open_options) = res.unwrap();
4556        assert_eq!(entry.inode, sym1_entry.inode);
4557        assert!(handler.is_none());
4558        assert_eq!(open_options, OpenOptions::empty());
4559
4560        // delete link destination
4561        unlink(&fs, &temp_dir.path().join("a.txt")).expect("Remove");
4562        assert_eq!(
4563            lookup(&fs, &temp_dir.path().join("a.txt"))
4564                .expect_err("file must not exist")
4565                .kind(),
4566            io::ErrorKind::NotFound,
4567            "a.txt must be removed"
4568        );
4569
4570        // after link destination removed, should still return valid dentry
4571        let res = atomic_open(
4572            &fs,
4573            &temp_dir.path().join("blink"),
4574            0o666,
4575            libc::O_RDWR as u32,
4576            0,
4577            None,
4578        );
4579        assert!(res.is_ok());
4580        let (entry, handler, open_options) = res.unwrap();
4581        assert_eq!(entry.inode, sym1_entry.inode);
4582        assert!(handler.is_none());
4583        assert_eq!(open_options, OpenOptions::empty());
4584    }
4585
4586    #[test]
4587    #[cfg(feature = "arc_quota")]
4588    fn set_permission_ioctl_valid_data() {
4589        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4590        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4591        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4592        let _guard = lock.lock().expect("acquire named lock");
4593
4594        let cfg = Config {
4595            max_dynamic_perm: 1,
4596            ..Default::default()
4597        };
4598        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4599
4600        let perm_path_string = String::from("/test");
4601        let fs_permission_data_buffer = FsPermissionDataBuffer {
4602            guest_uid: 1,
4603            guest_gid: 2,
4604            host_uid: 3,
4605            host_gid: 4,
4606            umask: 5,
4607            pad: 0,
4608            perm_path: {
4609                let mut perm_path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4610                perm_path[..perm_path_string.len()].copy_from_slice(perm_path_string.as_bytes());
4611                perm_path
4612            },
4613        };
4614        let r = std::io::Cursor::new(fs_permission_data_buffer.as_bytes());
4615
4616        let res = fs_ioc_setpermission(
4617            &p,
4618            mem::size_of_val(&fs_permission_data_buffer) as u32,
4619            r.clone(),
4620        )
4621        .expect("valid input should get IoctlReply");
4622        assert!(matches!(res, IoctlReply::Done(Ok(data)) if data.is_empty()));
4623
4624        let read_guard = p
4625            .permission_paths
4626            .read()
4627            .expect("read permission_paths failed");
4628        let permission_data = read_guard
4629            .first()
4630            .expect("permission path should not be empty");
4631
4632        // Check expected data item is added to permission_paths.
4633        let expected_data = PermissionData {
4634            guest_uid: 1,
4635            guest_gid: 2,
4636            host_uid: 3,
4637            host_gid: 4,
4638            umask: 5,
4639            perm_path: perm_path_string,
4640        };
4641        assert_eq!(*permission_data, expected_data);
4642
4643        // Second ioctl should not succeed since max_dynamic_perm is set to 1
4644        let res = fs_ioc_setpermission(
4645            &p,
4646            mem::size_of_val(&fs_permission_data_buffer) as u32,
4647            r.clone(),
4648        )
4649        .expect("valid input should get IoctlReply");
4650        assert!(
4651            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4652                errno == libc::EPERM
4653            }))
4654        );
4655    }
4656
4657    #[test]
4658    #[cfg(feature = "arc_quota")]
4659    fn set_permission_ioctl_invalid_data() {
4660        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4661        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4662        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4663        let _guard = lock.lock().expect("acquire named lock");
4664
4665        let cfg = Config {
4666            max_dynamic_perm: 1,
4667            ..Default::default()
4668        };
4669        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4670
4671        // The perm_path is not valid since it does not start with /.
4672        let perm_path_string = String::from("test");
4673        let fs_permission_data_buffer = FsPermissionDataBuffer {
4674            guest_uid: 1,
4675            guest_gid: 2,
4676            host_uid: 3,
4677            host_gid: 4,
4678            umask: 5,
4679            pad: 0,
4680            perm_path: {
4681                let mut perm_path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4682                perm_path[..perm_path_string.len()].copy_from_slice(perm_path_string.as_bytes());
4683                perm_path
4684            },
4685        };
4686
4687        let r = std::io::Cursor::new(fs_permission_data_buffer.as_bytes());
4688        // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
4689        // This call is supposed to get EINVAL ioctlReply, since the perm_path is invalid.
4690        let res = fs_ioc_setpermission(&p, mem::size_of_val(&fs_permission_data_buffer) as u32, r)
4691            .expect("invalid perm_path should get IoctlReply");
4692        assert!(
4693            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4694                errno == libc::EINVAL
4695            }))
4696        );
4697
4698        let fake_data_buffer: [u8; 128] = [0; 128];
4699        let r = std::io::Cursor::new(fake_data_buffer.as_bytes());
4700
4701        // This call is supposed to get EINVAL ioctlReply, since the in_size is not the size of
4702        // struct FsPermissionDataBuffer.
4703        let res = fs_ioc_setpermission(&p, mem::size_of_val(&fake_data_buffer) as u32, r)
4704            .expect_err("invalid in_size should get Error");
4705        assert!(res
4706            .raw_os_error()
4707            .is_some_and(|errno| { errno == libc::EINVAL }));
4708    }
4709
4710    #[test]
4711    #[cfg(feature = "arc_quota")]
4712    fn permission_data_path_matching() {
4713        let ctx = get_context();
4714        let temp_dir = TempDir::new().unwrap();
4715        // Prepare `a.txt` before starting the test.
4716        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/a.txt"]);
4717
4718        let cfg = Config {
4719            max_dynamic_perm: 1,
4720            ..Default::default()
4721        };
4722        let fs = PassthroughFs::new("tag", cfg).unwrap();
4723
4724        let capable = FsOptions::empty();
4725        fs.init(capable).unwrap();
4726
4727        const BY_PATH_UID: u32 = 655360;
4728        const BY_PATH_GID: u32 = 655361;
4729        const BY_PATH_UMASK: u32 = 0o007;
4730
4731        let dir_path = temp_dir.path().join("dir");
4732        let permission_data = PermissionData {
4733            guest_uid: BY_PATH_UID,
4734            guest_gid: BY_PATH_GID,
4735            host_uid: ctx.uid,
4736            host_gid: ctx.gid,
4737            umask: BY_PATH_UMASK,
4738            perm_path: dir_path.to_string_lossy().into_owned(),
4739        };
4740        fs.permission_paths
4741            .write()
4742            .expect("permission_path lock must be acquired")
4743            .push(permission_data);
4744
4745        // a_path is the path with out set permission by path
4746        let a_path = temp_dir.path().join("a.txt");
4747        let in_dir_a_path = dir_path.join("a.txt");
4748
4749        // a.txt should not be set with guest_uid/guest_uid/umask by path
4750        let a_entry = lookup_ent(&fs, &a_path).expect("a.txt must exist");
4751        assert_ne!(a_entry.attr.st_uid, BY_PATH_UID);
4752        assert_ne!(a_entry.attr.st_gid, BY_PATH_GID);
4753
4754        // a.txt in dir should be set guest_uid/guest_uid/umask by path
4755        let in_dir_a_entry = lookup_ent(&fs, &in_dir_a_path).expect("dir/a.txt must exist");
4756        assert_eq!(in_dir_a_entry.attr.st_uid, BY_PATH_UID);
4757        assert_eq!(in_dir_a_entry.attr.st_gid, BY_PATH_GID);
4758        assert_eq!(in_dir_a_entry.attr.st_mode & 0o777, !BY_PATH_UMASK & 0o777);
4759
4760        // Create dir/b.txt.
4761        let in_dir_b_path = dir_path.join("b.txt");
4762        create(&fs, &in_dir_b_path).expect("create b.txt");
4763
4764        // newly created b.txt in dir should be set guest_uid/guest_uid/umask by path
4765        let in_dir_b_entry = lookup_ent(&fs, &in_dir_a_path).expect("dir/b.txt must exist");
4766        assert_eq!(in_dir_b_entry.attr.st_uid, BY_PATH_UID);
4767        assert_eq!(in_dir_b_entry.attr.st_gid, BY_PATH_GID);
4768        assert_eq!(in_dir_b_entry.attr.st_mode & 0o777, !BY_PATH_UMASK & 0o777);
4769    }
4770
4771    #[test]
4772    #[cfg(feature = "fs_permission_translation")]
4773    fn test_copy_file_range_path_mapping() {
4774        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4775        let _guard = lock.lock().expect("acquire named lock");
4776
4777        let real_ctx = get_context();
4778        let temp_dir = TempDir::new().unwrap();
4779        let dir_path = temp_dir.path().join("dir");
4780        create_test_data(&temp_dir, &["dir"], &["src.txt", "dir/dst.txt"]);
4781
4782        let cfg = Config {
4783            ..Default::default()
4784        };
4785        let fs = PassthroughFs::new("tag", cfg).unwrap();
4786        fs.init(FsOptions::empty()).unwrap();
4787
4788        // Use a fake UID in the context that would normally fail set_creds()
4789        let mut fake_ctx = real_ctx;
4790        fake_ctx.uid = 9999;
4791        fake_ctx.gid = 9999;
4792
4793        // Create mapping: mapping the fake guest UID to the REAL host UID.
4794        // If the mapping works, copy_file_range will use real_ctx.uid and succeed.
4795        // If the mapping is ignored, it will use fake_ctx.uid (9999) and set_creds will fail with
4796        // EPERM.
4797        let permission_data = PermissionData {
4798            guest_uid: fake_ctx.uid,
4799            guest_gid: fake_ctx.gid,
4800            host_uid: real_ctx.uid,
4801            host_gid: real_ctx.gid,
4802            umask: 0,
4803            perm_path: dir_path.to_string_lossy().into_owned(),
4804        };
4805        fs.permission_paths.write().unwrap().push(permission_data);
4806
4807        let src_path = temp_dir.path().join("src.txt");
4808        let dst_path = dir_path.join("dst.txt");
4809
4810        std::fs::write(&src_path, b"hello world").unwrap();
4811
4812        let src_inode = lookup(&fs, &src_path).unwrap();
4813        let dst_inode = lookup(&fs, &dst_path).unwrap();
4814
4815        // Open files to get handles.
4816        // Note: we use real_ctx here to ensure file handles are opened successfully.
4817        // The copy_file_range call itself will use fake_ctx.
4818        let (src_handle, _) = fs
4819            .open(real_ctx, src_inode, libc::O_RDONLY as u32)
4820            .expect("open src");
4821        let (dst_handle, _) = fs
4822            .open(real_ctx, dst_inode, libc::O_WRONLY as u32)
4823            .expect("open dst");
4824
4825        let src_handle = src_handle.unwrap();
4826        let dst_handle = dst_handle.unwrap();
4827
4828        // Execute copy_file_range with fake_ctx.
4829        // This will only succeed if change_creds_for_path correctly translates 9999 -> real_uid.
4830        let result = fs.copy_file_range(
4831            fake_ctx, src_inode, src_handle, 0, dst_inode, dst_handle, 0, 5, 0,
4832        );
4833
4834        assert!(
4835            result.is_ok(),
4836            "copy_file_range failed: {:?}. Mapping might not be applied.",
4837            result.err()
4838        );
4839        assert_eq!(result.unwrap(), 5);
4840
4841        let content = std::fs::read(&dst_path).unwrap();
4842        assert_eq!(&content[0..5], b"hello");
4843    }
4844
4845    #[test]
4846    #[cfg(feature = "arc_quota")]
4847    fn set_path_xattr_ioctl_valid_data() {
4848        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4849        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4850        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4851        let _guard = lock.lock().expect("acquire named lock");
4852
4853        let cfg: Config = Config {
4854            max_dynamic_xattr: 1,
4855            ..Default::default()
4856        };
4857        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4858
4859        let path_string = String::from("/test");
4860        let xattr_name_string = String::from("test_name");
4861        let xattr_value_string = String::from("test_value");
4862        let fs_path_xattr_data_buffer = FsPathXattrDataBuffer {
4863            path: {
4864                let mut path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4865                path[..path_string.len()].copy_from_slice(path_string.as_bytes());
4866                path
4867            },
4868            xattr_name: {
4869                let mut xattr_name: [u8; FS_IOCTL_XATTR_NAME_MAX_LEN] =
4870                    [0; FS_IOCTL_XATTR_NAME_MAX_LEN];
4871                xattr_name[..xattr_name_string.len()].copy_from_slice(xattr_name_string.as_bytes());
4872                xattr_name
4873            },
4874            xattr_value: {
4875                let mut xattr_value: [u8; FS_IOCTL_XATTR_VALUE_MAX_LEN] =
4876                    [0; FS_IOCTL_XATTR_VALUE_MAX_LEN];
4877                xattr_value[..xattr_value_string.len()]
4878                    .copy_from_slice(xattr_value_string.as_bytes());
4879                xattr_value
4880            },
4881        };
4882        let r = std::io::Cursor::new(fs_path_xattr_data_buffer.as_bytes());
4883
4884        let res = fs_ioc_setpathxattr(
4885            &p,
4886            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4887            r.clone(),
4888        )
4889        .expect("valid input should get IoctlReply");
4890        assert!(matches!(res, IoctlReply::Done(Ok(data)) if data.is_empty()));
4891
4892        let read_guard = p.xattr_paths.read().expect("read xattr_paths failed");
4893        let xattr_data = read_guard.first().expect("xattr_paths should not be empty");
4894
4895        // Check expected data item is added to permission_paths.
4896        let expected_data = XattrData {
4897            xattr_path: path_string,
4898            xattr_name: xattr_name_string,
4899            xattr_value: xattr_value_string,
4900        };
4901        assert_eq!(*xattr_data, expected_data);
4902
4903        // Second ioctl should not succeed since max_dynamic_perm is set to 1
4904        let res = fs_ioc_setpathxattr(
4905            &p,
4906            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4907            r.clone(),
4908        )
4909        .expect("valid input should get IoctlReply");
4910        assert!(
4911            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4912                errno == libc::EPERM
4913            }))
4914        );
4915    }
4916    #[test]
4917    #[cfg(feature = "arc_quota")]
4918    fn set_path_xattr_ioctl_invalid_data() {
4919        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4920        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4921        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4922        let _guard = lock.lock().expect("acquire named lock");
4923
4924        let cfg: Config = Config {
4925            max_dynamic_xattr: 1,
4926            ..Default::default()
4927        };
4928        let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4929
4930        let path_string = String::from("test");
4931        let xattr_name_string = String::from("test_name");
4932        let xattr_value_string = String::from("test_value");
4933        let fs_path_xattr_data_buffer = FsPathXattrDataBuffer {
4934            path: {
4935                let mut path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4936                path[..path_string.len()].copy_from_slice(path_string.as_bytes());
4937                path
4938            },
4939            xattr_name: {
4940                let mut xattr_name: [u8; FS_IOCTL_XATTR_NAME_MAX_LEN] =
4941                    [0; FS_IOCTL_XATTR_NAME_MAX_LEN];
4942                xattr_name[..xattr_name_string.len()].copy_from_slice(xattr_name_string.as_bytes());
4943                xattr_name
4944            },
4945            xattr_value: {
4946                let mut xattr_value: [u8; FS_IOCTL_XATTR_VALUE_MAX_LEN] =
4947                    [0; FS_IOCTL_XATTR_VALUE_MAX_LEN];
4948                xattr_value[..xattr_value_string.len()]
4949                    .copy_from_slice(xattr_value_string.as_bytes());
4950                xattr_value
4951            },
4952        };
4953        let r = std::io::Cursor::new(fs_path_xattr_data_buffer.as_bytes());
4954
4955        // This call is supposed to get EINVAL ioctlReply, since the perm_path is invalid.
4956        let res = fs_ioc_setpathxattr(
4957            &p,
4958            mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4959            r.clone(),
4960        )
4961        .expect("valid input should get IoctlReply");
4962        assert!(
4963            matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4964                errno == libc::EINVAL
4965            }))
4966        );
4967
4968        let fake_data_buffer: [u8; 128] = [0; 128];
4969        let r = std::io::Cursor::new(fake_data_buffer.as_bytes());
4970        // This call is supposed to get EINVAL ioctlReply, since the in_size is not the size of
4971        // struct FsPathXattrDataBuffer.
4972        let res = fs_ioc_setpathxattr(&p, mem::size_of_val(&fake_data_buffer) as u32, r.clone())
4973            .expect_err("valid input should get IoctlReply");
4974        assert!(res
4975            .raw_os_error()
4976            .is_some_and(|errno| { errno == libc::EINVAL }));
4977    }
4978
4979    #[test]
4980    #[cfg(feature = "arc_quota")]
4981    fn xattr_data_path_matching() {
4982        let ctx = get_context();
4983        let temp_dir = TempDir::new().unwrap();
4984        // Prepare `a.txt` before starting the test.
4985        create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/a.txt"]);
4986
4987        let cfg = Config {
4988            max_dynamic_xattr: 1,
4989            ..Default::default()
4990        };
4991        let fs = PassthroughFs::new("tag", cfg).unwrap();
4992
4993        let capable = FsOptions::empty();
4994        fs.init(capable).unwrap();
4995
4996        let dir_path = temp_dir.path().join("dir");
4997        let xattr_name_string = String::from("test_name");
4998        let xattr_name_cstring = CString::new(xattr_name_string.clone()).expect("create c string");
4999        let xattr_value_string = String::from("test_value");
5000        let xattr_value_bytes = xattr_value_string.clone().into_bytes();
5001
5002        let xattr_data = XattrData {
5003            xattr_name: xattr_name_string,
5004            xattr_value: xattr_value_string,
5005            xattr_path: dir_path.to_string_lossy().into_owned(),
5006        };
5007        fs.xattr_paths
5008            .write()
5009            .expect("xattr_paths lock must be acquired")
5010            .push(xattr_data);
5011
5012        // a_path is the path with out set xattr by path
5013        let a_path: std::path::PathBuf = temp_dir.path().join("a.txt");
5014        let in_dir_a_path = dir_path.join("a.txt");
5015
5016        let a_node = lookup(&fs, a_path.as_path()).expect("lookup a node");
5017        // a.txt should not be set with xattr by path
5018        assert!(fs
5019            .getxattr(
5020                ctx,
5021                a_node,
5022                &xattr_name_cstring,
5023                xattr_value_bytes.len() as u32
5024            )
5025            .is_err());
5026
5027        let in_dir_a_node = lookup(&fs, in_dir_a_path.as_path()).expect("lookup in dir a node");
5028        // a.txt in dir should be set xattr by path
5029        let in_dir_a_reply = fs
5030            .getxattr(
5031                ctx,
5032                in_dir_a_node,
5033                &xattr_name_cstring,
5034                xattr_value_bytes.len() as u32,
5035            )
5036            .expect("Getxattr should success");
5037        assert!(matches!(in_dir_a_reply, GetxattrReply::Value(v) if v == xattr_value_bytes));
5038        // Create dir/b.txt.
5039        let in_dir_b_path = dir_path.join("b.txt");
5040        create(&fs, &in_dir_b_path).expect("create b.txt");
5041
5042        // newly created b.txt in dir should be set xattr by path
5043        let in_dir_b_node = lookup(&fs, in_dir_a_path.as_path()).expect("lookup in dir b node");
5044        let in_dir_b_reply = fs
5045            .getxattr(
5046                ctx,
5047                in_dir_b_node,
5048                &xattr_name_cstring,
5049                xattr_value_bytes.len() as u32,
5050            )
5051            .expect("Getxattr should success");
5052        assert!(matches!(in_dir_b_reply, GetxattrReply::Value(v) if v == xattr_value_bytes));
5053    }
5054
5055    /// Creates and open a new file by atomic_open with O_APPEND flag.
5056    /// We check O_APPEND is properly handled, depending on writeback cache is enabled or not.
5057    fn atomic_open_create_o_append(writeback: bool) {
5058        // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
5059        // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
5060        let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
5061        let _guard = lock.lock().expect("acquire named lock");
5062
5063        let temp_dir = TempDir::new().unwrap();
5064
5065        let cfg = Config {
5066            cache_policy: CachePolicy::Always,
5067            writeback,
5068            ..Default::default()
5069        };
5070        let fs = PassthroughFs::new("tag", cfg).unwrap();
5071
5072        let capable = FsOptions::ZERO_MESSAGE_OPEN | FsOptions::WRITEBACK_CACHE;
5073        fs.init(capable).unwrap();
5074
5075        let (entry, _, _) = atomic_open(
5076            &fs,
5077            &temp_dir.path().join("a.txt"),
5078            0o666,
5079            (libc::O_RDWR | libc::O_CREAT | libc::O_APPEND) as u32,
5080            0,
5081            None,
5082        )
5083        .expect("atomic_open");
5084        assert_ne!(entry.inode, 0);
5085
5086        let inodes = fs.inodes.lock();
5087        let data = inodes.get(&entry.inode).unwrap();
5088        let flags = data.file.lock().open_flags;
5089        if writeback {
5090            // When writeback is enabled, O_APPEND must be handled by the guest kernel.
5091            // So, it must be cleared.
5092            assert_eq!(flags & libc::O_APPEND, 0);
5093        } else {
5094            // Without writeback cache, O_APPEND must not be cleared.
5095            assert_eq!(flags & libc::O_APPEND, libc::O_APPEND);
5096        }
5097    }
5098
5099    #[test]
5100    fn test_atomic_open_create_o_append_no_writeback() {
5101        atomic_open_create_o_append(false);
5102    }
5103
5104    #[test]
5105    fn test_atomic_open_create_o_append_writeback() {
5106        atomic_open_create_o_append(true);
5107    }
5108}