base/sys/linux/
mod.rs

1// Copyright 2017 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Small system utility modules for usage by other modules.
6
7#[cfg(target_os = "android")]
8mod android;
9#[cfg(target_os = "android")]
10use android as target_os;
11#[cfg(target_os = "linux")]
12#[allow(clippy::module_inception)]
13mod linux;
14#[cfg(target_os = "linux")]
15use linux as target_os;
16use log::warn;
17#[macro_use]
18pub mod ioctl;
19#[macro_use]
20pub mod syslog;
21mod capabilities;
22mod descriptor;
23mod event;
24mod file;
25mod file_traits;
26mod mmap;
27mod net;
28mod notifiers;
29pub mod platform_timer_resolution;
30mod poll;
31mod priority;
32mod sched;
33mod shm;
34pub mod signal;
35mod signalfd;
36mod terminal;
37mod timer;
38pub mod vsock;
39mod write_zeroes;
40
41use std::ffi::CString;
42use std::fs::remove_file;
43use std::fs::File;
44use std::fs::OpenOptions;
45use std::mem;
46use std::mem::MaybeUninit;
47use std::ops::Deref;
48use std::os::unix::io::FromRawFd;
49use std::os::unix::io::RawFd;
50use std::os::unix::net::UnixDatagram;
51use std::os::unix::net::UnixListener;
52use std::os::unix::process::ExitStatusExt;
53use std::path::Path;
54use std::path::PathBuf;
55use std::process::ExitStatus;
56use std::ptr;
57use std::sync::OnceLock;
58use std::time::Duration;
59
60pub use capabilities::drop_capabilities;
61pub use event::EventExt;
62pub(crate) use event::PlatformEvent;
63pub use file::find_next_data;
64pub use file::FileDataIterator;
65pub(crate) use file_traits::lib::*;
66pub use ioctl::*;
67use libc::c_int;
68use libc::c_long;
69use libc::fcntl;
70use libc::pipe2;
71use libc::prctl;
72use libc::syscall;
73use libc::waitpid;
74use libc::SYS_getpid;
75use libc::SYS_getppid;
76use libc::SYS_gettid;
77use libc::EINVAL;
78use libc::O_CLOEXEC;
79use libc::PR_SET_NAME;
80use libc::SIGKILL;
81use libc::WNOHANG;
82pub use mmap::*;
83pub(in crate::sys) use net::sendmsg_nosignal as sendmsg;
84pub(in crate::sys) use net::sockaddr_un;
85pub(in crate::sys) use net::sockaddrv4_to_lib_c;
86pub(in crate::sys) use net::sockaddrv6_to_lib_c;
87pub use poll::EventContext;
88pub use priority::*;
89pub use sched::*;
90pub use shm::MemfdSeals;
91pub use shm::SharedMemoryLinux;
92pub use signal::*;
93pub use signalfd::Error as SignalFdError;
94pub use signalfd::*;
95pub use terminal::*;
96pub(crate) use write_zeroes::file_punch_hole;
97pub(crate) use write_zeroes::file_write_zeroes_at;
98
99use crate::descriptor::FromRawDescriptor;
100use crate::descriptor::SafeDescriptor;
101pub use crate::errno::Error;
102pub use crate::errno::Result;
103pub use crate::errno::*;
104use crate::number_of_logical_cores;
105use crate::round_up_to_page_size;
106pub use crate::sys::unix::descriptor::*;
107use crate::syscall;
108use crate::AsRawDescriptor;
109use crate::Pid;
110
111/// Re-export libc types that are part of the API.
112pub type Uid = libc::uid_t;
113pub type Gid = libc::gid_t;
114pub type Mode = libc::mode_t;
115
116// Directory that holds cpu sysinfo files.
117const CPU_DIR: &str = "/sys/devices/system/cpu";
118
119/// Safe wrapper for PR_SET_NAME(2const)
120#[inline(always)]
121pub fn set_thread_name(name: &str) -> Result<()> {
122    let name = CString::new(name).or(Err(Error::new(EINVAL)))?;
123    // SAFETY: prctl copies name and doesn't expect it to outlive this function.
124    let ret = unsafe { prctl(PR_SET_NAME, name.as_c_str()) };
125    if ret == 0 {
126        Ok(())
127    } else {
128        errno_result()
129    }
130}
131
132/// This bypasses `libc`'s caching `getpid(2)` wrapper which can be invalid if a raw clone was used
133/// elsewhere.
134#[inline(always)]
135pub fn getpid() -> Pid {
136    // SAFETY:
137    // Safe because this syscall can never fail and we give it a valid syscall number.
138    unsafe { syscall(SYS_getpid as c_long) as Pid }
139}
140
141/// Safe wrapper for the geppid Linux systemcall.
142#[inline(always)]
143pub fn getppid() -> Pid {
144    // SAFETY:
145    // Safe because this syscall can never fail and we give it a valid syscall number.
146    unsafe { syscall(SYS_getppid as c_long) as Pid }
147}
148
149/// Safe wrapper for the gettid Linux systemcall.
150pub fn gettid() -> Pid {
151    // SAFETY:
152    // Calling the gettid() sycall is always safe.
153    unsafe { syscall(SYS_gettid as c_long) as Pid }
154}
155
156/// Safe wrapper for `geteuid(2)`.
157#[inline(always)]
158pub fn geteuid() -> Uid {
159    // SAFETY:
160    // trivially safe
161    unsafe { libc::geteuid() }
162}
163
164/// Safe wrapper for `getegid(2)`.
165#[inline(always)]
166pub fn getegid() -> Gid {
167    // SAFETY:
168    // trivially safe
169    unsafe { libc::getegid() }
170}
171
172/// The operation to perform with `flock`.
173pub enum FlockOperation {
174    LockShared,
175    LockExclusive,
176    Unlock,
177}
178
179/// Safe wrapper for flock(2) with the operation `op` and optionally `nonblocking`. The lock will be
180/// dropped automatically when `file` is dropped.
181#[inline(always)]
182pub fn flock<F: AsRawDescriptor>(file: &F, op: FlockOperation, nonblocking: bool) -> Result<()> {
183    let mut operation = match op {
184        FlockOperation::LockShared => libc::LOCK_SH,
185        FlockOperation::LockExclusive => libc::LOCK_EX,
186        FlockOperation::Unlock => libc::LOCK_UN,
187    };
188
189    if nonblocking {
190        operation |= libc::LOCK_NB;
191    }
192
193    // SAFETY:
194    // Safe since we pass in a valid fd and flock operation, and check the return value.
195    syscall!(unsafe { libc::flock(file.as_raw_descriptor(), operation) }).map(|_| ())
196}
197
198/// The operation to perform with `fallocate`.
199pub enum FallocateMode {
200    PunchHole,
201    ZeroRange,
202    Allocate,
203}
204
205impl From<FallocateMode> for i32 {
206    fn from(value: FallocateMode) -> Self {
207        match value {
208            FallocateMode::Allocate => libc::FALLOC_FL_KEEP_SIZE,
209            FallocateMode::PunchHole => libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE,
210            FallocateMode::ZeroRange => libc::FALLOC_FL_ZERO_RANGE | libc::FALLOC_FL_KEEP_SIZE,
211        }
212    }
213}
214
215impl From<FallocateMode> for u32 {
216    fn from(value: FallocateMode) -> Self {
217        Into::<i32>::into(value) as u32
218    }
219}
220
221/// Safe wrapper for `fallocate()`.
222pub fn fallocate<F: AsRawDescriptor>(
223    file: &F,
224    mode: FallocateMode,
225    offset: u64,
226    len: u64,
227) -> Result<()> {
228    let offset = if offset > libc::off64_t::MAX as u64 {
229        return Err(Error::new(libc::EINVAL));
230    } else {
231        offset as libc::off64_t
232    };
233
234    let len = if len > libc::off64_t::MAX as u64 {
235        return Err(Error::new(libc::EINVAL));
236    } else {
237        len as libc::off64_t
238    };
239
240    // SAFETY:
241    // Safe since we pass in a valid fd and fallocate mode, validate offset and len,
242    // and check the return value.
243    syscall!(unsafe { libc::fallocate64(file.as_raw_descriptor(), mode.into(), offset, len) })
244        .map(|_| ())
245}
246
247/// Arguments for how `openat2(2)` should open the target path.
248#[repr(C)]
249#[derive(Default, Debug, Copy, Clone)]
250pub struct open_how {
251    /// Flags for the open operation (e.g. `O_RDONLY`).
252    pub flags: u64,
253    /// Mode for the created file (if `O_CREAT` is set).
254    pub mode: u64,
255    /// Resolution flags (e.g. `RESOLVE_IN_ROOT`).
256    pub resolve: u64,
257}
258
259/// Safe wrapper for `openat2(2)`.
260pub fn openat2<D: AsRawDescriptor>(dir: &D, name: &std::ffi::CStr, how: &open_how) -> Result<File> {
261    // SAFETY:
262    // Safe because the syscall is provided with valid arguments and its return value is checked.
263    syscall!(unsafe {
264        libc::syscall(
265            libc::SYS_openat2,
266            dir.as_raw_descriptor(),
267            name.as_ptr(),
268            how as *const open_how,
269            std::mem::size_of::<open_how>() as libc::size_t,
270        )
271    })
272    .map(|fd| unsafe { File::from_raw_descriptor(fd as RawFd) })
273}
274
275/// Safe wrapper for `fstat()`.
276pub fn fstat<F: AsRawDescriptor>(f: &F) -> Result<libc::stat64> {
277    let mut st = MaybeUninit::<libc::stat64>::zeroed();
278
279    // SAFETY:
280    // Safe because the kernel will only write data in `st` and we check the return
281    // value.
282    syscall!(unsafe { libc::fstat64(f.as_raw_descriptor(), st.as_mut_ptr()) })?;
283
284    // SAFETY:
285    // Safe because the kernel guarantees that the struct is now fully initialized.
286    Ok(unsafe { st.assume_init() })
287}
288
289/// Checks whether a file is a block device fie or not.
290pub fn is_block_file<F: AsRawDescriptor>(file: &F) -> Result<bool> {
291    let stat = fstat(file)?;
292    Ok((stat.st_mode & libc::S_IFMT) == libc::S_IFBLK)
293}
294
295const BLOCK_IO_TYPE: u32 = 0x12;
296ioctl_io_nr!(BLKDISCARD, BLOCK_IO_TYPE, 119);
297
298/// Discards the given range of a block file.
299pub fn discard_block<F: AsRawDescriptor>(file: &F, offset: u64, len: u64) -> Result<()> {
300    let range: [u64; 2] = [offset, len];
301    // SAFETY:
302    // Safe because
303    // - we check the return value.
304    // - ioctl(BLKDISCARD) does not hold the descriptor after the call.
305    // - ioctl(BLKDISCARD) does not break the file descriptor.
306    // - ioctl(BLKDISCARD) does not modify the given range.
307    syscall!(unsafe { libc::ioctl(file.as_raw_descriptor(), BLKDISCARD, &range) }).map(|_| ())
308}
309
310/// A trait used to abstract types that provide a process id that can be operated on.
311pub trait AsRawPid {
312    fn as_raw_pid(&self) -> Pid;
313}
314
315impl AsRawPid for Pid {
316    fn as_raw_pid(&self) -> Pid {
317        *self
318    }
319}
320
321impl AsRawPid for std::process::Child {
322    fn as_raw_pid(&self) -> Pid {
323        self.id() as Pid
324    }
325}
326
327/// A safe wrapper around waitpid.
328///
329/// On success if a process was reaped, it will be returned as the first value.
330/// The second returned value is the ExitStatus from the libc::waitpid() call.
331///
332/// Note: this can block if libc::WNOHANG is not set and EINTR is not handled internally.
333pub fn wait_for_pid<A: AsRawPid>(pid: A, options: c_int) -> Result<(Option<Pid>, ExitStatus)> {
334    let pid = pid.as_raw_pid();
335    let mut status: c_int = 1;
336    // SAFETY:
337    // Safe because status is owned and the error is checked.
338    let ret = unsafe { libc::waitpid(pid, &mut status, options) };
339    if ret < 0 {
340        return errno_result();
341    }
342    Ok((
343        if ret == 0 { None } else { Some(ret) },
344        ExitStatus::from_raw(status),
345    ))
346}
347
348/// Reaps a child process that has terminated.
349///
350/// Returns `Ok(pid)` where `pid` is the process that was reaped or `Ok(0)` if none of the children
351/// have terminated. An `Error` is with `errno == ECHILD` if there are no children left to reap.
352///
353/// # Examples
354///
355/// Reaps all child processes until there are no terminated children to reap.
356///
357/// ```
358/// fn reap_children() {
359///     loop {
360///         match base::linux::reap_child() {
361///             Ok(0) => println!("no children ready to reap"),
362///             Ok(pid) => {
363///                 println!("reaped {}", pid);
364///                 continue
365///             },
366///             Err(e) if e.errno() == libc::ECHILD => println!("no children left"),
367///             Err(e) => println!("error reaping children: {}", e),
368///         }
369///         break
370///     }
371/// }
372/// ```
373pub fn reap_child() -> Result<Pid> {
374    // SAFETY:
375    // Safe because we pass in no memory, prevent blocking with WNOHANG, and check for error.
376    let ret = unsafe { waitpid(-1, ptr::null_mut(), WNOHANG) };
377    if ret == -1 {
378        errno_result()
379    } else {
380        Ok(ret)
381    }
382}
383
384/// Kill all processes in the current process group.
385///
386/// On success, this kills all processes in the current process group, including the current
387/// process, meaning this will not return. This is equivalent to a call to `kill(0, SIGKILL)`.
388pub fn kill_process_group() -> Result<()> {
389    // SAFETY: Safe because pid is 'self group' and return value doesn't matter.
390    unsafe { kill(0, SIGKILL) }?;
391    // Kill succeeded, so this process never reaches here.
392    unreachable!();
393}
394
395/// Spawns a pipe pair where the first pipe is the read end and the second pipe is the write end.
396///
397/// The `O_CLOEXEC` flag will be set during pipe creation.
398pub fn pipe() -> Result<(File, File)> {
399    let mut pipe_fds = [-1; 2];
400    // SAFETY:
401    // Safe because pipe2 will only write 2 element array of i32 to the given pointer, and we check
402    // for error.
403    let ret = unsafe { pipe2(&mut pipe_fds[0], O_CLOEXEC) };
404    if ret == -1 {
405        errno_result()
406    } else {
407        // SAFETY:
408        // Safe because both fds must be valid for pipe2 to have returned sucessfully and we have
409        // exclusive ownership of them.
410        Ok(unsafe {
411            (
412                File::from_raw_fd(pipe_fds[0]),
413                File::from_raw_fd(pipe_fds[1]),
414            )
415        })
416    }
417}
418
419/// Sets the pipe signified with fd to `size`.
420///
421/// Returns the new size of the pipe or an error if the OS fails to set the pipe size.
422pub fn set_pipe_size(fd: RawFd, size: usize) -> Result<usize> {
423    // SAFETY:
424    // Safe because fcntl with the `F_SETPIPE_SZ` arg doesn't touch memory.
425    syscall!(unsafe { fcntl(fd, libc::F_SETPIPE_SZ, size as c_int) }).map(|ret| ret as usize)
426}
427
428/// Test-only function used to create a pipe that is full. The pipe is created, has its size set to
429/// the minimum and then has that much data written to it. Use `new_pipe_full` to test handling of
430/// blocking `write` calls in unit tests.
431pub fn new_pipe_full() -> Result<(File, File)> {
432    use std::io::Write;
433
434    let (rx, mut tx) = pipe()?;
435    // The smallest allowed size of a pipe is the system page size on linux.
436    let page_size = set_pipe_size(tx.as_raw_descriptor(), round_up_to_page_size(1))?;
437
438    // Fill the pipe with page_size zeros so the next write call will block.
439    let buf = vec![0u8; page_size];
440    tx.write_all(&buf)?;
441
442    Ok((rx, tx))
443}
444
445/// Used to attempt to clean up a named pipe after it is no longer used.
446pub struct UnlinkUnixDatagram(pub UnixDatagram);
447impl AsRef<UnixDatagram> for UnlinkUnixDatagram {
448    fn as_ref(&self) -> &UnixDatagram {
449        &self.0
450    }
451}
452impl Drop for UnlinkUnixDatagram {
453    fn drop(&mut self) {
454        if let Ok(addr) = self.0.local_addr() {
455            if let Some(path) = addr.as_pathname() {
456                if let Err(e) = remove_file(path) {
457                    warn!("failed to remove control socket file: {}", e);
458                }
459            }
460        }
461    }
462}
463
464/// Used to attempt to clean up a named pipe after it is no longer used.
465pub struct UnlinkUnixListener(pub UnixListener);
466
467impl AsRef<UnixListener> for UnlinkUnixListener {
468    fn as_ref(&self) -> &UnixListener {
469        &self.0
470    }
471}
472
473impl Deref for UnlinkUnixListener {
474    type Target = UnixListener;
475
476    fn deref(&self) -> &UnixListener {
477        &self.0
478    }
479}
480
481impl Drop for UnlinkUnixListener {
482    fn drop(&mut self) {
483        if let Ok(addr) = self.0.local_addr() {
484            if let Some(path) = addr.as_pathname() {
485                if let Err(e) = remove_file(path) {
486                    warn!("failed to remove control socket file: {}", e);
487                }
488            }
489        }
490    }
491}
492
493/// Verifies that |raw_descriptor| is actually owned by this process and duplicates it
494/// to ensure that we have a unique handle to it.
495pub fn validate_raw_descriptor(raw_descriptor: RawDescriptor) -> Result<RawDescriptor> {
496    validate_raw_fd(&raw_descriptor)
497}
498
499/// Verifies that |raw_fd| is actually owned by this process and duplicates it to ensure that
500/// we have a unique handle to it.
501pub fn validate_raw_fd(raw_fd: &RawFd) -> Result<RawFd> {
502    // Checking that close-on-exec isn't set helps filter out FDs that were opened by
503    // crosvm as all crosvm FDs are close on exec.
504    // SAFETY:
505    // Safe because this doesn't modify any memory and we check the return value.
506    let flags = unsafe { libc::fcntl(*raw_fd, libc::F_GETFD) };
507    if flags < 0 || (flags & libc::FD_CLOEXEC) != 0 {
508        return Err(Error::new(libc::EBADF));
509    }
510
511    // SAFETY:
512    // Duplicate the fd to ensure that we don't accidentally close an fd previously
513    // opened by another subsystem.  Safe because this doesn't modify any memory and
514    // we check the return value.
515    let dup_fd = unsafe { libc::fcntl(*raw_fd, libc::F_DUPFD_CLOEXEC, 0) };
516    if dup_fd < 0 {
517        return Err(Error::last());
518    }
519    Ok(dup_fd as RawFd)
520}
521
522/// Utility function that returns true if the given FD is readable without blocking.
523///
524/// On an error, such as an invalid or incompatible FD, this will return false, which can not be
525/// distinguished from a non-ready to read FD.
526pub fn poll_in<F: AsRawDescriptor>(fd: &F) -> bool {
527    let mut fds = libc::pollfd {
528        fd: fd.as_raw_descriptor(),
529        events: libc::POLLIN,
530        revents: 0,
531    };
532    // SAFETY:
533    // Safe because we give a valid pointer to a list (of 1) FD and check the return value.
534    let ret = unsafe { libc::poll(&mut fds, 1, 0) };
535    // An error probably indicates an invalid FD, or an FD that can't be polled. Returning false in
536    // that case is probably correct as such an FD is unlikely to be readable, although there are
537    // probably corner cases in which that is wrong.
538    if ret == -1 {
539        return false;
540    }
541    fds.revents & libc::POLLIN != 0
542}
543
544/// Return the maximum Duration that can be used with libc::timespec.
545pub fn max_timeout() -> Duration {
546    Duration::new(libc::time_t::MAX as u64, 999999999)
547}
548
549/// If the given path is of the form /proc/self/fd/N for some N, returns `Ok(Some(N))`. Otherwise
550/// returns `Ok(None)`.
551pub fn safe_descriptor_from_path<P: AsRef<Path>>(path: P) -> Result<Option<SafeDescriptor>> {
552    let path = path.as_ref();
553    if path.parent() == Some(Path::new("/proc/self/fd")) {
554        let raw_descriptor = path
555            .file_name()
556            .and_then(|fd_osstr| fd_osstr.to_str())
557            .and_then(|fd_str| fd_str.parse::<RawFd>().ok())
558            .ok_or_else(|| Error::new(EINVAL))?;
559        let validated_fd = validate_raw_fd(&raw_descriptor)?;
560        Ok(Some(
561            // SAFETY:
562            // Safe because nothing else has access to validated_fd after this call.
563            unsafe { SafeDescriptor::from_raw_descriptor(validated_fd) },
564        ))
565    } else {
566        Ok(None)
567    }
568}
569
570/// Check FD is not opened by crosvm and returns a FD that is freshly DUPFD_CLOEXEC's.
571/// A SafeDescriptor is created from the duplicated fd. It does not take ownership of
572/// fd passed by argument.
573pub fn safe_descriptor_from_cmdline_fd(fd: &RawFd) -> Result<SafeDescriptor> {
574    let validated_fd = validate_raw_fd(fd)?;
575    Ok(
576        // SAFETY:
577        // Safe because nothing else has access to validated_fd after this call.
578        unsafe { SafeDescriptor::from_raw_descriptor(validated_fd) },
579    )
580}
581
582/// Open the file with the given path, or if it is of the form `/proc/self/fd/N` then just use the
583/// file descriptor.
584///
585/// Note that this will not work properly if the same `/proc/self/fd/N` path is used twice in
586/// different places, as the metadata (including the offset) will be shared between both file
587/// descriptors.
588pub fn open_file_or_duplicate<P: AsRef<Path>>(path: P, options: &OpenOptions) -> Result<File> {
589    let path = path.as_ref();
590    // Special case '/proc/self/fd/*' paths. The FD is already open, just use it.
591    Ok(if let Some(fd) = safe_descriptor_from_path(path)? {
592        fd.into()
593    } else {
594        options.open(path)?
595    })
596}
597
598/// Get the soft and hard limits of max number of open files allowed by the environment.
599pub fn max_open_files() -> Result<libc::rlimit64> {
600    let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
601
602    // SAFETY:
603    // Safe because this will only modify `buf` and we check the return value.
604    let res = unsafe { libc::prlimit64(0, libc::RLIMIT_NOFILE, ptr::null(), buf.as_mut_ptr()) };
605    if res == 0 {
606        // SAFETY:
607        // Safe because the kernel guarantees that the struct is fully initialized.
608        let limit = unsafe { buf.assume_init() };
609        Ok(limit)
610    } else {
611        errno_result()
612    }
613}
614
615/// Executes the given callback with extended soft limit of max number of open files. After the
616/// callback executed, restore the limit.
617pub fn call_with_extended_max_files<T, E>(
618    callback: impl FnOnce() -> std::result::Result<T, E>,
619) -> Result<std::result::Result<T, E>> {
620    let cur_limit = max_open_files()?;
621    let new_limit = libc::rlimit64 {
622        rlim_cur: cur_limit.rlim_max,
623        ..cur_limit
624    };
625    let needs_extension = cur_limit.rlim_cur < new_limit.rlim_cur;
626    if needs_extension {
627        set_max_open_files(new_limit)?;
628    }
629
630    let r = callback();
631
632    // Restore the soft limit.
633    if needs_extension {
634        set_max_open_files(cur_limit)?;
635    }
636
637    Ok(r)
638}
639
640/// Set the soft and hard limits of max number of open files to the given value.
641fn set_max_open_files(limit: libc::rlimit64) -> Result<()> {
642    // SAFETY: RLIMIT_NOFILE is known only to read a buffer of size rlimit64, and we have always
643    // rlimit64 allocated.
644    let res = unsafe { libc::setrlimit64(libc::RLIMIT_NOFILE, &limit) };
645    if res == 0 {
646        Ok(())
647    } else {
648        errno_result()
649    }
650}
651
652/// Moves the requested PID/TID to a particular cgroup
653pub fn move_to_cgroup(cgroup_path: PathBuf, id_to_write: Pid, cgroup_file: &str) -> Result<()> {
654    use std::io::Write;
655
656    let gpu_cgroup_file = cgroup_path.join(cgroup_file);
657    let mut f = File::create(gpu_cgroup_file)?;
658    f.write_all(id_to_write.to_string().as_bytes())?;
659    Ok(())
660}
661
662pub fn move_task_to_cgroup(cgroup_path: PathBuf, thread_id: Pid) -> Result<()> {
663    move_to_cgroup(cgroup_path, thread_id, "tasks")
664}
665
666pub fn move_proc_to_cgroup(cgroup_path: PathBuf, process_id: Pid) -> Result<()> {
667    move_to_cgroup(cgroup_path, process_id, "cgroup.procs")
668}
669
670fn read_sysfs_cpu_info_in_dir(cpu_dir: &str, cpu_id: usize, property: &str) -> Result<String> {
671    let path = Path::new(cpu_dir)
672        .join(format!("cpu{cpu_id}"))
673        .join(property);
674
675    std::fs::read_to_string(path).map_err(|e| e.into())
676}
677
678/// Queries the property of a specified CPU sysfs node.
679fn parse_sysfs_cpu_info_vec(cpu_id: usize, property: &str) -> Result<Vec<u32>> {
680    parse_sysfs_cpu_info_vec_in_dir(CPU_DIR, cpu_id, property)
681}
682
683fn parse_sysfs_cpu_info_vec_in_dir(
684    cpu_dir: &str,
685    cpu_id: usize,
686    property: &str,
687) -> Result<Vec<u32>> {
688    read_sysfs_cpu_info_in_dir(cpu_dir, cpu_id, property)?
689        .split_whitespace()
690        .map(|x| x.parse().map_err(|_| Error::new(libc::EINVAL)))
691        .collect()
692}
693
694/// Returns a list of supported frequencies in kHz for a given logical core.
695pub fn logical_core_frequencies_khz(cpu_id: usize) -> Result<Vec<u32>> {
696    parse_sysfs_cpu_info_vec(cpu_id, "cpufreq/scaling_available_frequencies")
697}
698
699/// Queries the property of a specified CPU sysfs node.
700fn parse_sysfs_cpu_info(cpu_id: usize, property: &str) -> Result<u32> {
701    parse_sysfs_cpu_info_in_dir(CPU_DIR, cpu_id, property)
702}
703
704fn parse_sysfs_cpu_info_in_dir(cpu_dir: &str, cpu_id: usize, property: &str) -> Result<u32> {
705    read_sysfs_cpu_info_in_dir(cpu_dir, cpu_id, property)?
706        .trim()
707        .parse()
708        .map_err(|_| Error::new(libc::EINVAL))
709}
710
711/// Returns the capacity (measure of performance) of a given logical core.
712pub fn logical_core_capacity(cpu_id: usize) -> Result<u32> {
713    static CPU_MAX_FREQS: OnceLock<Option<Vec<u32>>> = OnceLock::new();
714
715    let cpu_capacity = parse_sysfs_cpu_info(cpu_id, "cpu_capacity")?;
716
717    // Collect and cache the maximum frequencies of all cores. We need to know
718    // the largest maximum frequency between all cores to reverse normalization,
719    // so collect all the values once on the first call to this function.
720    let cpu_max_freqs = CPU_MAX_FREQS.get_or_init(|| {
721        (0..number_of_logical_cores().ok()?)
722            .map(|cpu_id| logical_core_max_freq_khz(cpu_id).ok())
723            .collect()
724    });
725
726    if let Some(cpu_max_freqs) = cpu_max_freqs {
727        let largest_max_freq = *cpu_max_freqs.iter().max().ok_or(Error::new(EINVAL))?;
728        let cpu_max_freq = *cpu_max_freqs.get(cpu_id).ok_or(Error::new(EINVAL))?;
729        let normalized_cpu_capacity = (u64::from(cpu_capacity) * u64::from(largest_max_freq))
730            .checked_div(u64::from(cpu_max_freq))
731            .ok_or(Error::new(EINVAL))?;
732        normalized_cpu_capacity
733            .try_into()
734            .map_err(|_| Error::new(EINVAL))
735    } else {
736        // cpu-freq is not enabled. Fall back to using the normalized capacity.
737        Ok(cpu_capacity)
738    }
739}
740
741/// Returns the cluster ID of a given logical core.
742pub fn logical_core_cluster_id(cpu_id: usize) -> Result<u32> {
743    parse_sysfs_cpu_info(cpu_id, "topology/physical_package_id")
744}
745
746/// Returns the maximum frequency (in kHz) of a given logical core.
747pub fn logical_core_max_freq_khz(cpu_id: usize) -> Result<u32> {
748    parse_sysfs_cpu_info(cpu_id, "cpufreq/cpuinfo_max_freq")
749}
750
751/// Parses a string of comma separated CPU ranges, e.g. "0-2,4,6-8" into a BTreeSet of CPU IDs.
752fn parse_online_cpu_range(content: &str) -> std::collections::BTreeSet<usize> {
753    let mut cpus = std::collections::BTreeSet::new();
754    for part in content.trim().split(',') {
755        let part = part.trim();
756        if part.is_empty() {
757            continue;
758        }
759        if let Some((start_str, end_str)) = part.split_once('-') {
760            if let (Ok(start), Ok(end)) = (start_str.parse::<usize>(), end_str.parse::<usize>()) {
761                for i in start..=end {
762                    cpus.insert(i);
763                }
764            }
765        } else if let Ok(cpu) = part.parse::<usize>() {
766            cpus.insert(cpu);
767        }
768    }
769    cpus
770}
771
772/// Returns a bool if the CPU is online. The online status is cached on the first call.
773pub fn is_cpu_online(cpu_id: usize) -> bool {
774    static ONLINE_CPUS: OnceLock<std::collections::BTreeSet<usize>> = OnceLock::new();
775
776    let online_cpus = ONLINE_CPUS.get_or_init(|| {
777        let mut cpus = std::collections::BTreeSet::new();
778        let path = Path::new(CPU_DIR).join("online");
779        match std::fs::read_to_string(&path) {
780            Ok(content) => {
781                cpus = parse_online_cpu_range(&content);
782            }
783            Err(_) => {
784                // If we hit an error trying to access cpuX/online files, assume the CPU is online.
785                // This prevents permission/EACCES errors on individual files from crashing crosvm.
786                if let Ok(total_cores) = crate::number_of_logical_cores() {
787                    for id in 0..total_cores {
788                        match parse_sysfs_cpu_info(id, "online") {
789                            Ok(1) => {
790                                cpus.insert(id);
791                            }
792                            Ok(_) => {}
793                            Err(e) => {
794                                // Assume online on error to avoid crashes.
795                                warn!(
796                                    "Assuming CPU {} is online because we couldn't read the sys file: {}",
797                                    id, e
798                                );
799                                cpus.insert(id);
800                            }
801                        }
802                    }
803                }
804            }
805        }
806        cpus
807    });
808
809    online_cpus.contains(&cpu_id)
810}
811
812#[repr(C)]
813pub struct sched_attr {
814    pub size: u32,
815
816    pub sched_policy: u32,
817    pub sched_flags: u64,
818    pub sched_nice: i32,
819
820    pub sched_priority: u32,
821
822    pub sched_runtime: u64,
823    pub sched_deadline: u64,
824    pub sched_period: u64,
825
826    pub sched_util_min: u32,
827    pub sched_util_max: u32,
828}
829
830impl Default for sched_attr {
831    fn default() -> Self {
832        Self {
833            size: std::mem::size_of::<sched_attr>() as u32,
834            sched_policy: 0,
835            sched_flags: 0,
836            sched_nice: 0,
837            sched_priority: 0,
838            sched_runtime: 0,
839            sched_deadline: 0,
840            sched_period: 0,
841            sched_util_min: 0,
842            sched_util_max: 0,
843        }
844    }
845}
846
847pub fn sched_setattr(pid: Pid, attr: &mut sched_attr, flags: u32) -> Result<()> {
848    // SAFETY: Safe becuase all the args are valid and the return valud is checked.
849    let ret = unsafe {
850        libc::syscall(
851            libc::SYS_sched_setattr,
852            pid as usize,
853            attr as *mut sched_attr as usize,
854            flags as usize,
855        )
856    };
857
858    if ret < 0 {
859        return Err(Error::last());
860    }
861    Ok(())
862}
863
864#[cfg(test)]
865mod tests {
866    use std::fs::create_dir_all;
867    use std::fs::File;
868    use std::io::Write;
869    use std::os::fd::AsRawFd;
870
871    use tempfile::TempDir;
872
873    use super::*;
874    use crate::unix::add_fd_flags;
875
876    fn create_temp_file(path: &Path, content: &str) {
877        if let Some(parent) = path.parent() {
878            create_dir_all(parent).unwrap();
879        }
880        let mut file = File::create(path).unwrap();
881        file.write_all(content.as_bytes()).unwrap();
882    }
883
884    #[test]
885    fn test_parse_online_cpu_range() {
886        let set = parse_online_cpu_range("0-3,5-7");
887        assert_eq!(set.len(), 7);
888        assert!(set.contains(&0));
889        assert!(set.contains(&1));
890        assert!(set.contains(&2));
891        assert!(set.contains(&3));
892        assert!(!set.contains(&4));
893        assert!(set.contains(&5));
894        assert!(set.contains(&6));
895        assert!(set.contains(&7));
896
897        let set = parse_online_cpu_range("0");
898        assert_eq!(set.len(), 1);
899        assert!(set.contains(&0));
900
901        let set = parse_online_cpu_range("0,2,4");
902        assert_eq!(set.len(), 3);
903        assert!(set.contains(&0));
904        assert!(set.contains(&2));
905        assert!(set.contains(&4));
906
907        let set = parse_online_cpu_range("  0-1,  3  ");
908        assert_eq!(set.len(), 3);
909        assert!(set.contains(&0));
910        assert!(set.contains(&1));
911        assert!(set.contains(&3));
912
913        let set = parse_online_cpu_range("");
914        assert!(set.is_empty());
915    }
916
917    #[test]
918    fn pipe_size_and_fill() {
919        let (_rx, mut tx) = new_pipe_full().expect("Failed to pipe");
920
921        // To  check that setting the size worked, set the descriptor to non blocking and check that
922        // write returns an error.
923        add_fd_flags(tx.as_raw_fd(), libc::O_NONBLOCK).expect("Failed to set tx non blocking");
924        tx.write(&[0u8; 8])
925            .expect_err("Write after fill didn't fail");
926    }
927
928    #[test]
929    fn test_parse_sysfs_cpu_info() {
930        let temp_dir = TempDir::new().unwrap();
931        let root = temp_dir.path();
932        let cpu_dir = root.join("sys/devices/system/cpu");
933        let cpu = 0;
934        let property = "cpufreq/cpuinfo_max_freq";
935        create_temp_file(
936            &root.join("sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"),
937            "1000",
938        );
939
940        assert_eq!(
941            parse_sysfs_cpu_info_in_dir(cpu_dir.to_str().unwrap(), cpu, property).unwrap(),
942            1000
943        );
944    }
945
946    #[test]
947    fn test_parse_sysfs_cpu_info_error() {
948        let temp_dir = TempDir::new().unwrap();
949        let root = temp_dir.path();
950        let cpu_dir = root.join("sys/devices/system/cpu");
951        let cpu = 0;
952        let property = "cpufreq/cpuinfo_max_freq";
953        // Not creating the sysinfo file should result in an error trying to read from it.
954
955        let err =
956            parse_sysfs_cpu_info_in_dir(cpu_dir.to_str().unwrap(), cpu, property).unwrap_err();
957        assert_eq!(err, Error::new(libc::ENOENT));
958    }
959
960    #[test]
961    fn test_parse_sysfs_cpu_info_vec() {
962        let temp_dir = TempDir::new().unwrap();
963        let root = temp_dir.path();
964        let cpu_dir = root.join("sys/devices/system/cpu");
965        let cpu = 0;
966        let property = "cpufreq/scaling_available_frequencies";
967        create_temp_file(
968            &root.join("sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies"),
969            "1000 2000",
970        );
971
972        assert_eq!(
973            parse_sysfs_cpu_info_vec_in_dir(cpu_dir.to_str().unwrap(), cpu, property).unwrap(),
974            vec![1000, 2000]
975        );
976    }
977
978    #[test]
979    fn test_parse_sysfs_cpu_info_vec_error() {
980        let temp_dir = TempDir::new().unwrap();
981        let root = temp_dir.path();
982        let cpu_dir = root.join("sys/devices/system/cpu");
983        let cpu = 0;
984        let property = "cpufreq/scaling_available_frequencies";
985        // Not creating the sysinfo file should result in an error trying to read from it.
986
987        let err =
988            parse_sysfs_cpu_info_vec_in_dir(cpu_dir.to_str().unwrap(), cpu, property).unwrap_err();
989        assert_eq!(err, Error::new(libc::ENOENT));
990    }
991}