base/sys/linux/
mod.rs

1// Copyright 2017 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Small system utility modules for usage by other modules.
6
7#[cfg(target_os = "android")]
8mod android;
9#[cfg(target_os = "android")]
10use android as target_os;
11#[cfg(target_os = "linux")]
12#[allow(clippy::module_inception)]
13mod linux;
14#[cfg(target_os = "linux")]
15use linux as target_os;
16use log::warn;
17#[macro_use]
18pub mod ioctl;
19#[macro_use]
20pub mod syslog;
21mod acpi_event;
22mod capabilities;
23mod descriptor;
24mod event;
25mod file;
26mod file_traits;
27mod mmap;
28mod net;
29mod netlink;
30mod notifiers;
31pub mod platform_timer_resolution;
32mod poll;
33mod priority;
34mod sched;
35mod shm;
36pub mod signal;
37mod signalfd;
38mod terminal;
39mod timer;
40pub mod vsock;
41mod write_zeroes;
42
43use std::ffi::CString;
44use std::fs::remove_file;
45use std::fs::File;
46use std::fs::OpenOptions;
47use std::mem;
48use std::mem::MaybeUninit;
49use std::ops::Deref;
50use std::os::unix::io::FromRawFd;
51use std::os::unix::io::RawFd;
52use std::os::unix::net::UnixDatagram;
53use std::os::unix::net::UnixListener;
54use std::os::unix::process::ExitStatusExt;
55use std::path::Path;
56use std::path::PathBuf;
57use std::process::ExitStatus;
58use std::ptr;
59use std::sync::OnceLock;
60use std::time::Duration;
61
62pub use acpi_event::*;
63pub use capabilities::drop_capabilities;
64pub use event::EventExt;
65pub(crate) use event::PlatformEvent;
66pub use file::find_next_data;
67pub use file::FileDataIterator;
68pub(crate) use file_traits::lib::*;
69pub use ioctl::*;
70use libc::c_int;
71use libc::c_long;
72use libc::fcntl;
73use libc::pipe2;
74use libc::prctl;
75use libc::syscall;
76use libc::waitpid;
77use libc::SYS_getpid;
78use libc::SYS_getppid;
79use libc::SYS_gettid;
80use libc::EINVAL;
81use libc::O_CLOEXEC;
82use libc::PR_SET_NAME;
83use libc::SIGKILL;
84use libc::WNOHANG;
85pub use mmap::*;
86pub(in crate::sys) use net::sendmsg_nosignal as sendmsg;
87pub(in crate::sys) use net::sockaddr_un;
88pub(in crate::sys) use net::sockaddrv4_to_lib_c;
89pub(in crate::sys) use net::sockaddrv6_to_lib_c;
90pub use netlink::*;
91pub use poll::EventContext;
92pub use priority::*;
93pub use sched::*;
94pub use shm::MemfdSeals;
95pub use shm::SharedMemoryLinux;
96pub use signal::*;
97pub use signalfd::Error as SignalFdError;
98pub use signalfd::*;
99pub use terminal::*;
100pub(crate) use write_zeroes::file_punch_hole;
101pub(crate) use write_zeroes::file_write_zeroes_at;
102
103use crate::descriptor::FromRawDescriptor;
104use crate::descriptor::SafeDescriptor;
105pub use crate::errno::Error;
106pub use crate::errno::Result;
107pub use crate::errno::*;
108use crate::number_of_logical_cores;
109use crate::round_up_to_page_size;
110pub use crate::sys::unix::descriptor::*;
111use crate::syscall;
112use crate::AsRawDescriptor;
113use crate::Pid;
114
115/// Re-export libc types that are part of the API.
116pub type Uid = libc::uid_t;
117pub type Gid = libc::gid_t;
118pub type Mode = libc::mode_t;
119
120// Directory that holds cpu sysinfo files.
121const CPU_DIR: &str = "/sys/devices/system/cpu";
122
123/// Safe wrapper for PR_SET_NAME(2const)
124#[inline(always)]
125pub fn set_thread_name(name: &str) -> Result<()> {
126    let name = CString::new(name).or(Err(Error::new(EINVAL)))?;
127    // SAFETY: prctl copies name and doesn't expect it to outlive this function.
128    let ret = unsafe { prctl(PR_SET_NAME, name.as_c_str()) };
129    if ret == 0 {
130        Ok(())
131    } else {
132        errno_result()
133    }
134}
135
136/// This bypasses `libc`'s caching `getpid(2)` wrapper which can be invalid if a raw clone was used
137/// elsewhere.
138#[inline(always)]
139pub fn getpid() -> Pid {
140    // SAFETY:
141    // Safe because this syscall can never fail and we give it a valid syscall number.
142    unsafe { syscall(SYS_getpid as c_long) as Pid }
143}
144
145/// Safe wrapper for the geppid Linux systemcall.
146#[inline(always)]
147pub fn getppid() -> Pid {
148    // SAFETY:
149    // Safe because this syscall can never fail and we give it a valid syscall number.
150    unsafe { syscall(SYS_getppid as c_long) as Pid }
151}
152
153/// Safe wrapper for the gettid Linux systemcall.
154pub fn gettid() -> Pid {
155    // SAFETY:
156    // Calling the gettid() sycall is always safe.
157    unsafe { syscall(SYS_gettid as c_long) as Pid }
158}
159
160/// Safe wrapper for `geteuid(2)`.
161#[inline(always)]
162pub fn geteuid() -> Uid {
163    // SAFETY:
164    // trivially safe
165    unsafe { libc::geteuid() }
166}
167
168/// Safe wrapper for `getegid(2)`.
169#[inline(always)]
170pub fn getegid() -> Gid {
171    // SAFETY:
172    // trivially safe
173    unsafe { libc::getegid() }
174}
175
176/// The operation to perform with `flock`.
177pub enum FlockOperation {
178    LockShared,
179    LockExclusive,
180    Unlock,
181}
182
183/// Safe wrapper for flock(2) with the operation `op` and optionally `nonblocking`. The lock will be
184/// dropped automatically when `file` is dropped.
185#[inline(always)]
186pub fn flock<F: AsRawDescriptor>(file: &F, op: FlockOperation, nonblocking: bool) -> Result<()> {
187    let mut operation = match op {
188        FlockOperation::LockShared => libc::LOCK_SH,
189        FlockOperation::LockExclusive => libc::LOCK_EX,
190        FlockOperation::Unlock => libc::LOCK_UN,
191    };
192
193    if nonblocking {
194        operation |= libc::LOCK_NB;
195    }
196
197    // SAFETY:
198    // Safe since we pass in a valid fd and flock operation, and check the return value.
199    syscall!(unsafe { libc::flock(file.as_raw_descriptor(), operation) }).map(|_| ())
200}
201
202/// The operation to perform with `fallocate`.
203pub enum FallocateMode {
204    PunchHole,
205    ZeroRange,
206    Allocate,
207}
208
209impl From<FallocateMode> for i32 {
210    fn from(value: FallocateMode) -> Self {
211        match value {
212            FallocateMode::Allocate => libc::FALLOC_FL_KEEP_SIZE,
213            FallocateMode::PunchHole => libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE,
214            FallocateMode::ZeroRange => libc::FALLOC_FL_ZERO_RANGE | libc::FALLOC_FL_KEEP_SIZE,
215        }
216    }
217}
218
219impl From<FallocateMode> for u32 {
220    fn from(value: FallocateMode) -> Self {
221        Into::<i32>::into(value) as u32
222    }
223}
224
225/// Safe wrapper for `fallocate()`.
226pub fn fallocate<F: AsRawDescriptor>(
227    file: &F,
228    mode: FallocateMode,
229    offset: u64,
230    len: u64,
231) -> Result<()> {
232    let offset = if offset > libc::off64_t::MAX as u64 {
233        return Err(Error::new(libc::EINVAL));
234    } else {
235        offset as libc::off64_t
236    };
237
238    let len = if len > libc::off64_t::MAX as u64 {
239        return Err(Error::new(libc::EINVAL));
240    } else {
241        len as libc::off64_t
242    };
243
244    // SAFETY:
245    // Safe since we pass in a valid fd and fallocate mode, validate offset and len,
246    // and check the return value.
247    syscall!(unsafe { libc::fallocate64(file.as_raw_descriptor(), mode.into(), offset, len) })
248        .map(|_| ())
249}
250
251/// Safe wrapper for `fstat()`.
252pub fn fstat<F: AsRawDescriptor>(f: &F) -> Result<libc::stat64> {
253    let mut st = MaybeUninit::<libc::stat64>::zeroed();
254
255    // SAFETY:
256    // Safe because the kernel will only write data in `st` and we check the return
257    // value.
258    syscall!(unsafe { libc::fstat64(f.as_raw_descriptor(), st.as_mut_ptr()) })?;
259
260    // SAFETY:
261    // Safe because the kernel guarantees that the struct is now fully initialized.
262    Ok(unsafe { st.assume_init() })
263}
264
265/// Checks whether a file is a block device fie or not.
266pub fn is_block_file<F: AsRawDescriptor>(file: &F) -> Result<bool> {
267    let stat = fstat(file)?;
268    Ok((stat.st_mode & libc::S_IFMT) == libc::S_IFBLK)
269}
270
271const BLOCK_IO_TYPE: u32 = 0x12;
272ioctl_io_nr!(BLKDISCARD, BLOCK_IO_TYPE, 119);
273
274/// Discards the given range of a block file.
275pub fn discard_block<F: AsRawDescriptor>(file: &F, offset: u64, len: u64) -> Result<()> {
276    let range: [u64; 2] = [offset, len];
277    // SAFETY:
278    // Safe because
279    // - we check the return value.
280    // - ioctl(BLKDISCARD) does not hold the descriptor after the call.
281    // - ioctl(BLKDISCARD) does not break the file descriptor.
282    // - ioctl(BLKDISCARD) does not modify the given range.
283    syscall!(unsafe { libc::ioctl(file.as_raw_descriptor(), BLKDISCARD, &range) }).map(|_| ())
284}
285
286/// A trait used to abstract types that provide a process id that can be operated on.
287pub trait AsRawPid {
288    fn as_raw_pid(&self) -> Pid;
289}
290
291impl AsRawPid for Pid {
292    fn as_raw_pid(&self) -> Pid {
293        *self
294    }
295}
296
297impl AsRawPid for std::process::Child {
298    fn as_raw_pid(&self) -> Pid {
299        self.id() as Pid
300    }
301}
302
303/// A safe wrapper around waitpid.
304///
305/// On success if a process was reaped, it will be returned as the first value.
306/// The second returned value is the ExitStatus from the libc::waitpid() call.
307///
308/// Note: this can block if libc::WNOHANG is not set and EINTR is not handled internally.
309pub fn wait_for_pid<A: AsRawPid>(pid: A, options: c_int) -> Result<(Option<Pid>, ExitStatus)> {
310    let pid = pid.as_raw_pid();
311    let mut status: c_int = 1;
312    // SAFETY:
313    // Safe because status is owned and the error is checked.
314    let ret = unsafe { libc::waitpid(pid, &mut status, options) };
315    if ret < 0 {
316        return errno_result();
317    }
318    Ok((
319        if ret == 0 { None } else { Some(ret) },
320        ExitStatus::from_raw(status),
321    ))
322}
323
324/// Reaps a child process that has terminated.
325///
326/// Returns `Ok(pid)` where `pid` is the process that was reaped or `Ok(0)` if none of the children
327/// have terminated. An `Error` is with `errno == ECHILD` if there are no children left to reap.
328///
329/// # Examples
330///
331/// Reaps all child processes until there are no terminated children to reap.
332///
333/// ```
334/// fn reap_children() {
335///     loop {
336///         match base::linux::reap_child() {
337///             Ok(0) => println!("no children ready to reap"),
338///             Ok(pid) => {
339///                 println!("reaped {}", pid);
340///                 continue
341///             },
342///             Err(e) if e.errno() == libc::ECHILD => println!("no children left"),
343///             Err(e) => println!("error reaping children: {}", e),
344///         }
345///         break
346///     }
347/// }
348/// ```
349pub fn reap_child() -> Result<Pid> {
350    // SAFETY:
351    // Safe because we pass in no memory, prevent blocking with WNOHANG, and check for error.
352    let ret = unsafe { waitpid(-1, ptr::null_mut(), WNOHANG) };
353    if ret == -1 {
354        errno_result()
355    } else {
356        Ok(ret)
357    }
358}
359
360/// Kill all processes in the current process group.
361///
362/// On success, this kills all processes in the current process group, including the current
363/// process, meaning this will not return. This is equivalent to a call to `kill(0, SIGKILL)`.
364pub fn kill_process_group() -> Result<()> {
365    // SAFETY: Safe because pid is 'self group' and return value doesn't matter.
366    unsafe { kill(0, SIGKILL) }?;
367    // Kill succeeded, so this process never reaches here.
368    unreachable!();
369}
370
371/// Spawns a pipe pair where the first pipe is the read end and the second pipe is the write end.
372///
373/// The `O_CLOEXEC` flag will be set during pipe creation.
374pub fn pipe() -> Result<(File, File)> {
375    let mut pipe_fds = [-1; 2];
376    // SAFETY:
377    // Safe because pipe2 will only write 2 element array of i32 to the given pointer, and we check
378    // for error.
379    let ret = unsafe { pipe2(&mut pipe_fds[0], O_CLOEXEC) };
380    if ret == -1 {
381        errno_result()
382    } else {
383        // SAFETY:
384        // Safe because both fds must be valid for pipe2 to have returned sucessfully and we have
385        // exclusive ownership of them.
386        Ok(unsafe {
387            (
388                File::from_raw_fd(pipe_fds[0]),
389                File::from_raw_fd(pipe_fds[1]),
390            )
391        })
392    }
393}
394
395/// Sets the pipe signified with fd to `size`.
396///
397/// Returns the new size of the pipe or an error if the OS fails to set the pipe size.
398pub fn set_pipe_size(fd: RawFd, size: usize) -> Result<usize> {
399    // SAFETY:
400    // Safe because fcntl with the `F_SETPIPE_SZ` arg doesn't touch memory.
401    syscall!(unsafe { fcntl(fd, libc::F_SETPIPE_SZ, size as c_int) }).map(|ret| ret as usize)
402}
403
404/// Test-only function used to create a pipe that is full. The pipe is created, has its size set to
405/// the minimum and then has that much data written to it. Use `new_pipe_full` to test handling of
406/// blocking `write` calls in unit tests.
407pub fn new_pipe_full() -> Result<(File, File)> {
408    use std::io::Write;
409
410    let (rx, mut tx) = pipe()?;
411    // The smallest allowed size of a pipe is the system page size on linux.
412    let page_size = set_pipe_size(tx.as_raw_descriptor(), round_up_to_page_size(1))?;
413
414    // Fill the pipe with page_size zeros so the next write call will block.
415    let buf = vec![0u8; page_size];
416    tx.write_all(&buf)?;
417
418    Ok((rx, tx))
419}
420
421/// Used to attempt to clean up a named pipe after it is no longer used.
422pub struct UnlinkUnixDatagram(pub UnixDatagram);
423impl AsRef<UnixDatagram> for UnlinkUnixDatagram {
424    fn as_ref(&self) -> &UnixDatagram {
425        &self.0
426    }
427}
428impl Drop for UnlinkUnixDatagram {
429    fn drop(&mut self) {
430        if let Ok(addr) = self.0.local_addr() {
431            if let Some(path) = addr.as_pathname() {
432                if let Err(e) = remove_file(path) {
433                    warn!("failed to remove control socket file: {}", e);
434                }
435            }
436        }
437    }
438}
439
440/// Used to attempt to clean up a named pipe after it is no longer used.
441pub struct UnlinkUnixListener(pub UnixListener);
442
443impl AsRef<UnixListener> for UnlinkUnixListener {
444    fn as_ref(&self) -> &UnixListener {
445        &self.0
446    }
447}
448
449impl Deref for UnlinkUnixListener {
450    type Target = UnixListener;
451
452    fn deref(&self) -> &UnixListener {
453        &self.0
454    }
455}
456
457impl Drop for UnlinkUnixListener {
458    fn drop(&mut self) {
459        if let Ok(addr) = self.0.local_addr() {
460            if let Some(path) = addr.as_pathname() {
461                if let Err(e) = remove_file(path) {
462                    warn!("failed to remove control socket file: {}", e);
463                }
464            }
465        }
466    }
467}
468
469/// Verifies that |raw_descriptor| is actually owned by this process and duplicates it
470/// to ensure that we have a unique handle to it.
471pub fn validate_raw_descriptor(raw_descriptor: RawDescriptor) -> Result<RawDescriptor> {
472    validate_raw_fd(&raw_descriptor)
473}
474
475/// Verifies that |raw_fd| is actually owned by this process and duplicates it to ensure that
476/// we have a unique handle to it.
477pub fn validate_raw_fd(raw_fd: &RawFd) -> Result<RawFd> {
478    // Checking that close-on-exec isn't set helps filter out FDs that were opened by
479    // crosvm as all crosvm FDs are close on exec.
480    // SAFETY:
481    // Safe because this doesn't modify any memory and we check the return value.
482    let flags = unsafe { libc::fcntl(*raw_fd, libc::F_GETFD) };
483    if flags < 0 || (flags & libc::FD_CLOEXEC) != 0 {
484        return Err(Error::new(libc::EBADF));
485    }
486
487    // SAFETY:
488    // Duplicate the fd to ensure that we don't accidentally close an fd previously
489    // opened by another subsystem.  Safe because this doesn't modify any memory and
490    // we check the return value.
491    let dup_fd = unsafe { libc::fcntl(*raw_fd, libc::F_DUPFD_CLOEXEC, 0) };
492    if dup_fd < 0 {
493        return Err(Error::last());
494    }
495    Ok(dup_fd as RawFd)
496}
497
498/// Utility function that returns true if the given FD is readable without blocking.
499///
500/// On an error, such as an invalid or incompatible FD, this will return false, which can not be
501/// distinguished from a non-ready to read FD.
502pub fn poll_in<F: AsRawDescriptor>(fd: &F) -> bool {
503    let mut fds = libc::pollfd {
504        fd: fd.as_raw_descriptor(),
505        events: libc::POLLIN,
506        revents: 0,
507    };
508    // SAFETY:
509    // Safe because we give a valid pointer to a list (of 1) FD and check the return value.
510    let ret = unsafe { libc::poll(&mut fds, 1, 0) };
511    // An error probably indicates an invalid FD, or an FD that can't be polled. Returning false in
512    // that case is probably correct as such an FD is unlikely to be readable, although there are
513    // probably corner cases in which that is wrong.
514    if ret == -1 {
515        return false;
516    }
517    fds.revents & libc::POLLIN != 0
518}
519
520/// Return the maximum Duration that can be used with libc::timespec.
521pub fn max_timeout() -> Duration {
522    Duration::new(libc::time_t::MAX as u64, 999999999)
523}
524
525/// If the given path is of the form /proc/self/fd/N for some N, returns `Ok(Some(N))`. Otherwise
526/// returns `Ok(None)`.
527pub fn safe_descriptor_from_path<P: AsRef<Path>>(path: P) -> Result<Option<SafeDescriptor>> {
528    let path = path.as_ref();
529    if path.parent() == Some(Path::new("/proc/self/fd")) {
530        let raw_descriptor = path
531            .file_name()
532            .and_then(|fd_osstr| fd_osstr.to_str())
533            .and_then(|fd_str| fd_str.parse::<RawFd>().ok())
534            .ok_or_else(|| Error::new(EINVAL))?;
535        let validated_fd = validate_raw_fd(&raw_descriptor)?;
536        Ok(Some(
537            // SAFETY:
538            // Safe because nothing else has access to validated_fd after this call.
539            unsafe { SafeDescriptor::from_raw_descriptor(validated_fd) },
540        ))
541    } else {
542        Ok(None)
543    }
544}
545
546/// Check FD is not opened by crosvm and returns a FD that is freshly DUPFD_CLOEXEC's.
547/// A SafeDescriptor is created from the duplicated fd. It does not take ownership of
548/// fd passed by argument.
549pub fn safe_descriptor_from_cmdline_fd(fd: &RawFd) -> Result<SafeDescriptor> {
550    let validated_fd = validate_raw_fd(fd)?;
551    Ok(
552        // SAFETY:
553        // Safe because nothing else has access to validated_fd after this call.
554        unsafe { SafeDescriptor::from_raw_descriptor(validated_fd) },
555    )
556}
557
558/// Open the file with the given path, or if it is of the form `/proc/self/fd/N` then just use the
559/// file descriptor.
560///
561/// Note that this will not work properly if the same `/proc/self/fd/N` path is used twice in
562/// different places, as the metadata (including the offset) will be shared between both file
563/// descriptors.
564pub fn open_file_or_duplicate<P: AsRef<Path>>(path: P, options: &OpenOptions) -> Result<File> {
565    let path = path.as_ref();
566    // Special case '/proc/self/fd/*' paths. The FD is already open, just use it.
567    Ok(if let Some(fd) = safe_descriptor_from_path(path)? {
568        fd.into()
569    } else {
570        options.open(path)?
571    })
572}
573
574/// Get the soft and hard limits of max number of open files allowed by the environment.
575pub fn max_open_files() -> Result<libc::rlimit64> {
576    let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
577
578    // SAFETY:
579    // Safe because this will only modify `buf` and we check the return value.
580    let res = unsafe { libc::prlimit64(0, libc::RLIMIT_NOFILE, ptr::null(), buf.as_mut_ptr()) };
581    if res == 0 {
582        // SAFETY:
583        // Safe because the kernel guarantees that the struct is fully initialized.
584        let limit = unsafe { buf.assume_init() };
585        Ok(limit)
586    } else {
587        errno_result()
588    }
589}
590
591/// Executes the given callback with extended soft limit of max number of open files. After the
592/// callback executed, restore the limit.
593pub fn call_with_extended_max_files<T, E>(
594    callback: impl FnOnce() -> std::result::Result<T, E>,
595) -> Result<std::result::Result<T, E>> {
596    let cur_limit = max_open_files()?;
597    let new_limit = libc::rlimit64 {
598        rlim_cur: cur_limit.rlim_max,
599        ..cur_limit
600    };
601    let needs_extension = cur_limit.rlim_cur < new_limit.rlim_cur;
602    if needs_extension {
603        set_max_open_files(new_limit)?;
604    }
605
606    let r = callback();
607
608    // Restore the soft limit.
609    if needs_extension {
610        set_max_open_files(cur_limit)?;
611    }
612
613    Ok(r)
614}
615
616/// Set the soft and hard limits of max number of open files to the given value.
617fn set_max_open_files(limit: libc::rlimit64) -> Result<()> {
618    // SAFETY: RLIMIT_NOFILE is known only to read a buffer of size rlimit64, and we have always
619    // rlimit64 allocated.
620    let res = unsafe { libc::setrlimit64(libc::RLIMIT_NOFILE, &limit) };
621    if res == 0 {
622        Ok(())
623    } else {
624        errno_result()
625    }
626}
627
628/// Moves the requested PID/TID to a particular cgroup
629pub fn move_to_cgroup(cgroup_path: PathBuf, id_to_write: Pid, cgroup_file: &str) -> Result<()> {
630    use std::io::Write;
631
632    let gpu_cgroup_file = cgroup_path.join(cgroup_file);
633    let mut f = File::create(gpu_cgroup_file)?;
634    f.write_all(id_to_write.to_string().as_bytes())?;
635    Ok(())
636}
637
638pub fn move_task_to_cgroup(cgroup_path: PathBuf, thread_id: Pid) -> Result<()> {
639    move_to_cgroup(cgroup_path, thread_id, "tasks")
640}
641
642pub fn move_proc_to_cgroup(cgroup_path: PathBuf, process_id: Pid) -> Result<()> {
643    move_to_cgroup(cgroup_path, process_id, "cgroup.procs")
644}
645
646fn read_sysfs_cpu_info_in_dir(cpu_dir: &str, cpu_id: usize, property: &str) -> Result<String> {
647    let path = Path::new(cpu_dir)
648        .join(format!("cpu{cpu_id}"))
649        .join(property);
650
651    std::fs::read_to_string(path).map_err(|e| e.into())
652}
653
654/// Queries the property of a specified CPU sysfs node.
655fn parse_sysfs_cpu_info_vec(cpu_id: usize, property: &str) -> Result<Vec<u32>> {
656    parse_sysfs_cpu_info_vec_in_dir(CPU_DIR, cpu_id, property)
657}
658
659fn parse_sysfs_cpu_info_vec_in_dir(
660    cpu_dir: &str,
661    cpu_id: usize,
662    property: &str,
663) -> Result<Vec<u32>> {
664    read_sysfs_cpu_info_in_dir(cpu_dir, cpu_id, property)?
665        .split_whitespace()
666        .map(|x| x.parse().map_err(|_| Error::new(libc::EINVAL)))
667        .collect()
668}
669
670/// Returns a list of supported frequencies in kHz for a given logical core.
671pub fn logical_core_frequencies_khz(cpu_id: usize) -> Result<Vec<u32>> {
672    parse_sysfs_cpu_info_vec(cpu_id, "cpufreq/scaling_available_frequencies")
673}
674
675/// Queries the property of a specified CPU sysfs node.
676fn parse_sysfs_cpu_info(cpu_id: usize, property: &str) -> Result<u32> {
677    parse_sysfs_cpu_info_in_dir(CPU_DIR, cpu_id, property)
678}
679
680fn parse_sysfs_cpu_info_in_dir(cpu_dir: &str, cpu_id: usize, property: &str) -> Result<u32> {
681    read_sysfs_cpu_info_in_dir(cpu_dir, cpu_id, property)?
682        .trim()
683        .parse()
684        .map_err(|_| Error::new(libc::EINVAL))
685}
686
687/// Returns the capacity (measure of performance) of a given logical core.
688pub fn logical_core_capacity(cpu_id: usize) -> Result<u32> {
689    static CPU_MAX_FREQS: OnceLock<Option<Vec<u32>>> = OnceLock::new();
690
691    let cpu_capacity = parse_sysfs_cpu_info(cpu_id, "cpu_capacity")?;
692
693    // Collect and cache the maximum frequencies of all cores. We need to know
694    // the largest maximum frequency between all cores to reverse normalization,
695    // so collect all the values once on the first call to this function.
696    let cpu_max_freqs = CPU_MAX_FREQS.get_or_init(|| {
697        (0..number_of_logical_cores().ok()?)
698            .map(|cpu_id| logical_core_max_freq_khz(cpu_id).ok())
699            .collect()
700    });
701
702    if let Some(cpu_max_freqs) = cpu_max_freqs {
703        let largest_max_freq = *cpu_max_freqs.iter().max().ok_or(Error::new(EINVAL))?;
704        let cpu_max_freq = *cpu_max_freqs.get(cpu_id).ok_or(Error::new(EINVAL))?;
705        let normalized_cpu_capacity = (u64::from(cpu_capacity) * u64::from(largest_max_freq))
706            .checked_div(u64::from(cpu_max_freq))
707            .ok_or(Error::new(EINVAL))?;
708        normalized_cpu_capacity
709            .try_into()
710            .map_err(|_| Error::new(EINVAL))
711    } else {
712        // cpu-freq is not enabled. Fall back to using the normalized capacity.
713        Ok(cpu_capacity)
714    }
715}
716
717/// Returns the cluster ID of a given logical core.
718pub fn logical_core_cluster_id(cpu_id: usize) -> Result<u32> {
719    parse_sysfs_cpu_info(cpu_id, "topology/physical_package_id")
720}
721
722/// Returns the maximum frequency (in kHz) of a given logical core.
723pub fn logical_core_max_freq_khz(cpu_id: usize) -> Result<u32> {
724    parse_sysfs_cpu_info(cpu_id, "cpufreq/cpuinfo_max_freq")
725}
726
727/// Returns a bool if the CPU is online, or an error if there was an issue reading the system
728/// properties.
729pub fn is_cpu_online(cpu_id: usize) -> Result<bool> {
730    let result = parse_sysfs_cpu_info(cpu_id, "online");
731    match result {
732        Err(e) => {
733            if e.errno() == libc::ENOENT {
734                // Some systems don't have a file for CPU 0 if the system considers CPU 0 to be
735                // always-online. Or if CONFIG_HOTPLUG_CPU=n, then the "online" property/file will
736                // never be created in drivers/base/cpu.c.
737                Ok(true)
738            } else {
739                Err(e)
740            }
741        }
742        Ok(online) => Ok(online == 1),
743    }
744}
745
746#[repr(C)]
747pub struct sched_attr {
748    pub size: u32,
749
750    pub sched_policy: u32,
751    pub sched_flags: u64,
752    pub sched_nice: i32,
753
754    pub sched_priority: u32,
755
756    pub sched_runtime: u64,
757    pub sched_deadline: u64,
758    pub sched_period: u64,
759
760    pub sched_util_min: u32,
761    pub sched_util_max: u32,
762}
763
764impl Default for sched_attr {
765    fn default() -> Self {
766        Self {
767            size: std::mem::size_of::<sched_attr>() as u32,
768            sched_policy: 0,
769            sched_flags: 0,
770            sched_nice: 0,
771            sched_priority: 0,
772            sched_runtime: 0,
773            sched_deadline: 0,
774            sched_period: 0,
775            sched_util_min: 0,
776            sched_util_max: 0,
777        }
778    }
779}
780
781pub fn sched_setattr(pid: Pid, attr: &mut sched_attr, flags: u32) -> Result<()> {
782    // SAFETY: Safe becuase all the args are valid and the return valud is checked.
783    let ret = unsafe {
784        libc::syscall(
785            libc::SYS_sched_setattr,
786            pid as usize,
787            attr as *mut sched_attr as usize,
788            flags as usize,
789        )
790    };
791
792    if ret < 0 {
793        return Err(Error::last());
794    }
795    Ok(())
796}
797
798#[cfg(test)]
799mod tests {
800    use std::fs::create_dir_all;
801    use std::fs::File;
802    use std::io::Write;
803    use std::os::fd::AsRawFd;
804
805    use tempfile::TempDir;
806
807    use super::*;
808    use crate::unix::add_fd_flags;
809
810    fn create_temp_file(path: &Path, content: &str) {
811        if let Some(parent) = path.parent() {
812            create_dir_all(parent).unwrap();
813        }
814        let mut file = File::create(path).unwrap();
815        file.write_all(content.as_bytes()).unwrap();
816    }
817
818    #[test]
819    fn pipe_size_and_fill() {
820        let (_rx, mut tx) = new_pipe_full().expect("Failed to pipe");
821
822        // To  check that setting the size worked, set the descriptor to non blocking and check that
823        // write returns an error.
824        add_fd_flags(tx.as_raw_fd(), libc::O_NONBLOCK).expect("Failed to set tx non blocking");
825        tx.write(&[0u8; 8])
826            .expect_err("Write after fill didn't fail");
827    }
828
829    #[test]
830    fn test_parse_sysfs_cpu_info() {
831        let temp_dir = TempDir::new().unwrap();
832        let root = temp_dir.path();
833        let cpu_dir = root.join("sys/devices/system/cpu");
834        let cpu = 0;
835        let property = "cpufreq/cpuinfo_max_freq";
836        create_temp_file(
837            &root.join("sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"),
838            "1000",
839        );
840
841        assert_eq!(
842            parse_sysfs_cpu_info_in_dir(cpu_dir.to_str().unwrap(), cpu, property).unwrap(),
843            1000
844        );
845    }
846
847    #[test]
848    fn test_parse_sysfs_cpu_info_error() {
849        let temp_dir = TempDir::new().unwrap();
850        let root = temp_dir.path();
851        let cpu_dir = root.join("sys/devices/system/cpu");
852        let cpu = 0;
853        let property = "cpufreq/cpuinfo_max_freq";
854        // Not creating the sysinfo file should result in an error trying to read from it.
855
856        let err =
857            parse_sysfs_cpu_info_in_dir(cpu_dir.to_str().unwrap(), cpu, property).unwrap_err();
858        assert_eq!(err, Error::new(libc::ENOENT));
859    }
860
861    #[test]
862    fn test_parse_sysfs_cpu_info_vec() {
863        let temp_dir = TempDir::new().unwrap();
864        let root = temp_dir.path();
865        let cpu_dir = root.join("sys/devices/system/cpu");
866        let cpu = 0;
867        let property = "cpufreq/scaling_available_frequencies";
868        create_temp_file(
869            &root.join("sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies"),
870            "1000 2000",
871        );
872
873        assert_eq!(
874            parse_sysfs_cpu_info_vec_in_dir(cpu_dir.to_str().unwrap(), cpu, property).unwrap(),
875            vec![1000, 2000]
876        );
877    }
878
879    #[test]
880    fn test_parse_sysfs_cpu_info_vec_error() {
881        let temp_dir = TempDir::new().unwrap();
882        let root = temp_dir.path();
883        let cpu_dir = root.join("sys/devices/system/cpu");
884        let cpu = 0;
885        let property = "cpufreq/scaling_available_frequencies";
886        // Not creating the sysinfo file should result in an error trying to read from it.
887
888        let err =
889            parse_sysfs_cpu_info_vec_in_dir(cpu_dir.to_str().unwrap(), cpu, property).unwrap_err();
890        assert_eq!(err, Error::new(libc::ENOENT));
891    }
892}