swap/
userfaultfd.rs

1// Copyright 2022 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Provides wrapper of userfaultfd crate for vmm-swap feature.
6
7#![deny(missing_docs)]
8
9use std::convert::From;
10use std::fs::File;
11use std::fs::OpenOptions;
12use std::ops::Range;
13use std::os::unix::io::AsRawFd;
14use std::os::unix::prelude::FromRawFd;
15use std::os::unix::prelude::OpenOptionsExt;
16
17use anyhow::Context;
18use base::errno_result;
19use base::info;
20use base::ioctl_io_nr;
21use base::ioctl_iowr_nr;
22use base::ioctl_with_mut_ref;
23use base::ioctl_with_val;
24use base::linux::MemoryMappingUnix;
25use base::AsRawDescriptor;
26use base::AsRawDescriptors;
27use base::FromRawDescriptor;
28use base::IntoRawDescriptor;
29use base::MappedRegion;
30use base::MemoryMapping;
31use base::MemoryMappingBuilder;
32use base::RawDescriptor;
33use thiserror::Error as ThisError;
34use userfaultfd::Error as UffdError;
35pub use userfaultfd::Event as UffdEvent;
36use userfaultfd::FeatureFlags;
37use userfaultfd::IoctlFlags;
38use userfaultfd::Uffd;
39use userfaultfd::UffdBuilder;
40
41use crate::pagesize::pages_to_bytes;
42
43const DEV_USERFAULTFD_PATH: &str = "/dev/userfaultfd";
44const USERFAULTFD_IOC: u32 = 0xAA;
45ioctl_io_nr!(USERFAULTFD_IOC_NEW, USERFAULTFD_IOC, 0x00);
46ioctl_iowr_nr!(
47    UFFDIO_API,
48    userfaultfd_sys::UFFDIO,
49    userfaultfd_sys::_UFFDIO_API,
50    userfaultfd_sys::uffdio_api
51);
52
53/// Result for Userfaultfd
54pub type Result<T> = std::result::Result<T, Error>;
55
56/// Errors for Userfaultfd
57#[derive(ThisError, Debug)]
58pub enum Error {
59    #[error("userfaultfd error: {0:?}")]
60    /// unrecoverable userfaultfd error.
61    Userfaultfd(UffdError),
62    #[error("copy partially succeeded: {0:?} bytes copied")]
63    /// UFFDIO_COPY partillay succeed.
64    PartiallyCopied(usize),
65    #[error("the page is already filled")]
66    /// The page is already filled.
67    PageExist,
68    #[error("the uffd in the corresponding process is already closed")]
69    /// The corresponding process is already dead or has run exec(2).
70    UffdClosed,
71    #[error("clone error: {0:?}")]
72    /// Failed to clone userfaultfd.
73    Clone(base::Error),
74}
75
76impl From<UffdError> for Error {
77    fn from(e: UffdError) -> Self {
78        match e {
79            UffdError::PartiallyCopied(copied) => Self::PartiallyCopied(copied),
80            UffdError::CopyFailed(errno) if errno as i32 == libc::ESRCH => Self::UffdClosed,
81            UffdError::ZeropageFailed(errno) if errno as i32 == libc::EEXIST => Self::PageExist,
82            UffdError::ZeropageFailed(errno) if errno as i32 == libc::ESRCH => Self::UffdClosed,
83            other => Self::Userfaultfd(other),
84        }
85    }
86}
87
88/// Register all the regions to all the userfaultfd
89///
90/// # Arguments
91///
92/// * `regions` - the list of address range of regions.
93/// * `uffds` - the reference to the list of [Userfaultfd] for all the processes which may touch the
94///   `address_range` to be registered.
95///
96/// # Safety
97///
98/// Each address range in `regions` must be from guest memory.
99///
100/// The `uffds` must cover all the processes which may touch the `address_range`. otherwise some
101/// pages are zeroed by kernel on the unregistered process instead of swapping in from the swap
102/// file.
103#[deny(unsafe_op_in_unsafe_fn)]
104pub unsafe fn register_regions(regions: &[Range<usize>], uffds: &[Userfaultfd]) -> Result<()> {
105    for address_range in regions {
106        for uffd in uffds {
107            // SAFETY:
108            // Safe because the range is from the guest memory region.
109            let result = unsafe {
110                uffd.register(address_range.start, address_range.end - address_range.start)
111            };
112            match result {
113                Ok(_) => {}
114                // Skip the userfaultfd for dead processes.
115                Err(Error::UffdClosed) => {}
116                Err(e) => {
117                    return Err(e);
118                }
119            };
120        }
121    }
122    Ok(())
123}
124
125/// Unregister all the regions from all the userfaultfd.
126///
127/// `UFFDIO_UNREGISTER` unblocks any threads currently waiting on the region and remove page fault
128/// events on the region from the userfaultfd event queue.
129///
130/// # Arguments
131///
132/// * `regions` - the list of address range of regions.
133/// * `uffds` - the reference to the list of registered [Userfaultfd].
134pub fn unregister_regions(regions: &[Range<usize>], uffds: &[Userfaultfd]) -> Result<()> {
135    for address_range in regions {
136        for uffd in uffds {
137            let result =
138                uffd.unregister(address_range.start, address_range.end - address_range.start);
139            match result {
140                Ok(_) => {}
141                // Skip the userfaultfd for dead processes.
142                Err(Error::UffdClosed) => {}
143                Err(e) => {
144                    return Err(e);
145                }
146            };
147        }
148    }
149    Ok(())
150}
151
152/// Factory for [Userfaultfd].
153///
154/// If `/dev/userfaultfd` (introduced from Linux 6.1) exists, creates userfaultfd from the dev file.
155/// Otherwise use `userfaultfd(2)` to create a userfaultfd.
156pub struct Factory {
157    dev_file: Option<File>,
158}
159
160impl Default for Factory {
161    fn default() -> Self {
162        Self::new()
163    }
164}
165
166impl Factory {
167    /// Create [Factory] and try open `/dev/userfaultfd`.
168    ///
169    /// If it fails to open `/dev/userfaultfd`, userfaultfd creation fallback to `userfaultfd(2)`
170    /// syscall.
171    pub fn new() -> Self {
172        let dev_file = OpenOptions::new()
173            .read(true)
174            .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK)
175            .open(DEV_USERFAULTFD_PATH);
176        match dev_file {
177            Ok(dev_file) => Self {
178                dev_file: Some(dev_file),
179            },
180            Err(e) => {
181                info!(
182                    "Failed to open /dev/userfaultfd ({:?}), will fall back to userfaultfd(2)",
183                    e
184                );
185                Self { dev_file: None }
186            }
187        }
188    }
189
190    /// Creates a new [Userfaultfd] for this process.
191    pub fn create(&self) -> anyhow::Result<Userfaultfd> {
192        if let Some(dev_file) = &self.dev_file {
193            // SAFETY:
194            // Safe because ioctl(2) USERFAULTFD_IOC_NEW with does not change Rust memory safety.
195            let res = unsafe {
196                ioctl_with_val(
197                    dev_file,
198                    USERFAULTFD_IOC_NEW,
199                    (libc::O_CLOEXEC | libc::O_NONBLOCK) as libc::c_ulong,
200                )
201            };
202            let uffd = if res < 0 {
203                return errno_result().context("USERFAULTFD_IOC_NEW");
204            } else {
205                // Safe because the uffd is not owned by anyone in this process.
206                // SAFETY:
207                unsafe { Userfaultfd::from_raw_descriptor(res) }
208            };
209            let mut api = userfaultfd_sys::uffdio_api {
210                api: userfaultfd_sys::UFFD_API,
211                features: (FeatureFlags::MISSING_SHMEM | FeatureFlags::EVENT_REMOVE).bits(),
212                ioctls: 0,
213            };
214            // SAFETY:
215            // Safe because ioctl(2) UFFDIO_API with does not change Rust memory safety.
216            let res = unsafe { ioctl_with_mut_ref(&uffd, UFFDIO_API, &mut api) };
217            if res < 0 {
218                errno_result().context("UFFDIO_API")
219            } else {
220                Ok(uffd)
221            }
222        } else {
223            Userfaultfd::new().context("create userfaultfd")
224        }
225    }
226
227    /// Create a new [Factory] object.
228    pub fn try_clone(&self) -> anyhow::Result<Self> {
229        let dev_file = self.dev_file.as_ref().map(File::try_clone).transpose()?;
230        Ok(Self { dev_file })
231    }
232}
233
234impl AsRawDescriptors for Factory {
235    fn as_raw_descriptors(&self) -> Vec<RawDescriptor> {
236        if let Some(dev_file) = &self.dev_file {
237            vec![dev_file.as_raw_descriptor()]
238        } else {
239            Vec::new()
240        }
241    }
242}
243
244/// Wrapper for [`userfaultfd::Uffd`] to be used in the vmm-swap feature.
245///
246/// # Safety
247///
248/// The userfaultfd operations (`UFFDIO_COPY` and `UFFDIO_ZEROPAGE`) looks unsafe since it fills a
249/// memory content directly. But they actually are not unsafe operation but `UFFDIO_REGISTER` should
250/// be the unsafe operation for Rust memory safety.
251///
252/// According to [the Rust document](https://doc.rust-lang.org/nomicon/uninitialized.html),
253///
254/// > All runtime-allocated memory in a Rust program begins its life as uninitialized.
255///
256/// The userfaultfd operations actually does not change/overwrite the existing memory contents but
257/// they just setup the "uninitialized" pages. If the page was already initialized, the userfaultfd
258/// operations fail and return EEXIST error (which is not documented unfortunately). So they
259/// originally does not affect the Rust memory safety.
260///
261/// The "uninitialized" page in this context has 2 patterns:
262///
263/// 1. pages which is never touched or,
264/// 2. pages which is never touched after MADV_REMOVE
265///
266/// Filling the (1) pages with any contents should not affect the Rust memory safety.
267///
268/// Filling the (2) pages potentially may break the memory used by Rust. But the safety should be
269/// examined at `MADV_REMOVE` and `UFFDIO_REGISTER` timing.
270#[derive(Debug)]
271pub struct Userfaultfd {
272    uffd: Uffd,
273}
274
275impl Userfaultfd {
276    /// Creates a new userfaultfd using userfaultfd(2) syscall.
277    ///
278    /// This is public for tests.
279    pub fn new() -> Result<Self> {
280        let uffd = UffdBuilder::new()
281            .close_on_exec(true)
282            .non_blocking(true)
283            .user_mode_only(false)
284            .require_features(FeatureFlags::MISSING_SHMEM | FeatureFlags::EVENT_REMOVE)
285            .create()?;
286        Ok(Self { uffd })
287    }
288
289    /// Register a range of memory to the userfaultfd.
290    ///
291    /// After this registration, any page faults on the range will be caught by the userfaultfd.
292    ///
293    /// # Arguments
294    ///
295    /// * `addr` - the starting address of the range of memory.
296    /// * `len` - the length in bytes of the range of memory.
297    ///
298    /// # Safety
299    ///
300    /// [addr, addr+len) must lie within a [MemoryMapping], and that mapping
301    /// must live for the lifespan of the userfaultfd kernel object (which may be distinct from the
302    /// `Userfaultfd` rust object in this process).
303    pub unsafe fn register(&self, addr: usize, len: usize) -> Result<IoctlFlags> {
304        match self.uffd.register(addr as *mut libc::c_void, len) {
305            Ok(flags) => Ok(flags),
306            Err(UffdError::SystemError(errno)) if errno as i32 == libc::ENOMEM => {
307                // Userfaultfd returns `ENOMEM` if the corresponding process dies or run as another
308                // program by `exec` system call.
309                // TODO(b/267124393): Verify UFFDIO_ZEROPAGE + ESRCH as well since ENOMEM may be for
310                // other reasons.
311                Err(Error::UffdClosed)
312            }
313            Err(e) => Err(e.into()),
314        }
315    }
316
317    /// Unregister a range of memory from the userfaultfd.
318    ///
319    /// # Arguments
320    ///
321    /// * `addr` - the starting address of the range of memory.
322    /// * `len` - the length in bytes of the range of memory.
323    pub fn unregister(&self, addr: usize, len: usize) -> Result<()> {
324        match self.uffd.unregister(addr as *mut libc::c_void, len) {
325            Ok(_) => Ok(()),
326            Err(UffdError::SystemError(errno)) if errno as i32 == libc::ENOMEM => {
327                // Userfaultfd returns `ENOMEM` if the corresponding process dies or run as another
328                // program by `exec` system call.
329                // TODO(b/267124393): Verify UFFDIO_ZEROPAGE + ESRCH as well since ENOMEM may be for
330                // other reasons.
331                Err(Error::UffdClosed)
332            }
333            Err(e) => Err(e.into()),
334        }
335    }
336
337    /// Initialize page(s) and fill it with zero.
338    ///
339    /// # Arguments
340    ///
341    /// * `addr` - the starting address of the page(s) to be initialzed with zero.
342    /// * `len` - the length in bytes of the page(s).
343    /// * `wake` - whether or not to unblock the faulting thread.
344    pub fn zero(&self, addr: usize, len: usize, wake: bool) -> Result<usize> {
345        // SAFETY:
346        // safe because zeroing untouched pages does not break the Rust memory safety since "All
347        // runtime-allocated memory in a Rust program begins its life as uninitialized."
348        // https://doc.rust-lang.org/nomicon/uninitialized.html
349        Ok(unsafe { self.uffd.zeropage(addr as *mut libc::c_void, len, wake) }?)
350    }
351
352    /// Copy the `data` to the page(s) starting from `addr`.
353    ///
354    /// # Arguments
355    ///
356    /// * `addr` - the starting address of the page(s) to be initialzed with data.
357    /// * `len` - the length in bytes of the page(s).
358    /// * `data` - the starting address of the content.
359    /// * `wake` - whether or not to unblock the faulting thread.
360    pub fn copy(&self, addr: usize, len: usize, data: *const u8, wake: bool) -> Result<usize> {
361        Ok(
362            // SAFETY:
363            // safe because filling untouched pages with data does not break the Rust memory safety
364            // since "All runtime-allocated memory in a Rust program begins its life as
365            // uninitialized." https://doc.rust-lang.org/nomicon/uninitialized.html
366            unsafe {
367                self.uffd.copy(
368                    data as *const libc::c_void,
369                    addr as *mut libc::c_void,
370                    len,
371                    wake,
372                )
373            }?,
374        )
375    }
376
377    /// Wake the faulting thread blocked by the page(s).
378    ///
379    /// If the page is not initialized, the thread causes a page fault again.
380    ///
381    /// # Arguments
382    ///
383    /// * `addr` - the starting address of the page(s).
384    /// * `len` - the length in bytes of the page(s).
385    pub fn wake(&self, addr: usize, len: usize) -> Result<()> {
386        Ok(self.uffd.wake(addr as *mut libc::c_void, len)?)
387    }
388
389    /// Read an event from the userfaultfd.
390    ///
391    /// Return `None` immediately if no events is ready to read.
392    pub fn read_event(&self) -> Result<Option<UffdEvent>> {
393        Ok(self.uffd.read_event()?)
394    }
395
396    /// Try to clone [Userfaultfd]
397    pub fn try_clone(&self) -> Result<Self> {
398        let dup_desc = base::clone_descriptor(self).map_err(Error::Clone)?;
399        // SAFETY: no one owns dup_desc.
400        let uffd = Self::from(unsafe { Uffd::from_raw_fd(dup_desc.into_raw_descriptor()) });
401        Ok(uffd)
402    }
403}
404
405impl From<Uffd> for Userfaultfd {
406    fn from(uffd: Uffd) -> Self {
407        Self { uffd }
408    }
409}
410
411impl FromRawDescriptor for Userfaultfd {
412    unsafe fn from_raw_descriptor(descriptor: RawDescriptor) -> Self {
413        Self::from(Uffd::from_raw_fd(descriptor))
414    }
415}
416
417impl AsRawDescriptor for Userfaultfd {
418    fn as_raw_descriptor(&self) -> RawDescriptor {
419        self.uffd.as_raw_fd()
420    }
421}
422
423/// Check whether the process for the [Userfaultfd] is dead or not.
424pub trait DeadUffdChecker {
425    /// Register the [Userfaultfd]
426    fn register(&self, uffd: &Userfaultfd) -> anyhow::Result<()>;
427    /// Check whether the [Userfaultfd] is dead or not.
428    fn is_dead(&self, uffd: &Userfaultfd) -> bool;
429    /// Free the internal state.
430    fn reset(&self) -> anyhow::Result<()>;
431}
432
433/// Check whether the process for the [Userfaultfd] is dead or not.
434///
435/// [DeadUffdCheckerImpl] uses `UFFD_ZERO` on a dummy mmap page to check the liveness.
436///
437/// This must keep alive on the main process to make the dummy mmap present in all descendant
438/// processes.
439pub struct DeadUffdCheckerImpl {
440    dummy_mmap: MemoryMapping,
441}
442
443impl DeadUffdCheckerImpl {
444    /// Creates [DeadUffdCheckerImpl].
445    pub fn new() -> anyhow::Result<Self> {
446        Ok(Self {
447            dummy_mmap: MemoryMappingBuilder::new(pages_to_bytes(1))
448                .build()
449                .context("create dummy mmap")?,
450        })
451    }
452}
453
454impl DeadUffdChecker for DeadUffdCheckerImpl {
455    fn register(&self, uffd: &Userfaultfd) -> anyhow::Result<()> {
456        // SAFETY: no one except DeadUffdCheckerImpl access dummy_mmap.
457        unsafe { uffd.register(self.dummy_mmap.as_ptr() as usize, pages_to_bytes(1)) }
458            .map(|_| ())
459            .context("register to dummy mmap")
460    }
461
462    fn is_dead(&self, uffd: &Userfaultfd) -> bool {
463        // UFFDIO_ZEROPAGE returns ESRCH for dead uffd.
464        matches!(
465            uffd.zero(self.dummy_mmap.as_ptr() as usize, pages_to_bytes(1), false),
466            Err(Error::UffdClosed)
467        )
468    }
469
470    fn reset(&self) -> anyhow::Result<()> {
471        self.dummy_mmap
472            .remove_range(0, pages_to_bytes(1))
473            .context("free dummy mmap")
474    }
475}