swap/
page_handler.rs

1// Copyright 2022 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! PageHandler manages the page states of multiple regions.
6
7#![deny(missing_docs)]
8
9use std::fs::File;
10use std::mem;
11use std::ops::Range;
12use std::sync::Arc;
13
14use anyhow::Context;
15use base::error;
16use base::linux::FileDataIterator;
17use base::AsRawDescriptor;
18use base::SharedMemory;
19use base::VolatileSlice;
20use sync::Mutex;
21use thiserror::Error as ThisError;
22
23use crate::file::Error as FileError;
24use crate::file::SwapFile;
25use crate::pagesize::addr_to_page_idx;
26use crate::pagesize::bytes_to_pages;
27use crate::pagesize::is_hugepage_aligned;
28use crate::pagesize::is_page_aligned;
29use crate::pagesize::page_base_addr;
30use crate::pagesize::page_idx_to_addr;
31use crate::pagesize::pages_to_bytes;
32use crate::pagesize::round_up_hugepage_size;
33use crate::pagesize::THP_SIZE;
34use crate::staging::CopyOp;
35use crate::staging::Error as StagingError;
36use crate::staging::StagingMemory;
37use crate::userfaultfd::Error as UffdError;
38use crate::userfaultfd::Userfaultfd;
39use crate::worker::Channel;
40use crate::worker::Task;
41use crate::SwapMetrics;
42
43pub(crate) const MLOCK_BUDGET: usize = 16 * 1024 * 1024; // = 16MB
44const PREFETCH_THRESHOLD: usize = 4 * 1024 * 1024; // = 4MB
45
46/// Result for PageHandler
47pub type Result<T> = std::result::Result<T, Error>;
48
49/// Errors for PageHandler
50#[derive(ThisError, Debug)]
51pub enum Error {
52    #[error("the address is invalid {0:#018X}")]
53    /// the address is invalid
54    InvalidAddress(usize),
55    #[error("the regions {0:?} and {1:?} overlap")]
56    /// regions are overlaps on registering
57    RegionOverlap(Range<usize>, Range<usize>),
58    #[error("failed to create page handler {0:?}")]
59    /// failed to create page handler
60    CreateFailed(anyhow::Error),
61    #[error("file operation failed : {0:?}")]
62    /// file operation failed
63    File(#[from] FileError),
64    #[error("staging operation failed : {0:?}")]
65    /// staging operation failed
66    Staging(#[from] StagingError),
67    #[error("userfaultfd failed : {0:?}")]
68    /// userfaultfd operation failed
69    Userfaultfd(#[from] UffdError),
70    #[error("failed to iterate data ranges: {0:?}")]
71    /// FileDataIterator failed
72    FileDataIterator(#[from] base::Error),
73}
74
75/// Remove the memory range on the guest memory.
76///
77/// This is an alternative to [vm_memory::GuestMemory::remove_range()] when working with host
78/// addresses instead of guest addresses.
79///
80/// # Safety
81///
82/// The memory range must be on the guest memory.
83#[deny(unsafe_op_in_unsafe_fn)]
84unsafe fn remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error> {
85    // SAFETY:
86    // Safe because the caller guarantees addr is in guest memory, so this does not affect any rust
87    // managed memory.
88    let ret = unsafe { libc::madvise(addr as *mut libc::c_void, len, libc::MADV_REMOVE) };
89    if ret < 0 {
90        base::errno_result()
91    } else {
92        Ok(())
93    }
94}
95
96fn uffd_copy_all(
97    uffd: &Userfaultfd,
98    mut page_addr: usize,
99    mut data_slice: VolatileSlice,
100    wake: bool,
101) -> std::result::Result<(), UffdError> {
102    loop {
103        let result = uffd.copy(page_addr, data_slice.size(), data_slice.as_ptr(), wake);
104        match result {
105            Err(UffdError::PartiallyCopied(copied)) => {
106                page_addr += copied;
107                data_slice.advance(copied);
108            }
109            other => {
110                // Even EEXIST for copy operation should be an error for page fault handling. If
111                // the page was swapped in before, the page should be cleared from the swap file
112                // and do `Userfaultfd::zero()` instead.
113                return other.map(|_| ());
114            }
115        }
116    }
117}
118
119/// [Region] represents a memory region and corresponding [SwapFile].
120struct Region {
121    /// the head page index of the region.
122    head_page_idx: usize,
123    base_page_idx_in_file: usize,
124    num_pages: usize,
125    staging_memory: StagingMemory,
126    copied_from_file_pages: usize,
127    copied_from_staging_pages: usize,
128    zeroed_pages: usize,
129    swap_in_pages: usize,
130    /// the amount of pages which were already initialized on page faults.
131    redundant_pages: usize,
132}
133
134/// MoveToStaging copies chunks of consecutive pages next to each other on the guest memory to the
135/// staging memory and removes the chunks on the guest memory.
136pub struct MoveToStaging {
137    remove_area: Range<usize>,
138    copies: Vec<CopyOp>,
139}
140
141impl Task for MoveToStaging {
142    fn execute(self) {
143        for copy_op in self.copies {
144            copy_op.execute();
145        }
146        // Remove chunks of pages at once to reduce madvise(2) syscall.
147        // SAFETY:
148        // Safe because the region is already backed by the file and the content will be
149        // swapped in on a page fault.
150        let result = unsafe {
151            remove_memory(
152                self.remove_area.start,
153                self.remove_area.end - self.remove_area.start,
154            )
155        };
156        if let Err(e) = result {
157            panic!("failed to remove memory: {e:?}");
158        }
159    }
160}
161
162struct PageHandleContext<'a> {
163    file: SwapFile<'a>,
164    regions: Vec<Region>,
165    mlock_budget_pages: usize,
166}
167
168/// PageHandler manages the page states of multiple regions.
169///
170/// Handles multiple events derived from userfaultfd and swap out requests.
171/// All the addresses and sizes in bytes are converted to page id internally.
172pub struct PageHandler<'a> {
173    ctx: Mutex<PageHandleContext<'a>>,
174    channel: Arc<Channel<MoveToStaging>>,
175}
176
177impl<'a> PageHandler<'a> {
178    /// Creates [PageHandler] for the given region.
179    ///
180    /// If any of regions overlaps, this returns [Error::RegionOverlap].
181    ///
182    /// # Arguments
183    ///
184    /// * `swap_file` - The swap file.
185    /// * `staging_shmem` - The staging memory. It must have enough size to hold guest memory.
186    ///   Otherwise monitor process crashes on creating a mmap.
187    /// * `address_ranges` - The list of address range of the regions. the start address must align
188    ///   with page. the size must be multiple of pagesize.
189    pub fn create(
190        swap_file: &'a File,
191        staging_shmem: &'a SharedMemory,
192        address_ranges: &[Range<usize>],
193        stating_move_context: Arc<Channel<MoveToStaging>>,
194    ) -> Result<Self> {
195        // Truncate the file into the size to hold all regions, otherwise access beyond the end of
196        // file may cause SIGBUS.
197        swap_file
198            .set_len(
199                address_ranges
200                    .iter()
201                    .map(|r| (r.end.saturating_sub(r.start)) as u64)
202                    .sum(),
203            )
204            .context("truncate swap file")
205            .map_err(Error::CreateFailed)?;
206
207        let mut regions: Vec<Region> = Vec::new();
208        let mut offset_pages = 0;
209        for address_range in address_ranges {
210            let head_page_idx = addr_to_page_idx(address_range.start);
211            if address_range.end < address_range.start {
212                return Err(Error::CreateFailed(anyhow::anyhow!(
213                    "invalid region end < start"
214                )));
215            }
216            let region_size = address_range.end - address_range.start;
217            let num_pages = bytes_to_pages(region_size);
218
219            // Find an overlapping region
220            match regions.iter().position(|region| {
221                if region.head_page_idx < head_page_idx {
222                    region.head_page_idx + region.num_pages > head_page_idx
223                } else {
224                    region.head_page_idx < head_page_idx + num_pages
225                }
226            }) {
227                Some(i) => {
228                    let region = &regions[i];
229
230                    return Err(Error::RegionOverlap(
231                        address_range.clone(),
232                        page_idx_to_addr(region.head_page_idx)
233                            ..(page_idx_to_addr(region.head_page_idx + region.num_pages)),
234                    ));
235                }
236                None => {
237                    let base_addr = address_range.start;
238                    assert!(is_page_aligned(base_addr));
239                    assert!(is_page_aligned(region_size));
240
241                    let staging_memory = StagingMemory::new(
242                        staging_shmem,
243                        pages_to_bytes(offset_pages) as u64,
244                        num_pages,
245                    )?;
246                    regions.push(Region {
247                        head_page_idx,
248                        base_page_idx_in_file: offset_pages,
249                        num_pages,
250                        staging_memory,
251                        copied_from_file_pages: 0,
252                        copied_from_staging_pages: 0,
253                        zeroed_pages: 0,
254                        swap_in_pages: 0,
255                        redundant_pages: 0,
256                    });
257                    offset_pages += num_pages;
258                }
259            }
260        }
261
262        let file = SwapFile::new(swap_file, offset_pages)?;
263
264        Ok(Self {
265            ctx: Mutex::new(PageHandleContext {
266                file,
267                regions,
268                mlock_budget_pages: bytes_to_pages(MLOCK_BUDGET),
269            }),
270            channel: stating_move_context,
271        })
272    }
273
274    fn find_region(regions: &mut [Region], page_idx: usize) -> Option<&mut Region> {
275        // sequential search the corresponding page map from the list. It should be fast enough
276        // because there are a few regions (usually only 1).
277        regions.iter_mut().find(|region| {
278            region.head_page_idx <= page_idx && page_idx < region.head_page_idx + region.num_pages
279        })
280    }
281
282    /// Fills the faulted page with zero if the page is not initialized, with the content in the
283    /// swap file if the page is swapped out.
284    ///
285    /// # Arguments
286    ///
287    /// * `uffd` - the reference to the [Userfaultfd] for the faulting process.
288    /// * `address` - the address that triggered the page fault.
289    pub fn handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()> {
290        let page_idx = addr_to_page_idx(address);
291        // the head address of the page.
292        let page_addr = page_base_addr(address);
293        let page_size = pages_to_bytes(1);
294        let mut ctx = self.ctx.lock();
295        let PageHandleContext { regions, file, .. } = &mut *ctx;
296        let region = Self::find_region(regions, page_idx).ok_or(Error::InvalidAddress(address))?;
297
298        let idx_in_region = page_idx - region.head_page_idx;
299        let idx_in_file = idx_in_region + region.base_page_idx_in_file;
300        if let Some(page_slice) = region.staging_memory.page_content(idx_in_region)? {
301            uffd_copy_all(uffd, page_addr, page_slice, true)?;
302            // TODO(b/265758094): optimize clear operation.
303            region
304                .staging_memory
305                .clear_range(idx_in_region..idx_in_region + 1)?;
306            region.copied_from_staging_pages += 1;
307            Ok(())
308        } else if let Some(page_slice) = file.page_content(idx_in_file, false)? {
309            // TODO(kawasin): Unlock regions to proceed swap-in operation background.
310            uffd_copy_all(uffd, page_addr, page_slice, true)?;
311            // TODO(b/265758094): optimize clear operation.
312            // Do not erase the page from the disk for trimming optimization on next swap out.
313            let munlocked_pages = file.clear_range(idx_in_file..idx_in_file + 1)?;
314            region.copied_from_file_pages += 1;
315            ctx.mlock_budget_pages += munlocked_pages;
316            Ok(())
317        } else {
318            // Map a zero page since no swap file has been created yet but the fault
319            // happened.
320            // safe because the fault page is notified by uffd.
321            let result = uffd.zero(page_addr, page_size, true);
322            match result {
323                Ok(_) => {
324                    region.zeroed_pages += 1;
325                    Ok(())
326                }
327                Err(UffdError::PageExist) => {
328                    // This case can happen if page faults on the same page happen on different
329                    // processes.
330                    uffd.wake(page_addr, page_size)?;
331                    region.redundant_pages += 1;
332                    Ok(())
333                }
334                Err(e) => Err(e.into()),
335            }
336        }
337    }
338
339    /// Clear the internal state for the pages.
340    ///
341    /// When pages are removed by madvise with `MADV_DONTNEED` or `MADV_REMOVE`, userfaultfd
342    /// notifies the event as `UFFD_EVENT_REMOVE`. This handles the remove event.
343    ///
344    /// In crosvm, balloon frees the guest memory and cause `UFFD_EVENT_REMOVE`.
345    ///
346    /// # Arguments
347    ///
348    /// * `start_addr` - the head address of the memory area to be freed.
349    /// * `end_addr` - the end address of the memory area to be freed. `UFFD_EVENT_REMOVE` tells the
350    ///   head address of the next memory area of the freed area. (i.e. the exact tail address of
351    ///   the memory area is `end_addr - 1`.)
352    pub fn handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()> {
353        if !is_page_aligned(start_addr) {
354            return Err(Error::InvalidAddress(start_addr));
355        } else if !is_page_aligned(end_addr) {
356            return Err(Error::InvalidAddress(end_addr));
357        }
358        let start_page_idx = addr_to_page_idx(start_addr);
359        let last_page_idx = addr_to_page_idx(end_addr);
360        let mut ctx = self.ctx.lock();
361        // TODO(b/269983521): Clear multiple pages in the same region at once.
362        for page_idx in start_page_idx..(last_page_idx) {
363            let page_addr = page_idx_to_addr(page_idx);
364            // TODO(kawasin): Cache the position if the range does not span multiple regions.
365            let region = Self::find_region(&mut ctx.regions, page_idx)
366                .ok_or(Error::InvalidAddress(page_addr))?;
367            let idx_in_region = page_idx - region.head_page_idx;
368            let idx_range = idx_in_region..idx_in_region + 1;
369            if let Err(e) = region.staging_memory.clear_range(idx_range) {
370                error!("failed to clear removed page from staging: {:?}", e);
371            }
372            let idx_in_file = idx_in_region + region.base_page_idx_in_file;
373            let idx_range = idx_in_file..idx_in_file + 1;
374            // Erase the pages from the disk because the pages are removed from the guest memory.
375            let munlocked_pages = ctx.file.free_range(idx_range)?;
376            ctx.mlock_budget_pages += munlocked_pages;
377        }
378        Ok(())
379    }
380
381    /// Move active pages in the memory region to the staging memory.
382    ///
383    /// It only moves active contents in the guest memory to the swap file and skips empty pages
384    /// (e.g. pages not touched, freed by balloon) using `lseek(2)` + `SEEK_HOLE/DATA`.
385    ///
386    /// Returns the count of moved out pages.
387    ///
388    /// # Arguments
389    ///
390    /// * `base_addr` - the head address of the memory region.
391    /// * `memfd` - the file descriptor of the memfd backing the guest memory region.
392    /// * `base_offset` - the offset of the memory region in the memfd.
393    ///
394    /// # Safety
395    ///
396    /// The region must have been registered to all userfaultfd of processes which may touch the
397    /// region.
398    ///
399    /// The memory must be protected not to be updated while moving.
400    ///
401    /// The page fault events for the region from the userfaultfd must be handled by
402    /// [Self::handle_page_fault()].
403    ///
404    /// Must call [Channel::wait_complete()] to wait all the copy operation complete within the
405    /// memory protection period.
406    #[deny(unsafe_op_in_unsafe_fn)]
407    pub unsafe fn move_to_staging<T>(
408        &self,
409        base_addr: usize,
410        memfd: &T,
411        base_offset: u64,
412    ) -> Result<usize>
413    where
414        T: AsRawDescriptor,
415    {
416        let hugepage_size = *THP_SIZE;
417        let mut ctx = self.ctx.lock();
418        let region = Self::find_region(&mut ctx.regions, addr_to_page_idx(base_addr))
419            .ok_or(Error::InvalidAddress(base_addr))?;
420
421        if page_idx_to_addr(region.head_page_idx) != base_addr {
422            return Err(Error::InvalidAddress(base_addr));
423        }
424        let region_size = pages_to_bytes(region.num_pages);
425        let mut file_data = FileDataIterator::new(memfd, base_offset, region_size as u64);
426        let mut moved_size = 0;
427        let mut copies = Vec::new();
428        let mut remaining_batch_size = hugepage_size;
429        let mut batch_head_offset = 0;
430        let mut cur_data = None;
431        while let Some(data_range) = cur_data
432            .take()
433            .map(Ok)
434            .or_else(|| file_data.next())
435            .transpose()
436            .map_err(Error::FileDataIterator)?
437        {
438            // Assert offset is page aligned
439            let offset = (data_range.start - base_offset) as usize;
440            assert!(is_page_aligned(offset));
441
442            // The chunk size must be within usize since the chunk is within the guest memory.
443            let chunk_size = (data_range.end - data_range.start) as usize;
444            let data_range = if chunk_size > remaining_batch_size {
445                // Split the chunk if it is bigger than remaining_batch_size.
446
447                let split_size = if chunk_size >= hugepage_size {
448                    // If the chunk size is bigger than or equals to huge page size, the chunk may
449                    // contains a huge page. If we MADV_REMOVE a huge page partially, it can cause
450                    // inconsistency between the actual page table and vmm-swap internal state.
451                    let chunk_addr = base_addr + offset;
452                    if !is_hugepage_aligned(chunk_addr) {
453                        // Split the chunk before the where a huge page could start.
454                        std::cmp::min(
455                            round_up_hugepage_size(chunk_addr) - chunk_addr,
456                            remaining_batch_size,
457                        )
458                    } else {
459                        if remaining_batch_size < hugepage_size {
460                            // Remove the batch since it does not have enough room for a huge page.
461                            self.channel.push(MoveToStaging {
462                                remove_area: base_addr + batch_head_offset..base_addr + offset,
463                                copies: mem::take(&mut copies),
464                            });
465                            remaining_batch_size = hugepage_size;
466                            batch_head_offset = offset;
467                        }
468                        hugepage_size
469                    }
470                } else {
471                    remaining_batch_size
472                };
473                // Cache the rest of splitted chunk to avoid useless lseek(2) syscall.
474                cur_data = Some(data_range.start + split_size as u64..data_range.end);
475                data_range.start..data_range.start + split_size as u64
476            } else {
477                data_range
478            };
479
480            let size = (data_range.end - data_range.start) as usize;
481            assert!(is_page_aligned(size));
482
483            // SAFETY:
484            // Safe because:
485            // * src_addr is aligned with page size
486            // * the data_range starting from src_addr is on the guest memory.
487            let copy_op = unsafe {
488                region.staging_memory.copy(
489                    (base_addr + offset) as *const u8,
490                    bytes_to_pages(offset),
491                    bytes_to_pages(size),
492                )?
493            };
494            copies.push(copy_op);
495
496            moved_size += size;
497            // The size must be smaller than or equals to remaining_batch_size.
498            remaining_batch_size -= size;
499
500            if remaining_batch_size == 0 {
501                // Remove the batch of pages at once to reduce madvise(2) syscall.
502                self.channel.push(MoveToStaging {
503                    remove_area: base_addr + batch_head_offset..base_addr + offset + size,
504                    copies: mem::take(&mut copies),
505                });
506                remaining_batch_size = hugepage_size;
507                batch_head_offset = offset + size;
508            }
509        }
510        // Remove the final batch of pages.
511        self.channel.push(MoveToStaging {
512            remove_area: base_addr + batch_head_offset..base_addr + region_size,
513            copies,
514        });
515
516        region.copied_from_file_pages = 0;
517        region.copied_from_staging_pages = 0;
518        region.zeroed_pages = 0;
519        region.swap_in_pages = 0;
520        region.redundant_pages = 0;
521
522        Ok(bytes_to_pages(moved_size))
523    }
524
525    /// Write a chunk of consecutive pages in the staging memory to the swap file.
526    ///
527    /// If there is no active pages in the staging memory, this returns `Ok(0)`.
528    ///
529    /// The pages in guest memory have been moved to staging memory by [Self::move_to_staging()].
530    ///
531    /// Returns the count of swapped out pages.
532    ///
533    /// Even if swap_out fails on any internal steps, it does not break the page state management
534    /// and `PageHandler` can continue working with a little pages leaking in staging memory or swap
535    /// file. The leaked pages are removed when vmm-swap is disabled and `PageHandler` is dropped.
536    ///
537    /// # Arguments
538    ///
539    /// * `max_size` - the upper limit of the chunk size to write into the swap file at once. The
540    ///   chunk is splitted if it is bigger than `max_size`.
541    pub fn swap_out(&self, max_size: usize) -> Result<usize> {
542        let max_pages = bytes_to_pages(max_size);
543        let mut ctx = self.ctx.lock();
544        let PageHandleContext { regions, file, .. } = &mut *ctx;
545        for region in regions.iter_mut() {
546            if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
547                let idx_range_in_file = idx_range.start + region.base_page_idx_in_file
548                    ..idx_range.end + region.base_page_idx_in_file;
549                let pages = idx_range.end - idx_range.start;
550                let slice = region.staging_memory.get_slice(idx_range.clone())?;
551                // Convert VolatileSlice to &[u8]
552                // SAFETY:
553                // Safe because the range of volatile slice is already validated.
554                let slice = unsafe { std::slice::from_raw_parts(slice.as_ptr(), slice.size()) };
555                file.write_to_file(idx_range_in_file.start, slice)?;
556                // TODO(kawasin): clear state_list on each write and MADV_REMOVE several chunk at
557                // once.
558                region.staging_memory.clear_range(idx_range)?;
559                // TODO(kawasin): free the page cache of the swap file.
560                // TODO(kawasin): use writev() to swap_out several small chunks at once.
561                return Ok(pages);
562            }
563        }
564        Ok(0)
565    }
566
567    /// Create a new [SwapInContext].
568    pub fn start_swap_in(&'a self) -> SwapInContext<'a> {
569        SwapInContext {
570            ctx: &self.ctx,
571            cur_staging: 0,
572        }
573    }
574
575    /// Create a new [TrimContext].
576    pub fn start_trim(&'a self) -> TrimContext<'a> {
577        TrimContext {
578            ctx: &self.ctx,
579            cur_page: 0,
580            cur_region: 0,
581            next_data_in_file: 0..0,
582            clean_pages: 0,
583            zero_pages: 0,
584        }
585    }
586
587    /// Returns count of pages copied from vmm-swap file to the guest memory.
588    fn compute_copied_from_file_pages(&self) -> usize {
589        self.ctx
590            .lock()
591            .regions
592            .iter()
593            .map(|r| r.copied_from_file_pages)
594            .sum()
595    }
596
597    /// Returns count of pages copied from staging memory to the guest memory.
598    fn compute_copied_from_staging_pages(&self) -> usize {
599        self.ctx
600            .lock()
601            .regions
602            .iter()
603            .map(|r| r.copied_from_staging_pages)
604            .sum()
605    }
606
607    /// Returns count of pages initialized with zero.
608    fn compute_zeroed_pages(&self) -> usize {
609        self.ctx.lock().regions.iter().map(|r| r.zeroed_pages).sum()
610    }
611
612    /// Returns count of pages which were already initialized on page faults.
613    fn compute_redundant_pages(&self) -> usize {
614        self.ctx
615            .lock()
616            .regions
617            .iter()
618            .map(|r| r.redundant_pages)
619            .sum()
620    }
621
622    /// Returns count of pages present in the staging memory.
623    fn compute_staging_pages(&self) -> usize {
624        self.ctx
625            .lock()
626            .regions
627            .iter()
628            .map(|r| r.staging_memory.present_pages())
629            .sum()
630    }
631
632    /// Returns count of pages present in the swap files.
633    fn compute_swap_pages(&self) -> usize {
634        self.ctx.lock().file.present_pages()
635    }
636
637    /// Fill [SwapMetrics] with page handler metrics.
638    pub fn load_metrics(&self, metrics: &mut SwapMetrics) {
639        metrics.copied_from_file_pages = self.compute_copied_from_file_pages() as u64;
640        metrics.copied_from_staging_pages = self.compute_copied_from_staging_pages() as u64;
641        metrics.zeroed_pages = self.compute_zeroed_pages() as u64;
642        metrics.redundant_pages = self.compute_redundant_pages() as u64;
643        metrics.staging_pages = self.compute_staging_pages() as u64;
644        metrics.swap_pages = self.compute_swap_pages() as u64;
645    }
646}
647
648/// Context for swap-in operation.
649///
650/// This holds cursor of indices in the regions for each step for optimization.
651pub struct SwapInContext<'a> {
652    ctx: &'a Mutex<PageHandleContext<'a>>,
653    cur_staging: usize,
654}
655
656impl SwapInContext<'_> {
657    /// Swap in a chunk of consecutive pages from the staging memory and the swap file.
658    ///
659    /// If there is no more pages present outside of the guest memory, this returns `Ok(0)`.
660    ///
661    /// Returns the count of swapped in pages.
662    ///
663    /// # Arguments
664    ///
665    /// * `uffd` - the main [Userfaultfd].
666    /// * `max_size` - the upper limit of the chunk size to swap into the guest memory at once. The
667    ///   chunk is splitted if it is bigger than `max_size`.
668    pub fn swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize> {
669        let mut ctx = self.ctx.lock();
670        // Request the kernel to pre-populate the present pages in the swap file to page cache
671        // background. At most 16MB of pages will be populated.
672        // The threshold is to apply MADV_WILLNEED to bigger chunk of pages. The kernel populates
673        // consecutive pages at once on MADV_WILLNEED.
674        if ctx.mlock_budget_pages > bytes_to_pages(PREFETCH_THRESHOLD) {
675            let mlock_budget_pages = ctx.mlock_budget_pages;
676            let locked_pages = ctx.file.lock_and_async_prefetch(mlock_budget_pages)?;
677            ctx.mlock_budget_pages -= locked_pages;
678        }
679
680        let max_pages = bytes_to_pages(max_size);
681        for region in ctx.regions[self.cur_staging..].iter_mut() {
682            // TODO(kawasin): swap_in multiple chunks less than max_size at once.
683            if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
684                let pages = idx_range.end - idx_range.start;
685                let page_addr = page_idx_to_addr(region.head_page_idx + idx_range.start);
686                let slice = region.staging_memory.get_slice(idx_range.clone())?;
687                uffd_copy_all(uffd, page_addr, slice, false)?;
688                // Clear the staging memory to avoid memory spike.
689                // TODO(kawasin): reduce the call count of MADV_REMOVE by removing several data
690                // at once.
691                region.staging_memory.clear_range(idx_range)?;
692                region.swap_in_pages += pages;
693                return Ok(pages);
694            }
695            self.cur_staging += 1;
696        }
697
698        if let Some(mut idx_range_in_file) = ctx.file.first_data_range(max_pages) {
699            let PageHandleContext { regions, file, .. } = &mut *ctx;
700            for region in regions.iter_mut() {
701                let region_tail_idx_in_file = region.base_page_idx_in_file + region.num_pages;
702                if idx_range_in_file.start >= region_tail_idx_in_file {
703                    continue;
704                } else if idx_range_in_file.start < region.base_page_idx_in_file {
705                    return Err(Error::File(FileError::OutOfRange));
706                } else if idx_range_in_file.end > region_tail_idx_in_file {
707                    // The consecutive pages can be across regions. Swap-in pages in a region at
708                    // once.
709                    idx_range_in_file.end = region_tail_idx_in_file;
710                }
711                let pages = idx_range_in_file.end - idx_range_in_file.start;
712                let page_addr = page_idx_to_addr(
713                    idx_range_in_file.start - region.base_page_idx_in_file + region.head_page_idx,
714                );
715                let slice = file.get_slice(idx_range_in_file.clone())?;
716                // TODO(kawasin): Unlock regions to proceed page fault handling on the main thread.
717                //                We also need to handle the EEXIST error from UFFD_COPY.
718                uffd_copy_all(uffd, page_addr, slice, false)?;
719                // Do not erase each chunk of pages from disk on swap_in. The whole file will be
720                // truncated when swap_in is completed. Even if swap_in is aborted, the remaining
721                // disk contents help the trimming optimization on swap_out.
722                let munlocked_pages = file.clear_range(idx_range_in_file)?;
723                region.swap_in_pages += pages;
724                ctx.mlock_budget_pages += munlocked_pages;
725                return Ok(pages);
726            }
727            // File has remaining pages, but regions has been consumed.
728            return Err(Error::File(FileError::OutOfRange));
729        }
730
731        Ok(0)
732    }
733}
734
735impl Drop for SwapInContext<'_> {
736    fn drop(&mut self) {
737        let mut ctx = self.ctx.lock();
738        if let Err(e) = ctx.file.clear_mlock() {
739            panic!("failed to clear mlock: {e:?}");
740        }
741        ctx.mlock_budget_pages = bytes_to_pages(MLOCK_BUDGET);
742    }
743}
744
745/// Context for trim operation.
746///
747/// This drops 2 types of pages in the staging memory to reduce disk write.
748///
749/// * Clean pages
750///   * The pages which have been swapped out to the disk and have not been changed.
751///   * Drop the pages in the staging memory and mark it as present on the swap file.
752/// * Zero pages
753///   * Drop the pages in the staging memory. The pages will be UFFD_ZEROed on page fault.
754pub struct TrimContext<'a> {
755    ctx: &'a Mutex<PageHandleContext<'a>>,
756    cur_region: usize,
757    cur_page: usize,
758    /// The page idx range of pages which have been stored in the swap file.
759    next_data_in_file: Range<usize>,
760    clean_pages: usize,
761    zero_pages: usize,
762}
763
764impl TrimContext<'_> {
765    /// Trim pages in the staging memory.
766    ///
767    /// This returns the pages trimmed. This returns `None` if it traversed all pages in the staging
768    /// memory.
769    ///
770    /// # Arguments
771    ///
772    /// `max_size` - The maximum pages to be compared.
773    pub fn trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>> {
774        let mut ctx = self.ctx.lock();
775        if self.cur_region >= ctx.regions.len() {
776            return Ok(None);
777        }
778        let PageHandleContext { regions, file, .. } = &mut *ctx;
779        let region = &mut regions[self.cur_region];
780        let mut n_trimmed = 0;
781
782        for _ in 0..max_pages {
783            if let Some(slice_in_staging) = region
784                .staging_memory
785                .page_content(self.cur_page)
786                .context("get page of staging memory")?
787            {
788                let idx_range = self.cur_page..self.cur_page + 1;
789                let idx_in_file = idx_range.start + region.base_page_idx_in_file;
790
791                // Check zero page on the staging memory first. If the page is non-zero and have not
792                // been changed, zero checking is useless, but less cost than file I/O for the pages
793                // which were in the swap file and now is zero.
794                // Check 2 types of page in the same loop to utilize CPU cache for staging memory.
795                if slice_in_staging.is_all_zero() {
796                    region
797                        .staging_memory
798                        .clear_range(idx_range.clone())
799                        .context("clear a page in staging memory")?;
800                    // The page is on the swap file as well.
801                    let munlocked_pages = file
802                        .free_range(idx_in_file..idx_in_file + 1)
803                        .context("clear a page in swap file")?;
804                    if munlocked_pages != 0 {
805                        // Only either of swap-in or trimming runs at the same time. This is not
806                        // expected path. Just logging an error because leaking
807                        // mlock_budget_pages is not fatal.
808                        error!("pages are mlock(2)ed while trimming");
809                    }
810                    n_trimmed += 1;
811                    self.zero_pages += 1;
812                } else if let Some(slice_in_file) = file.page_content(idx_in_file, true)? {
813                    // Compare the page with the previous content of the page on the disk.
814                    if slice_in_staging == slice_in_file {
815                        region
816                            .staging_memory
817                            .clear_range(idx_range.clone())
818                            .context("clear a page in staging memory")?;
819                        file.mark_as_present(idx_in_file)?;
820                        n_trimmed += 1;
821                        self.clean_pages += 1;
822                    }
823                }
824            }
825
826            self.cur_page += 1;
827            if self.cur_page >= region.num_pages {
828                self.cur_region += 1;
829                self.cur_page = 0;
830                self.next_data_in_file = 0..0;
831                break;
832            }
833        }
834
835        Ok(Some(n_trimmed))
836    }
837
838    /// Total trimmed clean pages.
839    pub fn trimmed_clean_pages(&self) -> usize {
840        self.clean_pages
841    }
842
843    /// Total trimmed zero pages.
844    pub fn trimmed_zero_pages(&self) -> usize {
845        self.zero_pages
846    }
847}