devices/virtio/
pvclock.rs

1// Copyright 2022 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Virtio version of a linux pvclock clocksource.
6//!
7//! Driver source is here:
8//! <https://android.googlesource.com/kernel/common/+/ebaa2c516811825b141de844cee7a38653058ef5/drivers/virtio/virtio_pvclock.c>
9//!
10//! # Background
11//!
12//! Userland applications often rely on CLOCK_MONOTONIC to be relatively continuous.
13//! Large jumps can signal problems (e.g., triggering Android watchdogs).
14//! This assumption breaks down in virtualized environments, where a VM's suspension isn't
15//! inherently linked to the guest kernel's concept of "suspend".
16//! Since fixing all userland code is impractical, virtio-pvclock allows the VMM and guest kernel
17//! to collaborate on emulating the expected clock behavior around suspend/resume.
18//!
19//! # How it works
20//!
21//! ## Core functions of virtio-pvclock device:
22//!
23//! 1. Adjusts hardware clocksource offsets to make the guest clocks appear suspended when the VM is
24//!    suspended.
25//!   - This is achieved through the pvclock mechanism implemented in x86 KVM used by kvm-clock.
26//! 2. Provides the guest kernel with the duration of VM suspension, allowing the guest to adjust
27//!    its clocks accordingly.
28//!   - Since the offset between the CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained by the guest
29//!     kernel, applying the adjustment is the guest driver's responsibility.
30//!
31//! ## Expected guest clock behaviors under virtio-pvclock is enabled
32//!
33//! - Monotonicity of CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained.
34//! - CLOCK_MONOTONIC will not include the time passed during crosvm is suspended from its run mode
35//!   perspective.
36//! - CLOCK_BOOTTIME will be adjusted to include the time passed during crosvm is suspended.
37//!
38//! # Why it is needed
39//!
40//! Because the existing solution does not cover some expectations we need.
41//!
42//! kvm-clock is letting the host to manage the offsets of CLOCK_MONOTONIC.
43//! However, it doesn't address the difference between CLOCK_BOOTTIME and CLOCK_MONOTONIC related
44//! to host's suspend/resume, as it is designed to maintain the CLOCK_REALTIME in sync mainly.
45
46#[cfg(target_arch = "aarch64")]
47use std::arch::asm;
48use std::collections::BTreeMap;
49use std::mem::replace;
50use std::mem::size_of;
51use std::sync::atomic::AtomicU64;
52use std::sync::atomic::Ordering;
53use std::sync::Arc;
54use std::time::Duration;
55
56use anyhow::anyhow;
57use anyhow::bail;
58use anyhow::Context;
59use anyhow::Result;
60use base::error;
61use base::info;
62use base::warn;
63use base::AsRawDescriptor;
64#[cfg(windows)]
65use base::CloseNotifier;
66use base::Error;
67use base::Event;
68use base::EventToken;
69use base::RawDescriptor;
70use base::ReadNotifier;
71use base::Tube;
72use base::WaitContext;
73use base::WorkerThread;
74use chrono::DateTime;
75use chrono::Utc;
76use data_model::Le32;
77use data_model::Le64;
78use serde::Deserialize;
79use serde::Serialize;
80use snapshot::AnySnapshot;
81use vm_control::PvClockCommand;
82use vm_control::PvClockCommandResponse;
83use vm_memory::GuestAddress;
84use vm_memory::GuestMemory;
85use vm_memory::GuestMemoryError;
86use zerocopy::FromBytes;
87use zerocopy::Immutable;
88use zerocopy::IntoBytes;
89use zerocopy::KnownLayout;
90
91use super::copy_config;
92use super::DeviceType;
93use super::Interrupt;
94use super::Queue;
95use super::VirtioDevice;
96
97// Pvclock has one virtio queue: set_pvclock_page
98const QUEUE_SIZE: u16 = 1;
99const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
100
101// pvclock flag bits
102const PVCLOCK_TSC_STABLE_BIT: u8 = 1;
103const PVCLOCK_GUEST_STOPPED: u8 = 2;
104
105// The feature bitmap for virtio pvclock
106const VIRTIO_PVCLOCK_F_TSC_STABLE: u64 = 0; // TSC is stable
107const VIRTIO_PVCLOCK_F_INJECT_SLEEP: u64 = 1; // Inject sleep for suspend
108const VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING: u64 = 2; // Use device clocksource rating
109
110// Status values for a virtio_pvclock request.
111const VIRTIO_PVCLOCK_S_OK: u8 = 0;
112const VIRTIO_PVCLOCK_S_IOERR: u8 = 1;
113
114const VIRTIO_PVCLOCK_CLOCKSOURCE_RATING: u32 = 450;
115
116#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
117fn read_clock_counter() -> u64 {
118    // SAFETY: rdtsc is unprivileged and have no side effects.
119    unsafe { std::arch::x86_64::_rdtsc() }
120}
121
122#[cfg(target_arch = "aarch64")]
123fn read_clock_counter() -> u64 {
124    let mut x: u64;
125    // SAFETY: This instruction have no side effect apart from storing the current timestamp counter
126    //         into the specified register.
127    unsafe {
128        asm!("mrs {x}, cntvct_el0",
129            x = out(reg) x,
130        );
131    }
132    x
133}
134
135/// Calculate a (multiplier, shift) pair for scaled math of clocks.
136/// The values are passed on to `pvclock_scale_delta` in the guest kernel and satisfy the following
137/// (approximate) equality:
138/// `n * scaled_hz / base_hz ~= ((n << shift) * multiplier) >> 32`
139/// The logic here is roughly based on `kvm_get_time_scale` (but simplified as we can use u128).
140/// # Arguments
141/// * `scaled_hz` - Frequency to convert to. When dealing with clocksources, this is NSEC_PER_SEC.
142/// * `base_hz` - Frequency to convert from. When dealing with clocksources, this is the counter
143///   frequency.
144fn freq_scale_shift(scaled_hz: u64, base_hz: u64) -> (u32, i8) {
145    assert!(scaled_hz > 0 && base_hz > 0);
146    // We treat `multiplier` as a 0.32 fixed-point number by folding the >> 32 into its definition.
147    // With this definition, `multiplier` can be calculated as `(scaled_hz / base_hz) >> shift`
148    // with a corresponding `shift`.
149    //
150    // The value of `shift` should satisfy a few constraints:
151    // 1. `multiplier` needs to be < 1.0 due to the representable range of 0.32 fixed-point (maximum
152    //    (2^32-1)/2^32).
153    // 2. `shift` should be minimized because `pvclock_scale_delta` applies `shift` on the 64-bit
154    //    TSC value before extending to 128-bit and large positive shifts reduce the TSC rollover
155    //    time.
156    //
157    // Minimizing `shift` means maximizing `multiplier`. From the < 1.0 constraint, this is
158    // equivalent to having a multiplier within [0.5, 1.0). The logic below picks a multiplier
159    // satisfying that, while updating `shift` accordingly when we double or halve the multiplier.
160    let mut shift = 0;
161    // Convert to u128 so that overflow handling becomes much easier.
162    let mut scaled_hz = scaled_hz as u128;
163    let mut base_hz = base_hz as u128;
164    if scaled_hz >= base_hz {
165        while scaled_hz >= base_hz {
166            // `multiplier` >= 1.0; iteratively scale it down
167            // scaled_hz is at most 64 bits, so after this loop base_hz is at most 65 bits.
168            base_hz <<= 1;
169            shift += 1;
170        }
171    } else {
172        while base_hz > 2 * scaled_hz {
173            // `multiplier` < 0.5; iteratively scale it up
174            // base_hz is at most 64 bits. If the loop condition passes then scaled_hz is at most 63
175            // bits, otherwise at most 64 bits. Post-loop scaled_hz is at most 64 bits.
176            scaled_hz <<= 1;
177            shift -= 1;
178        }
179    }
180    // From above, we know that the values are at most 65 bits. This provides sufficient headroom
181    // for scaled_hz << 32 below.
182    assert!(base_hz < (1u128 << 65) && scaled_hz < (1u128 << 65));
183    let mult: u32 = ((scaled_hz << 32) / base_hz)
184        .try_into()
185        .expect("should not overflow");
186    (mult, shift)
187}
188
189// The config structure being exposed to the guest to tell them how much suspend time should be
190// injected to the guest's CLOCK_BOOTTIME.
191#[derive(Debug, Clone, Copy, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
192#[allow(non_camel_case_types)]
193#[repr(C)]
194struct virtio_pvclock_config {
195    // Total duration the VM has been paused while the guest kernel is not in the suspended state
196    // (from the power management and timekeeping perspective).
197    suspend_time_ns: Le64,
198    // Device-suggested rating of the pvclock clocksource.
199    clocksource_rating: Le32,
200    padding: u32,
201}
202
203#[derive(Debug, Clone, Copy, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
204#[allow(non_camel_case_types)]
205#[repr(C)]
206struct virtio_pvclock_set_pvclock_page_req {
207    // Physical address of pvclock page.
208    pvclock_page_pa: Le64,
209    // Current system time.
210    system_time: Le64,
211    // Current tsc value.
212    tsc_timestamp: Le64,
213    // Status of this request, one of VIRTIO_PVCLOCK_S_*.
214    status: u8,
215    padding: [u8; 7],
216}
217
218// Data structure for interacting with pvclock shared memory.
219struct PvclockSharedData {
220    mem: GuestMemory,
221    seqlock_addr: GuestAddress,
222    tsc_suspended_delta_addr: GuestAddress,
223    tsc_frequency_multiplier_addr: GuestAddress,
224    tsc_frequency_shift_addr: GuestAddress,
225    flags_addr: GuestAddress,
226}
227
228impl PvclockSharedData {
229    pub fn new(mem: GuestMemory, addr: GuestAddress) -> Self {
230        PvclockSharedData {
231            mem,
232            // The addresses of the various fields that we need to modify are relative to the
233            // base of the pvclock page. For reference, see the pvclock_vcpu_time_info struct.
234            seqlock_addr: addr,
235            tsc_suspended_delta_addr: addr.unchecked_add(8),
236            tsc_frequency_multiplier_addr: addr.unchecked_add(24),
237            tsc_frequency_shift_addr: addr.unchecked_add(28),
238            flags_addr: addr.unchecked_add(29),
239        }
240    }
241
242    /// Only the seqlock_addr is needed to re-create this struct at restore
243    /// time, so that is all our snapshot contains.
244    fn snapshot(&self) -> GuestAddress {
245        self.seqlock_addr
246    }
247
248    /// Set all fields to zero.
249    pub fn zero_fill(&mut self) -> Result<()> {
250        // The pvclock data structure is 32 bytes long, so we write 32 bytes of 0s
251        self.mem
252            .write_all_at_addr(&[0u8; 32], self.seqlock_addr)
253            .context("failed to zero fill the pvclock shared data")
254    }
255
256    pub fn increment_seqlock(&mut self) -> Result<()> {
257        // TODO (b/264931437): reads and writes using read/write_obj_from/at_addr are not
258        //  guaranteed to be atomic. Although this should not be a problem for the seqlock
259        //  or the other fields in the pvclock shared data (whch are protected via the seqlock)
260        //  we might want to update these calls to be as atomic as possible if/when we have
261        //  the ability to do so, just as a general cleanup and to be consistent.
262        let value = self
263            .mem
264            .read_obj_from_addr::<u32>(self.seqlock_addr)
265            .context("failed to read seqlock value")?;
266        self.mem
267            .write_obj_at_addr(value.wrapping_add(1), self.seqlock_addr)
268            .context("failed to write seqlock value")
269    }
270
271    pub fn set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()> {
272        self.mem
273            .write_obj_at_addr(delta, self.tsc_suspended_delta_addr)
274            .context("failed to write tsc suspended delta")
275    }
276
277    pub fn set_tsc_frequency(&mut self, frequency: u64) -> Result<()> {
278        let (multiplier, shift): (u32, i8) = freq_scale_shift(1_000_000_000, frequency);
279
280        self.mem
281            .write_obj_at_addr(multiplier, self.tsc_frequency_multiplier_addr)
282            .context("failed to write tsc frequency mlutiplier")?;
283        self.mem
284            .write_obj_at_addr(shift, self.tsc_frequency_shift_addr)
285            .context("failed to write tsc frequency shift")
286    }
287
288    pub fn enable_pvclock_flags(&mut self, flags: u8) -> Result<()> {
289        let value = self
290            .mem
291            .read_obj_from_addr::<u8>(self.flags_addr)
292            .context("failed to read flags")?;
293        self.mem
294            .write_obj_at_addr(value | flags, self.flags_addr)
295            .context("failed to write flags")
296    }
297}
298
299/// Serializable part of the [PvClock] struct which will be used by the virtio_snapshot / restore.
300#[derive(Serialize, Deserialize)]
301struct PvClockState {
302    tsc_frequency: u64,
303    /// If the device is sleeping, a [PvClockWorkerSnapshot] that can re-create the worker
304    /// will be stored here. (We can't just store the worker itself as it contains an object
305    /// tree with references to [GuestMemory].)
306    paused_main_worker: Option<PvClockWorkerSnapshot>,
307    /// The total time the vm has been suspended, this is in an `Arc<AtomicU64>>` because it's set
308    /// by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
309    total_suspend_ns: Arc<AtomicU64>,
310    features: u64,
311    acked_features: u64,
312}
313
314/// An enum to keep dynamic state of pvclock workers in a type safe manner.
315enum PvClockWorkerState {
316    /// Idle means no worker is running.
317    /// This tube is for communicating with this device from the crosvm threads.
318    Idle(Tube),
319    /// A stub worker to respond pvclock commands when the device is not activated yet.
320    Stub(WorkerThread<StubWorkerReturn>),
321    /// A main worker to respond pvclock commands while the device is active.
322    Main(WorkerThread<MainWorkerReturn>),
323    /// None is used only for handling transitional state between the states above.
324    None,
325}
326
327/// A struct that represents virtio-pvclock device.
328pub struct PvClock {
329    state: PvClockState,
330    worker_state: PvClockWorkerState,
331}
332
333impl PvClock {
334    pub fn new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self {
335        let state = PvClockState {
336            tsc_frequency,
337            paused_main_worker: None,
338            total_suspend_ns: Arc::new(AtomicU64::new(0)),
339            features: base_features
340                | 1 << VIRTIO_PVCLOCK_F_TSC_STABLE
341                | 1 << VIRTIO_PVCLOCK_F_INJECT_SLEEP
342                | 1 << VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING,
343            acked_features: 0,
344        };
345        PvClock {
346            state,
347            worker_state: PvClockWorkerState::Idle(suspend_tube),
348        }
349    }
350
351    fn get_config(&self) -> virtio_pvclock_config {
352        virtio_pvclock_config {
353            suspend_time_ns: self.state.total_suspend_ns.load(Ordering::SeqCst).into(),
354            clocksource_rating: VIRTIO_PVCLOCK_CLOCKSOURCE_RATING.into(),
355            padding: 0,
356        }
357    }
358
359    /// Use switch_to_*_worker unless needed to keep the state transition consistent
360    fn start_main_worker(
361        &mut self,
362        interrupt: Interrupt,
363        pvclock_worker: PvClockWorker,
364        mut queues: BTreeMap<usize, Queue>,
365    ) -> anyhow::Result<()> {
366        let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
367        if let PvClockWorkerState::Idle(suspend_tube) = last_state {
368            if queues.len() != QUEUE_SIZES.len() {
369                self.worker_state = PvClockWorkerState::Idle(suspend_tube);
370                return Err(anyhow!(
371                    "expected {} queues, got {}",
372                    QUEUE_SIZES.len(),
373                    queues.len()
374                ));
375            }
376            let set_pvclock_page_queue = queues.remove(&0).unwrap();
377            self.worker_state = PvClockWorkerState::Main(WorkerThread::start(
378                "virtio_pvclock".to_string(),
379                move |kill_evt| {
380                    run_main_worker(
381                        pvclock_worker,
382                        set_pvclock_page_queue,
383                        suspend_tube,
384                        interrupt,
385                        kill_evt,
386                    )
387                },
388            ));
389        } else {
390            panic!("Invalid state transition");
391        }
392        Ok(())
393    }
394
395    /// Use switch_to_*_worker unless needed to keep the state transition consistent
396    fn start_stub_worker(&mut self) {
397        let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
398        self.worker_state = if let PvClockWorkerState::Idle(suspend_tube) = last_state {
399            PvClockWorkerState::Stub(WorkerThread::start(
400                "virtio_pvclock_stub".to_string(),
401                move |kill_evt| run_stub_worker(suspend_tube, kill_evt),
402            ))
403        } else {
404            panic!("Invalid state transition");
405        };
406    }
407
408    /// Use switch_to_*_worker unless needed to keep the state transition consistent
409    fn stop_stub_worker(&mut self) {
410        let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
411        self.worker_state = if let PvClockWorkerState::Stub(stub_worker_thread) = last_state {
412            let stub_worker_ret = stub_worker_thread.stop();
413            PvClockWorkerState::Idle(stub_worker_ret.suspend_tube)
414        } else {
415            panic!("Invalid state transition");
416        }
417    }
418
419    /// Use switch_to_*_worker unless needed to keep the state transition consistent
420    fn stop_main_worker(&mut self) {
421        let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
422        if let PvClockWorkerState::Main(main_worker_thread) = last_state {
423            let main_worker_ret = main_worker_thread.stop();
424            self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
425            let mut queues = BTreeMap::new();
426            queues.insert(0, main_worker_ret.set_pvclock_page_queue);
427            self.state.paused_main_worker = Some(main_worker_ret.worker.into());
428        } else {
429            panic!("Invalid state transition");
430        }
431    }
432
433    fn switch_to_stub_worker(&mut self) {
434        self.stop_main_worker();
435        self.start_stub_worker();
436    }
437
438    fn switch_to_main_worker(
439        &mut self,
440        interrupt: Interrupt,
441        pvclock_worker: PvClockWorker,
442        queues: BTreeMap<usize, Queue>,
443    ) -> anyhow::Result<()> {
444        self.stop_stub_worker();
445        self.start_main_worker(interrupt, pvclock_worker, queues)
446    }
447}
448
449/// Represents a moment in time including the TSC counter value at that time.
450#[derive(Serialize, Deserialize, Clone)]
451struct PvclockInstant {
452    time: DateTime<Utc>,
453    tsc_value: u64,
454}
455
456/// The unique data retained by [PvClockWorker] which can be used to re-create
457/// an identical worker.
458#[derive(Serialize, Deserialize, Clone)]
459struct PvClockWorkerSnapshot {
460    suspend_time: Option<PvclockInstant>,
461    total_suspend_tsc_delta: u64,
462    pvclock_shared_data_base_address: Option<GuestAddress>,
463}
464
465impl From<PvClockWorker> for PvClockWorkerSnapshot {
466    fn from(worker: PvClockWorker) -> Self {
467        PvClockWorkerSnapshot {
468            suspend_time: worker.suspend_time,
469            total_suspend_tsc_delta: worker.total_suspend_tsc_delta,
470            pvclock_shared_data_base_address: worker
471                .pvclock_shared_data
472                .map(|pvclock| pvclock.snapshot()),
473        }
474    }
475}
476
477/// Worker struct for the virtio-pvclock device.
478///
479/// Handles virtio requests, storing information about suspend/resume, adjusting the
480/// pvclock data in shared memory, and injecting suspend durations via config
481/// changes.
482struct PvClockWorker {
483    tsc_frequency: u64,
484    // The moment the last suspend occurred.
485    suspend_time: Option<PvclockInstant>,
486    // The total time the vm has been suspended, this is in an Arc<AtomicU64>> because it's set
487    // by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
488    total_injected_ns: Arc<AtomicU64>,
489    // The total change in the TSC value over suspensions.
490    total_suspend_tsc_delta: u64,
491    // Pvclock shared data.
492    pvclock_shared_data: Option<PvclockSharedData>,
493    mem: GuestMemory,
494}
495
496impl PvClockWorker {
497    pub fn new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self {
498        PvClockWorker {
499            tsc_frequency,
500            suspend_time: None,
501            total_injected_ns,
502            total_suspend_tsc_delta: 0,
503            pvclock_shared_data: None,
504            mem,
505        }
506    }
507
508    fn from_snapshot(
509        tsc_frequency: u64,
510        total_injected_ns: Arc<AtomicU64>,
511        snap: PvClockWorkerSnapshot,
512        mem: GuestMemory,
513    ) -> Self {
514        PvClockWorker {
515            tsc_frequency,
516            suspend_time: snap.suspend_time,
517            total_injected_ns,
518            total_suspend_tsc_delta: snap.total_suspend_tsc_delta,
519            pvclock_shared_data: snap
520                .pvclock_shared_data_base_address
521                .map(|addr| PvclockSharedData::new(mem.clone(), addr)),
522            mem,
523        }
524    }
525
526    /// Initialize the pvclock for initial boot. We assume that the systemtime of 0 corresponds
527    /// to the tsc time of 0, so we do not set these. We set the tsc frequency based on the vcpu
528    /// tsc frequency and we set PVCLOCK_TSC_STABLE_BIT in flags to tell the guest that it's
529    /// safe to use vcpu0's pvclock page for use by the vdso. The order of writing the different
530    /// fields doesn't matter at this point, but does matter when updating.
531    fn set_pvclock_page(&mut self, addr: u64) -> Result<()> {
532        if self.pvclock_shared_data.is_some() {
533            return Err(Error::new(libc::EALREADY)).context("pvclock page already set");
534        }
535
536        let mut shared_data = PvclockSharedData::new(self.mem.clone(), GuestAddress(addr));
537
538        // set all fields to 0 first
539        shared_data.zero_fill()?;
540
541        shared_data.set_tsc_frequency(self.tsc_frequency)?;
542        shared_data.enable_pvclock_flags(PVCLOCK_TSC_STABLE_BIT)?;
543
544        self.pvclock_shared_data = Some(shared_data);
545        Ok(())
546    }
547
548    pub fn suspend(&mut self) {
549        if self.suspend_time.is_some() {
550            warn!("Suspend time already set, ignoring new suspend time");
551            return;
552        }
553        self.suspend_time = Some(PvclockInstant {
554            time: Utc::now(),
555            tsc_value: read_clock_counter(),
556        });
557    }
558
559    pub fn resume(&mut self) -> Result<u64> {
560        // First, increment the sequence lock by 1 before writing to the pvclock page.
561        self.increment_pvclock_seqlock()?;
562
563        // The guest makes sure there are memory barriers in between reads of the seqlock and other
564        // fields, we should make sure there are memory barriers in between writes of seqlock and
565        // writes to other fields.
566        std::sync::atomic::fence(Ordering::SeqCst);
567
568        // Set the guest_stopped_bit and tsc suspended delta in pvclock struct. We only need to set
569        // the bit, the guest will unset it once the guest has handled the stoppage.
570        // We get the result here because we want to call increment_pvclock_seqlock regardless of
571        // the result of these calls.
572        let result = self
573            .set_guest_stopped_bit()
574            .and_then(|_| self.set_suspended_time());
575
576        // The guest makes sure there are memory barriers in between reads of the seqlock and other
577        // fields, we should make sure there are memory barriers in between writes of seqlock and
578        // writes to other fields.
579        std::sync::atomic::fence(Ordering::SeqCst);
580
581        // Do a final increment once changes are done.
582        self.increment_pvclock_seqlock()?;
583
584        result
585    }
586
587    fn get_suspended_duration(suspend_time: &PvclockInstant) -> Duration {
588        match Utc::now().signed_duration_since(suspend_time.time).to_std() {
589            Ok(duration) => duration,
590            Err(e) => {
591                error!(
592                    "pvclock found suspend time in the future (was the host \
593                    clock adjusted?). Guest boot/realtime clock may now be \
594                    incorrect. Details: {}",
595                    e
596                );
597                Duration::ZERO
598            }
599        }
600    }
601
602    fn set_suspended_time(&mut self) -> Result<u64> {
603        let (this_suspend_duration, this_suspend_tsc_delta) =
604            if let Some(suspend_time) = self.suspend_time.take() {
605                (
606                    Self::get_suspended_duration(&suspend_time),
607                    // NB: This calculation may wrap around, as TSC can be reset to zero when
608                    // the device has resumed from the "deep" suspend state (it may not happen for
609                    // s2idle cases). It also happens when the tsc value itself wraps.
610                    read_clock_counter().wrapping_sub(suspend_time.tsc_value),
611                )
612            } else {
613                return Err(Error::new(libc::ENOTSUP))
614                    .context("Cannot set suspend time because suspend was never called");
615            };
616
617        // update the total tsc delta during all suspends
618        // NB: This calculation may wrap around, as the suspend time can be bigger than u64 range.
619        self.total_suspend_tsc_delta = self
620            .total_suspend_tsc_delta
621            .wrapping_add(this_suspend_tsc_delta);
622
623        // save tsc_suspended_delta to shared memory
624        self.pvclock_shared_data
625            .as_mut()
626            .ok_or(
627                anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
628            )?
629            .set_tsc_suspended_delta(self.total_suspend_tsc_delta)?;
630
631        info!(
632            "set total suspend tsc delta to {}",
633            self.total_suspend_tsc_delta
634        );
635
636        // update total suspend ns
637        self.total_injected_ns
638            .fetch_add(this_suspend_duration.as_nanos() as u64, Ordering::SeqCst);
639
640        Ok(self.total_suspend_tsc_delta)
641    }
642
643    fn increment_pvclock_seqlock(&mut self) -> Result<()> {
644        self.pvclock_shared_data
645            .as_mut()
646            .ok_or(
647                anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
648            )?
649            .increment_seqlock()
650    }
651
652    fn set_guest_stopped_bit(&mut self) -> Result<()> {
653        self.pvclock_shared_data
654            .as_mut()
655            .ok_or(
656                anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
657            )?
658            .enable_pvclock_flags(PVCLOCK_GUEST_STOPPED)
659    }
660}
661
662fn pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error {
663    for cause in error.chain() {
664        if let Some(e) = cause.downcast_ref::<base::Error>() {
665            return *e;
666        }
667
668        if let Some(e) = cause.downcast_ref::<GuestMemoryError>() {
669            return match e {
670                // Two kinds of GuestMemoryError contain base::Error
671                GuestMemoryError::MemoryAddSealsFailed(e) => *e,
672                GuestMemoryError::MemoryCreationFailed(e) => *e,
673                // Otherwise return EINVAL
674                _ => Error::new(libc::EINVAL),
675            };
676        }
677    }
678    // Unknown base error
679    Error::new(libc::EFAULT)
680}
681
682struct StubWorkerReturn {
683    suspend_tube: Tube,
684}
685
686/// A stub worker to respond any requests when the device is inactive.
687fn run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn {
688    #[derive(EventToken, Debug)]
689    enum Token {
690        SomePvClockRequest,
691        Kill,
692    }
693    let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
694        (suspend_tube.get_read_notifier(), Token::SomePvClockRequest),
695        // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
696        // implemented for Tube.
697        #[cfg(windows)]
698        (suspend_tube.get_close_notifier(), Token::Kill),
699        (&kill_evt, Token::Kill),
700    ]) {
701        Ok(wait_ctx) => wait_ctx,
702        Err(e) => {
703            error!("failed creating WaitContext: {}", e);
704            return StubWorkerReturn { suspend_tube };
705        }
706    };
707    'wait: loop {
708        let events = match wait_ctx.wait() {
709            Ok(v) => v,
710            Err(e) => {
711                error!("failed polling for events: {}", e);
712                break;
713            }
714        };
715        for event in events.iter().filter(|e| e.is_readable) {
716            match event.token {
717                Token::SomePvClockRequest => {
718                    match suspend_tube.recv::<PvClockCommand>() {
719                        Ok(req) => req,
720                        Err(e) => {
721                            error!("failed to receive request: {}", e);
722                            continue;
723                        }
724                    };
725                    if let Err(e) = suspend_tube.send(&PvClockCommandResponse::DeviceInactive) {
726                        error!("error sending PvClockCommandResponse: {}", e);
727                    }
728                }
729                Token::Kill => {
730                    break 'wait;
731                }
732            }
733        }
734    }
735    StubWorkerReturn { suspend_tube }
736}
737
738struct MainWorkerReturn {
739    worker: PvClockWorker,
740    set_pvclock_page_queue: Queue,
741    suspend_tube: Tube,
742}
743
744// TODO(b/237300012): asyncify this device.
745/// A worker to process PvClockCommand requests
746fn run_main_worker(
747    mut worker: PvClockWorker,
748    mut set_pvclock_page_queue: Queue,
749    suspend_tube: Tube,
750    interrupt: Interrupt,
751    kill_evt: Event,
752) -> MainWorkerReturn {
753    #[derive(EventToken)]
754    enum Token {
755        SetPvClockPageQueue,
756        SuspendResume,
757        Kill,
758    }
759
760    let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
761        (set_pvclock_page_queue.event(), Token::SetPvClockPageQueue),
762        (suspend_tube.get_read_notifier(), Token::SuspendResume),
763        // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
764        // implemented for Tube.
765        #[cfg(windows)]
766        (suspend_tube.get_close_notifier(), Token::Kill),
767        (&kill_evt, Token::Kill),
768    ]) {
769        Ok(pc) => pc,
770        Err(e) => {
771            error!("failed creating WaitContext: {}", e);
772            return MainWorkerReturn {
773                suspend_tube,
774                set_pvclock_page_queue,
775                worker,
776            };
777        }
778    };
779
780    'wait: loop {
781        let events = match wait_ctx.wait() {
782            Ok(v) => v,
783            Err(e) => {
784                error!("failed polling for events: {}", e);
785                break;
786            }
787        };
788
789        for event in events.iter().filter(|e| e.is_readable) {
790            match event.token {
791                Token::SetPvClockPageQueue => {
792                    let _ = set_pvclock_page_queue.event().wait();
793                    let desc_chain = match set_pvclock_page_queue.pop() {
794                        Some(desc_chain) => desc_chain,
795                        None => {
796                            // Spurious doorbells from the driver are permitted
797                            // by the virtio spec (v1.3; section 2.9).
798                            continue;
799                        }
800                    };
801
802                    // This device does not follow the virtio spec requirements for device-readable
803                    // vs. device-writable descriptors, so we can't use `Reader`/`Writer`. Pick the
804                    // first descriptor from the chain and assume the whole req structure is
805                    // contained within it.
806                    let desc = desc_chain
807                        .reader
808                        .get_remaining_regions()
809                        .chain(desc_chain.writer.get_remaining_regions())
810                        .next()
811                        .unwrap();
812
813                    let len = if desc.len < size_of::<virtio_pvclock_set_pvclock_page_req>() {
814                        error!("pvclock descriptor too short");
815                        0
816                    } else {
817                        let addr = GuestAddress(desc.offset);
818                        let mut req: virtio_pvclock_set_pvclock_page_req = match worker
819                            .mem
820                            .read_obj_from_addr(addr)
821                        {
822                            Ok(req) => req,
823                            Err(e) => {
824                                error!("failed to read request from set_pvclock_page queue: {}", e);
825                                continue;
826                            }
827                        };
828
829                        req.status = match worker.set_pvclock_page(req.pvclock_page_pa.into()) {
830                            Err(e) => {
831                                error!("failed to set pvclock page: {:#}", e);
832                                VIRTIO_PVCLOCK_S_IOERR
833                            }
834                            Ok(_) => VIRTIO_PVCLOCK_S_OK,
835                        };
836
837                        if let Err(e) = worker.mem.write_obj_at_addr(req, addr) {
838                            error!("failed to write set_pvclock_page status: {}", e);
839                            continue;
840                        }
841
842                        desc.len as u32
843                    };
844
845                    set_pvclock_page_queue.add_used_with_bytes_written(desc_chain, len);
846                    set_pvclock_page_queue.trigger_interrupt();
847                }
848                Token::SuspendResume => {
849                    let req = match suspend_tube.recv::<PvClockCommand>() {
850                        Ok(req) => req,
851                        Err(e) => {
852                            error!("failed to receive request: {}", e);
853                            continue;
854                        }
855                    };
856
857                    let resp = match req {
858                        PvClockCommand::Suspend => {
859                            worker.suspend();
860                            PvClockCommandResponse::Ok
861                        }
862                        PvClockCommand::Resume => {
863                            match worker.resume() {
864                                Ok(total_suspended_ticks) => {
865                                    // signal to the driver that the total_suspend_ns has changed
866                                    interrupt.signal_config_changed();
867                                    PvClockCommandResponse::Resumed {
868                                        total_suspended_ticks,
869                                    }
870                                }
871                                Err(e) => {
872                                    error!("Failed to resume pvclock: {:#}", e);
873                                    PvClockCommandResponse::Err(pvclock_response_error_from_anyhow(
874                                        e,
875                                    ))
876                                }
877                            }
878                        }
879                    };
880
881                    if let Err(e) = suspend_tube.send(&resp) {
882                        error!("error sending PvClockCommandResponse: {}", e);
883                    }
884                }
885                Token::Kill => {
886                    break 'wait;
887                }
888            }
889        }
890    }
891
892    MainWorkerReturn {
893        suspend_tube,
894        set_pvclock_page_queue,
895        worker,
896    }
897}
898
899impl VirtioDevice for PvClock {
900    fn keep_rds(&self) -> Vec<RawDescriptor> {
901        if let PvClockWorkerState::Idle(suspend_tube) = &self.worker_state {
902            vec![suspend_tube.as_raw_descriptor()]
903        } else {
904            Vec::new()
905        }
906    }
907
908    fn device_type(&self) -> DeviceType {
909        DeviceType::Pvclock
910    }
911
912    fn queue_max_sizes(&self) -> &[u16] {
913        QUEUE_SIZES
914    }
915
916    fn features(&self) -> u64 {
917        self.state.features
918    }
919
920    fn ack_features(&mut self, mut value: u64) {
921        if value & !self.features() != 0 {
922            warn!("virtio-pvclock got unknown feature ack {:x}", value);
923            value &= self.features();
924        }
925        self.state.acked_features |= value;
926    }
927
928    fn read_config(&self, offset: u64, data: &mut [u8]) {
929        copy_config(data, 0, self.get_config().as_bytes(), offset);
930    }
931
932    fn write_config(&mut self, offset: u64, data: &[u8]) {
933        // Pvclock device doesn't expect a guest write to config
934        warn!(
935            "Unexpected write to virtio-pvclock config at offset {}: {:?}",
936            offset, data
937        );
938    }
939
940    fn activate(
941        &mut self,
942        mem: GuestMemory,
943        interrupt: Interrupt,
944        queues: BTreeMap<usize, Queue>,
945    ) -> anyhow::Result<()> {
946        let tsc_frequency = self.state.tsc_frequency;
947        let total_suspend_ns = self.state.total_suspend_ns.clone();
948        let worker = PvClockWorker::new(tsc_frequency, total_suspend_ns, mem);
949        self.switch_to_main_worker(interrupt, worker, queues)
950    }
951
952    fn reset(&mut self) -> Result<()> {
953        self.switch_to_stub_worker();
954        Ok(())
955    }
956
957    fn virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>> {
958        let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
959        match last_state {
960            PvClockWorkerState::Main(main_worker_thread) => {
961                let main_worker_ret = main_worker_thread.stop();
962                let mut queues = BTreeMap::new();
963                queues.insert(0, main_worker_ret.set_pvclock_page_queue);
964                self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
965                self.state.paused_main_worker = Some(main_worker_ret.worker.into());
966                Ok(Some(queues))
967            }
968            PvClockWorkerState::Stub(stub_worker_thread) => {
969                let stub_ret = stub_worker_thread.stop();
970                self.worker_state = PvClockWorkerState::Idle(stub_ret.suspend_tube);
971                Ok(None)
972            }
973            PvClockWorkerState::Idle(suspend_tube) => {
974                self.worker_state = PvClockWorkerState::Idle(suspend_tube);
975                Ok(None)
976            }
977            PvClockWorkerState::None => panic!("invalid state transition"),
978        }
979    }
980
981    fn virtio_wake(
982        &mut self,
983        queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>,
984    ) -> anyhow::Result<()> {
985        if let Some((mem, interrupt, queues)) = queues_state {
986            let worker_snap = self
987                .state
988                .paused_main_worker
989                .take()
990                .ok_or(anyhow!("a sleeping pvclock must have a paused worker"))?;
991            let worker = PvClockWorker::from_snapshot(
992                self.state.tsc_frequency,
993                self.state.total_suspend_ns.clone(),
994                worker_snap,
995                mem,
996            );
997            // Use unchecked as no worker is running at this point
998            self.start_main_worker(interrupt, worker, queues)?;
999        } else {
1000            // If the device wasn't activated, we should bring up the stub worker since that's
1001            // what is supposed to be running for an un-activated device.
1002            self.start_stub_worker();
1003        }
1004        Ok(())
1005    }
1006
1007    fn virtio_snapshot(&mut self) -> anyhow::Result<AnySnapshot> {
1008        AnySnapshot::to_any(&self.state).context("failed to serialize PvClockState")
1009    }
1010
1011    fn virtio_restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {
1012        let state: PvClockState = AnySnapshot::from_any(data).context("error deserializing")?;
1013        if state.features != self.features() {
1014            bail!(
1015                "expected virtio_features to match, but they did not. Live: {:?}, snapshot {:?}",
1016                self.features(),
1017                state.features,
1018            );
1019        }
1020        // TODO(b/291346907): we assume that the TSC frequency has NOT changed
1021        // since the snapshot was made. Assuming we have not moved machines,
1022        // this is a reasonable assumption. We don't verify the frequency
1023        // because TSC calibration noisy.
1024        self.state = state;
1025        Ok(())
1026    }
1027
1028    fn on_device_sandboxed(&mut self) {
1029        self.start_stub_worker();
1030    }
1031}
1032
1033#[cfg(test)]
1034mod tests {
1035    use super::*;
1036    use crate::virtio::QueueConfig;
1037
1038    const TEST_QUEUE_SIZE: u16 = 2048;
1039
1040    fn make_interrupt() -> Interrupt {
1041        Interrupt::new_for_test()
1042    }
1043
1044    fn create_pvclock_device() -> (Tube, PvClock) {
1045        let (host_tube, device_tube) = Tube::pair().unwrap();
1046        let mut pvclock_device = PvClock::new(0, 1e9 as u64, device_tube);
1047
1048        // Simulate the device initialization to start the stub thread.
1049        // In the real case, on_device_sandboxed will be called after the device is sandboxed
1050        // (or at some point during the device initializtion when the sandbox is disabled) to
1051        // allow devices to use multi-threads (as spawning new threads before sandboxing is
1052        // prohibited because of the minijail's restriction).
1053        pvclock_device.on_device_sandboxed();
1054
1055        (host_tube, pvclock_device)
1056    }
1057
1058    fn create_sleeping_device() -> (PvClock, GuestMemory, Tube) {
1059        let (_host_tube, mut pvclock_device) = create_pvclock_device();
1060
1061        // The queue won't actually be used, so passing one that isn't
1062        // fully configured is fine.
1063        let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1064        fake_queue.set_ready(true);
1065        let mem = GuestMemory::new(&[(GuestAddress(0), 0x10000)]).unwrap();
1066        let interrupt = make_interrupt();
1067        pvclock_device
1068            .activate(
1069                mem.clone(),
1070                interrupt.clone(),
1071                BTreeMap::from([(
1072                    0,
1073                    fake_queue
1074                        .activate(&mem, Event::new().unwrap(), interrupt)
1075                        .unwrap(),
1076                )]),
1077            )
1078            .expect("activate should succeed");
1079        let queues = pvclock_device
1080            .virtio_sleep()
1081            .expect("sleep should succeed")
1082            .expect("sleep should yield queues");
1083        assert_eq!(queues.len(), 1);
1084        assert_eq!(
1085            queues.get(&0).expect("queue must be present").size(),
1086            TEST_QUEUE_SIZE
1087        );
1088        assert!(pvclock_device.state.paused_main_worker.is_some());
1089        (pvclock_device, mem, _host_tube)
1090    }
1091
1092    fn assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory) {
1093        // We just create a new queue here, because it isn't actually accessed
1094        // by the device in these tests.
1095        let mut wake_queues = BTreeMap::new();
1096        let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1097        let interrupt = make_interrupt();
1098        fake_queue.set_ready(true);
1099        wake_queues.insert(
1100            0,
1101            fake_queue
1102                .activate(mem, Event::new().unwrap(), interrupt.clone())
1103                .unwrap(),
1104        );
1105        let queues_state = (mem.clone(), interrupt, wake_queues);
1106        pvclock_device
1107            .virtio_wake(Some(queues_state))
1108            .expect("wake should succeed");
1109        assert!(pvclock_device.state.paused_main_worker.is_none());
1110    }
1111
1112    #[test]
1113    fn test_command_response_when_inactive() {
1114        let (host_tube, _pvclock_device) = create_pvclock_device();
1115        assert!(host_tube.send(&PvClockCommand::Suspend).is_ok());
1116        let res = host_tube.recv::<PvClockCommandResponse>();
1117        assert!(matches!(res, Ok(PvClockCommandResponse::DeviceInactive)));
1118    }
1119
1120    #[test]
1121    fn test_sleep_wake_smoke() {
1122        let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1123        assert_wake_successful(&mut pvclock_device, &mem);
1124    }
1125
1126    #[test]
1127    fn test_save_restore() {
1128        let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1129        let test_suspend_ns = 9999;
1130
1131        // Store a test value we can look for later in the test to verify
1132        // we're restoring properties.
1133        pvclock_device
1134            .state
1135            .total_suspend_ns
1136            .store(test_suspend_ns, Ordering::SeqCst);
1137
1138        let snap = pvclock_device.virtio_snapshot().unwrap();
1139        pvclock_device
1140            .state
1141            .total_suspend_ns
1142            .store(0, Ordering::SeqCst);
1143        pvclock_device.virtio_restore(snap).unwrap();
1144        assert_eq!(
1145            pvclock_device.state.total_suspend_ns.load(Ordering::SeqCst),
1146            test_suspend_ns
1147        );
1148
1149        assert_wake_successful(&mut pvclock_device, &mem);
1150    }
1151
1152    /// A simplified clone of `pvclock_scale_delta` from Linux kernel to emulate
1153    /// what the kernel does when converting TSC to ktime.
1154    fn pvclock_scale_tsc(mult: u32, shift: i8, tsc: u64) -> u64 {
1155        let shifted = if shift < 0 {
1156            tsc >> -shift
1157        } else {
1158            tsc << shift
1159        };
1160        let product = shifted as u128 * mult as u128;
1161        (product >> 32).try_into().expect("should not overflow")
1162    }
1163
1164    /// Helper function for checking the behavior of `freq_scale_shift`.
1165    fn check_freq_scale(f: u64, input: u64) {
1166        // We only test `scaled_hz` = 1GHz because that is the only value used in the code base.
1167        let (mult, shift) = freq_scale_shift(1_000_000_000, f);
1168
1169        let scaled = pvclock_scale_tsc(mult, shift, input);
1170
1171        // Use relative error <= 1e-8 as the target. TSC can be huge so this isn't really a super
1172        // accurate target, and our goal is to simply sanity check the math without adding too many
1173        // requirements about rounding errors.
1174        let expected: u64 = (input as u128 * 1_000_000_000u128 / f as u128) as u64;
1175        let expected_lo: u64 = (input as u128 * 999_999_990u128 / f as u128) as u64;
1176        let expected_hi: u64 = (input as u128 * 1_000_000_010u128 / f as u128) as u64;
1177        assert!(
1178            (expected_lo..=expected_hi).contains(&scaled),
1179            "{scaled} should be close to {expected} (base_hz={f}, mult={mult}, shift={shift})"
1180        );
1181    }
1182
1183    #[test]
1184    fn test_freq_scale_shift_accuracy() {
1185        // Basic check for formula correctness: scaling `scaled_hz` to `base_hz` should yield
1186        // `base_hz`.
1187        for f in (1..=50).map(|n| n * 100_000_000) {
1188            check_freq_scale(f, f);
1189        }
1190    }
1191
1192    #[test]
1193    fn test_freq_scale_shift_overflow_high_freq() {
1194        // For scale factors < 1.0, test that we can correctly convert the maximum TSC value without
1195        // overflow. We must be able to handle values as large as it realistically can be, as the
1196        // kernel clock breaks if the calculated ktime goes backwards (b/342168920).
1197        for f in (11..=50).map(|n| n * 100_000_000) {
1198            check_freq_scale(f, u64::MAX);
1199        }
1200    }
1201
1202    #[test]
1203    fn test_freq_scale_shift_overflow_low_freq() {
1204        fn prev_power_of_two(n: u64) -> u64 {
1205            assert_ne!(n, 0);
1206            let highest_bit_set = 63 - n.leading_zeros();
1207            1 << highest_bit_set
1208        }
1209        // Same test as above, but for scale factors >= 1.0. The difference is that for scale
1210        // factors >= 1.0 we first round up the factor, then apply a multiplier (< 1.0). We reflect
1211        // this limitation in our tested maximum value.
1212        for f in (1..=10).map(|n| n * 100_000_000) {
1213            // Truncate the remainder since prev_power_of_two rounds down anyway.
1214            let factor = 1_000_000_000 / f;
1215            // This is like (exp2(floor(log2(factor)) + 1)).
1216            let target = u64::MAX / (prev_power_of_two(factor) << 1);
1217            check_freq_scale(f, target);
1218        }
1219    }
1220}