devices/
vmwdt.rs

1// Copyright 2022 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! vmwdt is a virtual watchdog memory mapped device which detects stalls
6//! on the vCPUs and resets the guest when no 'pet' events are received.
7//! <https://docs.google.com/document/d/1DYmk2roxlwHZsOfcJi8xDMdWOHAmomvs2SDh7KPud3Y/edit?usp=sharing&resourcekey=0-oSNabc-t040a1q0K4cyI8Q>
8
9use std::collections::BTreeMap;
10use std::convert::TryFrom;
11use std::fs;
12use std::sync::Arc;
13use std::time::Duration;
14
15use anyhow::Context;
16use base::custom_serde::serialize_arc_mutex;
17use base::debug;
18use base::error;
19use base::warn;
20use base::AsRawDescriptor;
21use base::Descriptor;
22use base::Error as SysError;
23use base::Event;
24use base::EventToken;
25use base::SendTube;
26use base::Timer;
27use base::TimerTrait;
28use base::Tube;
29use base::VmEventType;
30use base::WaitContext;
31use base::WorkerThread;
32use serde::Deserialize;
33use serde::Serialize;
34use snapshot::AnySnapshot;
35use sync::Mutex;
36use vm_control::VmResponse;
37
38use crate::pci::CrosvmDeviceId;
39use crate::BusAccessInfo;
40use crate::BusDevice;
41use crate::DeviceId;
42use crate::IrqEdgeEvent;
43use crate::Suspendable;
44
45// Registers offsets
46const VMWDT_REG_STATUS: u32 = 0x00;
47const VMWDT_REG_LOAD_CNT: u32 = 0x04;
48const VMWDT_REG_CURRENT_CNT: u32 = 0x08;
49const VMWDT_REG_CLOCK_FREQ_HZ: u32 = 0x0C;
50
51// Length of the registers
52const VMWDT_REG_LEN: u64 = 0x10;
53
54pub const VMWDT_DEFAULT_TIMEOUT_SEC: u32 = 10;
55pub const VMWDT_DEFAULT_CLOCK_HZ: u32 = 2;
56
57// Proc stat indexes
58const PROCSTAT_GUEST_TIME_INDX: usize = 42;
59
60#[derive(Serialize)]
61pub struct VmwdtPerCpu {
62    // Flag which indicated if the watchdog is started
63    is_enabled: bool,
64    // Timer used to generate periodic events at `timer_freq_hz` frequency
65    #[serde(skip_serializing)]
66    timer: Timer,
67    // The frequency of the `timer`
68    timer_freq_hz: u64,
69    // Timestamp measured in miliseconds of the last guest activity
70    last_guest_time_ms: i64,
71    // The thread_id of the thread this vcpu belongs to
72    thread_id: u32,
73    // The process id of the task this vcpu belongs to
74    process_id: u32,
75    // The pre-programmed one-shot expiration interval. If the guest runs in this
76    // interval but we don't receive a periodic event, the guest is stalled.
77    next_expiration_interval_ms: i64,
78    // Keep track if the watchdog PPI raised.
79    stall_evt_ppi_triggered: bool,
80    // Keep track if the time was armed with oneshot mode or with repeating interval
81    repeating_interval: Option<Duration>,
82}
83
84#[derive(Deserialize)]
85struct VmwdtPerCpuRestore {
86    is_enabled: bool,
87    timer_freq_hz: u64,
88    last_guest_time_ms: i64,
89    next_expiration_interval_ms: i64,
90    repeating_interval: Option<Duration>,
91}
92
93pub struct Vmwdt {
94    vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
95    // The worker thread that waits on the timer fd
96    worker_thread: Option<WorkerThread<Tube>>,
97    // TODO: @sebastianene add separate reset event for the watchdog
98    // Reset source if the device is not responding
99    reset_evt_wrtube: SendTube,
100    activated: bool,
101    // Event to be used to interrupt the guest on detected stalls
102    stall_evt: IrqEdgeEvent,
103    vm_ctrl_tube: Option<Tube>,
104}
105
106#[derive(Serialize)]
107struct VmwdtSnapshot {
108    #[serde(serialize_with = "serialize_arc_mutex")]
109    vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
110    activated: bool,
111}
112
113#[derive(Deserialize)]
114struct VmwdtRestore {
115    vm_wdts: Vec<VmwdtPerCpuRestore>,
116    activated: bool,
117}
118
119impl Vmwdt {
120    pub fn new(
121        cpu_count: usize,
122        reset_evt_wrtube: SendTube,
123        evt: IrqEdgeEvent,
124        vm_ctrl_tube: Tube,
125    ) -> anyhow::Result<Vmwdt> {
126        let mut vec = Vec::new();
127        for _ in 0..cpu_count {
128            vec.push(VmwdtPerCpu {
129                last_guest_time_ms: 0,
130                thread_id: 0,
131                process_id: 0,
132                is_enabled: false,
133                stall_evt_ppi_triggered: false,
134                timer: Timer::new().context("failed to create Timer")?,
135                timer_freq_hz: 0,
136                next_expiration_interval_ms: 0,
137                repeating_interval: None,
138            });
139        }
140        let vm_wdts = Arc::new(Mutex::new(vec));
141
142        Ok(Vmwdt {
143            vm_wdts,
144            worker_thread: None,
145            reset_evt_wrtube,
146            activated: false,
147            stall_evt: evt,
148            vm_ctrl_tube: Some(vm_ctrl_tube),
149        })
150    }
151
152    pub fn vmwdt_worker_thread(
153        vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
154        kill_evt: Event,
155        reset_evt_wrtube: SendTube,
156        stall_evt: IrqEdgeEvent,
157        vm_ctrl_tube: Tube,
158        worker_started_send: Option<SendTube>,
159    ) -> anyhow::Result<Tube> {
160        let msg = vm_control::VmRequest::VcpuPidTid;
161        vm_ctrl_tube
162            .send(&msg)
163            .context("failed to send request to fetch Vcpus PID and TID")?;
164        let vcpus_pid_tid: BTreeMap<usize, (u32, u32)> = match vm_ctrl_tube
165            .recv()
166            .context("failed to receive vmwdt pids and tids")?
167        {
168            VmResponse::VcpuPidTidResponse { pid_tid_map } => pid_tid_map,
169            _ => {
170                return Err(anyhow::anyhow!(
171                    "Receive incorrect message type when trying to get vcpu pid tid map"
172                ));
173            }
174        };
175        {
176            let mut vm_wdts = vm_wdts.lock();
177            for (i, vmwdt) in (*vm_wdts).iter_mut().enumerate() {
178                let pid_tid = vcpus_pid_tid
179                    .get(&i)
180                    .context("vmwdts empty, which could indicate no vcpus are initialized")?;
181                vmwdt.process_id = pid_tid.0;
182                vmwdt.thread_id = pid_tid.1;
183            }
184        }
185        if let Some(worker_started_send) = worker_started_send {
186            worker_started_send
187                .send(&())
188                .context("failed to send vmwdt worker started")?;
189        }
190        #[derive(EventToken)]
191        enum Token {
192            Kill,
193            Timer(usize),
194        }
195
196        let wait_ctx: WaitContext<Token> =
197            WaitContext::new().context("Failed to create wait_ctx")?;
198        wait_ctx
199            .add(&kill_evt, Token::Kill)
200            .context("Failed to add Tokens to wait_ctx")?;
201
202        let len = vm_wdts.lock().len();
203        for clock_id in 0..len {
204            let timer_fd = vm_wdts.lock()[clock_id].timer.as_raw_descriptor();
205            wait_ctx
206                .add(&Descriptor(timer_fd), Token::Timer(clock_id))
207                .context("Failed to link FDs to Tokens")?;
208        }
209
210        loop {
211            let events = wait_ctx.wait().context("Failed to wait for events")?;
212            for event in events.iter().filter(|e| e.is_readable) {
213                match event.token {
214                    Token::Kill => {
215                        return Ok(vm_ctrl_tube);
216                    }
217                    Token::Timer(cpu_id) => {
218                        let mut wdts_locked = vm_wdts.lock();
219                        let watchdog = &mut wdts_locked[cpu_id];
220                        match watchdog.timer.mark_waited() {
221                            Ok(true) => continue, // timer not actually ready
222                            Ok(false) => {}
223                            Err(e) => {
224                                error!("error waiting for timer event on vcpu {cpu_id}: {e:#}");
225                                continue;
226                            }
227                        }
228
229                        let current_guest_time_ms =
230                            Vmwdt::get_guest_time_ms(watchdog.process_id, watchdog.thread_id)
231                                .context("get_guest_time_ms failed")?;
232                        let remaining_time_ms = watchdog.next_expiration_interval_ms
233                            - (current_guest_time_ms - watchdog.last_guest_time_ms);
234
235                        if remaining_time_ms > 0 {
236                            watchdog.next_expiration_interval_ms = remaining_time_ms;
237                            if let Err(e) = watchdog
238                                .timer
239                                .reset_oneshot(Duration::from_millis(remaining_time_ms as u64))
240                            {
241                                error!(
242                                    "failed to reset internal timer on vcpu {}: {:#}",
243                                    cpu_id, e
244                                );
245                            }
246                            watchdog.repeating_interval = None;
247                        } else {
248                            if watchdog.stall_evt_ppi_triggered {
249                                if let Err(e) = reset_evt_wrtube
250                                    .send::<VmEventType>(&VmEventType::WatchdogReset)
251                                {
252                                    error!("{} failed to send reset event from vcpu {}", e, cpu_id)
253                                }
254                            }
255
256                            stall_evt
257                                .trigger()
258                                .context("Failed to trigger stall event")?;
259                            watchdog.stall_evt_ppi_triggered = true;
260                            watchdog.last_guest_time_ms = current_guest_time_ms;
261                        }
262                    }
263                }
264            }
265        }
266    }
267
268    fn start(&mut self, worker_started_send: Option<SendTube>) -> anyhow::Result<()> {
269        let vm_wdts = self.vm_wdts.clone();
270        let reset_evt_wrtube = self.reset_evt_wrtube.try_clone().unwrap();
271        let stall_event = self.stall_evt.try_clone().unwrap();
272        let vm_ctrl_tube = self
273            .vm_ctrl_tube
274            .take()
275            .context("missing vm control tube")?;
276
277        self.activated = true;
278        self.worker_thread = Some(WorkerThread::start("vmwdt worker", |kill_evt| {
279            Vmwdt::vmwdt_worker_thread(
280                vm_wdts,
281                kill_evt,
282                reset_evt_wrtube,
283                stall_event,
284                vm_ctrl_tube,
285                worker_started_send,
286            )
287            .expect("failed to start vmwdt worker thread")
288        }));
289        Ok(())
290    }
291
292    fn ensure_started(&mut self) {
293        if self.worker_thread.is_some() {
294            return;
295        }
296
297        let (worker_started_send, worker_started_recv) =
298            Tube::directional_pair().expect("failed to create vmwdt worker started tubes");
299        self.start(Some(worker_started_send))
300            .expect("failed to start Vmwdt");
301        worker_started_recv
302            .recv::<()>()
303            .expect("failed to receive vmwdt worker started");
304    }
305
306    #[cfg(any(target_os = "linux", target_os = "android"))]
307    pub fn get_guest_time_ms(process_id: u32, thread_id: u32) -> Result<i64, SysError> {
308        // TODO: @sebastianene check if we can avoid open-read-close on each call
309        let stat_path = format!("/proc/{process_id}/task/{thread_id}/stat");
310        let contents = fs::read_to_string(stat_path)?;
311
312        let gtime_ticks = contents
313            .split_whitespace()
314            .nth(PROCSTAT_GUEST_TIME_INDX)
315            .and_then(|guest_time| guest_time.parse::<u64>().ok())
316            .unwrap_or(0);
317
318        // SAFETY:
319        // Safe because this just returns an integer
320        let ticks_per_sec = unsafe { libc::sysconf(libc::_SC_CLK_TCK) } as u64;
321        Ok((gtime_ticks * 1000 / ticks_per_sec) as i64)
322    }
323
324    #[cfg(not(any(target_os = "linux", target_os = "android")))]
325    pub fn get_guest_time_ms(process_id: u32, thread_id: u32) -> Result<i64, SysError> {
326        Ok(0)
327    }
328}
329
330impl BusDevice for Vmwdt {
331    fn debug_label(&self) -> String {
332        "Vmwdt".to_owned()
333    }
334
335    fn device_id(&self) -> DeviceId {
336        CrosvmDeviceId::VmWatchdog.into()
337    }
338
339    fn read(&mut self, _offset: BusAccessInfo, _data: &mut [u8]) {}
340
341    fn write(&mut self, info: BusAccessInfo, data: &[u8]) {
342        let data_array = match <&[u8; 4]>::try_from(data) {
343            Ok(array) => array,
344            _ => {
345                error!("Bad write size: {} for vmwdt", data.len());
346                return;
347            }
348        };
349
350        let reg_val = u32::from_ne_bytes(*data_array);
351        let cpu_index: usize = (info.offset / VMWDT_REG_LEN) as usize;
352        let reg_offset = (info.offset % VMWDT_REG_LEN) as u32;
353
354        if cpu_index > self.vm_wdts.lock().len() {
355            error!("Bad write cpu_index {}", cpu_index);
356            return;
357        }
358
359        match reg_offset {
360            VMWDT_REG_STATUS => {
361                self.ensure_started();
362                let mut wdts_locked = self.vm_wdts.lock();
363                let cpu_watchdog = &mut wdts_locked[cpu_index];
364
365                cpu_watchdog.is_enabled = reg_val != 0;
366
367                if reg_val != 0 {
368                    let interval = Duration::from_millis(1000 / cpu_watchdog.timer_freq_hz);
369                    cpu_watchdog.repeating_interval = Some(interval);
370                    cpu_watchdog
371                        .timer
372                        .reset_repeating(interval)
373                        .expect("Failed to reset timer repeating interval");
374                } else {
375                    cpu_watchdog.repeating_interval = None;
376                    cpu_watchdog
377                        .timer
378                        .clear()
379                        .expect("Failed to clear cpu watchdog timer");
380                }
381            }
382            VMWDT_REG_LOAD_CNT => {
383                self.ensure_started();
384                let (process_id, thread_id) = {
385                    let mut wdts_locked = self.vm_wdts.lock();
386                    let cpu_watchdog = &mut wdts_locked[cpu_index];
387                    (cpu_watchdog.process_id, cpu_watchdog.thread_id)
388                };
389                let guest_time_ms = Vmwdt::get_guest_time_ms(process_id, thread_id)
390                    .expect("get_guest_time_ms failed");
391
392                let mut wdts_locked = self.vm_wdts.lock();
393                let cpu_watchdog = &mut wdts_locked[cpu_index];
394                let next_expiration_interval_ms =
395                    reg_val as u64 * 1000 / cpu_watchdog.timer_freq_hz;
396
397                cpu_watchdog.last_guest_time_ms = guest_time_ms;
398                cpu_watchdog.stall_evt_ppi_triggered = false;
399                cpu_watchdog.next_expiration_interval_ms = next_expiration_interval_ms as i64;
400
401                if cpu_watchdog.is_enabled {
402                    if let Err(_e) = cpu_watchdog
403                        .timer
404                        .reset_oneshot(Duration::from_millis(next_expiration_interval_ms))
405                    {
406                        error!("failed to reset one-shot vcpu time {}", cpu_index);
407                    }
408                    cpu_watchdog.repeating_interval = None;
409                }
410            }
411            VMWDT_REG_CURRENT_CNT => {
412                warn!("invalid write to read-only VMWDT_REG_CURRENT_CNT register");
413            }
414            VMWDT_REG_CLOCK_FREQ_HZ => {
415                let mut wdts_locked = self.vm_wdts.lock();
416                let cpu_watchdog = &mut wdts_locked[cpu_index];
417
418                debug!(
419                    "CPU:{:x} wrote VMWDT_REG_CLOCK_FREQ_HZ {:x}",
420                    cpu_index, reg_val
421                );
422                cpu_watchdog.timer_freq_hz = reg_val as u64;
423            }
424            _ => unreachable!(),
425        }
426    }
427}
428
429impl Suspendable for Vmwdt {
430    fn sleep(&mut self) -> anyhow::Result<()> {
431        if let Some(worker) = self.worker_thread.take() {
432            self.vm_ctrl_tube = Some(worker.stop());
433        }
434        Ok(())
435    }
436
437    fn wake(&mut self) -> anyhow::Result<()> {
438        if self.activated {
439            // We do not pass a tube to notify that the worker thread has started on wake.
440            // At this stage, vm_control is blocked on resuming devices and cannot provide the vcpu
441            // PIDs/TIDs yet.
442            // At the same time, the Vcpus are still frozen, which means no MMIO will get
443            // processed, and write will not get triggered.
444            // The request to get PIDs/TIDs should get processed before any MMIO request occurs.
445            self.start(None)?;
446            let mut vm_wdts = self.vm_wdts.lock();
447            for vmwdt in vm_wdts.iter_mut() {
448                if let Some(interval) = &vmwdt.repeating_interval {
449                    vmwdt
450                        .timer
451                        .reset_repeating(*interval)
452                        .context("failed to write repeating interval")?;
453                } else if vmwdt.is_enabled {
454                    vmwdt
455                        .timer
456                        .reset_oneshot(Duration::from_millis(
457                            vmwdt.next_expiration_interval_ms as u64,
458                        ))
459                        .context("failed to write oneshot interval")?;
460                }
461            }
462        }
463        Ok(())
464    }
465
466    fn snapshot(&mut self) -> anyhow::Result<AnySnapshot> {
467        AnySnapshot::to_any(&VmwdtSnapshot {
468            vm_wdts: self.vm_wdts.clone(),
469            activated: self.activated,
470        })
471        .context("failed to snapshot Vmwdt")
472    }
473
474    fn restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {
475        let deser: VmwdtRestore =
476            AnySnapshot::from_any(data).context("failed to deserialize Vmwdt")?;
477        let mut vm_wdts = self.vm_wdts.lock();
478        for (vmwdt_restore, vmwdt) in deser.vm_wdts.iter().zip(vm_wdts.iter_mut()) {
479            vmwdt.is_enabled = vmwdt_restore.is_enabled;
480            vmwdt.timer_freq_hz = vmwdt_restore.timer_freq_hz;
481            vmwdt.last_guest_time_ms = vmwdt_restore.last_guest_time_ms;
482            vmwdt.next_expiration_interval_ms = vmwdt_restore.next_expiration_interval_ms;
483            vmwdt.repeating_interval = vmwdt_restore.repeating_interval;
484        }
485        self.activated = deser.activated;
486        Ok(())
487    }
488}
489
490#[cfg(test)]
491mod tests {
492    use std::process;
493    use std::thread::sleep;
494
495    #[cfg(any(target_os = "linux", target_os = "android"))]
496    use base::gettid;
497    use base::poll_assert;
498    use base::Tube;
499
500    use super::*;
501
502    const AARCH64_VMWDT_ADDR: u64 = 0x3000;
503    const TEST_VMWDT_CPU_NO: usize = 0x1;
504
505    fn vmwdt_bus_address(offset: u64) -> BusAccessInfo {
506        BusAccessInfo {
507            offset,
508            address: AARCH64_VMWDT_ADDR,
509            id: 0,
510        }
511    }
512
513    #[test]
514    fn test_watchdog_internal_timer() {
515        let (vm_evt_wrtube, _vm_evt_rdtube) = Tube::directional_pair().unwrap();
516        let (vm_ctrl_wrtube, vm_ctrl_rdtube) = Tube::pair().unwrap();
517        let irq = IrqEdgeEvent::new().unwrap();
518        #[cfg(any(target_os = "linux", target_os = "android"))]
519        {
520            vm_ctrl_wrtube
521                .send(&VmResponse::VcpuPidTidResponse {
522                    pid_tid_map: BTreeMap::from([(0, (process::id(), gettid() as u32))]),
523                })
524                .unwrap();
525        }
526        let mut device = Vmwdt::new(TEST_VMWDT_CPU_NO, vm_evt_wrtube, irq, vm_ctrl_rdtube).unwrap();
527
528        // Configure the watchdog device, 2Hz internal clock
529        device.write(
530            vmwdt_bus_address(VMWDT_REG_CLOCK_FREQ_HZ as u64),
531            &[10, 0, 0, 0],
532        );
533        device.write(vmwdt_bus_address(VMWDT_REG_LOAD_CNT as u64), &[1, 0, 0, 0]);
534        device.write(vmwdt_bus_address(VMWDT_REG_STATUS as u64), &[1, 0, 0, 0]);
535        let next_expiration_ms = {
536            let mut vmwdt_locked = device.vm_wdts.lock();
537            // In the test scenario the guest does not interpret the /proc/stat::guest_time, thus
538            // the function get_guest_time() returns 0
539            vmwdt_locked[0].last_guest_time_ms = 10;
540            vmwdt_locked[0].next_expiration_interval_ms
541        };
542
543        // Poll multiple times as we don't get a signal when the watchdog thread has run.
544        poll_assert!(10, || {
545            sleep(Duration::from_millis(50));
546            let vmwdt_locked = device.vm_wdts.lock();
547            // Verify that our timer expired and the next_expiration_interval_ms changed
548            vmwdt_locked[0].next_expiration_interval_ms != next_expiration_ms
549        });
550    }
551
552    #[test]
553    fn test_watchdog_expiration() {
554        let (vm_evt_wrtube, vm_evt_rdtube) = Tube::directional_pair().unwrap();
555        let (vm_ctrl_wrtube, vm_ctrl_rdtube) = Tube::pair().unwrap();
556        let irq = IrqEdgeEvent::new().unwrap();
557        #[cfg(any(target_os = "linux", target_os = "android"))]
558        {
559            vm_ctrl_wrtube
560                .send(&VmResponse::VcpuPidTidResponse {
561                    pid_tid_map: BTreeMap::from([(0, (process::id(), gettid() as u32))]),
562                })
563                .unwrap();
564        }
565        let mut device = Vmwdt::new(TEST_VMWDT_CPU_NO, vm_evt_wrtube, irq, vm_ctrl_rdtube).unwrap();
566
567        // Configure the watchdog device, 2Hz internal clock
568        device.write(
569            vmwdt_bus_address(VMWDT_REG_CLOCK_FREQ_HZ as u64),
570            &[10, 0, 0, 0],
571        );
572        device.write(vmwdt_bus_address(VMWDT_REG_LOAD_CNT as u64), &[1, 0, 0, 0]);
573        device.write(vmwdt_bus_address(VMWDT_REG_STATUS as u64), &[1, 0, 0, 0]);
574        // In the test scenario the guest does not interpret the /proc/stat::guest_time, thus
575        // the function get_guest_time() returns 0
576        device.vm_wdts.lock()[0].last_guest_time_ms = -100;
577
578        // Check that the interrupt has raised
579        poll_assert!(10, || {
580            sleep(Duration::from_millis(50));
581            let vmwdt_locked = device.vm_wdts.lock();
582            vmwdt_locked[0].stall_evt_ppi_triggered
583        });
584
585        // Simulate that the time has passed since the last expiration
586        device.vm_wdts.lock()[0].last_guest_time_ms = -100;
587
588        // Poll multiple times as we don't get a signal when the watchdog thread has run.
589        poll_assert!(10, || {
590            sleep(Duration::from_millis(50));
591            match vm_evt_rdtube.recv::<VmEventType>() {
592                Ok(vm_event) => vm_event == VmEventType::WatchdogReset,
593                Err(_e) => false,
594            }
595        });
596    }
597}