hypervisor/
x86_64.rs

1// Copyright 2020 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::arch::x86_64::CpuidResult;
6#[cfg(any(unix, feature = "haxm", feature = "whpx"))]
7use std::arch::x86_64::__cpuid;
8use std::arch::x86_64::_rdtsc;
9use std::collections::BTreeMap;
10use std::collections::HashSet;
11
12use anyhow::Context;
13use base::custom_serde::deserialize_seq_to_arr;
14use base::custom_serde::serialize_arr;
15use base::error;
16use base::warn;
17use base::Result;
18use bit_field::*;
19use downcast_rs::impl_downcast;
20use libc::c_void;
21use serde::Deserialize;
22use serde::Serialize;
23use snapshot::AnySnapshot;
24use vm_memory::GuestAddress;
25
26use crate::Hypervisor;
27use crate::IrqRoute;
28use crate::IrqSource;
29use crate::IrqSourceChip;
30use crate::Vcpu;
31use crate::Vm;
32
33const MSR_F15H_PERF_CTL0: u32 = 0xc0010200;
34const MSR_F15H_PERF_CTL1: u32 = 0xc0010202;
35const MSR_F15H_PERF_CTL2: u32 = 0xc0010204;
36const MSR_F15H_PERF_CTL3: u32 = 0xc0010206;
37const MSR_F15H_PERF_CTL4: u32 = 0xc0010208;
38const MSR_F15H_PERF_CTL5: u32 = 0xc001020a;
39const MSR_F15H_PERF_CTR0: u32 = 0xc0010201;
40const MSR_F15H_PERF_CTR1: u32 = 0xc0010203;
41const MSR_F15H_PERF_CTR2: u32 = 0xc0010205;
42const MSR_F15H_PERF_CTR3: u32 = 0xc0010207;
43const MSR_F15H_PERF_CTR4: u32 = 0xc0010209;
44const MSR_F15H_PERF_CTR5: u32 = 0xc001020b;
45const MSR_IA32_PERF_CAPABILITIES: u32 = 0x00000345;
46
47/// A trait for managing cpuids for an x86_64 hypervisor and for checking its capabilities.
48pub trait HypervisorX86_64: Hypervisor {
49    /// Get the system supported CPUID values.
50    fn get_supported_cpuid(&self) -> Result<CpuId>;
51
52    /// Gets the list of supported MSRs.
53    fn get_msr_index_list(&self) -> Result<Vec<u32>>;
54}
55
56/// A wrapper for using a VM on x86_64 and getting/setting its state.
57pub trait VmX86_64: Vm {
58    /// Gets the `HypervisorX86_64` that created this VM.
59    fn get_hypervisor(&self) -> &dyn HypervisorX86_64;
60
61    /// Create a Vcpu with the specified Vcpu ID.
62    fn create_vcpu(&self, id: usize) -> Result<Box<dyn VcpuX86_64>>;
63
64    /// Sets the address of the three-page region in the VM's address space.
65    fn set_tss_addr(&self, addr: GuestAddress) -> Result<()>;
66
67    /// Sets the address of a one-page region in the VM's address space.
68    fn set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>;
69
70    /// Load pVM firmware for the VM, creating a memslot for it as needed.
71    ///
72    /// Only works on protected VMs (i.e. those with vm_type == KVM_X86_PKVM_PROTECTED_VM).
73    fn load_protected_vm_firmware(&mut self, fw_addr: GuestAddress, fw_max_size: u64)
74        -> Result<()>;
75}
76
77/// A wrapper around creating and using a VCPU on x86_64.
78pub trait VcpuX86_64: Vcpu {
79    /// Sets or clears the flag that requests the VCPU to exit when it becomes possible to inject
80    /// interrupts into the guest.
81    fn set_interrupt_window_requested(&self, requested: bool);
82
83    /// Checks if we can inject an interrupt into the VCPU.
84    fn ready_for_interrupt(&self) -> bool;
85
86    /// Injects interrupt vector `irq` into the VCPU.
87    ///
88    /// This function should only be called when [`Self::ready_for_interrupt`] returns true.
89    /// Otherwise the interrupt injection may fail or the next VCPU run may fail. However, if
90    /// [`Self::interrupt`] returns [`Ok`], the implementation must guarantee that the interrupt
91    /// isn't injected in an uninterruptible window (e.g. right after the mov ss instruction).
92    ///
93    /// The caller should avoid calling this function more than 1 time for one VMEXIT, because the
94    /// hypervisor may behave differently: some hypervisors(e.g. WHPX, KVM) will only try to inject
95    /// the last `irq` requested, while some other hypervisors(e.g. HAXM) may try to inject all
96    /// `irq`s requested.
97    fn interrupt(&self, irq: u8) -> Result<()>;
98
99    /// Injects a non-maskable interrupt into the VCPU.
100    fn inject_nmi(&self) -> Result<()>;
101
102    /// Gets the VCPU general purpose registers.
103    fn get_regs(&self) -> Result<Regs>;
104
105    /// Sets the VCPU general purpose registers.
106    fn set_regs(&self, regs: &Regs) -> Result<()>;
107
108    /// Gets the VCPU special registers.
109    fn get_sregs(&self) -> Result<Sregs>;
110
111    /// Sets the VCPU special registers.
112    fn set_sregs(&self, sregs: &Sregs) -> Result<()>;
113
114    /// Gets the VCPU FPU registers.
115    fn get_fpu(&self) -> Result<Fpu>;
116
117    /// Sets the VCPU FPU registers.
118    fn set_fpu(&self, fpu: &Fpu) -> Result<()>;
119
120    /// Gets the VCPU debug registers.
121    fn get_debugregs(&self) -> Result<DebugRegs>;
122
123    /// Sets the VCPU debug registers.
124    fn set_debugregs(&self, debugregs: &DebugRegs) -> Result<()>;
125
126    /// Gets the VCPU extended control registers.
127    fn get_xcrs(&self) -> Result<BTreeMap<u32, u64>>;
128
129    /// Sets a VCPU extended control register.
130    fn set_xcr(&self, xcr: u32, value: u64) -> Result<()>;
131
132    /// Gets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
133    fn get_xsave(&self) -> Result<Xsave>;
134
135    /// Sets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
136    fn set_xsave(&self, xsave: &Xsave) -> Result<()>;
137
138    /// Gets hypervisor specific state for this VCPU that must be
139    /// saved/restored for snapshotting.
140    /// This state is fetched after VCPUs are frozen and interrupts are flushed.
141    fn get_hypervisor_specific_state(&self) -> Result<AnySnapshot>;
142
143    /// Sets hypervisor specific state for this VCPU. Only used for
144    /// snapshotting.
145    fn set_hypervisor_specific_state(&self, data: AnySnapshot) -> Result<()>;
146
147    /// Gets a single model-specific register's value.
148    fn get_msr(&self, msr_index: u32) -> Result<u64>;
149
150    /// Gets the model-specific registers. Returns all the MSRs for the VCPU.
151    fn get_all_msrs(&self) -> Result<BTreeMap<u32, u64>>;
152
153    /// Sets a single model-specific register's value.
154    fn set_msr(&self, msr_index: u32, value: u64) -> Result<()>;
155
156    /// Sets up the data returned by the CPUID instruction.
157    fn set_cpuid(&self, cpuid: &CpuId) -> Result<()>;
158
159    /// Sets up debug registers and configure vcpu for handling guest debug events.
160    fn set_guest_debug(&self, addrs: &[GuestAddress], enable_singlestep: bool) -> Result<()>;
161
162    /// This function should be called after `Vcpu::run` returns `VcpuExit::Cpuid`, and `entry`
163    /// should represent the result of emulating the CPUID instruction. The `handle_cpuid` function
164    /// will then set the appropriate registers on the vcpu.
165    fn handle_cpuid(&mut self, entry: &CpuIdEntry) -> Result<()>;
166
167    /// Gets the guest->host TSC offset.
168    ///
169    /// The default implementation uses [`VcpuX86_64::get_msr()`] to read the guest TSC.
170    fn get_tsc_offset(&self) -> Result<u64> {
171        // SAFETY:
172        // Safe because _rdtsc takes no arguments
173        let host_before_tsc = unsafe { _rdtsc() };
174
175        // get guest TSC value from our hypervisor
176        let guest_tsc = self.get_msr(crate::MSR_IA32_TSC)?;
177
178        // SAFETY:
179        // Safe because _rdtsc takes no arguments
180        let host_after_tsc = unsafe { _rdtsc() };
181
182        // Average the before and after host tsc to get the best value
183        let host_tsc = ((host_before_tsc as u128 + host_after_tsc as u128) / 2) as u64;
184
185        Ok(guest_tsc.wrapping_sub(host_tsc))
186    }
187
188    /// Sets the guest->host TSC offset.
189    ///
190    /// The default implementation uses [`VcpuX86_64::set_tsc_value()`] to set the TSC value.
191    ///
192    /// It sets TSC_OFFSET (VMCS / CB field) by setting the TSC MSR to the current
193    /// host TSC value plus the desired offset. We rely on the fact that hypervisors
194    /// determine the value of TSC_OFFSET by computing TSC_OFFSET = `new_tsc_value - _rdtsc()` =
195    /// `_rdtsc() + offset - _rdtsc()` ~= `offset`. Note that the ~= is important: this is an
196    /// approximate operation, because the two _rdtsc() calls
197    /// are separated by at least a few ticks.
198    ///
199    /// Note: TSC_OFFSET, host TSC, guest TSC, and TSC MSR are all different
200    /// concepts.
201    /// * When a guest executes rdtsc, the value (guest TSC) returned is host_tsc * TSC_MULTIPLIER +
202    ///   TSC_OFFSET + TSC_ADJUST.
203    /// * The TSC MSR is a special MSR that when written to by the host, will cause TSC_OFFSET to be
204    ///   set accordingly by the hypervisor.
205    /// * When the guest *writes* to TSC MSR, it actually changes the TSC_ADJUST MSR *for the
206    ///   guest*. Generally this is only happens if the guest is trying to re-zero or synchronize
207    ///   TSCs.
208    fn set_tsc_offset(&self, offset: u64) -> Result<()> {
209        // SAFETY: _rdtsc takes no arguments.
210        let host_tsc = unsafe { _rdtsc() };
211        self.set_tsc_value(host_tsc.wrapping_add(offset))
212    }
213
214    /// Sets the guest TSC exactly to the provided value.
215    ///
216    /// The default implementation sets the guest's TSC by writing the value to the MSR directly.
217    ///
218    /// See [`VcpuX86_64::set_tsc_offset()`] for an explanation of how this value is actually read
219    /// by the guest after being set.
220    fn set_tsc_value(&self, value: u64) -> Result<()> {
221        self.set_msr(crate::MSR_IA32_TSC, value)
222    }
223
224    /// Some hypervisors require special handling to restore timekeeping when
225    /// a snapshot is restored. They are provided with a host TSC reference
226    /// moment, guaranteed to be the same across all Vcpus, and the Vcpu's TSC
227    /// offset at the moment it was snapshotted.
228    fn restore_timekeeping(&self, host_tsc_reference_moment: u64, tsc_offset: u64) -> Result<()>;
229
230    /// Snapshot vCPU state
231    fn snapshot(&self) -> anyhow::Result<VcpuSnapshot> {
232        Ok(VcpuSnapshot {
233            vcpu_id: self.id(),
234            regs: self.get_regs()?,
235            sregs: self.get_sregs()?,
236            debug_regs: self.get_debugregs()?,
237            xcrs: self.get_xcrs()?,
238            msrs: self.get_all_msrs()?,
239            xsave: self.get_xsave()?,
240            hypervisor_data: self.get_hypervisor_specific_state()?,
241            tsc_offset: self.get_tsc_offset()?,
242        })
243    }
244
245    fn restore(
246        &mut self,
247        snapshot: &VcpuSnapshot,
248        host_tsc_reference_moment: u64,
249    ) -> anyhow::Result<()> {
250        // List of MSRs that may fail to restore due to lack of support in the host kernel.
251        // Some hosts are may be running older kernels which do not support all MSRs, but
252        // get_all_msrs will still fetch the MSRs supported by the CPU. Trying to set those MSRs
253        // will result in failures, so they will throw a warning instead.
254        let msr_allowlist = HashSet::from([
255            MSR_F15H_PERF_CTL0,
256            MSR_F15H_PERF_CTL1,
257            MSR_F15H_PERF_CTL2,
258            MSR_F15H_PERF_CTL3,
259            MSR_F15H_PERF_CTL4,
260            MSR_F15H_PERF_CTL5,
261            MSR_F15H_PERF_CTR0,
262            MSR_F15H_PERF_CTR1,
263            MSR_F15H_PERF_CTR2,
264            MSR_F15H_PERF_CTR3,
265            MSR_F15H_PERF_CTR4,
266            MSR_F15H_PERF_CTR5,
267            MSR_IA32_PERF_CAPABILITIES,
268        ]);
269        assert_eq!(snapshot.vcpu_id, self.id());
270        self.set_regs(&snapshot.regs)?;
271        self.set_sregs(&snapshot.sregs)?;
272        self.set_debugregs(&snapshot.debug_regs)?;
273        for (xcr_index, value) in &snapshot.xcrs {
274            self.set_xcr(*xcr_index, *value)?;
275        }
276
277        for (msr_index, value) in snapshot.msrs.iter() {
278            if self.get_msr(*msr_index) == Ok(*value) {
279                continue; // no need to set MSR since the values are the same.
280            }
281            if let Err(e) = self.set_msr(*msr_index, *value) {
282                if msr_allowlist.contains(msr_index) {
283                    warn!(
284                        "Failed to set MSR. MSR might not be supported in this kernel. Err: {}",
285                        e
286                    );
287                } else {
288                    return Err(e).context(
289                        "Failed to set MSR. MSR might not be supported by the CPU or by the kernel,
290                         and was not allow-listed.",
291                    );
292                }
293            };
294        }
295        self.set_xsave(&snapshot.xsave)?;
296        self.set_hypervisor_specific_state(snapshot.hypervisor_data.clone())?;
297        self.restore_timekeeping(host_tsc_reference_moment, snapshot.tsc_offset)?;
298        Ok(())
299    }
300}
301
302/// x86 specific vCPU snapshot.
303#[derive(Clone, Debug, Serialize, Deserialize)]
304pub struct VcpuSnapshot {
305    pub vcpu_id: usize,
306    regs: Regs,
307    sregs: Sregs,
308    debug_regs: DebugRegs,
309    xcrs: BTreeMap<u32, u64>,
310    msrs: BTreeMap<u32, u64>,
311    xsave: Xsave,
312    hypervisor_data: AnySnapshot,
313    tsc_offset: u64,
314}
315
316impl_downcast!(VcpuX86_64);
317
318// TSC MSR
319pub const MSR_IA32_TSC: u32 = 0x00000010;
320
321/// Gets host cpu max physical address bits.
322#[cfg(any(unix, feature = "haxm", feature = "whpx"))]
323pub(crate) fn host_phys_addr_bits() -> u8 {
324    // SAFETY: trivially safe
325    let highest_ext_function = unsafe { __cpuid(0x80000000) };
326    if highest_ext_function.eax >= 0x80000008 {
327        // SAFETY: trivially safe
328        let addr_size = unsafe { __cpuid(0x80000008) };
329        // Low 8 bits of 0x80000008 leaf: host physical address size in bits.
330        addr_size.eax as u8
331    } else {
332        36
333    }
334}
335
336/// Initial state for x86_64 VCPUs.
337#[derive(Clone, Default)]
338pub struct VcpuInitX86_64 {
339    /// General-purpose registers.
340    pub regs: Regs,
341
342    /// Special registers.
343    pub sregs: Sregs,
344
345    /// Floating-point registers.
346    pub fpu: Fpu,
347
348    /// Machine-specific registers.
349    pub msrs: BTreeMap<u32, u64>,
350}
351
352/// Hold the CPU feature configurations that are needed to setup a vCPU.
353#[derive(Clone, Debug, PartialEq, Eq)]
354pub struct CpuConfigX86_64 {
355    /// whether to force using a calibrated TSC leaf (0x15).
356    pub force_calibrated_tsc_leaf: bool,
357
358    /// whether enabling host cpu topology.
359    pub host_cpu_topology: bool,
360
361    /// whether expose HWP feature to the guest.
362    pub enable_hwp: bool,
363
364    /// Wheter diabling SMT (Simultaneous Multithreading).
365    pub no_smt: bool,
366
367    /// whether enabling ITMT scheduler
368    pub itmt: bool,
369
370    /// whether setting hybrid CPU type
371    pub hybrid_type: Option<CpuHybridType>,
372}
373
374impl CpuConfigX86_64 {
375    pub fn new(
376        force_calibrated_tsc_leaf: bool,
377        host_cpu_topology: bool,
378        enable_hwp: bool,
379        no_smt: bool,
380        itmt: bool,
381        hybrid_type: Option<CpuHybridType>,
382    ) -> Self {
383        CpuConfigX86_64 {
384            force_calibrated_tsc_leaf,
385            host_cpu_topology,
386            enable_hwp,
387            no_smt,
388            itmt,
389            hybrid_type,
390        }
391    }
392}
393
394/// A CpuId Entry contains supported feature information for the given processor.
395/// This can be modified by the hypervisor to pass additional information to the guest kernel
396/// about the hypervisor or vm. Information is returned in the eax, ebx, ecx and edx registers
397/// by the cpu for a given function and index/subfunction (passed into the cpu via the eax and ecx
398/// register respectively).
399#[repr(C)]
400#[derive(Clone, Copy, Debug, PartialEq, Eq)]
401pub struct CpuIdEntry {
402    pub function: u32,
403    pub index: u32,
404    // flags is needed for KVM.  We store it on CpuIdEntry to preserve the flags across
405    // get_supported_cpuids() -> kvm_cpuid2 -> CpuId -> kvm_cpuid2 -> set_cpuid().
406    pub flags: u32,
407    pub cpuid: CpuidResult,
408}
409
410/// A container for the list of cpu id entries for the hypervisor and underlying cpu.
411pub struct CpuId {
412    pub cpu_id_entries: Vec<CpuIdEntry>,
413}
414
415impl CpuId {
416    /// Constructs a new CpuId, with space allocated for `initial_capacity` CpuIdEntries.
417    pub fn new(initial_capacity: usize) -> Self {
418        CpuId {
419            cpu_id_entries: Vec::with_capacity(initial_capacity),
420        }
421    }
422}
423
424#[bitfield]
425#[derive(Clone, Copy, Debug, PartialEq, Eq)]
426pub enum DestinationMode {
427    Physical = 0,
428    Logical = 1,
429}
430
431#[bitfield]
432#[derive(Clone, Copy, Debug, PartialEq, Eq)]
433pub enum TriggerMode {
434    Edge = 0,
435    Level = 1,
436}
437
438#[bitfield]
439#[derive(Debug, Clone, Copy, PartialEq, Eq)]
440pub enum DeliveryMode {
441    Fixed = 0b000,
442    Lowest = 0b001,
443    SMI = 0b010,        // System management interrupt
444    RemoteRead = 0b011, // This is no longer supported by intel.
445    NMI = 0b100,        // Non maskable interrupt
446    Init = 0b101,
447    Startup = 0b110,
448    External = 0b111,
449}
450
451// These MSI structures are for Intel's implementation of MSI.  The PCI spec defines most of MSI,
452// but the Intel spec defines the format of messages for raising interrupts.  The PCI spec defines
453// three u32s -- the address, address_high, and data -- but Intel only makes use of the address and
454// data.  The Intel portion of the specification is in Volume 3 section 10.11.
455#[bitfield]
456#[derive(Clone, Copy, PartialEq, Eq)]
457pub struct MsiAddressMessage {
458    pub reserved: BitField2,
459    #[bits = 1]
460    pub destination_mode: DestinationMode,
461    pub redirection_hint: BitField1,
462    pub reserved_2: BitField8,
463    pub destination_id: BitField8,
464    // According to Intel's implementation of MSI, these bits must always be 0xfee.
465    pub always_0xfee: BitField12,
466}
467
468#[bitfield]
469#[derive(Clone, Copy, PartialEq, Eq)]
470pub struct MsiDataMessage {
471    pub vector: BitField8,
472    #[bits = 3]
473    pub delivery_mode: DeliveryMode,
474    pub reserved: BitField3,
475    #[bits = 1]
476    pub level: Level,
477    #[bits = 1]
478    pub trigger: TriggerMode,
479    pub reserved2: BitField16,
480}
481
482#[bitfield]
483#[derive(Debug, Clone, Copy, PartialEq, Eq)]
484pub enum DeliveryStatus {
485    Idle = 0,
486    Pending = 1,
487}
488
489/// The level of a level-triggered interrupt: asserted or deasserted.
490#[bitfield]
491#[derive(Debug, Clone, Copy, PartialEq, Eq)]
492pub enum Level {
493    Deassert = 0,
494    Assert = 1,
495}
496
497/// Represents a IOAPIC redirection table entry.
498#[bitfield]
499#[derive(Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
500pub struct IoapicRedirectionTableEntry {
501    vector: BitField8,
502    #[bits = 3]
503    delivery_mode: DeliveryMode,
504    #[bits = 1]
505    dest_mode: DestinationMode,
506    #[bits = 1]
507    delivery_status: DeliveryStatus,
508    polarity: BitField1,
509    remote_irr: bool,
510    #[bits = 1]
511    trigger_mode: TriggerMode,
512    interrupt_mask: bool, // true iff interrupts are masked.
513    reserved: BitField39,
514    dest_id: BitField8,
515}
516
517/// Number of pins on the standard KVM/IOAPIC.
518pub const NUM_IOAPIC_PINS: usize = 24;
519
520/// Represents the state of the IOAPIC.
521#[repr(C)]
522#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
523pub struct IoapicState {
524    /// base_address is the memory base address for this IOAPIC. It cannot be changed.
525    pub base_address: u64,
526    /// ioregsel register. Used for selecting which entry of the redirect table to read/write.
527    pub ioregsel: u8,
528    /// ioapicid register. Bits 24 - 27 contain the APIC ID for this device.
529    pub ioapicid: u32,
530    /// current_interrupt_level_bitmap represents a bitmap of the state of all of the irq lines
531    pub current_interrupt_level_bitmap: u32,
532    /// redirect_table contains the irq settings for each irq line
533    #[serde(
534        serialize_with = "serialize_arr",
535        deserialize_with = "deserialize_seq_to_arr"
536    )]
537    pub redirect_table: [IoapicRedirectionTableEntry; NUM_IOAPIC_PINS],
538}
539
540impl Default for IoapicState {
541    fn default() -> IoapicState {
542        // SAFETY: trivially safe
543        unsafe { std::mem::zeroed() }
544    }
545}
546
547#[repr(C)]
548#[derive(Debug, Clone, Copy, PartialEq, Eq)]
549pub enum PicSelect {
550    Primary = 0,
551    Secondary = 1,
552}
553
554#[repr(C)]
555#[derive(enumn::N, Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
556pub enum PicInitState {
557    #[default]
558    Icw1 = 0,
559    Icw2 = 1,
560    Icw3 = 2,
561    Icw4 = 3,
562}
563
564/// Convenience implementation for converting from a u8
565impl From<u8> for PicInitState {
566    fn from(item: u8) -> Self {
567        PicInitState::n(item).unwrap_or_else(|| {
568            error!("Invalid PicInitState {}, setting to 0", item);
569            PicInitState::Icw1
570        })
571    }
572}
573
574/// Represents the state of the PIC.
575#[repr(C)]
576#[derive(Clone, Copy, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
577pub struct PicState {
578    /// Edge detection.
579    pub last_irr: u8,
580    /// Interrupt Request Register.
581    pub irr: u8,
582    /// Interrupt Mask Register.
583    pub imr: u8,
584    /// Interrupt Service Register.
585    pub isr: u8,
586    /// Highest priority, for priority rotation.
587    pub priority_add: u8,
588    pub irq_base: u8,
589    pub read_reg_select: bool,
590    pub poll: bool,
591    pub special_mask: bool,
592    pub init_state: PicInitState,
593    pub auto_eoi: bool,
594    pub rotate_on_auto_eoi: bool,
595    pub special_fully_nested_mode: bool,
596    /// PIC takes either 3 or 4 bytes of initialization command word during
597    /// initialization. use_4_byte_icw is true if 4 bytes of ICW are needed.
598    pub use_4_byte_icw: bool,
599    /// "Edge/Level Control Registers", for edge trigger selection.
600    /// When a particular bit is set, the corresponding IRQ is in level-triggered mode. Otherwise
601    /// it is in edge-triggered mode.
602    pub elcr: u8,
603    pub elcr_mask: u8,
604}
605
606/// The LapicState represents the state of an x86 CPU's Local APIC.
607/// The Local APIC consists of 64 128-bit registers, but only the first 32-bits of each register
608/// can be used, so this structure only stores the first 32-bits of each register.
609#[repr(C)]
610#[derive(Clone, Copy, Serialize, Deserialize)]
611pub struct LapicState {
612    #[serde(
613        serialize_with = "serialize_arr",
614        deserialize_with = "deserialize_seq_to_arr"
615    )]
616    pub regs: [LapicRegister; 64],
617}
618
619pub type LapicRegister = u32;
620
621// rust arrays longer than 32 need custom implementations of Debug
622impl std::fmt::Debug for LapicState {
623    fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
624        self.regs[..].fmt(formatter)
625    }
626}
627
628// rust arrays longer than 32 need custom implementations of PartialEq
629impl PartialEq for LapicState {
630    fn eq(&self, other: &LapicState) -> bool {
631        self.regs[..] == other.regs[..]
632    }
633}
634
635// Lapic equality is reflexive, so we impl Eq
636impl Eq for LapicState {}
637
638/// The PitState represents the state of the PIT (aka the Programmable Interval Timer).
639/// The state is simply the state of it's three channels.
640#[repr(C)]
641#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
642pub struct PitState {
643    pub channels: [PitChannelState; 3],
644    /// Hypervisor-specific flags for setting the pit state.
645    pub flags: u32,
646}
647
648/// The PitRWMode enum represents the access mode of a PIT channel.
649/// Reads and writes to the Pit happen over Port-mapped I/O, which happens one byte at a time,
650/// but the count values and latch values are two bytes. So the access mode controls which of the
651/// two bytes will be read when.
652#[repr(C)]
653#[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
654pub enum PitRWMode {
655    /// None mode means that no access mode has been set.
656    None = 0,
657    /// Least mode means all reads/writes will read/write the least significant byte.
658    Least = 1,
659    /// Most mode means all reads/writes will read/write the most significant byte.
660    Most = 2,
661    /// Both mode means first the least significant byte will be read/written, then the
662    /// next read/write will read/write the most significant byte.
663    Both = 3,
664}
665
666/// Convenience implementation for converting from a u8
667impl From<u8> for PitRWMode {
668    fn from(item: u8) -> Self {
669        PitRWMode::n(item).unwrap_or_else(|| {
670            error!("Invalid PitRWMode value {}, setting to 0", item);
671            PitRWMode::None
672        })
673    }
674}
675
676/// The PitRWState enum represents the state of reading to or writing from a channel.
677/// This is related to the PitRWMode, it mainly gives more detail about the state of the channel
678/// with respect to PitRWMode::Both.
679#[repr(C)]
680#[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
681pub enum PitRWState {
682    /// None mode means that no access mode has been set.
683    None = 0,
684    /// LSB means that the channel is in PitRWMode::Least access mode.
685    LSB = 1,
686    /// MSB means that the channel is in PitRWMode::Most access mode.
687    MSB = 2,
688    /// Word0 means that the channel is in PitRWMode::Both mode, and the least sginificant byte
689    /// has not been read/written yet.
690    Word0 = 3,
691    /// Word1 means that the channel is in PitRWMode::Both mode and the least significant byte
692    /// has already been read/written, and the next byte to be read/written will be the most
693    /// significant byte.
694    Word1 = 4,
695}
696
697/// Convenience implementation for converting from a u8
698impl From<u8> for PitRWState {
699    fn from(item: u8) -> Self {
700        PitRWState::n(item).unwrap_or_else(|| {
701            error!("Invalid PitRWState value {}, setting to 0", item);
702            PitRWState::None
703        })
704    }
705}
706
707/// The PitChannelState represents the state of one of the PIT's three counters.
708#[repr(C)]
709#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
710pub struct PitChannelState {
711    /// The starting value for the counter.
712    pub count: u32,
713    /// Stores the channel count from the last time the count was latched.
714    pub latched_count: u16,
715    /// Indicates the PitRWState state of reading the latch value.
716    pub count_latched: PitRWState,
717    /// Indicates whether ReadBack status has been latched.
718    pub status_latched: bool,
719    /// Stores the channel status from the last time the status was latched. The status contains
720    /// information about the access mode of this channel, but changing those bits in the status
721    /// will not change the behavior of the pit.
722    pub status: u8,
723    /// Indicates the PitRWState state of reading the counter.
724    pub read_state: PitRWState,
725    /// Indicates the PitRWState state of writing the counter.
726    pub write_state: PitRWState,
727    /// Stores the value with which the counter was initialized. Counters are 16-
728    /// bit values with an effective range of 1-65536 (65536 represented by 0).
729    pub reload_value: u16,
730    /// The command access mode of this channel.
731    pub rw_mode: PitRWMode,
732    /// The operation mode of this channel.
733    pub mode: u8,
734    /// Whether or not we are in bcd mode. Not supported by KVM or crosvm's PIT implementation.
735    pub bcd: bool,
736    /// Value of the gate input pin. This only applies to channel 2.
737    pub gate: bool,
738    /// Nanosecond timestamp of when the count value was loaded.
739    pub count_load_time: u64,
740}
741
742// Convenience constructors for IrqRoutes
743impl IrqRoute {
744    pub fn ioapic_irq_route(irq_num: u32) -> IrqRoute {
745        IrqRoute {
746            gsi: irq_num,
747            source: IrqSource::Irqchip {
748                chip: IrqSourceChip::Ioapic,
749                pin: irq_num,
750            },
751        }
752    }
753
754    pub fn pic_irq_route(id: IrqSourceChip, irq_num: u32) -> IrqRoute {
755        IrqRoute {
756            gsi: irq_num,
757            source: IrqSource::Irqchip {
758                chip: id,
759                pin: irq_num % 8,
760            },
761        }
762    }
763}
764
765/// State of a VCPU's general purpose registers.
766#[repr(C)]
767#[derive(Debug, Copy, Clone, Serialize, Deserialize)]
768pub struct Regs {
769    pub rax: u64,
770    pub rbx: u64,
771    pub rcx: u64,
772    pub rdx: u64,
773    pub rsi: u64,
774    pub rdi: u64,
775    pub rsp: u64,
776    pub rbp: u64,
777    pub r8: u64,
778    pub r9: u64,
779    pub r10: u64,
780    pub r11: u64,
781    pub r12: u64,
782    pub r13: u64,
783    pub r14: u64,
784    pub r15: u64,
785    pub rip: u64,
786    pub rflags: u64,
787}
788
789impl Default for Regs {
790    fn default() -> Self {
791        Regs {
792            rax: 0,
793            rbx: 0,
794            rcx: 0,
795            rdx: 0,
796            rsi: 0,
797            rdi: 0,
798            rsp: 0,
799            rbp: 0,
800            r8: 0,
801            r9: 0,
802            r10: 0,
803            r11: 0,
804            r12: 0,
805            r13: 0,
806            r14: 0,
807            r15: 0,
808            rip: 0xfff0, // Reset vector.
809            rflags: 0x2, // Bit 1 (0x2) is always 1.
810        }
811    }
812}
813
814/// State of a memory segment.
815#[repr(C)]
816#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
817pub struct Segment {
818    pub base: u64,
819    /// Limit of the segment - always in bytes, regardless of granularity (`g`) field.
820    pub limit_bytes: u32,
821    pub selector: u16,
822    pub type_: u8,
823    pub present: u8,
824    pub dpl: u8,
825    pub db: u8,
826    pub s: u8,
827    pub l: u8,
828    pub g: u8,
829    pub avl: u8,
830}
831
832/// State of a global descriptor table or interrupt descriptor table.
833#[repr(C)]
834#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
835pub struct DescriptorTable {
836    pub base: u64,
837    pub limit: u16,
838}
839
840/// State of a VCPU's special registers.
841#[repr(C)]
842#[derive(Debug, Copy, Clone, Serialize, Deserialize)]
843pub struct Sregs {
844    pub cs: Segment,
845    pub ds: Segment,
846    pub es: Segment,
847    pub fs: Segment,
848    pub gs: Segment,
849    pub ss: Segment,
850    pub tr: Segment,
851    pub ldt: Segment,
852    pub gdt: DescriptorTable,
853    pub idt: DescriptorTable,
854    pub cr0: u64,
855    pub cr2: u64,
856    pub cr3: u64,
857    pub cr4: u64,
858    pub cr8: u64,
859    pub efer: u64,
860}
861
862impl Default for Sregs {
863    fn default() -> Self {
864        // Intel SDM Vol. 3A, 3.4.5.1 ("Code- and Data-Segment Descriptor Types")
865        const SEG_TYPE_DATA: u8 = 0b0000;
866        const SEG_TYPE_DATA_WRITABLE: u8 = 0b0010;
867
868        const SEG_TYPE_CODE: u8 = 0b1000;
869        const SEG_TYPE_CODE_READABLE: u8 = 0b0010;
870
871        const SEG_TYPE_ACCESSED: u8 = 0b0001;
872
873        // Intel SDM Vol. 3A, 3.4.5 ("Segment Descriptors")
874        const SEG_S_SYSTEM: u8 = 0; // System segment.
875        const SEG_S_CODE_OR_DATA: u8 = 1; // Data/code segment.
876
877        // 16-bit real-mode code segment (reset vector).
878        let code_seg = Segment {
879            base: 0xffff0000,
880            limit_bytes: 0xffff,
881            selector: 0xf000,
882            type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
883            present: 1,
884            s: SEG_S_CODE_OR_DATA,
885            ..Default::default()
886        };
887
888        // 16-bit real-mode data segment.
889        let data_seg = Segment {
890            base: 0,
891            limit_bytes: 0xffff,
892            selector: 0,
893            type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE | SEG_TYPE_ACCESSED, // 3
894            present: 1,
895            s: SEG_S_CODE_OR_DATA,
896            ..Default::default()
897        };
898
899        // 16-bit TSS segment.
900        let task_seg = Segment {
901            base: 0,
902            limit_bytes: 0xffff,
903            selector: 0,
904            type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
905            present: 1,
906            s: SEG_S_SYSTEM,
907            ..Default::default()
908        };
909
910        // Local descriptor table.
911        let ldt = Segment {
912            base: 0,
913            limit_bytes: 0xffff,
914            selector: 0,
915            type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE, // 2
916            present: 1,
917            s: SEG_S_SYSTEM,
918            ..Default::default()
919        };
920
921        // Global descriptor table.
922        let gdt = DescriptorTable {
923            base: 0,
924            limit: 0xffff,
925        };
926
927        // Interrupt descriptor table.
928        let idt = DescriptorTable {
929            base: 0,
930            limit: 0xffff,
931        };
932
933        let cr0 = (1 << 4) // CR0.ET (reserved, always 1)
934                | (1 << 30); // CR0.CD (cache disable)
935
936        Sregs {
937            cs: code_seg,
938            ds: data_seg,
939            es: data_seg,
940            fs: data_seg,
941            gs: data_seg,
942            ss: data_seg,
943            tr: task_seg,
944            ldt,
945            gdt,
946            idt,
947            cr0,
948            cr2: 0,
949            cr3: 0,
950            cr4: 0,
951            cr8: 0,
952            efer: 0,
953        }
954    }
955}
956
957/// x87 80-bit floating point value.
958#[repr(C)]
959#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
960pub struct FpuReg {
961    /// 64-bit mantissa.
962    pub significand: u64,
963
964    /// 15-bit biased exponent and sign bit.
965    pub sign_exp: u16,
966}
967
968impl FpuReg {
969    /// Convert an array of 8x16-byte arrays to an array of 8 `FpuReg`.
970    ///
971    /// Ignores any data in the upper 6 bytes of each element; the values represent 80-bit FPU
972    /// registers, so the upper 48 bits are unused.
973    pub fn from_16byte_arrays(byte_arrays: &[[u8; 16]; 8]) -> [FpuReg; 8] {
974        let mut regs = [FpuReg::default(); 8];
975        for (dst, src) in regs.iter_mut().zip(byte_arrays.iter()) {
976            let tbyte: [u8; 10] = src[0..10].try_into().unwrap();
977            *dst = FpuReg::from(tbyte);
978        }
979        regs
980    }
981
982    /// Convert an array of 8 `FpuReg` into 8x16-byte arrays.
983    pub fn to_16byte_arrays(regs: &[FpuReg; 8]) -> [[u8; 16]; 8] {
984        let mut byte_arrays = [[0u8; 16]; 8];
985        for (dst, src) in byte_arrays.iter_mut().zip(regs.iter()) {
986            *dst = (*src).into();
987        }
988        byte_arrays
989    }
990}
991
992impl From<[u8; 10]> for FpuReg {
993    /// Construct a `FpuReg` from an 80-bit representation.
994    fn from(value: [u8; 10]) -> FpuReg {
995        // These array sub-slices can't fail, but there's no (safe) way to express that in Rust
996        // without an `unwrap()`.
997        let significand_bytes = value[0..8].try_into().unwrap();
998        let significand = u64::from_le_bytes(significand_bytes);
999        let sign_exp_bytes = value[8..10].try_into().unwrap();
1000        let sign_exp = u16::from_le_bytes(sign_exp_bytes);
1001        FpuReg {
1002            significand,
1003            sign_exp,
1004        }
1005    }
1006}
1007
1008impl From<FpuReg> for [u8; 10] {
1009    /// Convert an `FpuReg` into its 80-bit "TBYTE" representation.
1010    fn from(value: FpuReg) -> [u8; 10] {
1011        let mut bytes = [0u8; 10];
1012        bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1013        bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1014        bytes
1015    }
1016}
1017
1018impl From<FpuReg> for [u8; 16] {
1019    /// Convert an `FpuReg` into its 80-bit representation plus 6 unused upper bytes.
1020    /// This is a convenience function for converting to hypervisor types.
1021    fn from(value: FpuReg) -> [u8; 16] {
1022        let mut bytes = [0u8; 16];
1023        bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1024        bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1025        bytes
1026    }
1027}
1028
1029/// State of a VCPU's floating point unit.
1030#[repr(C)]
1031#[derive(Debug, Copy, Clone, Serialize, Deserialize)]
1032pub struct Fpu {
1033    pub fpr: [FpuReg; 8],
1034    pub fcw: u16,
1035    pub fsw: u16,
1036    pub ftwx: u8,
1037    pub last_opcode: u16,
1038    pub last_ip: u64,
1039    pub last_dp: u64,
1040    pub xmm: [[u8; 16usize]; 16usize],
1041    pub mxcsr: u32,
1042}
1043
1044impl Default for Fpu {
1045    fn default() -> Self {
1046        Fpu {
1047            fpr: Default::default(),
1048            fcw: 0x37f, // Intel SDM Vol. 1, 13.6
1049            fsw: 0,
1050            ftwx: 0,
1051            last_opcode: 0,
1052            last_ip: 0,
1053            last_dp: 0,
1054            xmm: Default::default(),
1055            mxcsr: 0x1f80, // Intel SDM Vol. 1, 11.6.4
1056        }
1057    }
1058}
1059
1060/// State of a VCPU's debug registers.
1061#[repr(C)]
1062#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
1063pub struct DebugRegs {
1064    pub db: [u64; 4usize],
1065    pub dr6: u64,
1066    pub dr7: u64,
1067}
1068
1069/// The hybrid type for intel hybrid CPU.
1070#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
1071pub enum CpuHybridType {
1072    /// Intel Atom.
1073    Atom,
1074    /// Intel Core.
1075    Core,
1076}
1077
1078/// State of the VCPU's x87 FPU, MMX, XMM, YMM registers.
1079/// May contain more state depending on enabled extensions.
1080#[derive(Clone, Debug, Serialize, Deserialize)]
1081pub struct Xsave {
1082    data: Vec<u32>,
1083
1084    // Actual length in bytes. May be smaller than data if a non-u32 multiple of bytes is
1085    // requested.
1086    len: usize,
1087}
1088
1089impl Xsave {
1090    /// Create a new buffer to store Xsave data.
1091    ///
1092    /// # Argments
1093    /// * `len` size in bytes.
1094    pub fn new(len: usize) -> Self {
1095        Xsave {
1096            data: vec![0; len.div_ceil(4)],
1097            len,
1098        }
1099    }
1100
1101    pub fn as_ptr(&self) -> *const c_void {
1102        self.data.as_ptr() as *const c_void
1103    }
1104
1105    pub fn as_mut_ptr(&mut self) -> *mut c_void {
1106        self.data.as_mut_ptr() as *mut c_void
1107    }
1108
1109    /// Length in bytes of the XSAVE data.
1110    pub fn len(&self) -> usize {
1111        self.len
1112    }
1113
1114    /// Returns true is length of XSAVE data is zero
1115    pub fn is_empty(&self) -> bool {
1116        self.len() == 0
1117    }
1118}