hypervisor/
x86_64.rs

1// Copyright 2020 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#[cfg(any(unix, feature = "haxm", feature = "whpx"))]
6use std::arch::x86_64::__cpuid;
7use std::arch::x86_64::_rdtsc;
8use std::arch::x86_64::CpuidResult;
9use std::collections::BTreeMap;
10use std::collections::HashSet;
11use std::sync::Arc;
12
13use anyhow::Context;
14use base::custom_serde::deserialize_seq_to_arr;
15use base::custom_serde::serialize_arr;
16use base::error;
17use base::warn;
18use base::Result;
19use bit_field::*;
20use libc::c_void;
21use serde::Deserialize;
22use serde::Serialize;
23use snapshot::AnySnapshot;
24use vm_memory::GuestAddress;
25
26use crate::Hypervisor;
27use crate::IrqRoute;
28use crate::IrqSource;
29use crate::IrqSourceChip;
30use crate::Vcpu;
31use crate::Vm;
32
33const MSR_F15H_PERF_CTL0: u32 = 0xc0010200;
34const MSR_F15H_PERF_CTL1: u32 = 0xc0010202;
35const MSR_F15H_PERF_CTL2: u32 = 0xc0010204;
36const MSR_F15H_PERF_CTL3: u32 = 0xc0010206;
37const MSR_F15H_PERF_CTL4: u32 = 0xc0010208;
38const MSR_F15H_PERF_CTL5: u32 = 0xc001020a;
39const MSR_F15H_PERF_CTR0: u32 = 0xc0010201;
40const MSR_F15H_PERF_CTR1: u32 = 0xc0010203;
41const MSR_F15H_PERF_CTR2: u32 = 0xc0010205;
42const MSR_F15H_PERF_CTR3: u32 = 0xc0010207;
43const MSR_F15H_PERF_CTR4: u32 = 0xc0010209;
44const MSR_F15H_PERF_CTR5: u32 = 0xc001020b;
45const MSR_IA32_PERF_CAPABILITIES: u32 = 0x00000345;
46
47/// A trait for managing cpuids for an x86_64 hypervisor and for checking its capabilities.
48pub trait HypervisorX86_64: Hypervisor {
49    /// Get the system supported CPUID values.
50    fn get_supported_cpuid(&self) -> Result<CpuId>;
51
52    /// Gets the list of supported MSRs.
53    fn get_msr_index_list(&self) -> Result<Vec<u32>>;
54}
55
56/// A wrapper for using a VM on x86_64 and getting/setting its state.
57pub trait VmX86_64: Vm {
58    /// Gets the `HypervisorX86_64` that created this VM.
59    fn get_hypervisor(&self) -> &dyn HypervisorX86_64;
60
61    /// Create a Vcpu with the specified Vcpu ID.
62    fn create_vcpu(&self, id: usize) -> Result<Arc<dyn VcpuX86_64>>;
63
64    /// Sets the address of the three-page region in the VM's address space.
65    fn set_tss_addr(&self, addr: GuestAddress) -> Result<()>;
66
67    /// Sets the address of a one-page region in the VM's address space.
68    fn set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>;
69
70    /// Load pVM firmware for the VM, creating a memslot for it as needed.
71    ///
72    /// Only works on protected VMs (i.e. those with vm_type == KVM_X86_PKVM_PROTECTED_VM).
73    fn load_protected_vm_firmware(&self, fw_addr: GuestAddress, fw_max_size: u64) -> Result<()>;
74}
75
76/// A wrapper around creating and using a VCPU on x86_64.
77pub trait VcpuX86_64: Vcpu {
78    /// Sets or clears the flag that requests the VCPU to exit when it becomes possible to inject
79    /// interrupts into the guest.
80    fn set_interrupt_window_requested(&self, requested: bool);
81
82    /// Checks if we can inject an interrupt into the VCPU.
83    fn ready_for_interrupt(&self) -> bool;
84
85    /// Injects interrupt vector `irq` into the VCPU.
86    ///
87    /// This function should only be called when [`Self::ready_for_interrupt`] returns true.
88    /// Otherwise the interrupt injection may fail or the next VCPU run may fail. However, if
89    /// [`Self::interrupt`] returns [`Ok`], the implementation must guarantee that the interrupt
90    /// isn't injected in an uninterruptible window (e.g. right after the mov ss instruction).
91    ///
92    /// The caller should avoid calling this function more than 1 time for one VMEXIT, because the
93    /// hypervisor may behave differently: some hypervisors(e.g. WHPX, KVM) will only try to inject
94    /// the last `irq` requested, while some other hypervisors(e.g. HAXM) may try to inject all
95    /// `irq`s requested.
96    fn interrupt(&self, irq: u8) -> Result<()>;
97
98    /// Injects a non-maskable interrupt into the VCPU.
99    fn inject_nmi(&self) -> Result<()>;
100
101    /// Gets the VCPU general purpose registers.
102    fn get_regs(&self) -> Result<Regs>;
103
104    /// Sets the VCPU general purpose registers.
105    fn set_regs(&self, regs: &Regs) -> Result<()>;
106
107    /// Gets the VCPU special registers.
108    fn get_sregs(&self) -> Result<Sregs>;
109
110    /// Sets the VCPU special registers.
111    fn set_sregs(&self, sregs: &Sregs) -> Result<()>;
112
113    /// Gets the VCPU FPU registers.
114    fn get_fpu(&self) -> Result<Fpu>;
115
116    /// Sets the VCPU FPU registers.
117    fn set_fpu(&self, fpu: &Fpu) -> Result<()>;
118
119    /// Gets the VCPU debug registers.
120    fn get_debugregs(&self) -> Result<DebugRegs>;
121
122    /// Sets the VCPU debug registers.
123    fn set_debugregs(&self, debugregs: &DebugRegs) -> Result<()>;
124
125    /// Gets the VCPU extended control registers.
126    fn get_xcrs(&self) -> Result<BTreeMap<u32, u64>>;
127
128    /// Sets a VCPU extended control register.
129    fn set_xcr(&self, xcr: u32, value: u64) -> Result<()>;
130
131    /// Gets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
132    fn get_xsave(&self) -> Result<Xsave>;
133
134    /// Sets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
135    fn set_xsave(&self, xsave: &Xsave) -> Result<()>;
136
137    /// Gets hypervisor specific state for this VCPU that must be
138    /// saved/restored for snapshotting.
139    /// This state is fetched after VCPUs are frozen and interrupts are flushed.
140    fn get_hypervisor_specific_state(&self) -> Result<AnySnapshot>;
141
142    /// Sets hypervisor specific state for this VCPU. Only used for
143    /// snapshotting.
144    fn set_hypervisor_specific_state(&self, data: AnySnapshot) -> Result<()>;
145
146    /// Gets a single model-specific register's value.
147    fn get_msr(&self, msr_index: u32) -> Result<u64>;
148
149    /// Gets the model-specific registers. Returns all the MSRs for the VCPU.
150    fn get_all_msrs(&self) -> Result<BTreeMap<u32, u64>>;
151
152    /// Sets a single model-specific register's value.
153    fn set_msr(&self, msr_index: u32, value: u64) -> Result<()>;
154
155    /// Sets up the data returned by the CPUID instruction.
156    fn set_cpuid(&self, cpuid: &CpuId) -> Result<()>;
157
158    /// Sets up debug registers and configure vcpu for handling guest debug events.
159    fn set_guest_debug(&self, addrs: &[GuestAddress], enable_singlestep: bool) -> Result<()>;
160
161    /// This function should be called after `Vcpu::run` returns `VcpuExit::Cpuid`, and `entry`
162    /// should represent the result of emulating the CPUID instruction. The `handle_cpuid` function
163    /// will then set the appropriate registers on the vcpu.
164    fn handle_cpuid(&self, entry: &CpuIdEntry) -> Result<()>;
165
166    /// Gets the guest->host TSC offset.
167    ///
168    /// The default implementation uses [`VcpuX86_64::get_msr()`] to read the guest TSC.
169    fn get_tsc_offset(&self) -> Result<u64> {
170        // SAFETY:
171        // Safe because _rdtsc takes no arguments
172        let host_before_tsc = unsafe { _rdtsc() };
173
174        // get guest TSC value from our hypervisor
175        let guest_tsc = self.get_msr(crate::MSR_IA32_TSC)?;
176
177        // SAFETY:
178        // Safe because _rdtsc takes no arguments
179        let host_after_tsc = unsafe { _rdtsc() };
180
181        // Average the before and after host tsc to get the best value
182        let host_tsc = ((host_before_tsc as u128 + host_after_tsc as u128) / 2) as u64;
183
184        Ok(guest_tsc.wrapping_sub(host_tsc))
185    }
186
187    /// Sets the guest->host TSC offset.
188    ///
189    /// The default implementation uses [`VcpuX86_64::set_tsc_value()`] to set the TSC value.
190    ///
191    /// It sets TSC_OFFSET (VMCS / CB field) by setting the TSC MSR to the current
192    /// host TSC value plus the desired offset. We rely on the fact that hypervisors
193    /// determine the value of TSC_OFFSET by computing TSC_OFFSET = `new_tsc_value - _rdtsc()` =
194    /// `_rdtsc() + offset - _rdtsc()` ~= `offset`. Note that the ~= is important: this is an
195    /// approximate operation, because the two _rdtsc() calls
196    /// are separated by at least a few ticks.
197    ///
198    /// Note: TSC_OFFSET, host TSC, guest TSC, and TSC MSR are all different
199    /// concepts.
200    /// * When a guest executes rdtsc, the value (guest TSC) returned is host_tsc * TSC_MULTIPLIER +
201    ///   TSC_OFFSET + TSC_ADJUST.
202    /// * The TSC MSR is a special MSR that when written to by the host, will cause TSC_OFFSET to be
203    ///   set accordingly by the hypervisor.
204    /// * When the guest *writes* to TSC MSR, it actually changes the TSC_ADJUST MSR *for the
205    ///   guest*. Generally this is only happens if the guest is trying to re-zero or synchronize
206    ///   TSCs.
207    fn set_tsc_offset(&self, offset: u64) -> Result<()> {
208        // SAFETY: _rdtsc takes no arguments.
209        let host_tsc = unsafe { _rdtsc() };
210        self.set_tsc_value(host_tsc.wrapping_add(offset))
211    }
212
213    /// Sets the guest TSC exactly to the provided value.
214    ///
215    /// The default implementation sets the guest's TSC by writing the value to the MSR directly.
216    ///
217    /// See [`VcpuX86_64::set_tsc_offset()`] for an explanation of how this value is actually read
218    /// by the guest after being set.
219    fn set_tsc_value(&self, value: u64) -> Result<()> {
220        self.set_msr(crate::MSR_IA32_TSC, value)
221    }
222
223    /// Some hypervisors require special handling to restore timekeeping when
224    /// a snapshot is restored. They are provided with a host TSC reference
225    /// moment, guaranteed to be the same across all Vcpus, and the Vcpu's TSC
226    /// offset at the moment it was snapshotted.
227    fn restore_timekeeping(&self, host_tsc_reference_moment: u64, tsc_offset: u64) -> Result<()>;
228
229    /// Snapshot vCPU state
230    fn snapshot(&self) -> anyhow::Result<VcpuSnapshot> {
231        Ok(VcpuSnapshot {
232            vcpu_id: self.id(),
233            regs: self.get_regs()?,
234            sregs: self.get_sregs()?,
235            debug_regs: self.get_debugregs()?,
236            xcrs: self.get_xcrs()?,
237            msrs: self.get_all_msrs()?,
238            xsave: self.get_xsave()?,
239            hypervisor_data: self.get_hypervisor_specific_state()?,
240            tsc_offset: self.get_tsc_offset()?,
241        })
242    }
243
244    fn restore(
245        &self,
246        snapshot: &VcpuSnapshot,
247        host_tsc_reference_moment: u64,
248    ) -> anyhow::Result<()> {
249        // List of MSRs that may fail to restore due to lack of support in the host kernel.
250        // Some hosts are may be running older kernels which do not support all MSRs, but
251        // get_all_msrs will still fetch the MSRs supported by the CPU. Trying to set those MSRs
252        // will result in failures, so they will throw a warning instead.
253        let msr_allowlist = HashSet::from([
254            MSR_F15H_PERF_CTL0,
255            MSR_F15H_PERF_CTL1,
256            MSR_F15H_PERF_CTL2,
257            MSR_F15H_PERF_CTL3,
258            MSR_F15H_PERF_CTL4,
259            MSR_F15H_PERF_CTL5,
260            MSR_F15H_PERF_CTR0,
261            MSR_F15H_PERF_CTR1,
262            MSR_F15H_PERF_CTR2,
263            MSR_F15H_PERF_CTR3,
264            MSR_F15H_PERF_CTR4,
265            MSR_F15H_PERF_CTR5,
266            MSR_IA32_PERF_CAPABILITIES,
267        ]);
268        assert_eq!(snapshot.vcpu_id, self.id());
269        self.set_regs(&snapshot.regs)?;
270        self.set_sregs(&snapshot.sregs)?;
271        self.set_debugregs(&snapshot.debug_regs)?;
272        for (xcr_index, value) in &snapshot.xcrs {
273            self.set_xcr(*xcr_index, *value)?;
274        }
275
276        for (msr_index, value) in snapshot.msrs.iter() {
277            if self.get_msr(*msr_index) == Ok(*value) {
278                continue; // no need to set MSR since the values are the same.
279            }
280            if let Err(e) = self.set_msr(*msr_index, *value) {
281                if msr_allowlist.contains(msr_index) {
282                    warn!(
283                        "Failed to set MSR. MSR might not be supported in this kernel. Err: {}",
284                        e
285                    );
286                } else {
287                    return Err(e).context(
288                        "Failed to set MSR. MSR might not be supported by the CPU or by the kernel,
289                         and was not allow-listed.",
290                    );
291                }
292            };
293        }
294        self.set_xsave(&snapshot.xsave)?;
295        self.set_hypervisor_specific_state(snapshot.hypervisor_data.clone())?;
296        self.restore_timekeeping(host_tsc_reference_moment, snapshot.tsc_offset)?;
297        Ok(())
298    }
299}
300
301/// x86 specific vCPU snapshot.
302#[derive(Clone, Debug, Serialize, Deserialize)]
303pub struct VcpuSnapshot {
304    pub vcpu_id: usize,
305    regs: Regs,
306    sregs: Sregs,
307    debug_regs: DebugRegs,
308    xcrs: BTreeMap<u32, u64>,
309    msrs: BTreeMap<u32, u64>,
310    xsave: Xsave,
311    hypervisor_data: AnySnapshot,
312    tsc_offset: u64,
313}
314
315// TSC MSR
316pub const MSR_IA32_TSC: u32 = 0x00000010;
317
318/// Gets host cpu max physical address bits.
319#[cfg(any(unix, feature = "haxm", feature = "whpx"))]
320pub(crate) fn host_phys_addr_bits() -> u8 {
321    // SAFETY: trivially safe
322    let highest_ext_function = unsafe { __cpuid(0x80000000) };
323    if highest_ext_function.eax >= 0x80000008 {
324        // SAFETY: trivially safe
325        let addr_size = unsafe { __cpuid(0x80000008) };
326        // Low 8 bits of 0x80000008 leaf: host physical address size in bits.
327        addr_size.eax as u8
328    } else {
329        36
330    }
331}
332
333/// Initial state for x86_64 VCPUs.
334#[derive(Clone, Default)]
335pub struct VcpuInitX86_64 {
336    /// General-purpose registers.
337    pub regs: Regs,
338
339    /// Special registers.
340    pub sregs: Sregs,
341
342    /// Floating-point registers.
343    pub fpu: Fpu,
344
345    /// Machine-specific registers.
346    pub msrs: BTreeMap<u32, u64>,
347}
348
349/// Hold the CPU feature configurations that are needed to setup a vCPU.
350#[derive(Clone, Debug, PartialEq, Eq)]
351pub struct CpuConfigX86_64 {
352    /// whether to force using a calibrated TSC leaf (0x15).
353    pub force_calibrated_tsc_leaf: bool,
354
355    /// whether enabling host cpu topology.
356    pub host_cpu_topology: bool,
357
358    /// whether expose HWP feature to the guest.
359    pub enable_hwp: bool,
360
361    /// Wheter diabling SMT (Simultaneous Multithreading).
362    pub no_smt: bool,
363
364    /// whether enabling ITMT scheduler
365    pub itmt: bool,
366
367    /// whether setting hybrid CPU type
368    pub hybrid_type: Option<CpuHybridType>,
369}
370
371impl CpuConfigX86_64 {
372    pub fn new(
373        force_calibrated_tsc_leaf: bool,
374        host_cpu_topology: bool,
375        enable_hwp: bool,
376        no_smt: bool,
377        itmt: bool,
378        hybrid_type: Option<CpuHybridType>,
379    ) -> Self {
380        CpuConfigX86_64 {
381            force_calibrated_tsc_leaf,
382            host_cpu_topology,
383            enable_hwp,
384            no_smt,
385            itmt,
386            hybrid_type,
387        }
388    }
389}
390
391/// A CpuId Entry contains supported feature information for the given processor.
392/// This can be modified by the hypervisor to pass additional information to the guest kernel
393/// about the hypervisor or vm. Information is returned in the eax, ebx, ecx and edx registers
394/// by the cpu for a given function and index/subfunction (passed into the cpu via the eax and ecx
395/// register respectively).
396#[repr(C)]
397#[derive(Clone, Copy, Debug, PartialEq, Eq)]
398pub struct CpuIdEntry {
399    pub function: u32,
400    pub index: u32,
401    // flags is needed for KVM.  We store it on CpuIdEntry to preserve the flags across
402    // get_supported_cpuids() -> kvm_cpuid2 -> CpuId -> kvm_cpuid2 -> set_cpuid().
403    pub flags: u32,
404    pub cpuid: CpuidResult,
405}
406
407/// A container for the list of cpu id entries for the hypervisor and underlying cpu.
408pub struct CpuId {
409    pub cpu_id_entries: Vec<CpuIdEntry>,
410}
411
412impl CpuId {
413    /// Constructs a new CpuId, with space allocated for `initial_capacity` CpuIdEntries.
414    pub fn new(initial_capacity: usize) -> Self {
415        CpuId {
416            cpu_id_entries: Vec::with_capacity(initial_capacity),
417        }
418    }
419}
420
421#[bitfield]
422#[derive(Clone, Copy, Debug, PartialEq, Eq)]
423pub enum DestinationMode {
424    Physical = 0,
425    Logical = 1,
426}
427
428#[bitfield]
429#[derive(Clone, Copy, Debug, PartialEq, Eq)]
430pub enum TriggerMode {
431    Edge = 0,
432    Level = 1,
433}
434
435#[bitfield]
436#[derive(Debug, Clone, Copy, PartialEq, Eq)]
437pub enum DeliveryMode {
438    Fixed = 0b000,
439    Lowest = 0b001,
440    SMI = 0b010,        // System management interrupt
441    RemoteRead = 0b011, // This is no longer supported by intel.
442    NMI = 0b100,        // Non maskable interrupt
443    Init = 0b101,
444    Startup = 0b110,
445    External = 0b111,
446}
447
448// These MSI structures are for Intel's implementation of MSI.  The PCI spec defines most of MSI,
449// but the Intel spec defines the format of messages for raising interrupts.  The PCI spec defines
450// three u32s -- the address, address_high, and data -- but Intel only makes use of the address and
451// data.  The Intel portion of the specification is in Volume 3 section 10.11.
452#[bitfield]
453#[derive(Clone, Copy, PartialEq, Eq)]
454pub struct MsiAddressMessage {
455    pub reserved: BitField2,
456    #[bits = 1]
457    pub destination_mode: DestinationMode,
458    pub redirection_hint: BitField1,
459    pub reserved_2: BitField8,
460    pub destination_id: BitField8,
461    // According to Intel's implementation of MSI, these bits must always be 0xfee.
462    pub always_0xfee: BitField12,
463}
464
465#[bitfield]
466#[derive(Clone, Copy, PartialEq, Eq)]
467pub struct MsiDataMessage {
468    pub vector: BitField8,
469    #[bits = 3]
470    pub delivery_mode: DeliveryMode,
471    pub reserved: BitField3,
472    #[bits = 1]
473    pub level: Level,
474    #[bits = 1]
475    pub trigger: TriggerMode,
476    pub reserved2: BitField16,
477}
478
479#[bitfield]
480#[derive(Debug, Clone, Copy, PartialEq, Eq)]
481pub enum DeliveryStatus {
482    Idle = 0,
483    Pending = 1,
484}
485
486/// The level of a level-triggered interrupt: asserted or deasserted.
487#[bitfield]
488#[derive(Debug, Clone, Copy, PartialEq, Eq)]
489pub enum Level {
490    Deassert = 0,
491    Assert = 1,
492}
493
494/// Represents a IOAPIC redirection table entry.
495#[bitfield]
496#[derive(Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
497pub struct IoapicRedirectionTableEntry {
498    vector: BitField8,
499    #[bits = 3]
500    delivery_mode: DeliveryMode,
501    #[bits = 1]
502    dest_mode: DestinationMode,
503    #[bits = 1]
504    delivery_status: DeliveryStatus,
505    polarity: BitField1,
506    remote_irr: bool,
507    #[bits = 1]
508    trigger_mode: TriggerMode,
509    interrupt_mask: bool, // true iff interrupts are masked.
510    reserved: BitField39,
511    dest_id: BitField8,
512}
513
514/// Number of pins on the standard KVM/IOAPIC.
515pub const NUM_IOAPIC_PINS: usize = 24;
516
517/// Represents the state of the IOAPIC.
518#[repr(C)]
519#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
520pub struct IoapicState {
521    /// base_address is the memory base address for this IOAPIC. It cannot be changed.
522    pub base_address: u64,
523    /// ioregsel register. Used for selecting which entry of the redirect table to read/write.
524    pub ioregsel: u8,
525    /// ioapicid register. Bits 24 - 27 contain the APIC ID for this device.
526    pub ioapicid: u32,
527    /// current_interrupt_level_bitmap represents a bitmap of the state of all of the irq lines
528    pub current_interrupt_level_bitmap: u32,
529    /// redirect_table contains the irq settings for each irq line
530    #[serde(
531        serialize_with = "serialize_arr",
532        deserialize_with = "deserialize_seq_to_arr"
533    )]
534    pub redirect_table: [IoapicRedirectionTableEntry; NUM_IOAPIC_PINS],
535}
536
537impl Default for IoapicState {
538    fn default() -> IoapicState {
539        // SAFETY: trivially safe
540        unsafe { std::mem::zeroed() }
541    }
542}
543
544#[repr(C)]
545#[derive(Debug, Clone, Copy, PartialEq, Eq)]
546pub enum PicSelect {
547    Primary = 0,
548    Secondary = 1,
549}
550
551#[repr(C)]
552#[derive(enumn::N, Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
553pub enum PicInitState {
554    #[default]
555    Icw1 = 0,
556    Icw2 = 1,
557    Icw3 = 2,
558    Icw4 = 3,
559}
560
561/// Convenience implementation for converting from a u8
562impl From<u8> for PicInitState {
563    fn from(item: u8) -> Self {
564        PicInitState::n(item).unwrap_or_else(|| {
565            error!("Invalid PicInitState {}, setting to 0", item);
566            PicInitState::Icw1
567        })
568    }
569}
570
571/// Represents the state of the PIC.
572#[repr(C)]
573#[derive(Clone, Copy, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
574pub struct PicState {
575    /// Edge detection.
576    pub last_irr: u8,
577    /// Interrupt Request Register.
578    pub irr: u8,
579    /// Interrupt Mask Register.
580    pub imr: u8,
581    /// Interrupt Service Register.
582    pub isr: u8,
583    /// Highest priority, for priority rotation.
584    pub priority_add: u8,
585    pub irq_base: u8,
586    pub read_reg_select: bool,
587    pub poll: bool,
588    pub special_mask: bool,
589    pub init_state: PicInitState,
590    pub auto_eoi: bool,
591    pub rotate_on_auto_eoi: bool,
592    pub special_fully_nested_mode: bool,
593    /// PIC takes either 3 or 4 bytes of initialization command word during
594    /// initialization. use_4_byte_icw is true if 4 bytes of ICW are needed.
595    pub use_4_byte_icw: bool,
596    /// "Edge/Level Control Registers", for edge trigger selection.
597    /// When a particular bit is set, the corresponding IRQ is in level-triggered mode. Otherwise
598    /// it is in edge-triggered mode.
599    pub elcr: u8,
600    pub elcr_mask: u8,
601}
602
603/// The LapicState represents the state of an x86 CPU's Local APIC.
604/// The Local APIC consists of 64 128-bit registers, but only the first 32-bits of each register
605/// can be used, so this structure only stores the first 32-bits of each register.
606#[repr(C)]
607#[derive(Clone, Copy, Serialize, Deserialize)]
608pub struct LapicState {
609    #[serde(
610        serialize_with = "serialize_arr",
611        deserialize_with = "deserialize_seq_to_arr"
612    )]
613    pub regs: [LapicRegister; 64],
614}
615
616pub type LapicRegister = u32;
617
618// rust arrays longer than 32 need custom implementations of Debug
619impl std::fmt::Debug for LapicState {
620    fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
621        self.regs[..].fmt(formatter)
622    }
623}
624
625// rust arrays longer than 32 need custom implementations of PartialEq
626impl PartialEq for LapicState {
627    fn eq(&self, other: &LapicState) -> bool {
628        self.regs[..] == other.regs[..]
629    }
630}
631
632// Lapic equality is reflexive, so we impl Eq
633impl Eq for LapicState {}
634
635/// The PitState represents the state of the PIT (aka the Programmable Interval Timer).
636/// The state is simply the state of it's three channels.
637#[repr(C)]
638#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
639pub struct PitState {
640    pub channels: [PitChannelState; 3],
641    /// Hypervisor-specific flags for setting the pit state.
642    pub flags: u32,
643}
644
645/// The PitRWMode enum represents the access mode of a PIT channel.
646/// Reads and writes to the Pit happen over Port-mapped I/O, which happens one byte at a time,
647/// but the count values and latch values are two bytes. So the access mode controls which of the
648/// two bytes will be read when.
649#[repr(C)]
650#[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
651pub enum PitRWMode {
652    /// None mode means that no access mode has been set.
653    None = 0,
654    /// Least mode means all reads/writes will read/write the least significant byte.
655    Least = 1,
656    /// Most mode means all reads/writes will read/write the most significant byte.
657    Most = 2,
658    /// Both mode means first the least significant byte will be read/written, then the
659    /// next read/write will read/write the most significant byte.
660    Both = 3,
661}
662
663/// Convenience implementation for converting from a u8
664impl From<u8> for PitRWMode {
665    fn from(item: u8) -> Self {
666        PitRWMode::n(item).unwrap_or_else(|| {
667            error!("Invalid PitRWMode value {}, setting to 0", item);
668            PitRWMode::None
669        })
670    }
671}
672
673/// The PitRWState enum represents the state of reading to or writing from a channel.
674/// This is related to the PitRWMode, it mainly gives more detail about the state of the channel
675/// with respect to PitRWMode::Both.
676#[repr(C)]
677#[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
678pub enum PitRWState {
679    /// None mode means that no access mode has been set.
680    None = 0,
681    /// LSB means that the channel is in PitRWMode::Least access mode.
682    LSB = 1,
683    /// MSB means that the channel is in PitRWMode::Most access mode.
684    MSB = 2,
685    /// Word0 means that the channel is in PitRWMode::Both mode, and the least sginificant byte
686    /// has not been read/written yet.
687    Word0 = 3,
688    /// Word1 means that the channel is in PitRWMode::Both mode and the least significant byte
689    /// has already been read/written, and the next byte to be read/written will be the most
690    /// significant byte.
691    Word1 = 4,
692}
693
694/// Convenience implementation for converting from a u8
695impl From<u8> for PitRWState {
696    fn from(item: u8) -> Self {
697        PitRWState::n(item).unwrap_or_else(|| {
698            error!("Invalid PitRWState value {}, setting to 0", item);
699            PitRWState::None
700        })
701    }
702}
703
704/// The PitChannelState represents the state of one of the PIT's three counters.
705#[repr(C)]
706#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
707pub struct PitChannelState {
708    /// The starting value for the counter.
709    pub count: u32,
710    /// Stores the channel count from the last time the count was latched.
711    pub latched_count: u16,
712    /// Indicates the PitRWState state of reading the latch value.
713    pub count_latched: PitRWState,
714    /// Indicates whether ReadBack status has been latched.
715    pub status_latched: bool,
716    /// Stores the channel status from the last time the status was latched. The status contains
717    /// information about the access mode of this channel, but changing those bits in the status
718    /// will not change the behavior of the pit.
719    pub status: u8,
720    /// Indicates the PitRWState state of reading the counter.
721    pub read_state: PitRWState,
722    /// Indicates the PitRWState state of writing the counter.
723    pub write_state: PitRWState,
724    /// Stores the value with which the counter was initialized. Counters are 16-
725    /// bit values with an effective range of 1-65536 (65536 represented by 0).
726    pub reload_value: u16,
727    /// The command access mode of this channel.
728    pub rw_mode: PitRWMode,
729    /// The operation mode of this channel.
730    pub mode: u8,
731    /// Whether or not we are in bcd mode. Not supported by KVM or crosvm's PIT implementation.
732    pub bcd: bool,
733    /// Value of the gate input pin. This only applies to channel 2.
734    pub gate: bool,
735    /// Nanosecond timestamp of when the count value was loaded.
736    pub count_load_time: u64,
737}
738
739// Convenience constructors for IrqRoutes
740impl IrqRoute {
741    pub fn ioapic_irq_route(irq_num: u32) -> IrqRoute {
742        IrqRoute {
743            gsi: irq_num,
744            source: IrqSource::Irqchip {
745                chip: IrqSourceChip::Ioapic,
746                pin: irq_num,
747            },
748        }
749    }
750
751    pub fn pic_irq_route(id: IrqSourceChip, irq_num: u32) -> IrqRoute {
752        IrqRoute {
753            gsi: irq_num,
754            source: IrqSource::Irqchip {
755                chip: id,
756                pin: irq_num % 8,
757            },
758        }
759    }
760}
761
762/// State of a VCPU's general purpose registers.
763#[repr(C)]
764#[derive(Debug, Copy, Clone, Serialize, Deserialize)]
765pub struct Regs {
766    pub rax: u64,
767    pub rbx: u64,
768    pub rcx: u64,
769    pub rdx: u64,
770    pub rsi: u64,
771    pub rdi: u64,
772    pub rsp: u64,
773    pub rbp: u64,
774    pub r8: u64,
775    pub r9: u64,
776    pub r10: u64,
777    pub r11: u64,
778    pub r12: u64,
779    pub r13: u64,
780    pub r14: u64,
781    pub r15: u64,
782    pub rip: u64,
783    pub rflags: u64,
784}
785
786impl Default for Regs {
787    fn default() -> Self {
788        Regs {
789            rax: 0,
790            rbx: 0,
791            rcx: 0,
792            rdx: 0,
793            rsi: 0,
794            rdi: 0,
795            rsp: 0,
796            rbp: 0,
797            r8: 0,
798            r9: 0,
799            r10: 0,
800            r11: 0,
801            r12: 0,
802            r13: 0,
803            r14: 0,
804            r15: 0,
805            rip: 0xfff0, // Reset vector.
806            rflags: 0x2, // Bit 1 (0x2) is always 1.
807        }
808    }
809}
810
811/// State of a memory segment.
812#[repr(C)]
813#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
814pub struct Segment {
815    pub base: u64,
816    /// Limit of the segment - always in bytes, regardless of granularity (`g`) field.
817    pub limit_bytes: u32,
818    pub selector: u16,
819    pub type_: u8,
820    pub present: u8,
821    pub dpl: u8,
822    pub db: u8,
823    pub s: u8,
824    pub l: u8,
825    pub g: u8,
826    pub avl: u8,
827}
828
829/// State of a global descriptor table or interrupt descriptor table.
830#[repr(C)]
831#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
832pub struct DescriptorTable {
833    pub base: u64,
834    pub limit: u16,
835}
836
837/// State of a VCPU's special registers.
838#[repr(C)]
839#[derive(Debug, Copy, Clone, Serialize, Deserialize)]
840pub struct Sregs {
841    pub cs: Segment,
842    pub ds: Segment,
843    pub es: Segment,
844    pub fs: Segment,
845    pub gs: Segment,
846    pub ss: Segment,
847    pub tr: Segment,
848    pub ldt: Segment,
849    pub gdt: DescriptorTable,
850    pub idt: DescriptorTable,
851    pub cr0: u64,
852    pub cr2: u64,
853    pub cr3: u64,
854    pub cr4: u64,
855    pub cr8: u64,
856    pub efer: u64,
857}
858
859impl Default for Sregs {
860    fn default() -> Self {
861        // Intel SDM Vol. 3A, 3.4.5.1 ("Code- and Data-Segment Descriptor Types")
862        const SEG_TYPE_DATA: u8 = 0b0000;
863        const SEG_TYPE_DATA_WRITABLE: u8 = 0b0010;
864
865        const SEG_TYPE_CODE: u8 = 0b1000;
866        const SEG_TYPE_CODE_READABLE: u8 = 0b0010;
867
868        const SEG_TYPE_ACCESSED: u8 = 0b0001;
869
870        // Intel SDM Vol. 3A, 3.4.5 ("Segment Descriptors")
871        const SEG_S_SYSTEM: u8 = 0; // System segment.
872        const SEG_S_CODE_OR_DATA: u8 = 1; // Data/code segment.
873
874        // 16-bit real-mode code segment (reset vector).
875        let code_seg = Segment {
876            base: 0xffff0000,
877            limit_bytes: 0xffff,
878            selector: 0xf000,
879            type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
880            present: 1,
881            s: SEG_S_CODE_OR_DATA,
882            ..Default::default()
883        };
884
885        // 16-bit real-mode data segment.
886        let data_seg = Segment {
887            base: 0,
888            limit_bytes: 0xffff,
889            selector: 0,
890            type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE | SEG_TYPE_ACCESSED, // 3
891            present: 1,
892            s: SEG_S_CODE_OR_DATA,
893            ..Default::default()
894        };
895
896        // 16-bit TSS segment.
897        let task_seg = Segment {
898            base: 0,
899            limit_bytes: 0xffff,
900            selector: 0,
901            type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
902            present: 1,
903            s: SEG_S_SYSTEM,
904            ..Default::default()
905        };
906
907        // Local descriptor table.
908        let ldt = Segment {
909            base: 0,
910            limit_bytes: 0xffff,
911            selector: 0,
912            type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE, // 2
913            present: 1,
914            s: SEG_S_SYSTEM,
915            ..Default::default()
916        };
917
918        // Global descriptor table.
919        let gdt = DescriptorTable {
920            base: 0,
921            limit: 0xffff,
922        };
923
924        // Interrupt descriptor table.
925        let idt = DescriptorTable {
926            base: 0,
927            limit: 0xffff,
928        };
929
930        let cr0 = (1 << 4) // CR0.ET (reserved, always 1)
931                | (1 << 30); // CR0.CD (cache disable)
932
933        Sregs {
934            cs: code_seg,
935            ds: data_seg,
936            es: data_seg,
937            fs: data_seg,
938            gs: data_seg,
939            ss: data_seg,
940            tr: task_seg,
941            ldt,
942            gdt,
943            idt,
944            cr0,
945            cr2: 0,
946            cr3: 0,
947            cr4: 0,
948            cr8: 0,
949            efer: 0,
950        }
951    }
952}
953
954/// x87 80-bit floating point value.
955#[repr(C)]
956#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
957pub struct FpuReg {
958    /// 64-bit mantissa.
959    pub significand: u64,
960
961    /// 15-bit biased exponent and sign bit.
962    pub sign_exp: u16,
963}
964
965impl FpuReg {
966    /// Convert an array of 8x16-byte arrays to an array of 8 `FpuReg`.
967    ///
968    /// Ignores any data in the upper 6 bytes of each element; the values represent 80-bit FPU
969    /// registers, so the upper 48 bits are unused.
970    pub fn from_16byte_arrays(byte_arrays: &[[u8; 16]; 8]) -> [FpuReg; 8] {
971        let mut regs = [FpuReg::default(); 8];
972        for (dst, src) in regs.iter_mut().zip(byte_arrays.iter()) {
973            let tbyte: [u8; 10] = src[0..10].try_into().unwrap();
974            *dst = FpuReg::from(tbyte);
975        }
976        regs
977    }
978
979    /// Convert an array of 8 `FpuReg` into 8x16-byte arrays.
980    pub fn to_16byte_arrays(regs: &[FpuReg; 8]) -> [[u8; 16]; 8] {
981        let mut byte_arrays = [[0u8; 16]; 8];
982        for (dst, src) in byte_arrays.iter_mut().zip(regs.iter()) {
983            *dst = (*src).into();
984        }
985        byte_arrays
986    }
987}
988
989impl From<[u8; 10]> for FpuReg {
990    /// Construct a `FpuReg` from an 80-bit representation.
991    fn from(value: [u8; 10]) -> FpuReg {
992        // These array sub-slices can't fail, but there's no (safe) way to express that in Rust
993        // without an `unwrap()`.
994        let significand_bytes = value[0..8].try_into().unwrap();
995        let significand = u64::from_le_bytes(significand_bytes);
996        let sign_exp_bytes = value[8..10].try_into().unwrap();
997        let sign_exp = u16::from_le_bytes(sign_exp_bytes);
998        FpuReg {
999            significand,
1000            sign_exp,
1001        }
1002    }
1003}
1004
1005impl From<FpuReg> for [u8; 10] {
1006    /// Convert an `FpuReg` into its 80-bit "TBYTE" representation.
1007    fn from(value: FpuReg) -> [u8; 10] {
1008        let mut bytes = [0u8; 10];
1009        bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1010        bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1011        bytes
1012    }
1013}
1014
1015impl From<FpuReg> for [u8; 16] {
1016    /// Convert an `FpuReg` into its 80-bit representation plus 6 unused upper bytes.
1017    /// This is a convenience function for converting to hypervisor types.
1018    fn from(value: FpuReg) -> [u8; 16] {
1019        let mut bytes = [0u8; 16];
1020        bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1021        bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1022        bytes
1023    }
1024}
1025
1026/// State of a VCPU's floating point unit.
1027#[repr(C)]
1028#[derive(Debug, Copy, Clone, Serialize, Deserialize)]
1029pub struct Fpu {
1030    pub fpr: [FpuReg; 8],
1031    pub fcw: u16,
1032    pub fsw: u16,
1033    pub ftwx: u8,
1034    pub last_opcode: u16,
1035    pub last_ip: u64,
1036    pub last_dp: u64,
1037    pub xmm: [[u8; 16usize]; 16usize],
1038    pub mxcsr: u32,
1039}
1040
1041impl Default for Fpu {
1042    fn default() -> Self {
1043        Fpu {
1044            fpr: Default::default(),
1045            fcw: 0x37f, // Intel SDM Vol. 1, 13.6
1046            fsw: 0,
1047            ftwx: 0,
1048            last_opcode: 0,
1049            last_ip: 0,
1050            last_dp: 0,
1051            xmm: Default::default(),
1052            mxcsr: 0x1f80, // Intel SDM Vol. 1, 11.6.4
1053        }
1054    }
1055}
1056
1057/// State of a VCPU's debug registers.
1058#[repr(C)]
1059#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
1060pub struct DebugRegs {
1061    pub db: [u64; 4usize],
1062    pub dr6: u64,
1063    pub dr7: u64,
1064}
1065
1066/// The hybrid type for intel hybrid CPU.
1067#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
1068pub enum CpuHybridType {
1069    /// Intel Atom.
1070    Atom,
1071    /// Intel Core.
1072    Core,
1073}
1074
1075/// State of the VCPU's x87 FPU, MMX, XMM, YMM registers.
1076/// May contain more state depending on enabled extensions.
1077#[derive(Clone, Debug, Serialize, Deserialize)]
1078pub struct Xsave {
1079    data: Vec<u32>,
1080
1081    // Actual length in bytes. May be smaller than data if a non-u32 multiple of bytes is
1082    // requested.
1083    len: usize,
1084}
1085
1086impl Xsave {
1087    /// Create a new buffer to store Xsave data.
1088    ///
1089    /// # Argments
1090    /// * `len` size in bytes.
1091    pub fn new(len: usize) -> Self {
1092        Xsave {
1093            data: vec![0; len.div_ceil(4)],
1094            len,
1095        }
1096    }
1097
1098    pub fn as_ptr(&self) -> *const c_void {
1099        self.data.as_ptr() as *const c_void
1100    }
1101
1102    pub fn as_mut_ptr(&mut self) -> *mut c_void {
1103        self.data.as_mut_ptr() as *mut c_void
1104    }
1105
1106    /// Length in bytes of the XSAVE data.
1107    pub fn len(&self) -> usize {
1108        self.len
1109    }
1110
1111    /// Returns true is length of XSAVE data is zero
1112    pub fn is_empty(&self) -> bool {
1113        self.len() == 0
1114    }
1115}