arch/
lib.rs

1// Copyright 2018 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Virtual machine architecture support code.
6
7pub mod android;
8pub mod fdt;
9pub mod pstore;
10pub mod serial;
11
12pub mod sys;
13
14use std::collections::BTreeMap;
15use std::error::Error as StdError;
16use std::fs::File;
17use std::io;
18use std::ops::Deref;
19use std::path::PathBuf;
20use std::str::FromStr;
21use std::sync::mpsc;
22use std::sync::mpsc::SendError;
23use std::sync::Arc;
24
25use acpi_tables::sdt::SDT;
26use base::syslog;
27use base::AsRawDescriptors;
28use base::FileGetLen;
29use base::FileReadWriteAtVolatile;
30use base::RecvTube;
31use base::SendTube;
32use base::Tube;
33use devices::virtio::VirtioDevice;
34use devices::BarRange;
35use devices::Bus;
36use devices::BusDevice;
37use devices::BusDeviceObj;
38use devices::BusError;
39use devices::BusResumeDevice;
40use devices::FwCfgParameters;
41use devices::GpeScope;
42use devices::HotPlugBus;
43use devices::IrqChip;
44use devices::IrqEventSource;
45use devices::PciAddress;
46use devices::PciBus;
47use devices::PciDevice;
48use devices::PciDeviceError;
49use devices::PciInterruptPin;
50use devices::PciRoot;
51use devices::PciRootCommand;
52use devices::PreferredIrq;
53#[cfg(any(target_os = "android", target_os = "linux"))]
54use devices::ProxyDevice;
55use devices::SerialHardware;
56use devices::SerialParameters;
57pub use fdt::apply_device_tree_overlays;
58pub use fdt::DtbOverlay;
59#[cfg(feature = "gdb")]
60use gdbstub::arch::Arch;
61use hypervisor::MemCacheType;
62use hypervisor::Vm;
63#[cfg(windows)]
64use jail::FakeMinijailStub as Minijail;
65#[cfg(any(target_os = "android", target_os = "linux"))]
66use minijail::Minijail;
67use remain::sorted;
68use resources::SystemAllocator;
69use resources::SystemAllocatorConfig;
70use serde::de::Visitor;
71use serde::Deserialize;
72use serde::Serialize;
73use serde_keyvalue::FromKeyValues;
74pub use serial::add_serial_devices;
75pub use serial::get_serial_cmdline;
76pub use serial::set_default_serial_parameters;
77pub use serial::GetSerialCmdlineError;
78pub use serial::SERIAL_ADDR;
79use sync::Condvar;
80use sync::Mutex;
81#[cfg(any(target_os = "android", target_os = "linux"))]
82pub use sys::linux::PlatformBusResources;
83use thiserror::Error;
84use uuid::Uuid;
85use vm_control::BatControl;
86use vm_control::BatteryType;
87use vm_control::PmResource;
88use vm_memory::GuestAddress;
89use vm_memory::GuestMemory;
90use vm_memory::GuestMemoryError;
91use vm_memory::MemoryRegionInformation;
92use vm_memory::MemoryRegionOptions;
93
94cfg_if::cfg_if! {
95    if #[cfg(target_arch = "aarch64")] {
96        pub use devices::IrqChipAArch64 as IrqChipArch;
97        #[cfg(feature = "gdb")]
98        pub use gdbstub_arch::aarch64::AArch64 as GdbArch;
99        pub use hypervisor::CpuConfigAArch64 as CpuConfigArch;
100        pub use hypervisor::Hypervisor as HypervisorArch;
101        pub use hypervisor::VcpuAArch64 as VcpuArch;
102        pub use hypervisor::VcpuInitAArch64 as VcpuInitArch;
103        pub use hypervisor::VmAArch64 as VmArch;
104    } else if #[cfg(target_arch = "riscv64")] {
105        pub use devices::IrqChipRiscv64 as IrqChipArch;
106        #[cfg(feature = "gdb")]
107        pub use gdbstub_arch::riscv::Riscv64 as GdbArch;
108        pub use hypervisor::CpuConfigRiscv64 as CpuConfigArch;
109        pub use hypervisor::Hypervisor as HypervisorArch;
110        pub use hypervisor::VcpuInitRiscv64 as VcpuInitArch;
111        pub use hypervisor::VcpuRiscv64 as VcpuArch;
112        pub use hypervisor::VmRiscv64 as VmArch;
113    } else if #[cfg(target_arch = "x86_64")] {
114        pub use devices::IrqChipX86_64 as IrqChipArch;
115        #[cfg(feature = "gdb")]
116        pub use gdbstub_arch::x86::X86_64_SSE as GdbArch;
117        pub use hypervisor::CpuConfigX86_64 as CpuConfigArch;
118        pub use hypervisor::HypervisorX86_64 as HypervisorArch;
119        pub use hypervisor::VcpuInitX86_64 as VcpuInitArch;
120        pub use hypervisor::VcpuX86_64 as VcpuArch;
121        pub use hypervisor::VmX86_64 as VmArch;
122    }
123}
124
125pub enum VmImage {
126    Kernel(File),
127    Bios(File),
128}
129
130#[derive(Clone, Debug, Deserialize, Serialize, FromKeyValues, PartialEq, Eq)]
131#[serde(deny_unknown_fields, rename_all = "kebab-case")]
132pub struct Pstore {
133    pub path: PathBuf,
134    pub size: u32,
135}
136
137#[derive(Clone, Copy, Debug, Serialize, Deserialize, FromKeyValues)]
138#[serde(deny_unknown_fields, rename_all = "kebab-case")]
139pub enum FdtPosition {
140    /// At the start of RAM.
141    Start,
142    /// Near the end of RAM.
143    End,
144    /// After the payload, with some padding for alignment.
145    AfterPayload,
146}
147
148/// Set of CPU cores.
149#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
150pub struct CpuSet(Vec<usize>);
151
152impl CpuSet {
153    pub fn new<I: IntoIterator<Item = usize>>(cpus: I) -> Self {
154        CpuSet(cpus.into_iter().collect())
155    }
156
157    pub fn iter(&self) -> std::slice::Iter<'_, usize> {
158        self.0.iter()
159    }
160}
161
162impl FromIterator<usize> for CpuSet {
163    fn from_iter<T>(iter: T) -> Self
164    where
165        T: IntoIterator<Item = usize>,
166    {
167        CpuSet::new(iter)
168    }
169}
170
171#[cfg(target_arch = "aarch64")]
172fn sve_auto_default() -> bool {
173    true
174}
175
176/// The SVE config for Vcpus.
177#[cfg(target_arch = "aarch64")]
178#[derive(Copy, Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
179#[serde(deny_unknown_fields, rename_all = "kebab-case")]
180pub struct SveConfig {
181    /// Detect if SVE is available and enable accordingly. `enable` is ignored if auto is true
182    #[serde(default = "sve_auto_default")]
183    pub auto: bool,
184}
185
186#[cfg(target_arch = "aarch64")]
187impl Default for SveConfig {
188    fn default() -> Self {
189        SveConfig {
190            auto: sve_auto_default(),
191        }
192    }
193}
194
195/// FFA config
196// For now this is limited to android, will be opened to other aarch64 based pVMs after
197// corresponding kernel APIs are upstreamed.
198#[cfg(all(target_os = "android", target_arch = "aarch64"))]
199#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize, FromKeyValues)]
200#[serde(deny_unknown_fields, rename_all = "kebab-case")]
201pub struct FfaConfig {
202    /// Just enable FFA, don't care about the negotiated version.
203    #[serde(default)]
204    pub auto: bool,
205}
206
207fn parse_cpu_range(s: &str, cpuset: &mut Vec<usize>) -> Result<(), String> {
208    fn parse_cpu(s: &str) -> Result<usize, String> {
209        s.parse()
210            .map_err(|_| format!("invalid CPU index {s} - index must be a non-negative integer"))
211    }
212
213    let (first_cpu, last_cpu) = match s.split_once('-') {
214        Some((first_cpu, last_cpu)) => {
215            let first_cpu = parse_cpu(first_cpu)?;
216            let last_cpu = parse_cpu(last_cpu)?;
217
218            if last_cpu < first_cpu {
219                return Err(format!(
220                    "invalid CPU range {s} - ranges must be from low to high"
221                ));
222            }
223            (first_cpu, last_cpu)
224        }
225        None => {
226            let cpu = parse_cpu(s)?;
227            (cpu, cpu)
228        }
229    };
230
231    cpuset.extend(first_cpu..=last_cpu);
232
233    Ok(())
234}
235
236impl FromStr for CpuSet {
237    type Err = String;
238
239    fn from_str(s: &str) -> Result<Self, Self::Err> {
240        let mut cpuset = Vec::new();
241        for part in s.split(',') {
242            parse_cpu_range(part, &mut cpuset)?;
243        }
244        Ok(CpuSet::new(cpuset))
245    }
246}
247
248impl Deref for CpuSet {
249    type Target = Vec<usize>;
250
251    fn deref(&self) -> &Self::Target {
252        &self.0
253    }
254}
255
256impl IntoIterator for CpuSet {
257    type Item = usize;
258    type IntoIter = std::vec::IntoIter<Self::Item>;
259
260    fn into_iter(self) -> Self::IntoIter {
261        self.0.into_iter()
262    }
263}
264
265/// Deserializes a `CpuSet` from a sequence which elements can either be integers, or strings
266/// representing CPU ranges (e.g. `5-8`).
267impl<'de> Deserialize<'de> for CpuSet {
268    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
269    where
270        D: serde::Deserializer<'de>,
271    {
272        struct CpuSetVisitor;
273        impl<'de> Visitor<'de> for CpuSetVisitor {
274            type Value = CpuSet;
275
276            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
277                formatter.write_str("CpuSet")
278            }
279
280            fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
281            where
282                A: serde::de::SeqAccess<'de>,
283            {
284                #[derive(Deserialize)]
285                #[serde(untagged)]
286                enum CpuSetValue<'a> {
287                    Single(usize),
288                    Range(&'a str),
289                }
290
291                let mut cpus = Vec::new();
292                while let Some(cpuset) = seq.next_element::<CpuSetValue>()? {
293                    match cpuset {
294                        CpuSetValue::Single(cpu) => cpus.push(cpu),
295                        CpuSetValue::Range(range) => {
296                            parse_cpu_range(range, &mut cpus).map_err(serde::de::Error::custom)?;
297                        }
298                    }
299                }
300
301                Ok(CpuSet::new(cpus))
302            }
303        }
304
305        deserializer.deserialize_seq(CpuSetVisitor)
306    }
307}
308
309/// Serializes a `CpuSet` into a sequence of integers and strings representing CPU ranges.
310impl Serialize for CpuSet {
311    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
312    where
313        S: serde::Serializer,
314    {
315        use serde::ser::SerializeSeq;
316
317        let mut seq = serializer.serialize_seq(None)?;
318
319        // Factorize ranges into "a-b" strings.
320        let mut serialize_range = |start: usize, end: usize| -> Result<(), S::Error> {
321            if start == end {
322                seq.serialize_element(&start)?;
323            } else {
324                seq.serialize_element(&format!("{start}-{end}"))?;
325            }
326
327            Ok(())
328        };
329
330        // Current range.
331        let mut range = None;
332        for core in &self.0 {
333            range = match range {
334                None => Some((core, core)),
335                Some((start, end)) if *end == *core - 1 => Some((start, core)),
336                Some((start, end)) => {
337                    serialize_range(*start, *end)?;
338                    Some((core, core))
339                }
340            };
341        }
342
343        if let Some((start, end)) = range {
344            serialize_range(*start, *end)?;
345        }
346
347        seq.end()
348    }
349}
350
351/// Mapping of guest VCPU threads to host CPU cores.
352#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
353pub enum VcpuAffinity {
354    /// All VCPU threads will be pinned to the same set of host CPU cores.
355    Global(CpuSet),
356    /// Each VCPU may be pinned to a set of host CPU cores.
357    /// The map key is a guest VCPU index, and the corresponding value is the set of
358    /// host CPU indices that the VCPU thread will be allowed to run on.
359    /// If a VCPU index is not present in the map, its affinity will not be set.
360    PerVcpu(BTreeMap<usize, CpuSet>),
361}
362
363/// Memory region with optional size.
364#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
365pub struct MemoryRegionConfig {
366    pub start: u64,
367    pub size: Option<u64>,
368}
369
370/// General PCI config.
371#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
372pub struct PciConfig {
373    /// region for PCI Configuration Access Mechanism
374    #[cfg(target_arch = "aarch64")]
375    pub cam: Option<MemoryRegionConfig>,
376    /// region for PCIe Enhanced Configuration Access Mechanism
377    #[cfg(target_arch = "x86_64")]
378    pub ecam: Option<MemoryRegionConfig>,
379    /// region for non-prefetchable PCI device memory below 4G
380    pub mem: Option<MemoryRegionConfig>,
381}
382
383/// Holds the pieces needed to build a VM. Passed to `build_vm` in the `LinuxArch` trait below to
384/// create a `RunnableLinuxVm`.
385#[sorted]
386pub struct VmComponents {
387    #[cfg(all(target_arch = "x86_64", unix))]
388    pub ac_adapter: bool,
389    pub acpi_sdts: Vec<SDT>,
390    pub android_fstab: Option<File>,
391    pub boot_cpu: usize,
392    pub bootorder_fw_cfg_blob: Vec<u8>,
393    #[cfg(target_arch = "x86_64")]
394    pub break_linux_pci_config_io: bool,
395    pub cpu_capacity: BTreeMap<usize, u32>,
396    pub cpu_clusters: Vec<CpuSet>,
397    #[cfg(all(
398        target_arch = "aarch64",
399        any(target_os = "android", target_os = "linux")
400    ))]
401    pub cpu_frequencies: BTreeMap<usize, Vec<u32>>,
402    pub delay_rt: bool,
403    pub dynamic_power_coefficient: BTreeMap<usize, u32>,
404    pub extra_kernel_params: Vec<String>,
405    #[cfg(target_arch = "x86_64")]
406    pub force_s2idle: bool,
407    pub fw_cfg_enable: bool,
408    pub fw_cfg_parameters: Vec<FwCfgParameters>,
409    pub host_cpu_topology: bool,
410    pub hugepages: bool,
411    pub hv_cfg: hypervisor::Config,
412    pub initrd_image: Option<File>,
413    pub itmt: bool,
414    pub memory_size: u64,
415    pub no_i8042: bool,
416    pub no_rtc: bool,
417    pub no_smt: bool,
418    #[cfg(all(
419        target_arch = "aarch64",
420        any(target_os = "android", target_os = "linux")
421    ))]
422    pub normalized_cpu_ipc_ratios: BTreeMap<usize, u32>,
423    pub pci_config: PciConfig,
424    pub pflash_block_size: u32,
425    pub pflash_image: Option<File>,
426    pub pstore: Option<Pstore>,
427    /// A file to load as pVM firmware. Must be `Some` iff
428    /// `hv_cfg.protection_type == ProtectionType::UnprotectedWithFirmware`.
429    pub pvm_fw: Option<File>,
430    pub rt_cpus: CpuSet,
431    #[cfg(target_arch = "x86_64")]
432    pub smbios: SmbiosOptions,
433    pub smccc_trng: bool,
434    #[cfg(target_arch = "aarch64")]
435    pub sve_config: SveConfig,
436    pub swiotlb: Option<u64>,
437    pub vcpu_affinity: Option<VcpuAffinity>,
438    pub vcpu_count: usize,
439    #[cfg(all(
440        target_arch = "aarch64",
441        any(target_os = "android", target_os = "linux")
442    ))]
443    pub vcpu_domain_paths: BTreeMap<usize, PathBuf>,
444    #[cfg(all(
445        target_arch = "aarch64",
446        any(target_os = "android", target_os = "linux")
447    ))]
448    pub vcpu_domains: BTreeMap<usize, u32>,
449    #[cfg(all(
450        target_arch = "aarch64",
451        any(target_os = "android", target_os = "linux")
452    ))]
453    pub virt_cpufreq_v2: bool,
454    pub vm_image: VmImage,
455}
456
457/// Holds the elements needed to run a Linux VM. Created by `build_vm`.
458#[sorted]
459pub struct RunnableLinuxVm<V: VmArch, Vcpu: VcpuArch> {
460    pub bat_control: Option<BatControl>,
461    pub delay_rt: bool,
462    pub devices_thread: Option<std::thread::JoinHandle<()>>,
463    pub hotplug_bus: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
464    pub hypercall_bus: Arc<Bus>,
465    pub io_bus: Arc<Bus>,
466    pub irq_chip: Box<dyn IrqChipArch>,
467    pub mmio_bus: Arc<Bus>,
468    pub no_smt: bool,
469    pub pid_debug_label_map: BTreeMap<u32, String>,
470    #[cfg(any(target_os = "android", target_os = "linux"))]
471    pub platform_devices: Vec<Arc<Mutex<dyn BusDevice>>>,
472    pub pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
473    /// Devices to be notified before the system resumes from the S3 suspended state.
474    pub resume_notify_devices: Vec<Arc<Mutex<dyn BusResumeDevice>>>,
475    pub root_config: Arc<Mutex<PciRoot>>,
476    pub rt_cpus: CpuSet,
477    pub suspend_tube: (Arc<Mutex<SendTube>>, RecvTube),
478    pub vcpu_affinity: Option<VcpuAffinity>,
479    pub vcpu_count: usize,
480    pub vcpu_init: Vec<VcpuInitArch>,
481    /// If vcpus is None, then it's the responsibility of the vcpu thread to create vcpus.
482    /// If it's Some, then `build_vm` already created the vcpus.
483    pub vcpus: Option<Vec<Vcpu>>,
484    pub vm: V,
485    pub vm_request_tubes: Vec<Tube>,
486}
487
488/// The device and optional jail.
489pub struct VirtioDeviceStub {
490    pub dev: Box<dyn VirtioDevice>,
491    pub jail: Option<Minijail>,
492}
493
494/// Trait which is implemented for each Linux Architecture in order to
495/// set up the memory, cpus, and system devices and to boot the kernel.
496pub trait LinuxArch {
497    type Error: StdError;
498    type ArchMemoryLayout;
499
500    /// Decide architecture specific memory layout details to be used by later stages of the VM
501    /// setup.
502    fn arch_memory_layout(
503        components: &VmComponents,
504    ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error>;
505
506    /// Returns a Vec of the valid memory addresses as pairs of address and length. These should be
507    /// used to configure the `GuestMemory` structure for the platform.
508    ///
509    /// # Arguments
510    ///
511    /// * `components` - Parts used to determine the memory layout.
512    fn guest_memory_layout(
513        components: &VmComponents,
514        arch_memory_layout: &Self::ArchMemoryLayout,
515        hypervisor: &impl hypervisor::Hypervisor,
516    ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>;
517
518    /// Gets the configuration for a new `SystemAllocator` that fits the given `Vm`'s memory layout.
519    ///
520    /// This is the per-architecture template for constructing the `SystemAllocator`. Platform
521    /// agnostic modifications may be made to this configuration, but the final `SystemAllocator`
522    /// will be at least as strict as this configuration.
523    ///
524    /// # Arguments
525    ///
526    /// * `vm` - The virtual machine to be used as a template for the `SystemAllocator`.
527    fn get_system_allocator_config<V: Vm>(
528        vm: &V,
529        arch_memory_layout: &Self::ArchMemoryLayout,
530    ) -> SystemAllocatorConfig;
531
532    /// Takes `VmComponents` and generates a `RunnableLinuxVm`.
533    ///
534    /// # Arguments
535    ///
536    /// * `components` - Parts to use to build the VM.
537    /// * `vm_evt_wrtube` - Tube used by sub-devices to request that crosvm exit because guest wants
538    ///   to stop/shut down or requested reset.
539    /// * `system_allocator` - Allocator created by this trait's implementation of
540    ///   `get_system_allocator_config`.
541    /// * `serial_parameters` - Definitions for how the serial devices should be configured.
542    /// * `serial_jail` - Jail used for serial devices created here.
543    /// * `battery` - Defines what battery device will be created.
544    /// * `vm` - A VM implementation to build upon.
545    /// * `ramoops_region` - Region allocated for ramoops.
546    /// * `devices` - The devices to be built into the VM.
547    /// * `irq_chip` - The IRQ chip implemention for the VM.
548    /// * `debugcon_jail` - Jail used for debugcon devices created here.
549    /// * `pflash_jail` - Jail used for pflash device created here.
550    /// * `fw_cfg_jail` - Jail used for fw_cfg device created here.
551    /// * `device_tree_overlays` - Device tree overlay binaries
552    fn build_vm<V, Vcpu>(
553        components: VmComponents,
554        arch_memory_layout: &Self::ArchMemoryLayout,
555        vm_evt_wrtube: &SendTube,
556        system_allocator: &mut SystemAllocator,
557        serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
558        serial_jail: Option<Minijail>,
559        battery: (Option<BatteryType>, Option<Minijail>),
560        vm: V,
561        ramoops_region: Option<pstore::RamoopsRegion>,
562        devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
563        irq_chip: &mut dyn IrqChipArch,
564        vcpu_ids: &mut Vec<usize>,
565        dump_device_tree_blob: Option<PathBuf>,
566        debugcon_jail: Option<Minijail>,
567        #[cfg(target_arch = "x86_64")] pflash_jail: Option<Minijail>,
568        #[cfg(target_arch = "x86_64")] fw_cfg_jail: Option<Minijail>,
569        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
570        guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
571        device_tree_overlays: Vec<DtbOverlay>,
572        fdt_position: Option<FdtPosition>,
573        no_pmu: bool,
574    ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
575    where
576        V: VmArch,
577        Vcpu: VcpuArch;
578
579    /// Configures the vcpu and should be called once per vcpu from the vcpu's thread.
580    ///
581    /// # Arguments
582    ///
583    /// * `vm` - The virtual machine object.
584    /// * `hypervisor` - The `Hypervisor` that created the vcpu.
585    /// * `irq_chip` - The `IrqChip` associated with this vm.
586    /// * `vcpu` - The VCPU object to configure.
587    /// * `vcpu_init` - The data required to initialize VCPU registers and other state.
588    /// * `vcpu_id` - The id of the given `vcpu`.
589    /// * `num_cpus` - Number of virtual CPUs the guest will have.
590    /// * `cpu_config` - CPU feature configurations.
591    fn configure_vcpu<V: Vm>(
592        vm: &V,
593        hypervisor: &dyn HypervisorArch,
594        irq_chip: &mut dyn IrqChipArch,
595        vcpu: &mut dyn VcpuArch,
596        vcpu_init: VcpuInitArch,
597        vcpu_id: usize,
598        num_cpus: usize,
599        cpu_config: Option<CpuConfigArch>,
600    ) -> Result<(), Self::Error>;
601
602    /// Configures and add a pci device into vm
603    fn register_pci_device<V: VmArch, Vcpu: VcpuArch>(
604        linux: &mut RunnableLinuxVm<V, Vcpu>,
605        device: Box<dyn PciDevice>,
606        #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
607        resources: &mut SystemAllocator,
608        hp_control_tube: &mpsc::Sender<PciRootCommand>,
609        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
610    ) -> Result<PciAddress, Self::Error>;
611
612    /// Returns frequency map for each of the host's logical cores.
613    fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>, Self::Error>;
614
615    /// Returns max-freq map of the host's logical cores.
616    fn get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>, Self::Error>;
617
618    /// Returns capacity map of the host's logical cores.
619    fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>, Self::Error>;
620
621    /// Returns cluster masks for each of the host's logical cores.
622    fn get_host_cpu_clusters() -> Result<Vec<CpuSet>, Self::Error>;
623}
624
625#[cfg(feature = "gdb")]
626pub trait GdbOps<T: VcpuArch> {
627    type Error: StdError;
628
629    /// Reads vCPU's registers.
630    fn read_registers(vcpu: &T) -> Result<<GdbArch as Arch>::Registers, Self::Error>;
631
632    /// Writes vCPU's registers.
633    fn write_registers(vcpu: &T, regs: &<GdbArch as Arch>::Registers) -> Result<(), Self::Error>;
634
635    /// Reads bytes from the guest memory.
636    fn read_memory(
637        vcpu: &T,
638        guest_mem: &GuestMemory,
639        vaddr: GuestAddress,
640        len: usize,
641    ) -> Result<Vec<u8>, Self::Error>;
642
643    /// Writes bytes to the specified guest memory.
644    fn write_memory(
645        vcpu: &T,
646        guest_mem: &GuestMemory,
647        vaddr: GuestAddress,
648        buf: &[u8],
649    ) -> Result<(), Self::Error>;
650
651    /// Reads bytes from the guest register.
652    ///
653    /// Returns an empty vector if `reg_id` is valid but the register is not available.
654    fn read_register(vcpu: &T, reg_id: <GdbArch as Arch>::RegId) -> Result<Vec<u8>, Self::Error>;
655
656    /// Writes bytes to the specified guest register.
657    fn write_register(
658        vcpu: &T,
659        reg_id: <GdbArch as Arch>::RegId,
660        data: &[u8],
661    ) -> Result<(), Self::Error>;
662
663    /// Make the next vCPU's run single-step.
664    fn enable_singlestep(vcpu: &T) -> Result<(), Self::Error>;
665
666    /// Get maximum number of hardware breakpoints.
667    fn get_max_hw_breakpoints(vcpu: &T) -> Result<usize, Self::Error>;
668
669    /// Set hardware breakpoints at the given addresses.
670    fn set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<(), Self::Error>;
671}
672
673/// Errors for device manager.
674#[sorted]
675#[derive(Error, Debug)]
676pub enum DeviceRegistrationError {
677    /// No more MMIO space available.
678    #[error("no more addresses are available")]
679    AddrsExhausted,
680    /// Could not allocate device address space for the device.
681    #[error("Allocating device addresses: {0}")]
682    AllocateDeviceAddrs(PciDeviceError),
683    /// Could not allocate IO space for the device.
684    #[error("Allocating IO addresses: {0}")]
685    AllocateIoAddrs(PciDeviceError),
686    /// Could not allocate MMIO or IO resource for the device.
687    #[error("Allocating IO resource: {0}")]
688    AllocateIoResource(resources::Error),
689    /// Could not allocate an IRQ number.
690    #[error("Allocating IRQ number")]
691    AllocateIrq,
692    /// Could not allocate IRQ resource for the device.
693    #[cfg(any(target_os = "android", target_os = "linux"))]
694    #[error("Allocating IRQ resource: {0}")]
695    AllocateIrqResource(devices::vfio::VfioError),
696    /// Broken pci topology
697    #[error("pci topology is broken")]
698    BrokenPciTopology,
699    /// Unable to clone a jail for the device.
700    #[cfg(any(target_os = "android", target_os = "linux"))]
701    #[error("failed to clone jail: {0}")]
702    CloneJail(minijail::Error),
703    /// Appending to kernel command line failed.
704    #[error("unable to add device to kernel command line: {0}")]
705    Cmdline(kernel_cmdline::Error),
706    /// Configure window size failed.
707    #[error("failed to configure window size: {0}")]
708    ConfigureWindowSize(PciDeviceError),
709    // Unable to create a pipe.
710    #[error("failed to create pipe: {0}")]
711    CreatePipe(base::Error),
712    // Unable to create a root.
713    #[error("failed to create pci root: {0}")]
714    CreateRoot(anyhow::Error),
715    // Unable to create serial device from serial parameters
716    #[error("failed to create serial device: {0}")]
717    CreateSerialDevice(devices::SerialError),
718    // Unable to create tube
719    #[error("failed to create tube: {0}")]
720    CreateTube(base::TubeError),
721    /// Could not clone an event.
722    #[error("failed to clone event: {0}")]
723    EventClone(base::Error),
724    /// Could not create an event.
725    #[error("failed to create event: {0}")]
726    EventCreate(base::Error),
727    /// Failed to generate ACPI content.
728    #[error("failed to generate ACPI content")]
729    GenerateAcpi,
730    /// No more IRQs are available.
731    #[error("no more IRQs are available")]
732    IrqsExhausted,
733    /// VFIO device is missing a DT symbol.
734    #[error("cannot match VFIO device to DT node due to a missing symbol")]
735    MissingDeviceTreeSymbol,
736    /// Missing a required serial device.
737    #[error("missing required serial device {0}")]
738    MissingRequiredSerialDevice(u8),
739    /// Could not add a device to the mmio bus.
740    #[error("failed to add to mmio bus: {0}")]
741    MmioInsert(BusError),
742    /// Failed to insert device into PCI root.
743    #[error("failed to insert device into PCI root: {0}")]
744    PciRootAddDevice(PciDeviceError),
745    #[cfg(any(target_os = "android", target_os = "linux"))]
746    /// Failed to initialize proxy device for jailed device.
747    #[error("failed to create proxy device: {0}")]
748    ProxyDeviceCreation(devices::ProxyError),
749    #[cfg(any(target_os = "android", target_os = "linux"))]
750    /// Failed to register battery device.
751    #[error("failed to register battery device to VM: {0}")]
752    RegisterBattery(devices::BatteryError),
753    /// Could not register PCI device to pci root bus
754    #[error("failed to register PCI device to pci root bus")]
755    RegisterDevice(SendError<PciRootCommand>),
756    /// Could not register PCI device capabilities.
757    #[error("could not register PCI device capabilities: {0}")]
758    RegisterDeviceCapabilities(PciDeviceError),
759    /// Failed to register ioevent with VM.
760    #[error("failed to register ioevent to VM: {0}")]
761    RegisterIoevent(base::Error),
762    /// Failed to register irq event with VM.
763    #[error("failed to register irq event to VM: {0}")]
764    RegisterIrqfd(base::Error),
765    /// Could not setup VFIO platform IRQ for the device.
766    #[error("Setting up VFIO platform IRQ: {0}")]
767    SetupVfioPlatformIrq(anyhow::Error),
768}
769
770/// Config a PCI device for used by this vm.
771pub fn configure_pci_device<V: VmArch, Vcpu: VcpuArch>(
772    linux: &mut RunnableLinuxVm<V, Vcpu>,
773    mut device: Box<dyn PciDevice>,
774    #[cfg(any(target_os = "android", target_os = "linux"))] jail: Option<Minijail>,
775    resources: &mut SystemAllocator,
776    hp_control_tube: &mpsc::Sender<PciRootCommand>,
777    #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
778) -> Result<PciAddress, DeviceRegistrationError> {
779    // Allocate PCI device address before allocating BARs.
780    let pci_address = device
781        .allocate_address(resources)
782        .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
783
784    // Allocate ranges that may need to be in the low MMIO region (MmioType::Low).
785    let mmio_ranges = device
786        .allocate_io_bars(resources)
787        .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
788
789    // Allocate device ranges that may be in low or high MMIO after low-only ranges.
790    let device_ranges = device
791        .allocate_device_bars(resources)
792        .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
793
794    // If device is a pcie bridge, add its pci bus to pci root
795    if let Some(pci_bus) = device.get_new_pci_bus() {
796        hp_control_tube
797            .send(PciRootCommand::AddBridge(pci_bus))
798            .map_err(DeviceRegistrationError::RegisterDevice)?;
799        let bar_ranges = Vec::new();
800        device
801            .configure_bridge_window(resources, &bar_ranges)
802            .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
803    }
804
805    // Do not suggest INTx for hot-plug devices.
806    let intx_event = devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
807
808    if let PreferredIrq::Fixed { pin, gsi } = device.preferred_irq() {
809        resources.reserve_irq(gsi);
810
811        device.assign_irq(
812            intx_event
813                .try_clone()
814                .map_err(DeviceRegistrationError::EventClone)?,
815            pin,
816            gsi,
817        );
818
819        linux
820            .irq_chip
821            .as_irq_chip_mut()
822            .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(&device))
823            .map_err(DeviceRegistrationError::RegisterIrqfd)?;
824    }
825
826    let mut keep_rds = device.keep_rds();
827    syslog::push_descriptors(&mut keep_rds);
828    cros_tracing::push_descriptors!(&mut keep_rds);
829    metrics::push_descriptors(&mut keep_rds);
830
831    device
832        .register_device_capabilities()
833        .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
834
835    #[cfg(any(target_os = "android", target_os = "linux"))]
836    let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
837        let proxy = ProxyDevice::new(
838            device,
839            jail,
840            keep_rds,
841            #[cfg(feature = "swap")]
842            swap_controller,
843        )
844        .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
845        linux
846            .pid_debug_label_map
847            .insert(proxy.pid() as u32, proxy.debug_label());
848        Arc::new(Mutex::new(proxy))
849    } else {
850        device.on_sandboxed();
851        Arc::new(Mutex::new(device))
852    };
853
854    #[cfg(windows)]
855    let arced_dev = {
856        device.on_sandboxed();
857        Arc::new(Mutex::new(device))
858    };
859
860    #[cfg(any(target_os = "android", target_os = "linux"))]
861    hp_control_tube
862        .send(PciRootCommand::Add(pci_address, arced_dev.clone()))
863        .map_err(DeviceRegistrationError::RegisterDevice)?;
864
865    for range in &mmio_ranges {
866        linux
867            .mmio_bus
868            .insert(arced_dev.clone(), range.addr, range.size)
869            .map_err(DeviceRegistrationError::MmioInsert)?;
870    }
871
872    for range in &device_ranges {
873        linux
874            .mmio_bus
875            .insert(arced_dev.clone(), range.addr, range.size)
876            .map_err(DeviceRegistrationError::MmioInsert)?;
877    }
878
879    Ok(pci_address)
880}
881
882// Generate pci topology starting from parent bus
883fn generate_pci_topology(
884    parent_bus: Arc<Mutex<PciBus>>,
885    resources: &mut SystemAllocator,
886    io_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
887    device_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
888    device_addrs: &[PciAddress],
889    devices: &mut Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
890) -> Result<(Vec<BarRange>, u8), DeviceRegistrationError> {
891    let mut bar_ranges = Vec::new();
892    let bus_num = parent_bus.lock().get_bus_num();
893    let mut subordinate_bus = bus_num;
894    for (dev_idx, addr) in device_addrs.iter().enumerate() {
895        // Only target for devices that located on this bus
896        if addr.bus == bus_num {
897            // If this device is a pci bridge (a.k.a., it has a pci bus structure),
898            // create its topology recursively
899            if let Some(child_bus) = devices[dev_idx].0.get_new_pci_bus() {
900                let (child_bar_ranges, child_sub_bus) = generate_pci_topology(
901                    child_bus.clone(),
902                    resources,
903                    io_ranges,
904                    device_ranges,
905                    device_addrs,
906                    devices,
907                )?;
908                let device = &mut devices[dev_idx].0;
909                parent_bus
910                    .lock()
911                    .add_child_bus(child_bus.clone())
912                    .map_err(|_| DeviceRegistrationError::BrokenPciTopology)?;
913                let bridge_window = device
914                    .configure_bridge_window(resources, &child_bar_ranges)
915                    .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
916                bar_ranges.extend(bridge_window);
917
918                let ranges = device
919                    .allocate_io_bars(resources)
920                    .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
921                io_ranges.insert(dev_idx, ranges.clone());
922                bar_ranges.extend(ranges);
923
924                let ranges = device
925                    .allocate_device_bars(resources)
926                    .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
927                device_ranges.insert(dev_idx, ranges.clone());
928                bar_ranges.extend(ranges);
929
930                device.set_subordinate_bus(child_sub_bus);
931
932                subordinate_bus = std::cmp::max(subordinate_bus, child_sub_bus);
933            }
934        }
935    }
936
937    for (dev_idx, addr) in device_addrs.iter().enumerate() {
938        if addr.bus == bus_num {
939            let device = &mut devices[dev_idx].0;
940            // Allocate MMIO for non-bridge devices
941            if device.get_new_pci_bus().is_none() {
942                let ranges = device
943                    .allocate_io_bars(resources)
944                    .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
945                io_ranges.insert(dev_idx, ranges.clone());
946                bar_ranges.extend(ranges);
947
948                let ranges = device
949                    .allocate_device_bars(resources)
950                    .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
951                device_ranges.insert(dev_idx, ranges.clone());
952                bar_ranges.extend(ranges);
953            }
954        }
955    }
956    Ok((bar_ranges, subordinate_bus))
957}
958
959/// Ensure all PCI devices have an assigned PCI address.
960pub fn assign_pci_addresses(
961    devices: &mut [(Box<dyn BusDeviceObj>, Option<Minijail>)],
962    resources: &mut SystemAllocator,
963) -> Result<(), DeviceRegistrationError> {
964    // First allocate devices with a preferred address.
965    for pci_device in devices
966        .iter_mut()
967        .filter_map(|(device, _jail)| device.as_pci_device_mut())
968        .filter(|pci_device| pci_device.preferred_address().is_some())
969    {
970        let _ = pci_device
971            .allocate_address(resources)
972            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
973    }
974
975    // Then allocate addresses for the remaining devices.
976    for pci_device in devices
977        .iter_mut()
978        .filter_map(|(device, _jail)| device.as_pci_device_mut())
979        .filter(|pci_device| pci_device.preferred_address().is_none())
980    {
981        let _ = pci_device
982            .allocate_address(resources)
983            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
984    }
985
986    Ok(())
987}
988
989/// Creates a root PCI device for use by this Vm.
990pub fn generate_pci_root(
991    mut devices: Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
992    irq_chip: &mut dyn IrqChip,
993    mmio_bus: Arc<Bus>,
994    mmio_base: GuestAddress,
995    mmio_register_bit_num: usize,
996    io_bus: Arc<Bus>,
997    resources: &mut SystemAllocator,
998    vm: &mut impl Vm,
999    max_irqs: usize,
1000    vcfg_base: Option<u64>,
1001    #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1002) -> Result<
1003    (
1004        PciRoot,
1005        Vec<(PciAddress, u32, PciInterruptPin)>,
1006        BTreeMap<u32, String>,
1007        BTreeMap<PciAddress, Vec<u8>>,
1008        BTreeMap<PciAddress, Vec<u8>>,
1009    ),
1010    DeviceRegistrationError,
1011> {
1012    let mut device_addrs = Vec::new();
1013
1014    for (device, _jail) in devices.iter_mut() {
1015        let address = device
1016            .allocate_address(resources)
1017            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1018        device_addrs.push(address);
1019    }
1020
1021    let mut device_ranges = BTreeMap::new();
1022    let mut io_ranges = BTreeMap::new();
1023    let root_bus = Arc::new(Mutex::new(PciBus::new(0, 0, false)));
1024
1025    generate_pci_topology(
1026        root_bus.clone(),
1027        resources,
1028        &mut io_ranges,
1029        &mut device_ranges,
1030        &device_addrs,
1031        &mut devices,
1032    )?;
1033
1034    let mut root = PciRoot::new(
1035        vm,
1036        Arc::downgrade(&mmio_bus),
1037        mmio_base,
1038        mmio_register_bit_num,
1039        Arc::downgrade(&io_bus),
1040        root_bus,
1041    )
1042    .map_err(DeviceRegistrationError::CreateRoot)?;
1043    #[cfg_attr(windows, allow(unused_mut))]
1044    let mut pid_labels = BTreeMap::new();
1045
1046    // Allocate legacy INTx
1047    let mut pci_irqs = Vec::new();
1048    let mut irqs: Vec<u32> = Vec::new();
1049
1050    // Mapping of (bus, dev, pin) -> IRQ number.
1051    let mut dev_pin_irq = BTreeMap::new();
1052
1053    for (dev_idx, (device, _jail)) in devices.iter_mut().enumerate() {
1054        let pci_address = device_addrs[dev_idx];
1055
1056        let irq = match device.preferred_irq() {
1057            PreferredIrq::Fixed { pin, gsi } => {
1058                // The device reported a preferred IRQ, so use that rather than allocating one.
1059                resources.reserve_irq(gsi);
1060                Some((pin, gsi))
1061            }
1062            PreferredIrq::Any => {
1063                // The device did not provide a preferred IRQ but requested one, so allocate one.
1064
1065                // Choose a pin based on the slot's function number. Function 0 must always use
1066                // INTA# for single-function devices per the PCI spec, and we choose to use INTA#
1067                // for function 0 on multifunction devices and distribute the remaining functions
1068                // evenly across the other pins.
1069                let pin = match pci_address.func % 4 {
1070                    0 => PciInterruptPin::IntA,
1071                    1 => PciInterruptPin::IntB,
1072                    2 => PciInterruptPin::IntC,
1073                    _ => PciInterruptPin::IntD,
1074                };
1075
1076                // If an IRQ number has already been assigned for a different function with this
1077                // (bus, device, pin) combination, use it. Otherwise allocate a new one and insert
1078                // it into the map.
1079                let pin_key = (pci_address.bus, pci_address.dev, pin);
1080                let irq_num = if let Some(irq_num) = dev_pin_irq.get(&pin_key) {
1081                    *irq_num
1082                } else {
1083                    // If we have allocated fewer than `max_irqs` total, add a new irq to the `irqs`
1084                    // pool. Otherwise, share one of the existing `irqs`.
1085                    let irq_num = if irqs.len() < max_irqs {
1086                        let irq_num = resources
1087                            .allocate_irq()
1088                            .ok_or(DeviceRegistrationError::AllocateIrq)?;
1089                        irqs.push(irq_num);
1090                        irq_num
1091                    } else {
1092                        // Pick one of the existing IRQs to share, using `dev_idx` to distribute IRQ
1093                        // sharing evenly across devices.
1094                        irqs[dev_idx % max_irqs]
1095                    };
1096
1097                    dev_pin_irq.insert(pin_key, irq_num);
1098                    irq_num
1099                };
1100                Some((pin, irq_num))
1101            }
1102            PreferredIrq::None => {
1103                // The device does not want an INTx# IRQ.
1104                None
1105            }
1106        };
1107
1108        if let Some((pin, gsi)) = irq {
1109            let intx_event =
1110                devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
1111
1112            device.assign_irq(
1113                intx_event
1114                    .try_clone()
1115                    .map_err(DeviceRegistrationError::EventClone)?,
1116                pin,
1117                gsi,
1118            );
1119
1120            irq_chip
1121                .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(device))
1122                .map_err(DeviceRegistrationError::RegisterIrqfd)?;
1123
1124            pci_irqs.push((pci_address, gsi, pin));
1125        }
1126    }
1127
1128    // To prevent issues where device's on_sandbox may spawn thread before all
1129    // sandboxed devices are sandboxed we partition iterator to go over sandboxed
1130    // first. This is needed on linux platforms. On windows, this is a no-op since
1131    // jails are always None, even for sandboxed devices.
1132    let devices = {
1133        let (sandboxed, non_sandboxed): (Vec<_>, Vec<_>) = devices
1134            .into_iter()
1135            .enumerate()
1136            .partition(|(_, (_, jail))| jail.is_some());
1137        sandboxed.into_iter().chain(non_sandboxed)
1138    };
1139
1140    let mut amls = BTreeMap::new();
1141    let mut gpe_scope_amls = BTreeMap::new();
1142    for (dev_idx, dev_value) in devices {
1143        #[cfg(any(target_os = "android", target_os = "linux"))]
1144        let (mut device, jail) = dev_value;
1145        #[cfg(windows)]
1146        let (mut device, _) = dev_value;
1147        let address = device_addrs[dev_idx];
1148
1149        let mut keep_rds = device.keep_rds();
1150        syslog::push_descriptors(&mut keep_rds);
1151        cros_tracing::push_descriptors!(&mut keep_rds);
1152        metrics::push_descriptors(&mut keep_rds);
1153        keep_rds.append(&mut vm.get_memory().as_raw_descriptors());
1154
1155        let ranges = io_ranges.remove(&dev_idx).unwrap_or_default();
1156        let device_ranges = device_ranges.remove(&dev_idx).unwrap_or_default();
1157        device
1158            .register_device_capabilities()
1159            .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
1160
1161        if let Some(vcfg_base) = vcfg_base {
1162            let (methods, shm) = device.generate_acpi_methods();
1163            if !methods.is_empty() {
1164                amls.insert(address, methods);
1165            }
1166            if let Some((offset, mmap)) = shm {
1167                let _ = vm.add_memory_region(
1168                    GuestAddress(vcfg_base + offset as u64),
1169                    Box::new(mmap),
1170                    false,
1171                    false,
1172                    MemCacheType::CacheCoherent,
1173                );
1174            }
1175        }
1176        let gpe_nr = device.set_gpe(resources);
1177
1178        #[cfg(any(target_os = "android", target_os = "linux"))]
1179        let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
1180            let proxy = ProxyDevice::new(
1181                device,
1182                jail,
1183                keep_rds,
1184                #[cfg(feature = "swap")]
1185                swap_controller,
1186            )
1187            .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
1188            pid_labels.insert(proxy.pid() as u32, proxy.debug_label());
1189            Arc::new(Mutex::new(proxy))
1190        } else {
1191            device.on_sandboxed();
1192            Arc::new(Mutex::new(device))
1193        };
1194        #[cfg(windows)]
1195        let arced_dev = {
1196            device.on_sandboxed();
1197            Arc::new(Mutex::new(device))
1198        };
1199        root.add_device(address, arced_dev.clone(), vm)
1200            .map_err(DeviceRegistrationError::PciRootAddDevice)?;
1201        for range in &ranges {
1202            mmio_bus
1203                .insert(arced_dev.clone(), range.addr, range.size)
1204                .map_err(DeviceRegistrationError::MmioInsert)?;
1205        }
1206
1207        for range in &device_ranges {
1208            mmio_bus
1209                .insert(arced_dev.clone(), range.addr, range.size)
1210                .map_err(DeviceRegistrationError::MmioInsert)?;
1211        }
1212
1213        if let Some(gpe_nr) = gpe_nr {
1214            if let Some(acpi_path) = root.acpi_path(&address) {
1215                let mut gpe_aml = Vec::new();
1216
1217                GpeScope {}.cast_to_aml_bytes(
1218                    &mut gpe_aml,
1219                    gpe_nr,
1220                    format!("\\{acpi_path}").as_str(),
1221                );
1222                if !gpe_aml.is_empty() {
1223                    gpe_scope_amls.insert(address, gpe_aml);
1224                }
1225            }
1226        }
1227    }
1228
1229    Ok((root, pci_irqs, pid_labels, amls, gpe_scope_amls))
1230}
1231
1232/// Errors for image loading.
1233#[sorted]
1234#[derive(Error, Debug)]
1235pub enum LoadImageError {
1236    #[error("Alignment not a power of two: {0}")]
1237    BadAlignment(u64),
1238    #[error("Getting image size failed: {0}")]
1239    GetLen(io::Error),
1240    #[error("GuestMemory get slice failed: {0}")]
1241    GuestMemorySlice(GuestMemoryError),
1242    #[error("Image size too large: {0}")]
1243    ImageSizeTooLarge(u64),
1244    #[error("No suitable memory region found")]
1245    NoSuitableMemoryRegion,
1246    #[error("Reading image into memory failed: {0}")]
1247    ReadToMemory(io::Error),
1248    #[error("Cannot load zero-sized image")]
1249    ZeroSizedImage,
1250}
1251
1252/// Load an image from a file into guest memory.
1253///
1254/// # Arguments
1255///
1256/// * `guest_mem` - The memory to be used by the guest.
1257/// * `guest_addr` - The starting address to load the image in the guest memory.
1258/// * `max_size` - The amount of space in bytes available in the guest memory for the image.
1259/// * `image` - The file containing the image to be loaded.
1260///
1261/// The size in bytes of the loaded image is returned.
1262pub fn load_image<F>(
1263    guest_mem: &GuestMemory,
1264    image: &mut F,
1265    guest_addr: GuestAddress,
1266    max_size: u64,
1267) -> Result<usize, LoadImageError>
1268where
1269    F: FileReadWriteAtVolatile + FileGetLen,
1270{
1271    let size = image.get_len().map_err(LoadImageError::GetLen)?;
1272
1273    if size > usize::MAX as u64 || size > max_size {
1274        return Err(LoadImageError::ImageSizeTooLarge(size));
1275    }
1276
1277    // This is safe due to the bounds check above.
1278    let size = size as usize;
1279
1280    let guest_slice = guest_mem
1281        .get_slice_at_addr(guest_addr, size)
1282        .map_err(LoadImageError::GuestMemorySlice)?;
1283    image
1284        .read_exact_at_volatile(guest_slice, 0)
1285        .map_err(LoadImageError::ReadToMemory)?;
1286
1287    Ok(size)
1288}
1289
1290/// Load an image from a file into guest memory at the highest possible address.
1291///
1292/// # Arguments
1293///
1294/// * `guest_mem` - The memory to be used by the guest.
1295/// * `image` - The file containing the image to be loaded.
1296/// * `min_guest_addr` - The minimum address of the start of the image.
1297/// * `max_guest_addr` - The address to load the last byte of the image.
1298/// * `region_filter` - The optional filter function for determining if the given guest memory
1299///   region is suitable for loading the image into it.
1300/// * `align` - The minimum alignment of the start address of the image in bytes (must be a power of
1301///   two).
1302///
1303/// The guest address and size in bytes of the loaded image are returned.
1304pub fn load_image_high<F>(
1305    guest_mem: &GuestMemory,
1306    image: &mut F,
1307    min_guest_addr: GuestAddress,
1308    max_guest_addr: GuestAddress,
1309    region_filter: Option<fn(&MemoryRegionInformation) -> bool>,
1310    align: u64,
1311) -> Result<(GuestAddress, usize), LoadImageError>
1312where
1313    F: FileReadWriteAtVolatile + FileGetLen,
1314{
1315    if !align.is_power_of_two() {
1316        return Err(LoadImageError::BadAlignment(align));
1317    }
1318
1319    let max_size = max_guest_addr.offset_from(min_guest_addr) & !(align - 1);
1320    let size = image.get_len().map_err(LoadImageError::GetLen)?;
1321
1322    if size == 0 {
1323        return Err(LoadImageError::ZeroSizedImage);
1324    }
1325
1326    if size > usize::MAX as u64 || size > max_size {
1327        return Err(LoadImageError::ImageSizeTooLarge(size));
1328    }
1329
1330    // Sort the list of guest memory regions by address so we can iterate over them in reverse order
1331    // (high to low).
1332    let mut regions: Vec<_> = guest_mem
1333        .regions()
1334        .filter(region_filter.unwrap_or(|_| true))
1335        .collect();
1336    regions.sort_unstable_by(|a, b| a.guest_addr.cmp(&b.guest_addr));
1337
1338    // Find the highest valid address inside a guest memory region that satisfies the requested
1339    // alignment and min/max address requirements while having enough space for the image.
1340    let guest_addr = regions
1341        .into_iter()
1342        .rev()
1343        .filter_map(|r| {
1344            // Highest address within this region.
1345            let rgn_max_addr = r
1346                .guest_addr
1347                .checked_add((r.size as u64).checked_sub(1)?)?
1348                .min(max_guest_addr);
1349            // Lowest aligned address within this region.
1350            let rgn_start_aligned = r.guest_addr.align(align)?;
1351            // Hypothetical address of the image if loaded at the end of the region.
1352            let image_addr = rgn_max_addr.checked_sub(size - 1)? & !(align - 1);
1353
1354            // Would the image fit within the region?
1355            if image_addr >= rgn_start_aligned {
1356                Some(image_addr)
1357            } else {
1358                None
1359            }
1360        })
1361        .find(|&addr| addr >= min_guest_addr)
1362        .ok_or(LoadImageError::NoSuitableMemoryRegion)?;
1363
1364    // This is safe due to the bounds check above.
1365    let size = size as usize;
1366
1367    let guest_slice = guest_mem
1368        .get_slice_at_addr(guest_addr, size)
1369        .map_err(LoadImageError::GuestMemorySlice)?;
1370    image
1371        .read_exact_at_volatile(guest_slice, 0)
1372        .map_err(LoadImageError::ReadToMemory)?;
1373
1374    Ok((guest_addr, size))
1375}
1376
1377/// SMBIOS table configuration
1378#[derive(Clone, Debug, Default, Serialize, Deserialize, FromKeyValues, PartialEq, Eq)]
1379#[serde(deny_unknown_fields, rename_all = "kebab-case")]
1380pub struct SmbiosOptions {
1381    /// BIOS vendor name.
1382    pub bios_vendor: Option<String>,
1383
1384    /// BIOS version number (free-form string).
1385    pub bios_version: Option<String>,
1386
1387    /// System manufacturer name.
1388    pub manufacturer: Option<String>,
1389
1390    /// System product name.
1391    pub product_name: Option<String>,
1392
1393    /// System serial number (free-form string).
1394    pub serial_number: Option<String>,
1395
1396    /// System UUID.
1397    pub uuid: Option<Uuid>,
1398
1399    /// Additional OEM strings to add to SMBIOS table.
1400    #[serde(default)]
1401    pub oem_strings: Vec<String>,
1402}
1403
1404#[cfg(test)]
1405mod tests {
1406    use serde_keyvalue::from_key_values;
1407    use tempfile::tempfile;
1408
1409    use super::*;
1410
1411    #[test]
1412    fn parse_pstore() {
1413        let res: Pstore = from_key_values("path=/some/path,size=16384").unwrap();
1414        assert_eq!(
1415            res,
1416            Pstore {
1417                path: "/some/path".into(),
1418                size: 16384,
1419            }
1420        );
1421
1422        let res = from_key_values::<Pstore>("path=/some/path");
1423        assert!(res.is_err());
1424
1425        let res = from_key_values::<Pstore>("size=16384");
1426        assert!(res.is_err());
1427
1428        let res = from_key_values::<Pstore>("");
1429        assert!(res.is_err());
1430    }
1431
1432    #[test]
1433    fn deserialize_cpuset_serde_kv() {
1434        let res: CpuSet = from_key_values("[0,4,7]").unwrap();
1435        assert_eq!(res, CpuSet::new(vec![0, 4, 7]));
1436
1437        let res: CpuSet = from_key_values("[9-12]").unwrap();
1438        assert_eq!(res, CpuSet::new(vec![9, 10, 11, 12]));
1439
1440        let res: CpuSet = from_key_values("[0,4,7,9-12,15]").unwrap();
1441        assert_eq!(res, CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]));
1442    }
1443
1444    #[test]
1445    fn deserialize_serialize_cpuset_json() {
1446        let json_str = "[0,4,7]";
1447        let cpuset = CpuSet::new(vec![0, 4, 7]);
1448        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1449        assert_eq!(res, cpuset);
1450        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1451
1452        let json_str = r#"["9-12"]"#;
1453        let cpuset = CpuSet::new(vec![9, 10, 11, 12]);
1454        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1455        assert_eq!(res, cpuset);
1456        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1457
1458        let json_str = r#"[0,4,7,"9-12",15]"#;
1459        let cpuset = CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]);
1460        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1461        assert_eq!(res, cpuset);
1462        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1463    }
1464
1465    #[test]
1466    fn load_image_high_max_4g() {
1467        let mem = GuestMemory::new(&[
1468            (GuestAddress(0x0000_0000), 0x4000_0000), // 0x00000000..0x40000000
1469            (GuestAddress(0x8000_0000), 0x4000_0000), // 0x80000000..0xC0000000
1470        ])
1471        .unwrap();
1472
1473        const TEST_IMAGE_SIZE: u64 = 1234;
1474        let mut test_image = tempfile().unwrap();
1475        test_image.set_len(TEST_IMAGE_SIZE).unwrap();
1476
1477        const TEST_ALIGN: u64 = 0x8000;
1478        let (addr, size) = load_image_high(
1479            &mem,
1480            &mut test_image,
1481            GuestAddress(0x8000),
1482            GuestAddress(0xFFFF_FFFF), // max_guest_addr beyond highest guest memory region
1483            None,
1484            TEST_ALIGN,
1485        )
1486        .unwrap();
1487
1488        assert_eq!(addr, GuestAddress(0xBFFF_8000));
1489        assert_eq!(addr.offset() % TEST_ALIGN, 0);
1490        assert_eq!(size, TEST_IMAGE_SIZE as usize);
1491    }
1492}