arch/
lib.rs

1// Copyright 2018 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Virtual machine architecture support code.
6
7pub mod android;
8pub mod fdt;
9pub mod pstore;
10pub mod serial;
11
12pub mod sys;
13
14use std::collections::BTreeMap;
15use std::error::Error as StdError;
16use std::fs::File;
17use std::io;
18use std::ops::Deref;
19use std::path::PathBuf;
20use std::str::FromStr;
21use std::sync::mpsc;
22use std::sync::mpsc::SendError;
23use std::sync::Arc;
24
25use acpi_tables::sdt::SDT;
26use base::syslog;
27use base::AsRawDescriptors;
28use base::FileGetLen;
29use base::FileReadWriteAtVolatile;
30use base::RecvTube;
31use base::SendTube;
32use base::Tube;
33use devices::virtio::VirtioDevice;
34use devices::BarRange;
35use devices::Bus;
36use devices::BusDevice;
37use devices::BusDeviceObj;
38use devices::BusError;
39use devices::BusResumeDevice;
40use devices::FwCfgParameters;
41use devices::GpeScope;
42use devices::HotPlugBus;
43use devices::IrqChip;
44use devices::IrqEventSource;
45use devices::PciAddress;
46use devices::PciBus;
47use devices::PciDevice;
48use devices::PciDeviceError;
49use devices::PciInterruptPin;
50use devices::PciRoot;
51use devices::PciRootCommand;
52use devices::PreferredIrq;
53#[cfg(any(target_os = "android", target_os = "linux"))]
54use devices::ProxyDevice;
55use devices::SerialHardware;
56use devices::SerialParameters;
57pub use fdt::apply_device_tree_overlays;
58pub use fdt::DtbOverlay;
59#[cfg(feature = "gdb")]
60use gdbstub::arch::Arch;
61pub use hypervisor::CpuConfigArch;
62pub use hypervisor::HypervisorArch;
63use hypervisor::MemCacheType;
64pub use hypervisor::VcpuArch;
65pub use hypervisor::VcpuInitArch;
66use hypervisor::Vm;
67pub use hypervisor::VmArch;
68#[cfg(windows)]
69use jail::FakeMinijailStub as Minijail;
70#[cfg(any(target_os = "android", target_os = "linux"))]
71use minijail::Minijail;
72use remain::sorted;
73use resources::SystemAllocator;
74use resources::SystemAllocatorConfig;
75use serde::de::Visitor;
76use serde::Deserialize;
77use serde::Serialize;
78use serde_keyvalue::FromKeyValues;
79pub use serial::add_serial_devices;
80pub use serial::get_serial_cmdline;
81pub use serial::set_default_serial_parameters;
82pub use serial::GetSerialCmdlineError;
83pub use serial::SERIAL_ADDR;
84use sync::Condvar;
85use sync::Mutex;
86use thiserror::Error;
87use uuid::Uuid;
88use vm_control::BatControl;
89use vm_control::BatteryType;
90use vm_control::PmResource;
91use vm_memory::GuestAddress;
92use vm_memory::GuestMemory;
93use vm_memory::GuestMemoryError;
94use vm_memory::MemoryRegionInformation;
95use vm_memory::MemoryRegionOptions;
96
97cfg_if::cfg_if! {
98    if #[cfg(target_arch = "aarch64")] {
99        pub use devices::IrqChipAArch64 as IrqChipArch;
100        #[cfg(feature = "gdb")]
101        pub use gdbstub_arch::aarch64::AArch64 as GdbArch;
102    } else if #[cfg(target_arch = "riscv64")] {
103        pub use devices::IrqChipRiscv64 as IrqChipArch;
104        #[cfg(feature = "gdb")]
105        pub use gdbstub_arch::riscv::Riscv64 as GdbArch;
106    } else if #[cfg(target_arch = "x86_64")] {
107        pub use devices::IrqChipX86_64 as IrqChipArch;
108        #[cfg(feature = "gdb")]
109        pub use gdbstub_arch::x86::X86_64_SSE as GdbArch;
110    }
111}
112
113pub enum VmImage {
114    Kernel(File),
115    Bios(File),
116}
117
118#[derive(Clone, Debug, Deserialize, Serialize, FromKeyValues, PartialEq, Eq)]
119#[serde(deny_unknown_fields, rename_all = "kebab-case")]
120pub struct Pstore {
121    pub path: PathBuf,
122    pub size: u32,
123}
124
125#[derive(Clone, Copy, Debug, Serialize, Deserialize, FromKeyValues)]
126#[serde(deny_unknown_fields, rename_all = "kebab-case")]
127pub enum FdtPosition {
128    /// At the start of RAM.
129    Start,
130    /// Near the end of RAM.
131    End,
132    /// After the payload, with some padding for alignment.
133    AfterPayload,
134}
135
136/// Set of CPU cores.
137#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
138pub struct CpuSet(Vec<usize>);
139
140impl CpuSet {
141    pub fn new<I: IntoIterator<Item = usize>>(cpus: I) -> Self {
142        CpuSet(cpus.into_iter().collect())
143    }
144
145    pub fn iter(&self) -> std::slice::Iter<'_, usize> {
146        self.0.iter()
147    }
148}
149
150impl FromIterator<usize> for CpuSet {
151    fn from_iter<T>(iter: T) -> Self
152    where
153        T: IntoIterator<Item = usize>,
154    {
155        CpuSet::new(iter)
156    }
157}
158
159#[cfg(target_arch = "aarch64")]
160fn sve_auto_default() -> bool {
161    true
162}
163
164/// The SVE config for Vcpus.
165#[cfg(target_arch = "aarch64")]
166#[derive(Copy, Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
167#[serde(deny_unknown_fields, rename_all = "kebab-case")]
168pub struct SveConfig {
169    /// Detect if SVE is available and enable accordingly. `enable` is ignored if auto is true
170    #[serde(default = "sve_auto_default")]
171    pub auto: bool,
172}
173
174#[cfg(target_arch = "aarch64")]
175impl Default for SveConfig {
176    fn default() -> Self {
177        SveConfig {
178            auto: sve_auto_default(),
179        }
180    }
181}
182
183/// FFA config
184// For now this is limited to android, will be opened to other aarch64 based pVMs after
185// corresponding kernel APIs are upstreamed.
186#[cfg(all(target_os = "android", target_arch = "aarch64"))]
187#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize, FromKeyValues)]
188#[serde(deny_unknown_fields, rename_all = "kebab-case")]
189pub struct FfaConfig {
190    /// Just enable FFA, don't care about the negotiated version.
191    #[serde(default)]
192    pub auto: bool,
193}
194
195fn parse_cpu_range(s: &str, cpuset: &mut Vec<usize>) -> Result<(), String> {
196    fn parse_cpu(s: &str) -> Result<usize, String> {
197        s.parse()
198            .map_err(|_| format!("invalid CPU index {s} - index must be a non-negative integer"))
199    }
200
201    let (first_cpu, last_cpu) = match s.split_once('-') {
202        Some((first_cpu, last_cpu)) => {
203            let first_cpu = parse_cpu(first_cpu)?;
204            let last_cpu = parse_cpu(last_cpu)?;
205
206            if last_cpu < first_cpu {
207                return Err(format!(
208                    "invalid CPU range {s} - ranges must be from low to high"
209                ));
210            }
211            (first_cpu, last_cpu)
212        }
213        None => {
214            let cpu = parse_cpu(s)?;
215            (cpu, cpu)
216        }
217    };
218
219    cpuset.extend(first_cpu..=last_cpu);
220
221    Ok(())
222}
223
224impl FromStr for CpuSet {
225    type Err = String;
226
227    fn from_str(s: &str) -> Result<Self, Self::Err> {
228        let mut cpuset = Vec::new();
229        for part in s.split(',') {
230            parse_cpu_range(part, &mut cpuset)?;
231        }
232        Ok(CpuSet::new(cpuset))
233    }
234}
235
236impl Deref for CpuSet {
237    type Target = Vec<usize>;
238
239    fn deref(&self) -> &Self::Target {
240        &self.0
241    }
242}
243
244impl IntoIterator for CpuSet {
245    type Item = usize;
246    type IntoIter = std::vec::IntoIter<Self::Item>;
247
248    fn into_iter(self) -> Self::IntoIter {
249        self.0.into_iter()
250    }
251}
252
253/// Selects the interface for guest-controlled power management of assigned devices.
254#[derive(Clone, Copy, Debug, Deserialize, PartialEq, Eq, Serialize)]
255pub enum DevicePowerManagerConfig {
256    /// Uses the protected KVM hypercall interface.
257    PkvmHvc,
258}
259
260impl FromStr for DevicePowerManagerConfig {
261    type Err = String;
262
263    fn from_str(s: &str) -> Result<Self, Self::Err> {
264        match s {
265            "pkvm-hvc" => Ok(Self::PkvmHvc),
266            _ => Err(format!("DevicePowerManagerConfig '{s}' not supported")),
267        }
268    }
269}
270
271/// Deserializes a `CpuSet` from a sequence which elements can either be integers, or strings
272/// representing CPU ranges (e.g. `5-8`).
273impl<'de> Deserialize<'de> for CpuSet {
274    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
275    where
276        D: serde::Deserializer<'de>,
277    {
278        struct CpuSetVisitor;
279        impl<'de> Visitor<'de> for CpuSetVisitor {
280            type Value = CpuSet;
281
282            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
283                formatter.write_str("CpuSet")
284            }
285
286            fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
287            where
288                A: serde::de::SeqAccess<'de>,
289            {
290                #[derive(Deserialize)]
291                #[serde(untagged)]
292                enum CpuSetValue<'a> {
293                    Single(usize),
294                    Range(&'a str),
295                }
296
297                let mut cpus = Vec::new();
298                while let Some(cpuset) = seq.next_element::<CpuSetValue>()? {
299                    match cpuset {
300                        CpuSetValue::Single(cpu) => cpus.push(cpu),
301                        CpuSetValue::Range(range) => {
302                            parse_cpu_range(range, &mut cpus).map_err(serde::de::Error::custom)?;
303                        }
304                    }
305                }
306
307                Ok(CpuSet::new(cpus))
308            }
309        }
310
311        deserializer.deserialize_seq(CpuSetVisitor)
312    }
313}
314
315/// Serializes a `CpuSet` into a sequence of integers and strings representing CPU ranges.
316impl Serialize for CpuSet {
317    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
318    where
319        S: serde::Serializer,
320    {
321        use serde::ser::SerializeSeq;
322
323        let mut seq = serializer.serialize_seq(None)?;
324
325        // Factorize ranges into "a-b" strings.
326        let mut serialize_range = |start: usize, end: usize| -> Result<(), S::Error> {
327            if start == end {
328                seq.serialize_element(&start)?;
329            } else {
330                seq.serialize_element(&format!("{start}-{end}"))?;
331            }
332
333            Ok(())
334        };
335
336        // Current range.
337        let mut range = None;
338        for core in &self.0 {
339            range = match range {
340                None => Some((core, core)),
341                Some((start, end)) if *end == *core - 1 => Some((start, core)),
342                Some((start, end)) => {
343                    serialize_range(*start, *end)?;
344                    Some((core, core))
345                }
346            };
347        }
348
349        if let Some((start, end)) = range {
350            serialize_range(*start, *end)?;
351        }
352
353        seq.end()
354    }
355}
356
357/// Mapping of guest VCPU threads to host CPU cores.
358#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
359pub enum VcpuAffinity {
360    /// All VCPU threads will be pinned to the same set of host CPU cores.
361    Global(CpuSet),
362    /// Each VCPU may be pinned to a set of host CPU cores.
363    /// The map key is a guest VCPU index, and the corresponding value is the set of
364    /// host CPU indices that the VCPU thread will be allowed to run on.
365    /// If a VCPU index is not present in the map, its affinity will not be set.
366    PerVcpu(BTreeMap<usize, CpuSet>),
367}
368
369/// Memory region with optional size.
370#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
371pub struct MemoryRegionConfig {
372    pub start: u64,
373    pub size: Option<u64>,
374}
375
376/// General PCI config.
377#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
378pub struct PciConfig {
379    /// region for PCI Configuration Access Mechanism
380    #[cfg(target_arch = "aarch64")]
381    pub cam: Option<MemoryRegionConfig>,
382    /// region for PCIe Enhanced Configuration Access Mechanism
383    #[cfg(target_arch = "x86_64")]
384    pub ecam: Option<MemoryRegionConfig>,
385    /// region for non-prefetchable PCI device memory below 4G
386    pub mem: Option<MemoryRegionConfig>,
387}
388
389pub const DEFAULT_CPU_CAPACITY: u32 = 1024;
390
391#[sorted]
392#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
393pub struct VcpuProperties {
394    pub capacity: Option<u32>,
395    pub dynamic_power_coefficient: Option<u32>,
396    pub frequencies: Vec<u32>,
397    #[cfg(all(
398        target_arch = "aarch64",
399        any(target_os = "android", target_os = "linux")
400    ))]
401    pub normalized_cpu_ipc_ratio: Option<u32>,
402    #[cfg(all(
403        target_arch = "aarch64",
404        any(target_os = "android", target_os = "linux")
405    ))]
406    pub vcpu_domain: Option<u32>,
407    #[cfg(all(
408        target_arch = "aarch64",
409        any(target_os = "android", target_os = "linux")
410    ))]
411    pub vcpu_domain_path: Option<PathBuf>,
412}
413
414/// Derives base VCPU properties from various config fields.
415pub fn derive_vcpu_properties(
416    vcpu_count: usize,
417    vcpu_capacity: &std::collections::BTreeMap<usize, u32>,
418    dynamic_power_coefficient: &std::collections::BTreeMap<usize, u32>,
419    vcpu_frequencies: &std::collections::BTreeMap<usize, Vec<u32>>,
420    #[cfg(all(
421        target_arch = "aarch64",
422        any(target_os = "android", target_os = "linux")
423    ))]
424    normalized_cpu_ipc_ratio: &std::collections::BTreeMap<usize, u32>,
425    #[cfg(all(
426        target_arch = "aarch64",
427        any(target_os = "android", target_os = "linux")
428    ))]
429    vcpu_domain: &std::collections::BTreeMap<usize, u32>,
430    #[cfg(all(
431        target_arch = "aarch64",
432        any(target_os = "android", target_os = "linux")
433    ))]
434    vcpu_domain_path: &std::collections::BTreeMap<usize, std::path::PathBuf>,
435) -> std::collections::BTreeMap<usize, VcpuProperties> {
436    let mut vcpu_properties = std::collections::BTreeMap::new();
437    for vcpu_id in 0..vcpu_count {
438        let vcpu_prop_capacity = vcpu_capacity.get(&vcpu_id).copied();
439
440        vcpu_properties.insert(
441            vcpu_id,
442            VcpuProperties {
443                capacity: vcpu_prop_capacity,
444                frequencies: vcpu_frequencies.get(&vcpu_id).cloned().unwrap_or_default(),
445                dynamic_power_coefficient: dynamic_power_coefficient.get(&vcpu_id).copied(),
446                #[cfg(all(
447                    target_arch = "aarch64",
448                    any(target_os = "android", target_os = "linux")
449                ))]
450                normalized_cpu_ipc_ratio: normalized_cpu_ipc_ratio.get(&vcpu_id).copied(),
451                #[cfg(all(
452                    target_arch = "aarch64",
453                    any(target_os = "android", target_os = "linux")
454                ))]
455                vcpu_domain: vcpu_domain.get(&vcpu_id).copied(),
456                #[cfg(all(
457                    target_arch = "aarch64",
458                    any(target_os = "android", target_os = "linux")
459                ))]
460                vcpu_domain_path: vcpu_domain_path.get(&vcpu_id).cloned(),
461            },
462        );
463    }
464    vcpu_properties
465}
466
467/// Holds the pieces needed to build a VM. Passed to `build_vm` in the `LinuxArch` trait below to
468/// create a `RunnableLinuxVm`.
469#[sorted]
470pub struct VmComponents {
471    pub acpi_sdts: Vec<SDT>,
472    pub android_fstab: Option<File>,
473    pub boot_cpu: usize,
474    pub bootorder_fw_cfg_blob: Vec<u8>,
475    #[cfg(target_arch = "x86_64")]
476    pub break_linux_pci_config_io: bool,
477
478    pub delay_rt: bool,
479    pub dev_pm: Option<DevicePowerManagerConfig>,
480    pub extra_kernel_params: Vec<String>,
481    #[cfg(target_arch = "x86_64")]
482    pub force_s2idle: bool,
483    pub fw_cfg_enable: bool,
484    pub fw_cfg_parameters: Vec<FwCfgParameters>,
485    pub host_cpu_topology: bool,
486    pub hugepages: bool,
487    pub hv_cfg: hypervisor::Config,
488    pub initrd_image: Option<File>,
489    pub itmt: bool,
490    pub memory_size: u64,
491    pub no_i8042: bool,
492    pub no_rtc: bool,
493    pub no_smt: bool,
494
495    pub pci_config: PciConfig,
496    pub pflash_block_size: u32,
497    pub pflash_image: Option<File>,
498    pub pstore: Option<Pstore>,
499    /// A file to load as pVM firmware. Must be `Some` iff
500    /// `hv_cfg.protection_type == ProtectionType::UnprotectedWithFirmware`.
501    pub pvm_fw: Option<File>,
502    pub rt_cpus: CpuSet,
503    #[cfg(target_arch = "x86_64")]
504    pub smbios: SmbiosOptions,
505    pub smccc_trng: bool,
506    #[cfg(target_arch = "aarch64")]
507    pub sve_config: SveConfig,
508    pub swiotlb: Option<u64>,
509    pub vcpu_affinity: Option<VcpuAffinity>,
510    /// List of vCPU clusters, mapped from pCPU clusters.
511    pub vcpu_clusters: Vec<CpuSet>,
512    pub vcpu_properties: BTreeMap<usize, VcpuProperties>,
513    #[cfg(any(target_os = "android", target_os = "linux"))]
514    pub vfio_platform_pm: bool,
515    #[cfg(all(
516        target_arch = "aarch64",
517        any(target_os = "android", target_os = "linux")
518    ))]
519    pub virt_cpufreq_v2: bool,
520    pub vm_image: VmImage,
521}
522
523/// Holds the elements needed to run a Linux VM. Created by `build_vm`.
524#[sorted]
525pub struct RunnableLinuxVm {
526    pub bat_control: Option<BatControl>,
527    pub delay_rt: bool,
528    pub devices_thread: Option<std::thread::JoinHandle<()>>,
529    pub hotplug_bus: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
530    pub hypercall_bus: Arc<Bus>,
531    pub io_bus: Arc<Bus>,
532    pub irq_chip: Arc<dyn IrqChipArch>,
533    pub mmio_bus: Arc<Bus>,
534    pub no_smt: bool,
535    pub pid_debug_label_map: BTreeMap<u32, String>,
536    #[cfg(any(target_os = "android", target_os = "linux"))]
537    pub platform_devices: Vec<Arc<Mutex<dyn BusDevice>>>,
538    pub pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
539    /// Devices to be notified before the system resumes from the S3 suspended state.
540    pub resume_notify_devices: Vec<Arc<Mutex<dyn BusResumeDevice>>>,
541    pub root_config: Arc<Mutex<PciRoot>>,
542    pub rt_cpus: CpuSet,
543    pub suspend_tube: (Arc<Mutex<SendTube>>, RecvTube),
544    pub vcpu_affinity: Option<VcpuAffinity>,
545    pub vcpu_count: usize,
546    pub vcpu_init: Vec<VcpuInitArch>,
547    /// If vcpus is None, then it's the responsibility of the vcpu thread to create vcpus.
548    /// If it's Some, then `build_vm` already created the vcpus.
549    pub vcpus: Option<Vec<Arc<dyn VcpuArch>>>,
550    pub vm: Arc<dyn VmArch>,
551    pub vm_request_tubes: Vec<Tube>,
552}
553
554/// The device and optional jail.
555pub struct VirtioDeviceStub {
556    pub dev: Box<dyn VirtioDevice>,
557    pub jail: Option<Minijail>,
558}
559
560/// Trait which is implemented for each Linux Architecture in order to
561/// set up the memory, cpus, and system devices and to boot the kernel.
562pub trait LinuxArch {
563    type Error: StdError;
564    type ArchMemoryLayout;
565
566    /// Decide architecture specific memory layout details to be used by later stages of the VM
567    /// setup.
568    fn arch_memory_layout(
569        components: &VmComponents,
570    ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error>;
571
572    /// Returns a Vec of the valid memory addresses as pairs of address and length. These should be
573    /// used to configure the `GuestMemory` structure for the platform.
574    ///
575    /// # Arguments
576    ///
577    /// * `components` - Parts used to determine the memory layout.
578    fn guest_memory_layout(
579        components: &VmComponents,
580        arch_memory_layout: &Self::ArchMemoryLayout,
581        hypervisor: &impl hypervisor::Hypervisor,
582    ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>;
583
584    /// Gets the configuration for a new `SystemAllocator` that fits the given `Vm`'s memory layout.
585    ///
586    /// This is the per-architecture template for constructing the `SystemAllocator`. Platform
587    /// agnostic modifications may be made to this configuration, but the final `SystemAllocator`
588    /// will be at least as strict as this configuration.
589    ///
590    /// # Arguments
591    ///
592    /// * `vm` - The virtual machine to be used as a template for the `SystemAllocator`.
593    fn get_system_allocator_config(
594        vm: &dyn Vm,
595        arch_memory_layout: &Self::ArchMemoryLayout,
596    ) -> SystemAllocatorConfig;
597
598    /// Takes `VmComponents` and generates a `RunnableLinuxVm`.
599    ///
600    /// # Arguments
601    ///
602    /// * `components` - Parts to use to build the VM.
603    /// * `vm_evt_wrtube` - Tube used by sub-devices to request that crosvm exit because guest wants
604    ///   to stop/shut down or requested reset.
605    /// * `system_allocator` - Allocator created by this trait's implementation of
606    ///   `get_system_allocator_config`.
607    /// * `serial_parameters` - Definitions for how the serial devices should be configured.
608    /// * `serial_jail` - Jail used for serial devices created here.
609    /// * `battery` - Defines what battery device will be created.
610    /// * `vm` - A VM implementation to build upon.
611    /// * `ramoops_region` - Region allocated for ramoops.
612    /// * `devices` - The devices to be built into the VM.
613    /// * `irq_chip` - The IRQ chip implemention for the VM.
614    /// * `debugcon_jail` - Jail used for debugcon devices created here.
615    /// * `pflash_jail` - Jail used for pflash device created here.
616    /// * `fw_cfg_jail` - Jail used for fw_cfg device created here.
617    /// * `device_tree_overlays` - Device tree overlay binaries
618    fn build_vm(
619        components: VmComponents,
620        arch_memory_layout: &Self::ArchMemoryLayout,
621        vm_evt_wrtube: &SendTube,
622        system_allocator: &mut SystemAllocator,
623        serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
624        serial_jail: Option<Minijail>,
625        battery: (Option<BatteryType>, Option<Minijail>),
626        vm: Arc<dyn VmArch>,
627        ramoops_region: Option<pstore::RamoopsRegion>,
628        devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
629        irq_chip: Arc<dyn IrqChipArch>,
630        vcpu_ids: &mut Vec<usize>,
631        dump_device_tree_blob: Option<PathBuf>,
632        debugcon_jail: Option<Minijail>,
633        #[cfg(target_arch = "x86_64")] pflash_jail: Option<Minijail>,
634        #[cfg(target_arch = "x86_64")] fw_cfg_jail: Option<Minijail>,
635        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
636        guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
637        device_tree_overlays: Vec<DtbOverlay>,
638        fdt_position: Option<FdtPosition>,
639        no_pmu: bool,
640    ) -> std::result::Result<RunnableLinuxVm, Self::Error>;
641
642    /// Configures the vcpu and should be called once per vcpu from the vcpu's thread.
643    ///
644    /// # Arguments
645    ///
646    /// * `vm` - The virtual machine object.
647    /// * `hypervisor` - The `Hypervisor` that created the vcpu.
648    /// * `irq_chip` - The `IrqChip` associated with this vm.
649    /// * `vcpu` - The VCPU object to configure.
650    /// * `vcpu_init` - The data required to initialize VCPU registers and other state.
651    /// * `vcpu_id` - The id of the given `vcpu`.
652    /// * `num_vcpus` - Number of virtual CPUs the guest will have.
653    /// * `cpu_config` - CPU feature configurations.
654    fn configure_vcpu(
655        vm: &dyn Vm,
656        hypervisor: &dyn HypervisorArch,
657        irq_chip: &dyn IrqChipArch,
658        vcpu: &dyn VcpuArch,
659        vcpu_init: VcpuInitArch,
660        vcpu_id: usize,
661        num_vcpus: usize,
662        cpu_config: Option<CpuConfigArch>,
663    ) -> Result<(), Self::Error>;
664
665    /// Configures and add a pci device into vm
666    fn register_pci_device(
667        linux: &mut RunnableLinuxVm,
668        device: Box<dyn PciDevice>,
669        #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
670        resources: &mut SystemAllocator,
671        hp_control_tube: &mpsc::Sender<PciRootCommand>,
672        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
673    ) -> Result<PciAddress, Self::Error>;
674
675    /// Returns frequency map for each of the host's logical cores.
676    fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>, Self::Error>;
677
678    /// Returns max-freq map of the host's logical cores.
679    fn get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>, Self::Error>;
680
681    /// Returns capacity map of the host's logical cores.
682    fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>, Self::Error>;
683
684    /// Returns cluster masks for each of the host's logical cores.
685    fn get_host_cpu_clusters() -> Result<Vec<CpuSet>, Self::Error>;
686}
687
688#[cfg(feature = "gdb")]
689pub trait GdbOps {
690    type Error: StdError;
691
692    /// Reads vCPU's registers.
693    fn read_registers(vcpu: &dyn VcpuArch) -> Result<<GdbArch as Arch>::Registers, Self::Error>;
694
695    /// Writes vCPU's registers.
696    fn write_registers(
697        vcpu: &dyn VcpuArch,
698        regs: &<GdbArch as Arch>::Registers,
699    ) -> Result<(), Self::Error>;
700
701    /// Reads bytes from the guest memory.
702    fn read_memory(
703        vcpu: &dyn VcpuArch,
704        guest_mem: &GuestMemory,
705        vaddr: GuestAddress,
706        len: usize,
707    ) -> Result<Vec<u8>, Self::Error>;
708
709    /// Writes bytes to the specified guest memory.
710    fn write_memory(
711        vcpu: &dyn VcpuArch,
712        guest_mem: &GuestMemory,
713        vaddr: GuestAddress,
714        buf: &[u8],
715    ) -> Result<(), Self::Error>;
716
717    /// Reads bytes from the guest register.
718    ///
719    /// Returns an empty vector if `reg_id` is valid but the register is not available.
720    fn read_register(
721        vcpu: &dyn VcpuArch,
722        reg_id: <GdbArch as Arch>::RegId,
723    ) -> Result<Vec<u8>, Self::Error>;
724
725    /// Writes bytes to the specified guest register.
726    fn write_register(
727        vcpu: &dyn VcpuArch,
728        reg_id: <GdbArch as Arch>::RegId,
729        data: &[u8],
730    ) -> Result<(), Self::Error>;
731
732    /// Make the next vCPU's run single-step.
733    fn enable_singlestep(vcpu: &dyn VcpuArch) -> Result<(), Self::Error>;
734
735    /// Get maximum number of hardware breakpoints.
736    fn get_max_hw_breakpoints(vcpu: &dyn VcpuArch) -> Result<usize, Self::Error>;
737
738    /// Set hardware breakpoints at the given addresses.
739    fn set_hw_breakpoints(
740        vcpu: &dyn VcpuArch,
741        breakpoints: &[GuestAddress],
742    ) -> Result<(), Self::Error>;
743}
744
745/// Errors for device manager.
746#[sorted]
747#[derive(Error, Debug)]
748pub enum DeviceRegistrationError {
749    /// No more MMIO space available.
750    #[error("no more addresses are available")]
751    AddrsExhausted,
752    /// Could not allocate device address space for the device.
753    #[error("Allocating device addresses: {0}")]
754    AllocateDeviceAddrs(PciDeviceError),
755    /// Could not allocate IO space for the device.
756    #[error("Allocating IO addresses: {0}")]
757    AllocateIoAddrs(PciDeviceError),
758    /// Could not allocate MMIO or IO resource for the device.
759    #[error("Allocating IO resource: {0}")]
760    AllocateIoResource(resources::Error),
761    /// Could not allocate an IRQ number.
762    #[error("Allocating IRQ number")]
763    AllocateIrq,
764    /// Could not allocate IRQ resource for the device.
765    #[cfg(any(target_os = "android", target_os = "linux"))]
766    #[error("Allocating IRQ resource: {0}")]
767    AllocateIrqResource(devices::vfio::VfioError),
768    #[error("failed to attach the device to its power domain: {0}")]
769    AttachDevicePowerDomain(anyhow::Error),
770    /// Broken pci topology
771    #[error("pci topology is broken")]
772    BrokenPciTopology,
773    /// Unable to clone a jail for the device.
774    #[cfg(any(target_os = "android", target_os = "linux"))]
775    #[error("failed to clone jail: {0}")]
776    CloneJail(minijail::Error),
777    /// Appending to kernel command line failed.
778    #[error("unable to add device to kernel command line: {0}")]
779    Cmdline(kernel_cmdline::Error),
780    /// Configure window size failed.
781    #[error("failed to configure window size: {0}")]
782    ConfigureWindowSize(PciDeviceError),
783    // Unable to create a pipe.
784    #[error("failed to create pipe: {0}")]
785    CreatePipe(base::Error),
786    // Unable to create a root.
787    #[error("failed to create pci root: {0}")]
788    CreateRoot(anyhow::Error),
789    // Unable to create serial device from serial parameters
790    #[error("failed to create serial device: {0}")]
791    CreateSerialDevice(devices::SerialError),
792    // Unable to create tube
793    #[error("failed to create tube: {0}")]
794    CreateTube(base::TubeError),
795    /// Could not clone an event.
796    #[error("failed to clone event: {0}")]
797    EventClone(base::Error),
798    /// Could not create an event.
799    #[error("failed to create event: {0}")]
800    EventCreate(base::Error),
801    /// Failed to generate ACPI content.
802    #[error("failed to generate ACPI content")]
803    GenerateAcpi,
804    /// No more IRQs are available.
805    #[error("no more IRQs are available")]
806    IrqsExhausted,
807    /// VFIO device is missing a DT symbol.
808    #[error("cannot match VFIO device to DT node due to a missing symbol")]
809    MissingDeviceTreeSymbol,
810    /// Missing a required serial device.
811    #[error("missing required serial device {0}")]
812    MissingRequiredSerialDevice(u8),
813    /// Could not add a device to the mmio bus.
814    #[error("failed to add to mmio bus: {0}")]
815    MmioInsert(BusError),
816    /// Failed to insert device into PCI root.
817    #[error("failed to insert device into PCI root: {0}")]
818    PciRootAddDevice(PciDeviceError),
819    #[cfg(any(target_os = "android", target_os = "linux"))]
820    /// Failed to initialize proxy device for jailed device.
821    #[error("failed to create proxy device: {0}")]
822    ProxyDeviceCreation(devices::ProxyError),
823    #[cfg(any(target_os = "android", target_os = "linux"))]
824    /// Failed to register battery device.
825    #[error("failed to register battery device to VM: {0}")]
826    RegisterBattery(devices::BatteryError),
827    /// Could not register PCI device to pci root bus
828    #[error("failed to register PCI device to pci root bus")]
829    RegisterDevice(SendError<PciRootCommand>),
830    /// Could not register PCI device capabilities.
831    #[error("could not register PCI device capabilities: {0}")]
832    RegisterDeviceCapabilities(PciDeviceError),
833    /// Failed to register ioevent with VM.
834    #[error("failed to register ioevent to VM: {0}")]
835    RegisterIoevent(base::Error),
836    /// Failed to register irq event with VM.
837    #[error("failed to register irq event to VM: {0}")]
838    RegisterIrqfd(base::Error),
839    /// Could not setup VFIO platform IRQ for the device.
840    #[error("Setting up VFIO platform IRQ: {0}")]
841    SetupVfioPlatformIrq(anyhow::Error),
842}
843
844/// Config a PCI device for used by this vm.
845pub fn configure_pci_device(
846    linux: &mut RunnableLinuxVm,
847    mut device: Box<dyn PciDevice>,
848    #[cfg(any(target_os = "android", target_os = "linux"))] jail: Option<Minijail>,
849    resources: &mut SystemAllocator,
850    hp_control_tube: &mpsc::Sender<PciRootCommand>,
851    #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
852) -> Result<PciAddress, DeviceRegistrationError> {
853    // Allocate PCI device address before allocating BARs.
854    let pci_address = device
855        .allocate_address(resources)
856        .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
857
858    // Allocate ranges that may need to be in the low MMIO region (MmioType::Low).
859    let mmio_ranges = device
860        .allocate_io_bars(resources)
861        .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
862
863    // Allocate device ranges that may be in low or high MMIO after low-only ranges.
864    let device_ranges = device
865        .allocate_device_bars(resources)
866        .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
867
868    // If device is a pcie bridge, add its pci bus to pci root
869    if let Some(pci_bus) = device.get_new_pci_bus() {
870        hp_control_tube
871            .send(PciRootCommand::AddBridge(pci_bus))
872            .map_err(DeviceRegistrationError::RegisterDevice)?;
873        let bar_ranges = Vec::new();
874        device
875            .configure_bridge_window(resources, &bar_ranges)
876            .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
877    }
878
879    // Do not suggest INTx for hot-plug devices.
880    let intx_event = devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
881
882    if let PreferredIrq::Fixed { pin, gsi } = device.preferred_irq() {
883        resources.reserve_irq(gsi);
884
885        device.assign_irq(
886            intx_event
887                .try_clone()
888                .map_err(DeviceRegistrationError::EventClone)?,
889            pin,
890            gsi,
891        );
892
893        linux
894            .irq_chip
895            .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(&device))
896            .map_err(DeviceRegistrationError::RegisterIrqfd)?;
897    }
898
899    let mut keep_rds = device.keep_rds();
900    syslog::push_descriptors(&mut keep_rds);
901    cros_tracing::push_descriptors!(&mut keep_rds);
902    metrics::push_descriptors(&mut keep_rds);
903
904    device
905        .register_device_capabilities()
906        .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
907
908    #[cfg(any(target_os = "android", target_os = "linux"))]
909    let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
910        let proxy = ProxyDevice::new(
911            device,
912            jail,
913            keep_rds,
914            #[cfg(feature = "swap")]
915            swap_controller,
916        )
917        .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
918        linux
919            .pid_debug_label_map
920            .insert(proxy.pid() as u32, proxy.debug_label());
921        Arc::new(Mutex::new(proxy))
922    } else {
923        device.on_sandboxed();
924        Arc::new(Mutex::new(device))
925    };
926
927    #[cfg(windows)]
928    let arced_dev = {
929        device.on_sandboxed();
930        Arc::new(Mutex::new(device))
931    };
932
933    #[cfg(any(target_os = "android", target_os = "linux"))]
934    hp_control_tube
935        .send(PciRootCommand::Add(pci_address, arced_dev.clone()))
936        .map_err(DeviceRegistrationError::RegisterDevice)?;
937
938    for range in &mmio_ranges {
939        linux
940            .mmio_bus
941            .insert(arced_dev.clone(), range.addr, range.size)
942            .map_err(DeviceRegistrationError::MmioInsert)?;
943    }
944
945    for range in &device_ranges {
946        linux
947            .mmio_bus
948            .insert(arced_dev.clone(), range.addr, range.size)
949            .map_err(DeviceRegistrationError::MmioInsert)?;
950    }
951
952    Ok(pci_address)
953}
954
955// Generate pci topology starting from parent bus
956fn generate_pci_topology(
957    parent_bus: Arc<Mutex<PciBus>>,
958    resources: &mut SystemAllocator,
959    io_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
960    device_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
961    device_addrs: &[PciAddress],
962    devices: &mut Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
963) -> Result<(Vec<BarRange>, u8), DeviceRegistrationError> {
964    let mut bar_ranges = Vec::new();
965    let bus_num = parent_bus.lock().get_bus_num();
966    let mut subordinate_bus = bus_num;
967    for (dev_idx, addr) in device_addrs.iter().enumerate() {
968        // Only target for devices that located on this bus
969        if addr.bus == bus_num {
970            // If this device is a pci bridge (a.k.a., it has a pci bus structure),
971            // create its topology recursively
972            if let Some(child_bus) = devices[dev_idx].0.get_new_pci_bus() {
973                let (child_bar_ranges, child_sub_bus) = generate_pci_topology(
974                    child_bus.clone(),
975                    resources,
976                    io_ranges,
977                    device_ranges,
978                    device_addrs,
979                    devices,
980                )?;
981                let device = &mut devices[dev_idx].0;
982                parent_bus
983                    .lock()
984                    .add_child_bus(child_bus.clone())
985                    .map_err(|_| DeviceRegistrationError::BrokenPciTopology)?;
986                let bridge_window = device
987                    .configure_bridge_window(resources, &child_bar_ranges)
988                    .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
989                bar_ranges.extend(bridge_window);
990
991                let ranges = device
992                    .allocate_io_bars(resources)
993                    .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
994                io_ranges.insert(dev_idx, ranges.clone());
995                bar_ranges.extend(ranges);
996
997                let ranges = device
998                    .allocate_device_bars(resources)
999                    .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1000                device_ranges.insert(dev_idx, ranges.clone());
1001                bar_ranges.extend(ranges);
1002
1003                device.set_subordinate_bus(child_sub_bus);
1004
1005                subordinate_bus = std::cmp::max(subordinate_bus, child_sub_bus);
1006            }
1007        }
1008    }
1009
1010    for (dev_idx, addr) in device_addrs.iter().enumerate() {
1011        if addr.bus == bus_num {
1012            let device = &mut devices[dev_idx].0;
1013            // Allocate MMIO for non-bridge devices
1014            if device.get_new_pci_bus().is_none() {
1015                let ranges = device
1016                    .allocate_io_bars(resources)
1017                    .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
1018                io_ranges.insert(dev_idx, ranges.clone());
1019                bar_ranges.extend(ranges);
1020
1021                let ranges = device
1022                    .allocate_device_bars(resources)
1023                    .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1024                device_ranges.insert(dev_idx, ranges.clone());
1025                bar_ranges.extend(ranges);
1026            }
1027        }
1028    }
1029    Ok((bar_ranges, subordinate_bus))
1030}
1031
1032/// Ensure all PCI devices have an assigned PCI address.
1033pub fn assign_pci_addresses(
1034    devices: &mut [(Box<dyn BusDeviceObj>, Option<Minijail>)],
1035    resources: &mut SystemAllocator,
1036) -> Result<(), DeviceRegistrationError> {
1037    // First allocate devices with a preferred address.
1038    for pci_device in devices
1039        .iter_mut()
1040        .filter_map(|(device, _jail)| device.as_pci_device_mut())
1041        .filter(|pci_device| pci_device.preferred_address().is_some())
1042    {
1043        let _ = pci_device
1044            .allocate_address(resources)
1045            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1046    }
1047
1048    // Then allocate addresses for the remaining devices.
1049    for pci_device in devices
1050        .iter_mut()
1051        .filter_map(|(device, _jail)| device.as_pci_device_mut())
1052        .filter(|pci_device| pci_device.preferred_address().is_none())
1053    {
1054        let _ = pci_device
1055            .allocate_address(resources)
1056            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1057    }
1058
1059    Ok(())
1060}
1061
1062/// Creates a root PCI device for use by this Vm.
1063pub fn generate_pci_root(
1064    mut devices: Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
1065    irq_chip: &dyn IrqChip,
1066    mmio_bus: Arc<Bus>,
1067    mmio_base: GuestAddress,
1068    mmio_register_bit_num: usize,
1069    io_bus: Arc<Bus>,
1070    resources: &mut SystemAllocator,
1071    mut vm: &dyn Vm,
1072    max_irqs: usize,
1073    vcfg_base: Option<u64>,
1074    #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1075) -> Result<
1076    (
1077        PciRoot,
1078        Vec<(PciAddress, u32, PciInterruptPin)>,
1079        BTreeMap<u32, String>,
1080        BTreeMap<PciAddress, Vec<u8>>,
1081        BTreeMap<PciAddress, Vec<u8>>,
1082    ),
1083    DeviceRegistrationError,
1084> {
1085    let mut device_addrs = Vec::new();
1086
1087    for (device, _jail) in devices.iter_mut() {
1088        let address = device
1089            .allocate_address(resources)
1090            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1091        device_addrs.push(address);
1092    }
1093
1094    let mut device_ranges = BTreeMap::new();
1095    let mut io_ranges = BTreeMap::new();
1096    let root_bus = Arc::new(Mutex::new(PciBus::new(0, 0, false)));
1097
1098    generate_pci_topology(
1099        root_bus.clone(),
1100        resources,
1101        &mut io_ranges,
1102        &mut device_ranges,
1103        &device_addrs,
1104        &mut devices,
1105    )?;
1106
1107    let mut root = PciRoot::new(
1108        vm,
1109        Arc::downgrade(&mmio_bus),
1110        mmio_base,
1111        mmio_register_bit_num,
1112        Arc::downgrade(&io_bus),
1113        root_bus,
1114    )
1115    .map_err(DeviceRegistrationError::CreateRoot)?;
1116    #[cfg_attr(windows, allow(unused_mut))]
1117    let mut pid_labels = BTreeMap::new();
1118
1119    // Allocate legacy INTx
1120    let mut pci_irqs = Vec::new();
1121    let mut irqs: Vec<u32> = Vec::new();
1122
1123    // Mapping of (bus, dev, pin) -> IRQ number.
1124    let mut dev_pin_irq = BTreeMap::new();
1125
1126    for (dev_idx, (device, _jail)) in devices.iter_mut().enumerate() {
1127        let pci_address = device_addrs[dev_idx];
1128
1129        let irq = match device.preferred_irq() {
1130            PreferredIrq::Fixed { pin, gsi } => {
1131                // The device reported a preferred IRQ, so use that rather than allocating one.
1132                resources.reserve_irq(gsi);
1133                Some((pin, gsi))
1134            }
1135            PreferredIrq::Any => {
1136                // The device did not provide a preferred IRQ but requested one, so allocate one.
1137
1138                // Choose a pin based on the slot's function number. Function 0 must always use
1139                // INTA# for single-function devices per the PCI spec, and we choose to use INTA#
1140                // for function 0 on multifunction devices and distribute the remaining functions
1141                // evenly across the other pins.
1142                let pin = match pci_address.func % 4 {
1143                    0 => PciInterruptPin::IntA,
1144                    1 => PciInterruptPin::IntB,
1145                    2 => PciInterruptPin::IntC,
1146                    _ => PciInterruptPin::IntD,
1147                };
1148
1149                // If an IRQ number has already been assigned for a different function with this
1150                // (bus, device, pin) combination, use it. Otherwise allocate a new one and insert
1151                // it into the map.
1152                let pin_key = (pci_address.bus, pci_address.dev, pin);
1153                let irq_num = if let Some(irq_num) = dev_pin_irq.get(&pin_key) {
1154                    *irq_num
1155                } else {
1156                    // If we have allocated fewer than `max_irqs` total, add a new irq to the `irqs`
1157                    // pool. Otherwise, share one of the existing `irqs`.
1158                    let irq_num = if irqs.len() < max_irqs {
1159                        let irq_num = resources
1160                            .allocate_irq()
1161                            .ok_or(DeviceRegistrationError::AllocateIrq)?;
1162                        irqs.push(irq_num);
1163                        irq_num
1164                    } else {
1165                        // Pick one of the existing IRQs to share, using `dev_idx` to distribute IRQ
1166                        // sharing evenly across devices.
1167                        irqs[dev_idx % max_irqs]
1168                    };
1169
1170                    dev_pin_irq.insert(pin_key, irq_num);
1171                    irq_num
1172                };
1173                Some((pin, irq_num))
1174            }
1175            PreferredIrq::None => {
1176                // The device does not want an INTx# IRQ.
1177                None
1178            }
1179        };
1180
1181        if let Some((pin, gsi)) = irq {
1182            let intx_event =
1183                devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
1184
1185            device.assign_irq(
1186                intx_event
1187                    .try_clone()
1188                    .map_err(DeviceRegistrationError::EventClone)?,
1189                pin,
1190                gsi,
1191            );
1192
1193            irq_chip
1194                .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(device))
1195                .map_err(DeviceRegistrationError::RegisterIrqfd)?;
1196
1197            pci_irqs.push((pci_address, gsi, pin));
1198        }
1199    }
1200
1201    // To prevent issues where device's on_sandbox may spawn thread before all
1202    // sandboxed devices are sandboxed we partition iterator to go over sandboxed
1203    // first. This is needed on linux platforms. On windows, this is a no-op since
1204    // jails are always None, even for sandboxed devices.
1205    let devices = {
1206        let (sandboxed, non_sandboxed): (Vec<_>, Vec<_>) = devices
1207            .into_iter()
1208            .enumerate()
1209            .partition(|(_, (_, jail))| jail.is_some());
1210        sandboxed.into_iter().chain(non_sandboxed)
1211    };
1212
1213    let mut amls = BTreeMap::new();
1214    let mut gpe_scope_amls = BTreeMap::new();
1215    for (dev_idx, dev_value) in devices {
1216        #[cfg(any(target_os = "android", target_os = "linux"))]
1217        let (mut device, jail) = dev_value;
1218        #[cfg(windows)]
1219        let (mut device, _) = dev_value;
1220        let address = device_addrs[dev_idx];
1221
1222        let mut keep_rds = device.keep_rds();
1223        syslog::push_descriptors(&mut keep_rds);
1224        cros_tracing::push_descriptors!(&mut keep_rds);
1225        metrics::push_descriptors(&mut keep_rds);
1226        keep_rds.append(&mut vm.get_memory().as_raw_descriptors());
1227
1228        let ranges = io_ranges.remove(&dev_idx).unwrap_or_default();
1229        let device_ranges = device_ranges.remove(&dev_idx).unwrap_or_default();
1230        device
1231            .register_device_capabilities()
1232            .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
1233
1234        if let Some(vcfg_base) = vcfg_base {
1235            let (methods, shm) = device.generate_acpi_methods();
1236            if !methods.is_empty() {
1237                amls.insert(address, methods);
1238            }
1239            if let Some((offset, mmap)) = shm {
1240                let _ = vm.add_memory_region(
1241                    GuestAddress(vcfg_base + offset as u64),
1242                    Box::new(mmap),
1243                    false,
1244                    false,
1245                    MemCacheType::CacheCoherent,
1246                );
1247            }
1248        }
1249        let gpe_nr = device.set_gpe(resources);
1250
1251        #[cfg(any(target_os = "android", target_os = "linux"))]
1252        let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
1253            let proxy = ProxyDevice::new(
1254                device,
1255                jail,
1256                keep_rds,
1257                #[cfg(feature = "swap")]
1258                swap_controller,
1259            )
1260            .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
1261            pid_labels.insert(proxy.pid() as u32, proxy.debug_label());
1262            Arc::new(Mutex::new(proxy))
1263        } else {
1264            device.on_sandboxed();
1265            Arc::new(Mutex::new(device))
1266        };
1267        #[cfg(windows)]
1268        let arced_dev = {
1269            device.on_sandboxed();
1270            Arc::new(Mutex::new(device))
1271        };
1272        root.add_device(address, arced_dev.clone(), &mut vm)
1273            .map_err(DeviceRegistrationError::PciRootAddDevice)?;
1274        for range in &ranges {
1275            mmio_bus
1276                .insert(arced_dev.clone(), range.addr, range.size)
1277                .map_err(DeviceRegistrationError::MmioInsert)?;
1278        }
1279
1280        for range in &device_ranges {
1281            mmio_bus
1282                .insert(arced_dev.clone(), range.addr, range.size)
1283                .map_err(DeviceRegistrationError::MmioInsert)?;
1284        }
1285
1286        if let Some(gpe_nr) = gpe_nr {
1287            if let Some(acpi_path) = root.acpi_path(&address) {
1288                let mut gpe_aml = Vec::new();
1289
1290                GpeScope {}.cast_to_aml_bytes(
1291                    &mut gpe_aml,
1292                    gpe_nr,
1293                    format!("\\{acpi_path}").as_str(),
1294                );
1295                if !gpe_aml.is_empty() {
1296                    gpe_scope_amls.insert(address, gpe_aml);
1297                }
1298            }
1299        }
1300    }
1301
1302    Ok((root, pci_irqs, pid_labels, amls, gpe_scope_amls))
1303}
1304
1305/// Errors for image loading.
1306#[sorted]
1307#[derive(Error, Debug)]
1308pub enum LoadImageError {
1309    #[error("Alignment not a power of two: {0}")]
1310    BadAlignment(u64),
1311    #[error("Getting image size failed: {0}")]
1312    GetLen(io::Error),
1313    #[error("GuestMemory get slice failed: {0}")]
1314    GuestMemorySlice(GuestMemoryError),
1315    #[error("Image size too large: {0}")]
1316    ImageSizeTooLarge(u64),
1317    #[error("No suitable memory region found")]
1318    NoSuitableMemoryRegion,
1319    #[error("Reading image into memory failed: {0}")]
1320    ReadToMemory(io::Error),
1321    #[error("Cannot load zero-sized image")]
1322    ZeroSizedImage,
1323}
1324
1325/// Load an image from a file into guest memory.
1326///
1327/// # Arguments
1328///
1329/// * `guest_mem` - The memory to be used by the guest.
1330/// * `guest_addr` - The starting address to load the image in the guest memory.
1331/// * `max_size` - The amount of space in bytes available in the guest memory for the image.
1332/// * `image` - The file containing the image to be loaded.
1333///
1334/// The size in bytes of the loaded image is returned.
1335pub fn load_image<F>(
1336    guest_mem: &GuestMemory,
1337    image: &mut F,
1338    guest_addr: GuestAddress,
1339    max_size: u64,
1340) -> Result<u32, LoadImageError>
1341where
1342    F: FileReadWriteAtVolatile + FileGetLen,
1343{
1344    let size = image.get_len().map_err(LoadImageError::GetLen)?;
1345
1346    if size > u32::MAX as u64 || size > max_size {
1347        return Err(LoadImageError::ImageSizeTooLarge(size));
1348    }
1349
1350    // This is safe due to the bounds check above.
1351    let size = size as u32;
1352
1353    let guest_slice = guest_mem
1354        .get_slice_at_addr(guest_addr, size as usize)
1355        .map_err(LoadImageError::GuestMemorySlice)?;
1356    image
1357        .read_exact_at_volatile(guest_slice, 0)
1358        .map_err(LoadImageError::ReadToMemory)?;
1359
1360    Ok(size)
1361}
1362
1363/// Load an image from a file into guest memory at the highest possible address.
1364///
1365/// # Arguments
1366///
1367/// * `guest_mem` - The memory to be used by the guest.
1368/// * `image` - The file containing the image to be loaded.
1369/// * `min_guest_addr` - The minimum address of the start of the image.
1370/// * `max_guest_addr` - The address to load the last byte of the image.
1371/// * `region_filter` - The optional filter function for determining if the given guest memory
1372///   region is suitable for loading the image into it.
1373/// * `align` - The minimum alignment of the start address of the image in bytes (must be a power of
1374///   two).
1375///
1376/// The guest address and size in bytes of the loaded image are returned.
1377pub fn load_image_high<F>(
1378    guest_mem: &GuestMemory,
1379    image: &mut F,
1380    min_guest_addr: GuestAddress,
1381    max_guest_addr: GuestAddress,
1382    region_filter: Option<fn(&MemoryRegionInformation) -> bool>,
1383    align: u64,
1384) -> Result<(GuestAddress, u32), LoadImageError>
1385where
1386    F: FileReadWriteAtVolatile + FileGetLen,
1387{
1388    if !align.is_power_of_two() {
1389        return Err(LoadImageError::BadAlignment(align));
1390    }
1391
1392    let max_size = max_guest_addr.offset_from(min_guest_addr) & !(align - 1);
1393    let size = image.get_len().map_err(LoadImageError::GetLen)?;
1394
1395    if size == 0 {
1396        return Err(LoadImageError::ZeroSizedImage);
1397    }
1398
1399    if size > u32::MAX as u64 || size > max_size {
1400        return Err(LoadImageError::ImageSizeTooLarge(size));
1401    }
1402
1403    // Sort the list of guest memory regions by address so we can iterate over them in reverse order
1404    // (high to low).
1405    let mut regions: Vec<_> = guest_mem
1406        .regions()
1407        .filter(region_filter.unwrap_or(|_| true))
1408        .collect();
1409    regions.sort_unstable_by(|a, b| a.guest_addr.cmp(&b.guest_addr));
1410
1411    // Find the highest valid address inside a guest memory region that satisfies the requested
1412    // alignment and min/max address requirements while having enough space for the image.
1413    let guest_addr = regions
1414        .into_iter()
1415        .rev()
1416        .filter_map(|r| {
1417            // Highest address within this region.
1418            let rgn_max_addr = r
1419                .guest_addr
1420                .checked_add((r.size as u64).checked_sub(1)?)?
1421                .min(max_guest_addr);
1422            // Lowest aligned address within this region.
1423            let rgn_start_aligned = r.guest_addr.align(align)?;
1424            // Hypothetical address of the image if loaded at the end of the region.
1425            let image_addr = rgn_max_addr.checked_sub(size - 1)? & !(align - 1);
1426
1427            // Would the image fit within the region?
1428            if image_addr >= rgn_start_aligned {
1429                Some(image_addr)
1430            } else {
1431                None
1432            }
1433        })
1434        .find(|&addr| addr >= min_guest_addr)
1435        .ok_or(LoadImageError::NoSuitableMemoryRegion)?;
1436
1437    // This is safe due to the bounds check above.
1438    let size = size as u32;
1439
1440    let guest_slice = guest_mem
1441        .get_slice_at_addr(guest_addr, size as usize)
1442        .map_err(LoadImageError::GuestMemorySlice)?;
1443    image
1444        .read_exact_at_volatile(guest_slice, 0)
1445        .map_err(LoadImageError::ReadToMemory)?;
1446
1447    Ok((guest_addr, size))
1448}
1449
1450/// SMBIOS table configuration
1451#[derive(Clone, Debug, Default, Serialize, Deserialize, FromKeyValues, PartialEq, Eq)]
1452#[serde(deny_unknown_fields, rename_all = "kebab-case")]
1453pub struct SmbiosOptions {
1454    /// BIOS vendor name.
1455    pub bios_vendor: Option<String>,
1456
1457    /// BIOS version number (free-form string).
1458    pub bios_version: Option<String>,
1459
1460    /// System manufacturer name.
1461    pub manufacturer: Option<String>,
1462
1463    /// System product name.
1464    pub product_name: Option<String>,
1465
1466    /// System serial number (free-form string).
1467    pub serial_number: Option<String>,
1468
1469    /// System UUID.
1470    pub uuid: Option<Uuid>,
1471
1472    /// Additional OEM strings to add to SMBIOS table.
1473    #[serde(default)]
1474    pub oem_strings: Vec<String>,
1475}
1476
1477#[cfg(test)]
1478mod tests {
1479    use serde_keyvalue::from_key_values;
1480    use tempfile::tempfile;
1481
1482    use super::*;
1483
1484    #[test]
1485    fn parse_pstore() {
1486        let res: Pstore = from_key_values("path=/some/path,size=16384").unwrap();
1487        assert_eq!(
1488            res,
1489            Pstore {
1490                path: "/some/path".into(),
1491                size: 16384,
1492            }
1493        );
1494
1495        let res = from_key_values::<Pstore>("path=/some/path");
1496        assert!(res.is_err());
1497
1498        let res = from_key_values::<Pstore>("size=16384");
1499        assert!(res.is_err());
1500
1501        let res = from_key_values::<Pstore>("");
1502        assert!(res.is_err());
1503    }
1504
1505    #[test]
1506    fn deserialize_cpuset_serde_kv() {
1507        let res: CpuSet = from_key_values("[0,4,7]").unwrap();
1508        assert_eq!(res, CpuSet::new(vec![0, 4, 7]));
1509
1510        let res: CpuSet = from_key_values("[9-12]").unwrap();
1511        assert_eq!(res, CpuSet::new(vec![9, 10, 11, 12]));
1512
1513        let res: CpuSet = from_key_values("[0,4,7,9-12,15]").unwrap();
1514        assert_eq!(res, CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]));
1515    }
1516
1517    #[test]
1518    fn deserialize_serialize_cpuset_json() {
1519        let json_str = "[0,4,7]";
1520        let cpuset = CpuSet::new(vec![0, 4, 7]);
1521        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1522        assert_eq!(res, cpuset);
1523        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1524
1525        let json_str = r#"["9-12"]"#;
1526        let cpuset = CpuSet::new(vec![9, 10, 11, 12]);
1527        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1528        assert_eq!(res, cpuset);
1529        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1530
1531        let json_str = r#"[0,4,7,"9-12",15]"#;
1532        let cpuset = CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]);
1533        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1534        assert_eq!(res, cpuset);
1535        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1536    }
1537
1538    #[test]
1539    fn load_image_high_max_4g() {
1540        let mem = GuestMemory::new(&[
1541            (GuestAddress(0x0000_0000), 0x4000_0000), // 0x00000000..0x40000000
1542            (GuestAddress(0x8000_0000), 0x4000_0000), // 0x80000000..0xC0000000
1543        ])
1544        .unwrap();
1545
1546        const TEST_IMAGE_SIZE: u64 = 1234;
1547        let mut test_image = tempfile().unwrap();
1548        test_image.set_len(TEST_IMAGE_SIZE).unwrap();
1549
1550        const TEST_ALIGN: u64 = 0x8000;
1551        let (addr, size) = load_image_high(
1552            &mem,
1553            &mut test_image,
1554            GuestAddress(0x8000),
1555            GuestAddress(0xFFFF_FFFF), // max_guest_addr beyond highest guest memory region
1556            None,
1557            TEST_ALIGN,
1558        )
1559        .unwrap();
1560
1561        assert_eq!(addr, GuestAddress(0xBFFF_8000));
1562        assert_eq!(addr.offset() % TEST_ALIGN, 0);
1563        assert_eq!(size, TEST_IMAGE_SIZE as u32);
1564    }
1565}