arch/
lib.rs

1// Copyright 2018 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Virtual machine architecture support code.
6
7pub mod android;
8pub mod fdt;
9pub mod pstore;
10pub mod serial;
11
12pub mod sys;
13
14use std::collections::BTreeMap;
15use std::error::Error as StdError;
16use std::fs::File;
17use std::io;
18use std::ops::Deref;
19use std::path::PathBuf;
20use std::str::FromStr;
21use std::sync::mpsc;
22use std::sync::mpsc::SendError;
23use std::sync::Arc;
24
25use acpi_tables::sdt::SDT;
26use base::syslog;
27use base::AsRawDescriptors;
28use base::FileGetLen;
29use base::FileReadWriteAtVolatile;
30use base::RecvTube;
31use base::SendTube;
32use base::Tube;
33use devices::virtio::VirtioDevice;
34use devices::BarRange;
35use devices::Bus;
36use devices::BusDevice;
37use devices::BusDeviceObj;
38use devices::BusError;
39use devices::BusResumeDevice;
40use devices::FwCfgParameters;
41use devices::GpeScope;
42use devices::HotPlugBus;
43use devices::IrqChip;
44use devices::IrqEventSource;
45use devices::PciAddress;
46use devices::PciBus;
47use devices::PciDevice;
48use devices::PciDeviceError;
49use devices::PciInterruptPin;
50use devices::PciRoot;
51use devices::PciRootCommand;
52use devices::PreferredIrq;
53#[cfg(any(target_os = "android", target_os = "linux"))]
54use devices::ProxyDevice;
55use devices::SerialHardware;
56use devices::SerialParameters;
57pub use fdt::apply_device_tree_overlays;
58pub use fdt::DtbOverlay;
59#[cfg(feature = "gdb")]
60use gdbstub::arch::Arch;
61use hypervisor::MemCacheType;
62use hypervisor::Vm;
63#[cfg(windows)]
64use jail::FakeMinijailStub as Minijail;
65#[cfg(any(target_os = "android", target_os = "linux"))]
66use minijail::Minijail;
67use remain::sorted;
68use resources::SystemAllocator;
69use resources::SystemAllocatorConfig;
70use serde::de::Visitor;
71use serde::Deserialize;
72use serde::Serialize;
73use serde_keyvalue::FromKeyValues;
74pub use serial::add_serial_devices;
75pub use serial::get_serial_cmdline;
76pub use serial::set_default_serial_parameters;
77pub use serial::GetSerialCmdlineError;
78pub use serial::SERIAL_ADDR;
79use sync::Condvar;
80use sync::Mutex;
81#[cfg(any(target_os = "android", target_os = "linux"))]
82pub use sys::linux::PlatformBusResources;
83use thiserror::Error;
84use uuid::Uuid;
85use vm_control::BatControl;
86use vm_control::BatteryType;
87use vm_control::PmResource;
88use vm_memory::GuestAddress;
89use vm_memory::GuestMemory;
90use vm_memory::GuestMemoryError;
91use vm_memory::MemoryRegionInformation;
92use vm_memory::MemoryRegionOptions;
93
94cfg_if::cfg_if! {
95    if #[cfg(target_arch = "aarch64")] {
96        pub use devices::IrqChipAArch64 as IrqChipArch;
97        #[cfg(feature = "gdb")]
98        pub use gdbstub_arch::aarch64::AArch64 as GdbArch;
99        pub use hypervisor::CpuConfigAArch64 as CpuConfigArch;
100        pub use hypervisor::Hypervisor as HypervisorArch;
101        pub use hypervisor::VcpuAArch64 as VcpuArch;
102        pub use hypervisor::VcpuInitAArch64 as VcpuInitArch;
103        pub use hypervisor::VmAArch64 as VmArch;
104    } else if #[cfg(target_arch = "riscv64")] {
105        pub use devices::IrqChipRiscv64 as IrqChipArch;
106        #[cfg(feature = "gdb")]
107        pub use gdbstub_arch::riscv::Riscv64 as GdbArch;
108        pub use hypervisor::CpuConfigRiscv64 as CpuConfigArch;
109        pub use hypervisor::Hypervisor as HypervisorArch;
110        pub use hypervisor::VcpuInitRiscv64 as VcpuInitArch;
111        pub use hypervisor::VcpuRiscv64 as VcpuArch;
112        pub use hypervisor::VmRiscv64 as VmArch;
113    } else if #[cfg(target_arch = "x86_64")] {
114        pub use devices::IrqChipX86_64 as IrqChipArch;
115        #[cfg(feature = "gdb")]
116        pub use gdbstub_arch::x86::X86_64_SSE as GdbArch;
117        pub use hypervisor::CpuConfigX86_64 as CpuConfigArch;
118        pub use hypervisor::HypervisorX86_64 as HypervisorArch;
119        pub use hypervisor::VcpuInitX86_64 as VcpuInitArch;
120        pub use hypervisor::VcpuX86_64 as VcpuArch;
121        pub use hypervisor::VmX86_64 as VmArch;
122    }
123}
124
125pub enum VmImage {
126    Kernel(File),
127    Bios(File),
128}
129
130#[derive(Clone, Debug, Deserialize, Serialize, FromKeyValues, PartialEq, Eq)]
131#[serde(deny_unknown_fields, rename_all = "kebab-case")]
132pub struct Pstore {
133    pub path: PathBuf,
134    pub size: u32,
135}
136
137#[derive(Clone, Copy, Debug, Serialize, Deserialize, FromKeyValues)]
138#[serde(deny_unknown_fields, rename_all = "kebab-case")]
139pub enum FdtPosition {
140    /// At the start of RAM.
141    Start,
142    /// Near the end of RAM.
143    End,
144    /// After the payload, with some padding for alignment.
145    AfterPayload,
146}
147
148/// Set of CPU cores.
149#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
150pub struct CpuSet(Vec<usize>);
151
152impl CpuSet {
153    pub fn new<I: IntoIterator<Item = usize>>(cpus: I) -> Self {
154        CpuSet(cpus.into_iter().collect())
155    }
156
157    pub fn iter(&self) -> std::slice::Iter<'_, usize> {
158        self.0.iter()
159    }
160}
161
162impl FromIterator<usize> for CpuSet {
163    fn from_iter<T>(iter: T) -> Self
164    where
165        T: IntoIterator<Item = usize>,
166    {
167        CpuSet::new(iter)
168    }
169}
170
171#[cfg(target_arch = "aarch64")]
172fn sve_auto_default() -> bool {
173    true
174}
175
176/// The SVE config for Vcpus.
177#[cfg(target_arch = "aarch64")]
178#[derive(Copy, Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
179#[serde(deny_unknown_fields, rename_all = "kebab-case")]
180pub struct SveConfig {
181    /// Detect if SVE is available and enable accordingly. `enable` is ignored if auto is true
182    #[serde(default = "sve_auto_default")]
183    pub auto: bool,
184}
185
186#[cfg(target_arch = "aarch64")]
187impl Default for SveConfig {
188    fn default() -> Self {
189        SveConfig {
190            auto: sve_auto_default(),
191        }
192    }
193}
194
195/// FFA config
196// For now this is limited to android, will be opened to other aarch64 based pVMs after
197// corresponding kernel APIs are upstreamed.
198#[cfg(all(target_os = "android", target_arch = "aarch64"))]
199#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize, FromKeyValues)]
200#[serde(deny_unknown_fields, rename_all = "kebab-case")]
201pub struct FfaConfig {
202    /// Just enable FFA, don't care about the negotiated version.
203    #[serde(default)]
204    pub auto: bool,
205}
206
207fn parse_cpu_range(s: &str, cpuset: &mut Vec<usize>) -> Result<(), String> {
208    fn parse_cpu(s: &str) -> Result<usize, String> {
209        s.parse()
210            .map_err(|_| format!("invalid CPU index {s} - index must be a non-negative integer"))
211    }
212
213    let (first_cpu, last_cpu) = match s.split_once('-') {
214        Some((first_cpu, last_cpu)) => {
215            let first_cpu = parse_cpu(first_cpu)?;
216            let last_cpu = parse_cpu(last_cpu)?;
217
218            if last_cpu < first_cpu {
219                return Err(format!(
220                    "invalid CPU range {s} - ranges must be from low to high"
221                ));
222            }
223            (first_cpu, last_cpu)
224        }
225        None => {
226            let cpu = parse_cpu(s)?;
227            (cpu, cpu)
228        }
229    };
230
231    cpuset.extend(first_cpu..=last_cpu);
232
233    Ok(())
234}
235
236impl FromStr for CpuSet {
237    type Err = String;
238
239    fn from_str(s: &str) -> Result<Self, Self::Err> {
240        let mut cpuset = Vec::new();
241        for part in s.split(',') {
242            parse_cpu_range(part, &mut cpuset)?;
243        }
244        Ok(CpuSet::new(cpuset))
245    }
246}
247
248impl Deref for CpuSet {
249    type Target = Vec<usize>;
250
251    fn deref(&self) -> &Self::Target {
252        &self.0
253    }
254}
255
256impl IntoIterator for CpuSet {
257    type Item = usize;
258    type IntoIter = std::vec::IntoIter<Self::Item>;
259
260    fn into_iter(self) -> Self::IntoIter {
261        self.0.into_iter()
262    }
263}
264
265/// Selects the interface for guest-controlled power management of assigned devices.
266#[derive(Clone, Copy, Debug, Deserialize, PartialEq, Eq, Serialize)]
267pub enum DevicePowerManagerConfig {
268    /// Uses the protected KVM hypercall interface.
269    PkvmHvc,
270}
271
272impl FromStr for DevicePowerManagerConfig {
273    type Err = String;
274
275    fn from_str(s: &str) -> Result<Self, Self::Err> {
276        match s {
277            "pkvm-hvc" => Ok(Self::PkvmHvc),
278            _ => Err(format!("DevicePowerManagerConfig '{s}' not supported")),
279        }
280    }
281}
282
283/// Deserializes a `CpuSet` from a sequence which elements can either be integers, or strings
284/// representing CPU ranges (e.g. `5-8`).
285impl<'de> Deserialize<'de> for CpuSet {
286    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
287    where
288        D: serde::Deserializer<'de>,
289    {
290        struct CpuSetVisitor;
291        impl<'de> Visitor<'de> for CpuSetVisitor {
292            type Value = CpuSet;
293
294            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
295                formatter.write_str("CpuSet")
296            }
297
298            fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
299            where
300                A: serde::de::SeqAccess<'de>,
301            {
302                #[derive(Deserialize)]
303                #[serde(untagged)]
304                enum CpuSetValue<'a> {
305                    Single(usize),
306                    Range(&'a str),
307                }
308
309                let mut cpus = Vec::new();
310                while let Some(cpuset) = seq.next_element::<CpuSetValue>()? {
311                    match cpuset {
312                        CpuSetValue::Single(cpu) => cpus.push(cpu),
313                        CpuSetValue::Range(range) => {
314                            parse_cpu_range(range, &mut cpus).map_err(serde::de::Error::custom)?;
315                        }
316                    }
317                }
318
319                Ok(CpuSet::new(cpus))
320            }
321        }
322
323        deserializer.deserialize_seq(CpuSetVisitor)
324    }
325}
326
327/// Serializes a `CpuSet` into a sequence of integers and strings representing CPU ranges.
328impl Serialize for CpuSet {
329    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
330    where
331        S: serde::Serializer,
332    {
333        use serde::ser::SerializeSeq;
334
335        let mut seq = serializer.serialize_seq(None)?;
336
337        // Factorize ranges into "a-b" strings.
338        let mut serialize_range = |start: usize, end: usize| -> Result<(), S::Error> {
339            if start == end {
340                seq.serialize_element(&start)?;
341            } else {
342                seq.serialize_element(&format!("{start}-{end}"))?;
343            }
344
345            Ok(())
346        };
347
348        // Current range.
349        let mut range = None;
350        for core in &self.0 {
351            range = match range {
352                None => Some((core, core)),
353                Some((start, end)) if *end == *core - 1 => Some((start, core)),
354                Some((start, end)) => {
355                    serialize_range(*start, *end)?;
356                    Some((core, core))
357                }
358            };
359        }
360
361        if let Some((start, end)) = range {
362            serialize_range(*start, *end)?;
363        }
364
365        seq.end()
366    }
367}
368
369/// Mapping of guest VCPU threads to host CPU cores.
370#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
371pub enum VcpuAffinity {
372    /// All VCPU threads will be pinned to the same set of host CPU cores.
373    Global(CpuSet),
374    /// Each VCPU may be pinned to a set of host CPU cores.
375    /// The map key is a guest VCPU index, and the corresponding value is the set of
376    /// host CPU indices that the VCPU thread will be allowed to run on.
377    /// If a VCPU index is not present in the map, its affinity will not be set.
378    PerVcpu(BTreeMap<usize, CpuSet>),
379}
380
381/// Memory region with optional size.
382#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
383pub struct MemoryRegionConfig {
384    pub start: u64,
385    pub size: Option<u64>,
386}
387
388/// General PCI config.
389#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
390pub struct PciConfig {
391    /// region for PCI Configuration Access Mechanism
392    #[cfg(target_arch = "aarch64")]
393    pub cam: Option<MemoryRegionConfig>,
394    /// region for PCIe Enhanced Configuration Access Mechanism
395    #[cfg(target_arch = "x86_64")]
396    pub ecam: Option<MemoryRegionConfig>,
397    /// region for non-prefetchable PCI device memory below 4G
398    pub mem: Option<MemoryRegionConfig>,
399}
400
401pub const DEFAULT_CPU_CAPACITY: u32 = 1024;
402
403#[sorted]
404#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
405pub struct VcpuProperties {
406    pub capacity: Option<u32>,
407    pub dynamic_power_coefficient: Option<u32>,
408    pub frequencies: Vec<u32>,
409    #[cfg(all(
410        target_arch = "aarch64",
411        any(target_os = "android", target_os = "linux")
412    ))]
413    pub normalized_cpu_ipc_ratio: Option<u32>,
414    #[cfg(all(
415        target_arch = "aarch64",
416        any(target_os = "android", target_os = "linux")
417    ))]
418    pub vcpu_domain: Option<u32>,
419    #[cfg(all(
420        target_arch = "aarch64",
421        any(target_os = "android", target_os = "linux")
422    ))]
423    pub vcpu_domain_path: Option<PathBuf>,
424}
425
426/// Derives base VCPU properties from various config fields.
427pub fn derive_vcpu_properties(
428    vcpu_count: usize,
429    vcpu_capacity: &std::collections::BTreeMap<usize, u32>,
430    dynamic_power_coefficient: &std::collections::BTreeMap<usize, u32>,
431    vcpu_frequencies: &std::collections::BTreeMap<usize, Vec<u32>>,
432    #[cfg(all(
433        target_arch = "aarch64",
434        any(target_os = "android", target_os = "linux")
435    ))]
436    normalized_cpu_ipc_ratio: &std::collections::BTreeMap<usize, u32>,
437    #[cfg(all(
438        target_arch = "aarch64",
439        any(target_os = "android", target_os = "linux")
440    ))]
441    vcpu_domain: &std::collections::BTreeMap<usize, u32>,
442    #[cfg(all(
443        target_arch = "aarch64",
444        any(target_os = "android", target_os = "linux")
445    ))]
446    vcpu_domain_path: &std::collections::BTreeMap<usize, std::path::PathBuf>,
447) -> std::collections::BTreeMap<usize, VcpuProperties> {
448    let mut vcpu_properties = std::collections::BTreeMap::new();
449    for vcpu_id in 0..vcpu_count {
450        let vcpu_prop_capacity = vcpu_capacity.get(&vcpu_id).copied();
451
452        vcpu_properties.insert(
453            vcpu_id,
454            VcpuProperties {
455                capacity: vcpu_prop_capacity,
456                frequencies: vcpu_frequencies.get(&vcpu_id).cloned().unwrap_or_default(),
457                dynamic_power_coefficient: dynamic_power_coefficient.get(&vcpu_id).copied(),
458                #[cfg(all(
459                    target_arch = "aarch64",
460                    any(target_os = "android", target_os = "linux")
461                ))]
462                normalized_cpu_ipc_ratio: normalized_cpu_ipc_ratio.get(&vcpu_id).copied(),
463                #[cfg(all(
464                    target_arch = "aarch64",
465                    any(target_os = "android", target_os = "linux")
466                ))]
467                vcpu_domain: vcpu_domain.get(&vcpu_id).copied(),
468                #[cfg(all(
469                    target_arch = "aarch64",
470                    any(target_os = "android", target_os = "linux")
471                ))]
472                vcpu_domain_path: vcpu_domain_path.get(&vcpu_id).cloned(),
473            },
474        );
475    }
476    vcpu_properties
477}
478
479/// Holds the pieces needed to build a VM. Passed to `build_vm` in the `LinuxArch` trait below to
480/// create a `RunnableLinuxVm`.
481#[sorted]
482pub struct VmComponents {
483    #[cfg(all(target_arch = "x86_64", unix))]
484    pub ac_adapter: bool,
485    pub acpi_sdts: Vec<SDT>,
486    pub android_fstab: Option<File>,
487    pub boot_cpu: usize,
488    pub bootorder_fw_cfg_blob: Vec<u8>,
489    #[cfg(target_arch = "x86_64")]
490    pub break_linux_pci_config_io: bool,
491
492    pub delay_rt: bool,
493    pub dev_pm: Option<DevicePowerManagerConfig>,
494    pub extra_kernel_params: Vec<String>,
495    #[cfg(target_arch = "x86_64")]
496    pub force_s2idle: bool,
497    pub fw_cfg_enable: bool,
498    pub fw_cfg_parameters: Vec<FwCfgParameters>,
499    pub host_cpu_topology: bool,
500    pub hugepages: bool,
501    pub hv_cfg: hypervisor::Config,
502    pub initrd_image: Option<File>,
503    pub itmt: bool,
504    pub memory_size: u64,
505    pub no_i8042: bool,
506    pub no_rtc: bool,
507    pub no_smt: bool,
508
509    pub pci_config: PciConfig,
510    pub pflash_block_size: u32,
511    pub pflash_image: Option<File>,
512    pub pstore: Option<Pstore>,
513    /// A file to load as pVM firmware. Must be `Some` iff
514    /// `hv_cfg.protection_type == ProtectionType::UnprotectedWithFirmware`.
515    pub pvm_fw: Option<File>,
516    pub rt_cpus: CpuSet,
517    #[cfg(target_arch = "x86_64")]
518    pub smbios: SmbiosOptions,
519    pub smccc_trng: bool,
520    #[cfg(target_arch = "aarch64")]
521    pub sve_config: SveConfig,
522    pub swiotlb: Option<u64>,
523    pub vcpu_affinity: Option<VcpuAffinity>,
524    /// List of vCPU clusters, mapped from pCPU clusters.
525    pub vcpu_clusters: Vec<CpuSet>,
526    pub vcpu_properties: BTreeMap<usize, VcpuProperties>,
527    #[cfg(any(target_os = "android", target_os = "linux"))]
528    pub vfio_platform_pm: bool,
529    #[cfg(all(
530        target_arch = "aarch64",
531        any(target_os = "android", target_os = "linux")
532    ))]
533    pub virt_cpufreq_v2: bool,
534    pub vm_image: VmImage,
535}
536
537/// Holds the elements needed to run a Linux VM. Created by `build_vm`.
538#[sorted]
539pub struct RunnableLinuxVm<V: VmArch, Vcpu: VcpuArch> {
540    pub bat_control: Option<BatControl>,
541    pub delay_rt: bool,
542    pub devices_thread: Option<std::thread::JoinHandle<()>>,
543    pub hotplug_bus: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
544    pub hypercall_bus: Arc<Bus>,
545    pub io_bus: Arc<Bus>,
546    pub irq_chip: Box<dyn IrqChipArch>,
547    pub mmio_bus: Arc<Bus>,
548    pub no_smt: bool,
549    pub pid_debug_label_map: BTreeMap<u32, String>,
550    #[cfg(any(target_os = "android", target_os = "linux"))]
551    pub platform_devices: Vec<Arc<Mutex<dyn BusDevice>>>,
552    pub pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
553    /// Devices to be notified before the system resumes from the S3 suspended state.
554    pub resume_notify_devices: Vec<Arc<Mutex<dyn BusResumeDevice>>>,
555    pub root_config: Arc<Mutex<PciRoot>>,
556    pub rt_cpus: CpuSet,
557    pub suspend_tube: (Arc<Mutex<SendTube>>, RecvTube),
558    pub vcpu_affinity: Option<VcpuAffinity>,
559    pub vcpu_count: usize,
560    pub vcpu_init: Vec<VcpuInitArch>,
561    /// If vcpus is None, then it's the responsibility of the vcpu thread to create vcpus.
562    /// If it's Some, then `build_vm` already created the vcpus.
563    pub vcpus: Option<Vec<Vcpu>>,
564    pub vm: V,
565    pub vm_request_tubes: Vec<Tube>,
566}
567
568/// The device and optional jail.
569pub struct VirtioDeviceStub {
570    pub dev: Box<dyn VirtioDevice>,
571    pub jail: Option<Minijail>,
572}
573
574/// Trait which is implemented for each Linux Architecture in order to
575/// set up the memory, cpus, and system devices and to boot the kernel.
576pub trait LinuxArch {
577    type Error: StdError;
578    type ArchMemoryLayout;
579
580    /// Decide architecture specific memory layout details to be used by later stages of the VM
581    /// setup.
582    fn arch_memory_layout(
583        components: &VmComponents,
584    ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error>;
585
586    /// Returns a Vec of the valid memory addresses as pairs of address and length. These should be
587    /// used to configure the `GuestMemory` structure for the platform.
588    ///
589    /// # Arguments
590    ///
591    /// * `components` - Parts used to determine the memory layout.
592    fn guest_memory_layout(
593        components: &VmComponents,
594        arch_memory_layout: &Self::ArchMemoryLayout,
595        hypervisor: &impl hypervisor::Hypervisor,
596    ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>;
597
598    /// Gets the configuration for a new `SystemAllocator` that fits the given `Vm`'s memory layout.
599    ///
600    /// This is the per-architecture template for constructing the `SystemAllocator`. Platform
601    /// agnostic modifications may be made to this configuration, but the final `SystemAllocator`
602    /// will be at least as strict as this configuration.
603    ///
604    /// # Arguments
605    ///
606    /// * `vm` - The virtual machine to be used as a template for the `SystemAllocator`.
607    fn get_system_allocator_config<V: Vm>(
608        vm: &V,
609        arch_memory_layout: &Self::ArchMemoryLayout,
610    ) -> SystemAllocatorConfig;
611
612    /// Takes `VmComponents` and generates a `RunnableLinuxVm`.
613    ///
614    /// # Arguments
615    ///
616    /// * `components` - Parts to use to build the VM.
617    /// * `vm_evt_wrtube` - Tube used by sub-devices to request that crosvm exit because guest wants
618    ///   to stop/shut down or requested reset.
619    /// * `system_allocator` - Allocator created by this trait's implementation of
620    ///   `get_system_allocator_config`.
621    /// * `serial_parameters` - Definitions for how the serial devices should be configured.
622    /// * `serial_jail` - Jail used for serial devices created here.
623    /// * `battery` - Defines what battery device will be created.
624    /// * `vm` - A VM implementation to build upon.
625    /// * `ramoops_region` - Region allocated for ramoops.
626    /// * `devices` - The devices to be built into the VM.
627    /// * `irq_chip` - The IRQ chip implemention for the VM.
628    /// * `debugcon_jail` - Jail used for debugcon devices created here.
629    /// * `pflash_jail` - Jail used for pflash device created here.
630    /// * `fw_cfg_jail` - Jail used for fw_cfg device created here.
631    /// * `device_tree_overlays` - Device tree overlay binaries
632    fn build_vm<V, Vcpu>(
633        components: VmComponents,
634        arch_memory_layout: &Self::ArchMemoryLayout,
635        vm_evt_wrtube: &SendTube,
636        system_allocator: &mut SystemAllocator,
637        serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
638        serial_jail: Option<Minijail>,
639        battery: (Option<BatteryType>, Option<Minijail>),
640        vm: V,
641        ramoops_region: Option<pstore::RamoopsRegion>,
642        devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
643        irq_chip: &mut dyn IrqChipArch,
644        vcpu_ids: &mut Vec<usize>,
645        dump_device_tree_blob: Option<PathBuf>,
646        debugcon_jail: Option<Minijail>,
647        #[cfg(target_arch = "x86_64")] pflash_jail: Option<Minijail>,
648        #[cfg(target_arch = "x86_64")] fw_cfg_jail: Option<Minijail>,
649        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
650        guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
651        device_tree_overlays: Vec<DtbOverlay>,
652        fdt_position: Option<FdtPosition>,
653        no_pmu: bool,
654    ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
655    where
656        V: VmArch,
657        Vcpu: VcpuArch;
658
659    /// Configures the vcpu and should be called once per vcpu from the vcpu's thread.
660    ///
661    /// # Arguments
662    ///
663    /// * `vm` - The virtual machine object.
664    /// * `hypervisor` - The `Hypervisor` that created the vcpu.
665    /// * `irq_chip` - The `IrqChip` associated with this vm.
666    /// * `vcpu` - The VCPU object to configure.
667    /// * `vcpu_init` - The data required to initialize VCPU registers and other state.
668    /// * `vcpu_id` - The id of the given `vcpu`.
669    /// * `num_vcpus` - Number of virtual CPUs the guest will have.
670    /// * `cpu_config` - CPU feature configurations.
671    fn configure_vcpu<V: Vm>(
672        vm: &V,
673        hypervisor: &dyn HypervisorArch,
674        irq_chip: &mut dyn IrqChipArch,
675        vcpu: &mut dyn VcpuArch,
676        vcpu_init: VcpuInitArch,
677        vcpu_id: usize,
678        num_vcpus: usize,
679        cpu_config: Option<CpuConfigArch>,
680    ) -> Result<(), Self::Error>;
681
682    /// Configures and add a pci device into vm
683    fn register_pci_device<V: VmArch, Vcpu: VcpuArch>(
684        linux: &mut RunnableLinuxVm<V, Vcpu>,
685        device: Box<dyn PciDevice>,
686        #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
687        resources: &mut SystemAllocator,
688        hp_control_tube: &mpsc::Sender<PciRootCommand>,
689        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
690    ) -> Result<PciAddress, Self::Error>;
691
692    /// Returns frequency map for each of the host's logical cores.
693    fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>, Self::Error>;
694
695    /// Returns max-freq map of the host's logical cores.
696    fn get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>, Self::Error>;
697
698    /// Returns capacity map of the host's logical cores.
699    fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>, Self::Error>;
700
701    /// Returns cluster masks for each of the host's logical cores.
702    fn get_host_cpu_clusters() -> Result<Vec<CpuSet>, Self::Error>;
703}
704
705#[cfg(feature = "gdb")]
706pub trait GdbOps<T: VcpuArch> {
707    type Error: StdError;
708
709    /// Reads vCPU's registers.
710    fn read_registers(vcpu: &T) -> Result<<GdbArch as Arch>::Registers, Self::Error>;
711
712    /// Writes vCPU's registers.
713    fn write_registers(vcpu: &T, regs: &<GdbArch as Arch>::Registers) -> Result<(), Self::Error>;
714
715    /// Reads bytes from the guest memory.
716    fn read_memory(
717        vcpu: &T,
718        guest_mem: &GuestMemory,
719        vaddr: GuestAddress,
720        len: usize,
721    ) -> Result<Vec<u8>, Self::Error>;
722
723    /// Writes bytes to the specified guest memory.
724    fn write_memory(
725        vcpu: &T,
726        guest_mem: &GuestMemory,
727        vaddr: GuestAddress,
728        buf: &[u8],
729    ) -> Result<(), Self::Error>;
730
731    /// Reads bytes from the guest register.
732    ///
733    /// Returns an empty vector if `reg_id` is valid but the register is not available.
734    fn read_register(vcpu: &T, reg_id: <GdbArch as Arch>::RegId) -> Result<Vec<u8>, Self::Error>;
735
736    /// Writes bytes to the specified guest register.
737    fn write_register(
738        vcpu: &T,
739        reg_id: <GdbArch as Arch>::RegId,
740        data: &[u8],
741    ) -> Result<(), Self::Error>;
742
743    /// Make the next vCPU's run single-step.
744    fn enable_singlestep(vcpu: &T) -> Result<(), Self::Error>;
745
746    /// Get maximum number of hardware breakpoints.
747    fn get_max_hw_breakpoints(vcpu: &T) -> Result<usize, Self::Error>;
748
749    /// Set hardware breakpoints at the given addresses.
750    fn set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<(), Self::Error>;
751}
752
753/// Errors for device manager.
754#[sorted]
755#[derive(Error, Debug)]
756pub enum DeviceRegistrationError {
757    /// No more MMIO space available.
758    #[error("no more addresses are available")]
759    AddrsExhausted,
760    /// Could not allocate device address space for the device.
761    #[error("Allocating device addresses: {0}")]
762    AllocateDeviceAddrs(PciDeviceError),
763    /// Could not allocate IO space for the device.
764    #[error("Allocating IO addresses: {0}")]
765    AllocateIoAddrs(PciDeviceError),
766    /// Could not allocate MMIO or IO resource for the device.
767    #[error("Allocating IO resource: {0}")]
768    AllocateIoResource(resources::Error),
769    /// Could not allocate an IRQ number.
770    #[error("Allocating IRQ number")]
771    AllocateIrq,
772    /// Could not allocate IRQ resource for the device.
773    #[cfg(any(target_os = "android", target_os = "linux"))]
774    #[error("Allocating IRQ resource: {0}")]
775    AllocateIrqResource(devices::vfio::VfioError),
776    #[error("failed to attach the device to its power domain: {0}")]
777    AttachDevicePowerDomain(anyhow::Error),
778    /// Broken pci topology
779    #[error("pci topology is broken")]
780    BrokenPciTopology,
781    /// Unable to clone a jail for the device.
782    #[cfg(any(target_os = "android", target_os = "linux"))]
783    #[error("failed to clone jail: {0}")]
784    CloneJail(minijail::Error),
785    /// Appending to kernel command line failed.
786    #[error("unable to add device to kernel command line: {0}")]
787    Cmdline(kernel_cmdline::Error),
788    /// Configure window size failed.
789    #[error("failed to configure window size: {0}")]
790    ConfigureWindowSize(PciDeviceError),
791    // Unable to create a pipe.
792    #[error("failed to create pipe: {0}")]
793    CreatePipe(base::Error),
794    // Unable to create a root.
795    #[error("failed to create pci root: {0}")]
796    CreateRoot(anyhow::Error),
797    // Unable to create serial device from serial parameters
798    #[error("failed to create serial device: {0}")]
799    CreateSerialDevice(devices::SerialError),
800    // Unable to create tube
801    #[error("failed to create tube: {0}")]
802    CreateTube(base::TubeError),
803    /// Could not clone an event.
804    #[error("failed to clone event: {0}")]
805    EventClone(base::Error),
806    /// Could not create an event.
807    #[error("failed to create event: {0}")]
808    EventCreate(base::Error),
809    /// Failed to generate ACPI content.
810    #[error("failed to generate ACPI content")]
811    GenerateAcpi,
812    /// No more IRQs are available.
813    #[error("no more IRQs are available")]
814    IrqsExhausted,
815    /// VFIO device is missing a DT symbol.
816    #[error("cannot match VFIO device to DT node due to a missing symbol")]
817    MissingDeviceTreeSymbol,
818    /// Missing a required serial device.
819    #[error("missing required serial device {0}")]
820    MissingRequiredSerialDevice(u8),
821    /// Could not add a device to the mmio bus.
822    #[error("failed to add to mmio bus: {0}")]
823    MmioInsert(BusError),
824    /// Failed to insert device into PCI root.
825    #[error("failed to insert device into PCI root: {0}")]
826    PciRootAddDevice(PciDeviceError),
827    #[cfg(any(target_os = "android", target_os = "linux"))]
828    /// Failed to initialize proxy device for jailed device.
829    #[error("failed to create proxy device: {0}")]
830    ProxyDeviceCreation(devices::ProxyError),
831    #[cfg(any(target_os = "android", target_os = "linux"))]
832    /// Failed to register battery device.
833    #[error("failed to register battery device to VM: {0}")]
834    RegisterBattery(devices::BatteryError),
835    /// Could not register PCI device to pci root bus
836    #[error("failed to register PCI device to pci root bus")]
837    RegisterDevice(SendError<PciRootCommand>),
838    /// Could not register PCI device capabilities.
839    #[error("could not register PCI device capabilities: {0}")]
840    RegisterDeviceCapabilities(PciDeviceError),
841    /// Failed to register ioevent with VM.
842    #[error("failed to register ioevent to VM: {0}")]
843    RegisterIoevent(base::Error),
844    /// Failed to register irq event with VM.
845    #[error("failed to register irq event to VM: {0}")]
846    RegisterIrqfd(base::Error),
847    /// Could not setup VFIO platform IRQ for the device.
848    #[error("Setting up VFIO platform IRQ: {0}")]
849    SetupVfioPlatformIrq(anyhow::Error),
850}
851
852/// Config a PCI device for used by this vm.
853pub fn configure_pci_device<V: VmArch, Vcpu: VcpuArch>(
854    linux: &mut RunnableLinuxVm<V, Vcpu>,
855    mut device: Box<dyn PciDevice>,
856    #[cfg(any(target_os = "android", target_os = "linux"))] jail: Option<Minijail>,
857    resources: &mut SystemAllocator,
858    hp_control_tube: &mpsc::Sender<PciRootCommand>,
859    #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
860) -> Result<PciAddress, DeviceRegistrationError> {
861    // Allocate PCI device address before allocating BARs.
862    let pci_address = device
863        .allocate_address(resources)
864        .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
865
866    // Allocate ranges that may need to be in the low MMIO region (MmioType::Low).
867    let mmio_ranges = device
868        .allocate_io_bars(resources)
869        .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
870
871    // Allocate device ranges that may be in low or high MMIO after low-only ranges.
872    let device_ranges = device
873        .allocate_device_bars(resources)
874        .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
875
876    // If device is a pcie bridge, add its pci bus to pci root
877    if let Some(pci_bus) = device.get_new_pci_bus() {
878        hp_control_tube
879            .send(PciRootCommand::AddBridge(pci_bus))
880            .map_err(DeviceRegistrationError::RegisterDevice)?;
881        let bar_ranges = Vec::new();
882        device
883            .configure_bridge_window(resources, &bar_ranges)
884            .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
885    }
886
887    // Do not suggest INTx for hot-plug devices.
888    let intx_event = devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
889
890    if let PreferredIrq::Fixed { pin, gsi } = device.preferred_irq() {
891        resources.reserve_irq(gsi);
892
893        device.assign_irq(
894            intx_event
895                .try_clone()
896                .map_err(DeviceRegistrationError::EventClone)?,
897            pin,
898            gsi,
899        );
900
901        linux
902            .irq_chip
903            .as_irq_chip_mut()
904            .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(&device))
905            .map_err(DeviceRegistrationError::RegisterIrqfd)?;
906    }
907
908    let mut keep_rds = device.keep_rds();
909    syslog::push_descriptors(&mut keep_rds);
910    cros_tracing::push_descriptors!(&mut keep_rds);
911    metrics::push_descriptors(&mut keep_rds);
912
913    device
914        .register_device_capabilities()
915        .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
916
917    #[cfg(any(target_os = "android", target_os = "linux"))]
918    let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
919        let proxy = ProxyDevice::new(
920            device,
921            jail,
922            keep_rds,
923            #[cfg(feature = "swap")]
924            swap_controller,
925        )
926        .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
927        linux
928            .pid_debug_label_map
929            .insert(proxy.pid() as u32, proxy.debug_label());
930        Arc::new(Mutex::new(proxy))
931    } else {
932        device.on_sandboxed();
933        Arc::new(Mutex::new(device))
934    };
935
936    #[cfg(windows)]
937    let arced_dev = {
938        device.on_sandboxed();
939        Arc::new(Mutex::new(device))
940    };
941
942    #[cfg(any(target_os = "android", target_os = "linux"))]
943    hp_control_tube
944        .send(PciRootCommand::Add(pci_address, arced_dev.clone()))
945        .map_err(DeviceRegistrationError::RegisterDevice)?;
946
947    for range in &mmio_ranges {
948        linux
949            .mmio_bus
950            .insert(arced_dev.clone(), range.addr, range.size)
951            .map_err(DeviceRegistrationError::MmioInsert)?;
952    }
953
954    for range in &device_ranges {
955        linux
956            .mmio_bus
957            .insert(arced_dev.clone(), range.addr, range.size)
958            .map_err(DeviceRegistrationError::MmioInsert)?;
959    }
960
961    Ok(pci_address)
962}
963
964// Generate pci topology starting from parent bus
965fn generate_pci_topology(
966    parent_bus: Arc<Mutex<PciBus>>,
967    resources: &mut SystemAllocator,
968    io_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
969    device_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
970    device_addrs: &[PciAddress],
971    devices: &mut Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
972) -> Result<(Vec<BarRange>, u8), DeviceRegistrationError> {
973    let mut bar_ranges = Vec::new();
974    let bus_num = parent_bus.lock().get_bus_num();
975    let mut subordinate_bus = bus_num;
976    for (dev_idx, addr) in device_addrs.iter().enumerate() {
977        // Only target for devices that located on this bus
978        if addr.bus == bus_num {
979            // If this device is a pci bridge (a.k.a., it has a pci bus structure),
980            // create its topology recursively
981            if let Some(child_bus) = devices[dev_idx].0.get_new_pci_bus() {
982                let (child_bar_ranges, child_sub_bus) = generate_pci_topology(
983                    child_bus.clone(),
984                    resources,
985                    io_ranges,
986                    device_ranges,
987                    device_addrs,
988                    devices,
989                )?;
990                let device = &mut devices[dev_idx].0;
991                parent_bus
992                    .lock()
993                    .add_child_bus(child_bus.clone())
994                    .map_err(|_| DeviceRegistrationError::BrokenPciTopology)?;
995                let bridge_window = device
996                    .configure_bridge_window(resources, &child_bar_ranges)
997                    .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
998                bar_ranges.extend(bridge_window);
999
1000                let ranges = device
1001                    .allocate_io_bars(resources)
1002                    .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
1003                io_ranges.insert(dev_idx, ranges.clone());
1004                bar_ranges.extend(ranges);
1005
1006                let ranges = device
1007                    .allocate_device_bars(resources)
1008                    .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1009                device_ranges.insert(dev_idx, ranges.clone());
1010                bar_ranges.extend(ranges);
1011
1012                device.set_subordinate_bus(child_sub_bus);
1013
1014                subordinate_bus = std::cmp::max(subordinate_bus, child_sub_bus);
1015            }
1016        }
1017    }
1018
1019    for (dev_idx, addr) in device_addrs.iter().enumerate() {
1020        if addr.bus == bus_num {
1021            let device = &mut devices[dev_idx].0;
1022            // Allocate MMIO for non-bridge devices
1023            if device.get_new_pci_bus().is_none() {
1024                let ranges = device
1025                    .allocate_io_bars(resources)
1026                    .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
1027                io_ranges.insert(dev_idx, ranges.clone());
1028                bar_ranges.extend(ranges);
1029
1030                let ranges = device
1031                    .allocate_device_bars(resources)
1032                    .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1033                device_ranges.insert(dev_idx, ranges.clone());
1034                bar_ranges.extend(ranges);
1035            }
1036        }
1037    }
1038    Ok((bar_ranges, subordinate_bus))
1039}
1040
1041/// Ensure all PCI devices have an assigned PCI address.
1042pub fn assign_pci_addresses(
1043    devices: &mut [(Box<dyn BusDeviceObj>, Option<Minijail>)],
1044    resources: &mut SystemAllocator,
1045) -> Result<(), DeviceRegistrationError> {
1046    // First allocate devices with a preferred address.
1047    for pci_device in devices
1048        .iter_mut()
1049        .filter_map(|(device, _jail)| device.as_pci_device_mut())
1050        .filter(|pci_device| pci_device.preferred_address().is_some())
1051    {
1052        let _ = pci_device
1053            .allocate_address(resources)
1054            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1055    }
1056
1057    // Then allocate addresses for the remaining devices.
1058    for pci_device in devices
1059        .iter_mut()
1060        .filter_map(|(device, _jail)| device.as_pci_device_mut())
1061        .filter(|pci_device| pci_device.preferred_address().is_none())
1062    {
1063        let _ = pci_device
1064            .allocate_address(resources)
1065            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1066    }
1067
1068    Ok(())
1069}
1070
1071/// Creates a root PCI device for use by this Vm.
1072pub fn generate_pci_root(
1073    mut devices: Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
1074    irq_chip: &mut dyn IrqChip,
1075    mmio_bus: Arc<Bus>,
1076    mmio_base: GuestAddress,
1077    mmio_register_bit_num: usize,
1078    io_bus: Arc<Bus>,
1079    resources: &mut SystemAllocator,
1080    vm: &mut impl Vm,
1081    max_irqs: usize,
1082    vcfg_base: Option<u64>,
1083    #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1084) -> Result<
1085    (
1086        PciRoot,
1087        Vec<(PciAddress, u32, PciInterruptPin)>,
1088        BTreeMap<u32, String>,
1089        BTreeMap<PciAddress, Vec<u8>>,
1090        BTreeMap<PciAddress, Vec<u8>>,
1091    ),
1092    DeviceRegistrationError,
1093> {
1094    let mut device_addrs = Vec::new();
1095
1096    for (device, _jail) in devices.iter_mut() {
1097        let address = device
1098            .allocate_address(resources)
1099            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1100        device_addrs.push(address);
1101    }
1102
1103    let mut device_ranges = BTreeMap::new();
1104    let mut io_ranges = BTreeMap::new();
1105    let root_bus = Arc::new(Mutex::new(PciBus::new(0, 0, false)));
1106
1107    generate_pci_topology(
1108        root_bus.clone(),
1109        resources,
1110        &mut io_ranges,
1111        &mut device_ranges,
1112        &device_addrs,
1113        &mut devices,
1114    )?;
1115
1116    let mut root = PciRoot::new(
1117        vm,
1118        Arc::downgrade(&mmio_bus),
1119        mmio_base,
1120        mmio_register_bit_num,
1121        Arc::downgrade(&io_bus),
1122        root_bus,
1123    )
1124    .map_err(DeviceRegistrationError::CreateRoot)?;
1125    #[cfg_attr(windows, allow(unused_mut))]
1126    let mut pid_labels = BTreeMap::new();
1127
1128    // Allocate legacy INTx
1129    let mut pci_irqs = Vec::new();
1130    let mut irqs: Vec<u32> = Vec::new();
1131
1132    // Mapping of (bus, dev, pin) -> IRQ number.
1133    let mut dev_pin_irq = BTreeMap::new();
1134
1135    for (dev_idx, (device, _jail)) in devices.iter_mut().enumerate() {
1136        let pci_address = device_addrs[dev_idx];
1137
1138        let irq = match device.preferred_irq() {
1139            PreferredIrq::Fixed { pin, gsi } => {
1140                // The device reported a preferred IRQ, so use that rather than allocating one.
1141                resources.reserve_irq(gsi);
1142                Some((pin, gsi))
1143            }
1144            PreferredIrq::Any => {
1145                // The device did not provide a preferred IRQ but requested one, so allocate one.
1146
1147                // Choose a pin based on the slot's function number. Function 0 must always use
1148                // INTA# for single-function devices per the PCI spec, and we choose to use INTA#
1149                // for function 0 on multifunction devices and distribute the remaining functions
1150                // evenly across the other pins.
1151                let pin = match pci_address.func % 4 {
1152                    0 => PciInterruptPin::IntA,
1153                    1 => PciInterruptPin::IntB,
1154                    2 => PciInterruptPin::IntC,
1155                    _ => PciInterruptPin::IntD,
1156                };
1157
1158                // If an IRQ number has already been assigned for a different function with this
1159                // (bus, device, pin) combination, use it. Otherwise allocate a new one and insert
1160                // it into the map.
1161                let pin_key = (pci_address.bus, pci_address.dev, pin);
1162                let irq_num = if let Some(irq_num) = dev_pin_irq.get(&pin_key) {
1163                    *irq_num
1164                } else {
1165                    // If we have allocated fewer than `max_irqs` total, add a new irq to the `irqs`
1166                    // pool. Otherwise, share one of the existing `irqs`.
1167                    let irq_num = if irqs.len() < max_irqs {
1168                        let irq_num = resources
1169                            .allocate_irq()
1170                            .ok_or(DeviceRegistrationError::AllocateIrq)?;
1171                        irqs.push(irq_num);
1172                        irq_num
1173                    } else {
1174                        // Pick one of the existing IRQs to share, using `dev_idx` to distribute IRQ
1175                        // sharing evenly across devices.
1176                        irqs[dev_idx % max_irqs]
1177                    };
1178
1179                    dev_pin_irq.insert(pin_key, irq_num);
1180                    irq_num
1181                };
1182                Some((pin, irq_num))
1183            }
1184            PreferredIrq::None => {
1185                // The device does not want an INTx# IRQ.
1186                None
1187            }
1188        };
1189
1190        if let Some((pin, gsi)) = irq {
1191            let intx_event =
1192                devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
1193
1194            device.assign_irq(
1195                intx_event
1196                    .try_clone()
1197                    .map_err(DeviceRegistrationError::EventClone)?,
1198                pin,
1199                gsi,
1200            );
1201
1202            irq_chip
1203                .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(device))
1204                .map_err(DeviceRegistrationError::RegisterIrqfd)?;
1205
1206            pci_irqs.push((pci_address, gsi, pin));
1207        }
1208    }
1209
1210    // To prevent issues where device's on_sandbox may spawn thread before all
1211    // sandboxed devices are sandboxed we partition iterator to go over sandboxed
1212    // first. This is needed on linux platforms. On windows, this is a no-op since
1213    // jails are always None, even for sandboxed devices.
1214    let devices = {
1215        let (sandboxed, non_sandboxed): (Vec<_>, Vec<_>) = devices
1216            .into_iter()
1217            .enumerate()
1218            .partition(|(_, (_, jail))| jail.is_some());
1219        sandboxed.into_iter().chain(non_sandboxed)
1220    };
1221
1222    let mut amls = BTreeMap::new();
1223    let mut gpe_scope_amls = BTreeMap::new();
1224    for (dev_idx, dev_value) in devices {
1225        #[cfg(any(target_os = "android", target_os = "linux"))]
1226        let (mut device, jail) = dev_value;
1227        #[cfg(windows)]
1228        let (mut device, _) = dev_value;
1229        let address = device_addrs[dev_idx];
1230
1231        let mut keep_rds = device.keep_rds();
1232        syslog::push_descriptors(&mut keep_rds);
1233        cros_tracing::push_descriptors!(&mut keep_rds);
1234        metrics::push_descriptors(&mut keep_rds);
1235        keep_rds.append(&mut vm.get_memory().as_raw_descriptors());
1236
1237        let ranges = io_ranges.remove(&dev_idx).unwrap_or_default();
1238        let device_ranges = device_ranges.remove(&dev_idx).unwrap_or_default();
1239        device
1240            .register_device_capabilities()
1241            .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
1242
1243        if let Some(vcfg_base) = vcfg_base {
1244            let (methods, shm) = device.generate_acpi_methods();
1245            if !methods.is_empty() {
1246                amls.insert(address, methods);
1247            }
1248            if let Some((offset, mmap)) = shm {
1249                let _ = vm.add_memory_region(
1250                    GuestAddress(vcfg_base + offset as u64),
1251                    Box::new(mmap),
1252                    false,
1253                    false,
1254                    MemCacheType::CacheCoherent,
1255                );
1256            }
1257        }
1258        let gpe_nr = device.set_gpe(resources);
1259
1260        #[cfg(any(target_os = "android", target_os = "linux"))]
1261        let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
1262            let proxy = ProxyDevice::new(
1263                device,
1264                jail,
1265                keep_rds,
1266                #[cfg(feature = "swap")]
1267                swap_controller,
1268            )
1269            .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
1270            pid_labels.insert(proxy.pid() as u32, proxy.debug_label());
1271            Arc::new(Mutex::new(proxy))
1272        } else {
1273            device.on_sandboxed();
1274            Arc::new(Mutex::new(device))
1275        };
1276        #[cfg(windows)]
1277        let arced_dev = {
1278            device.on_sandboxed();
1279            Arc::new(Mutex::new(device))
1280        };
1281        root.add_device(address, arced_dev.clone(), vm)
1282            .map_err(DeviceRegistrationError::PciRootAddDevice)?;
1283        for range in &ranges {
1284            mmio_bus
1285                .insert(arced_dev.clone(), range.addr, range.size)
1286                .map_err(DeviceRegistrationError::MmioInsert)?;
1287        }
1288
1289        for range in &device_ranges {
1290            mmio_bus
1291                .insert(arced_dev.clone(), range.addr, range.size)
1292                .map_err(DeviceRegistrationError::MmioInsert)?;
1293        }
1294
1295        if let Some(gpe_nr) = gpe_nr {
1296            if let Some(acpi_path) = root.acpi_path(&address) {
1297                let mut gpe_aml = Vec::new();
1298
1299                GpeScope {}.cast_to_aml_bytes(
1300                    &mut gpe_aml,
1301                    gpe_nr,
1302                    format!("\\{acpi_path}").as_str(),
1303                );
1304                if !gpe_aml.is_empty() {
1305                    gpe_scope_amls.insert(address, gpe_aml);
1306                }
1307            }
1308        }
1309    }
1310
1311    Ok((root, pci_irqs, pid_labels, amls, gpe_scope_amls))
1312}
1313
1314/// Errors for image loading.
1315#[sorted]
1316#[derive(Error, Debug)]
1317pub enum LoadImageError {
1318    #[error("Alignment not a power of two: {0}")]
1319    BadAlignment(u64),
1320    #[error("Getting image size failed: {0}")]
1321    GetLen(io::Error),
1322    #[error("GuestMemory get slice failed: {0}")]
1323    GuestMemorySlice(GuestMemoryError),
1324    #[error("Image size too large: {0}")]
1325    ImageSizeTooLarge(u64),
1326    #[error("No suitable memory region found")]
1327    NoSuitableMemoryRegion,
1328    #[error("Reading image into memory failed: {0}")]
1329    ReadToMemory(io::Error),
1330    #[error("Cannot load zero-sized image")]
1331    ZeroSizedImage,
1332}
1333
1334/// Load an image from a file into guest memory.
1335///
1336/// # Arguments
1337///
1338/// * `guest_mem` - The memory to be used by the guest.
1339/// * `guest_addr` - The starting address to load the image in the guest memory.
1340/// * `max_size` - The amount of space in bytes available in the guest memory for the image.
1341/// * `image` - The file containing the image to be loaded.
1342///
1343/// The size in bytes of the loaded image is returned.
1344pub fn load_image<F>(
1345    guest_mem: &GuestMemory,
1346    image: &mut F,
1347    guest_addr: GuestAddress,
1348    max_size: u64,
1349) -> Result<u32, LoadImageError>
1350where
1351    F: FileReadWriteAtVolatile + FileGetLen,
1352{
1353    let size = image.get_len().map_err(LoadImageError::GetLen)?;
1354
1355    if size > u32::MAX as u64 || size > max_size {
1356        return Err(LoadImageError::ImageSizeTooLarge(size));
1357    }
1358
1359    // This is safe due to the bounds check above.
1360    let size = size as u32;
1361
1362    let guest_slice = guest_mem
1363        .get_slice_at_addr(guest_addr, size as usize)
1364        .map_err(LoadImageError::GuestMemorySlice)?;
1365    image
1366        .read_exact_at_volatile(guest_slice, 0)
1367        .map_err(LoadImageError::ReadToMemory)?;
1368
1369    Ok(size)
1370}
1371
1372/// Load an image from a file into guest memory at the highest possible address.
1373///
1374/// # Arguments
1375///
1376/// * `guest_mem` - The memory to be used by the guest.
1377/// * `image` - The file containing the image to be loaded.
1378/// * `min_guest_addr` - The minimum address of the start of the image.
1379/// * `max_guest_addr` - The address to load the last byte of the image.
1380/// * `region_filter` - The optional filter function for determining if the given guest memory
1381///   region is suitable for loading the image into it.
1382/// * `align` - The minimum alignment of the start address of the image in bytes (must be a power of
1383///   two).
1384///
1385/// The guest address and size in bytes of the loaded image are returned.
1386pub fn load_image_high<F>(
1387    guest_mem: &GuestMemory,
1388    image: &mut F,
1389    min_guest_addr: GuestAddress,
1390    max_guest_addr: GuestAddress,
1391    region_filter: Option<fn(&MemoryRegionInformation) -> bool>,
1392    align: u64,
1393) -> Result<(GuestAddress, u32), LoadImageError>
1394where
1395    F: FileReadWriteAtVolatile + FileGetLen,
1396{
1397    if !align.is_power_of_two() {
1398        return Err(LoadImageError::BadAlignment(align));
1399    }
1400
1401    let max_size = max_guest_addr.offset_from(min_guest_addr) & !(align - 1);
1402    let size = image.get_len().map_err(LoadImageError::GetLen)?;
1403
1404    if size == 0 {
1405        return Err(LoadImageError::ZeroSizedImage);
1406    }
1407
1408    if size > u32::MAX as u64 || size > max_size {
1409        return Err(LoadImageError::ImageSizeTooLarge(size));
1410    }
1411
1412    // Sort the list of guest memory regions by address so we can iterate over them in reverse order
1413    // (high to low).
1414    let mut regions: Vec<_> = guest_mem
1415        .regions()
1416        .filter(region_filter.unwrap_or(|_| true))
1417        .collect();
1418    regions.sort_unstable_by(|a, b| a.guest_addr.cmp(&b.guest_addr));
1419
1420    // Find the highest valid address inside a guest memory region that satisfies the requested
1421    // alignment and min/max address requirements while having enough space for the image.
1422    let guest_addr = regions
1423        .into_iter()
1424        .rev()
1425        .filter_map(|r| {
1426            // Highest address within this region.
1427            let rgn_max_addr = r
1428                .guest_addr
1429                .checked_add((r.size as u64).checked_sub(1)?)?
1430                .min(max_guest_addr);
1431            // Lowest aligned address within this region.
1432            let rgn_start_aligned = r.guest_addr.align(align)?;
1433            // Hypothetical address of the image if loaded at the end of the region.
1434            let image_addr = rgn_max_addr.checked_sub(size - 1)? & !(align - 1);
1435
1436            // Would the image fit within the region?
1437            if image_addr >= rgn_start_aligned {
1438                Some(image_addr)
1439            } else {
1440                None
1441            }
1442        })
1443        .find(|&addr| addr >= min_guest_addr)
1444        .ok_or(LoadImageError::NoSuitableMemoryRegion)?;
1445
1446    // This is safe due to the bounds check above.
1447    let size = size as u32;
1448
1449    let guest_slice = guest_mem
1450        .get_slice_at_addr(guest_addr, size as usize)
1451        .map_err(LoadImageError::GuestMemorySlice)?;
1452    image
1453        .read_exact_at_volatile(guest_slice, 0)
1454        .map_err(LoadImageError::ReadToMemory)?;
1455
1456    Ok((guest_addr, size))
1457}
1458
1459/// SMBIOS table configuration
1460#[derive(Clone, Debug, Default, Serialize, Deserialize, FromKeyValues, PartialEq, Eq)]
1461#[serde(deny_unknown_fields, rename_all = "kebab-case")]
1462pub struct SmbiosOptions {
1463    /// BIOS vendor name.
1464    pub bios_vendor: Option<String>,
1465
1466    /// BIOS version number (free-form string).
1467    pub bios_version: Option<String>,
1468
1469    /// System manufacturer name.
1470    pub manufacturer: Option<String>,
1471
1472    /// System product name.
1473    pub product_name: Option<String>,
1474
1475    /// System serial number (free-form string).
1476    pub serial_number: Option<String>,
1477
1478    /// System UUID.
1479    pub uuid: Option<Uuid>,
1480
1481    /// Additional OEM strings to add to SMBIOS table.
1482    #[serde(default)]
1483    pub oem_strings: Vec<String>,
1484}
1485
1486#[cfg(test)]
1487mod tests {
1488    use serde_keyvalue::from_key_values;
1489    use tempfile::tempfile;
1490
1491    use super::*;
1492
1493    #[test]
1494    fn parse_pstore() {
1495        let res: Pstore = from_key_values("path=/some/path,size=16384").unwrap();
1496        assert_eq!(
1497            res,
1498            Pstore {
1499                path: "/some/path".into(),
1500                size: 16384,
1501            }
1502        );
1503
1504        let res = from_key_values::<Pstore>("path=/some/path");
1505        assert!(res.is_err());
1506
1507        let res = from_key_values::<Pstore>("size=16384");
1508        assert!(res.is_err());
1509
1510        let res = from_key_values::<Pstore>("");
1511        assert!(res.is_err());
1512    }
1513
1514    #[test]
1515    fn deserialize_cpuset_serde_kv() {
1516        let res: CpuSet = from_key_values("[0,4,7]").unwrap();
1517        assert_eq!(res, CpuSet::new(vec![0, 4, 7]));
1518
1519        let res: CpuSet = from_key_values("[9-12]").unwrap();
1520        assert_eq!(res, CpuSet::new(vec![9, 10, 11, 12]));
1521
1522        let res: CpuSet = from_key_values("[0,4,7,9-12,15]").unwrap();
1523        assert_eq!(res, CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]));
1524    }
1525
1526    #[test]
1527    fn deserialize_serialize_cpuset_json() {
1528        let json_str = "[0,4,7]";
1529        let cpuset = CpuSet::new(vec![0, 4, 7]);
1530        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1531        assert_eq!(res, cpuset);
1532        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1533
1534        let json_str = r#"["9-12"]"#;
1535        let cpuset = CpuSet::new(vec![9, 10, 11, 12]);
1536        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1537        assert_eq!(res, cpuset);
1538        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1539
1540        let json_str = r#"[0,4,7,"9-12",15]"#;
1541        let cpuset = CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]);
1542        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1543        assert_eq!(res, cpuset);
1544        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1545    }
1546
1547    #[test]
1548    fn load_image_high_max_4g() {
1549        let mem = GuestMemory::new(&[
1550            (GuestAddress(0x0000_0000), 0x4000_0000), // 0x00000000..0x40000000
1551            (GuestAddress(0x8000_0000), 0x4000_0000), // 0x80000000..0xC0000000
1552        ])
1553        .unwrap();
1554
1555        const TEST_IMAGE_SIZE: u64 = 1234;
1556        let mut test_image = tempfile().unwrap();
1557        test_image.set_len(TEST_IMAGE_SIZE).unwrap();
1558
1559        const TEST_ALIGN: u64 = 0x8000;
1560        let (addr, size) = load_image_high(
1561            &mem,
1562            &mut test_image,
1563            GuestAddress(0x8000),
1564            GuestAddress(0xFFFF_FFFF), // max_guest_addr beyond highest guest memory region
1565            None,
1566            TEST_ALIGN,
1567        )
1568        .unwrap();
1569
1570        assert_eq!(addr, GuestAddress(0xBFFF_8000));
1571        assert_eq!(addr.offset() % TEST_ALIGN, 0);
1572        assert_eq!(size, TEST_IMAGE_SIZE as u32);
1573    }
1574}