arch/
lib.rs

1// Copyright 2018 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Virtual machine architecture support code.
6
7pub mod android;
8pub mod fdt;
9pub mod pstore;
10pub mod serial;
11
12pub mod sys;
13
14use std::collections::BTreeMap;
15use std::error::Error as StdError;
16use std::fs::File;
17use std::io;
18use std::ops::Deref;
19use std::path::PathBuf;
20use std::str::FromStr;
21use std::sync::mpsc;
22use std::sync::mpsc::SendError;
23use std::sync::Arc;
24
25use acpi_tables::sdt::SDT;
26use base::syslog;
27use base::AsRawDescriptors;
28use base::FileGetLen;
29use base::FileReadWriteAtVolatile;
30use base::RecvTube;
31use base::SendTube;
32use base::Tube;
33use devices::virtio::VirtioDevice;
34use devices::BarRange;
35use devices::Bus;
36use devices::BusDevice;
37use devices::BusDeviceObj;
38use devices::BusError;
39use devices::BusResumeDevice;
40use devices::FwCfgParameters;
41use devices::GpeScope;
42use devices::HotPlugBus;
43use devices::IrqChip;
44use devices::IrqEventSource;
45use devices::PciAddress;
46use devices::PciBus;
47use devices::PciDevice;
48use devices::PciDeviceError;
49use devices::PciInterruptPin;
50use devices::PciRoot;
51use devices::PciRootCommand;
52use devices::PreferredIrq;
53#[cfg(any(target_os = "android", target_os = "linux"))]
54use devices::ProxyDevice;
55use devices::SerialHardware;
56use devices::SerialParameters;
57pub use fdt::apply_device_tree_overlays;
58pub use fdt::DtbOverlay;
59#[cfg(feature = "gdb")]
60use gdbstub::arch::Arch;
61pub use hypervisor::CpuConfigArch;
62pub use hypervisor::HypervisorArch;
63use hypervisor::MemCacheType;
64pub use hypervisor::VcpuArch;
65pub use hypervisor::VcpuInitArch;
66use hypervisor::Vm;
67pub use hypervisor::VmArch;
68#[cfg(windows)]
69use jail::FakeMinijailStub as Minijail;
70#[cfg(any(target_os = "android", target_os = "linux"))]
71use minijail::Minijail;
72use remain::sorted;
73use resources::SystemAllocator;
74use resources::SystemAllocatorConfig;
75use serde::de::Visitor;
76use serde::Deserialize;
77use serde::Serialize;
78use serde_keyvalue::FromKeyValues;
79pub use serial::add_serial_devices;
80pub use serial::get_serial_cmdline;
81pub use serial::set_default_serial_parameters;
82pub use serial::GetSerialCmdlineError;
83pub use serial::SERIAL_ADDR;
84use sync::Condvar;
85use sync::Mutex;
86#[cfg(any(target_os = "android", target_os = "linux"))]
87pub use sys::linux::PlatformBusResources;
88use thiserror::Error;
89use uuid::Uuid;
90use vm_control::BatControl;
91use vm_control::BatteryType;
92use vm_control::PmResource;
93use vm_memory::GuestAddress;
94use vm_memory::GuestMemory;
95use vm_memory::GuestMemoryError;
96use vm_memory::MemoryRegionInformation;
97use vm_memory::MemoryRegionOptions;
98
99cfg_if::cfg_if! {
100    if #[cfg(target_arch = "aarch64")] {
101        pub use devices::IrqChipAArch64 as IrqChipArch;
102        #[cfg(feature = "gdb")]
103        pub use gdbstub_arch::aarch64::AArch64 as GdbArch;
104    } else if #[cfg(target_arch = "riscv64")] {
105        pub use devices::IrqChipRiscv64 as IrqChipArch;
106        #[cfg(feature = "gdb")]
107        pub use gdbstub_arch::riscv::Riscv64 as GdbArch;
108    } else if #[cfg(target_arch = "x86_64")] {
109        pub use devices::IrqChipX86_64 as IrqChipArch;
110        #[cfg(feature = "gdb")]
111        pub use gdbstub_arch::x86::X86_64_SSE as GdbArch;
112    }
113}
114
115pub enum VmImage {
116    Kernel(File),
117    Bios(File),
118}
119
120#[derive(Clone, Debug, Deserialize, Serialize, FromKeyValues, PartialEq, Eq)]
121#[serde(deny_unknown_fields, rename_all = "kebab-case")]
122pub struct Pstore {
123    pub path: PathBuf,
124    pub size: u32,
125}
126
127#[derive(Clone, Copy, Debug, Serialize, Deserialize, FromKeyValues)]
128#[serde(deny_unknown_fields, rename_all = "kebab-case")]
129pub enum FdtPosition {
130    /// At the start of RAM.
131    Start,
132    /// Near the end of RAM.
133    End,
134    /// After the payload, with some padding for alignment.
135    AfterPayload,
136}
137
138/// Set of CPU cores.
139#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
140pub struct CpuSet(Vec<usize>);
141
142impl CpuSet {
143    pub fn new<I: IntoIterator<Item = usize>>(cpus: I) -> Self {
144        CpuSet(cpus.into_iter().collect())
145    }
146
147    pub fn iter(&self) -> std::slice::Iter<'_, usize> {
148        self.0.iter()
149    }
150}
151
152impl FromIterator<usize> for CpuSet {
153    fn from_iter<T>(iter: T) -> Self
154    where
155        T: IntoIterator<Item = usize>,
156    {
157        CpuSet::new(iter)
158    }
159}
160
161#[cfg(target_arch = "aarch64")]
162fn sve_auto_default() -> bool {
163    true
164}
165
166/// The SVE config for Vcpus.
167#[cfg(target_arch = "aarch64")]
168#[derive(Copy, Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
169#[serde(deny_unknown_fields, rename_all = "kebab-case")]
170pub struct SveConfig {
171    /// Detect if SVE is available and enable accordingly. `enable` is ignored if auto is true
172    #[serde(default = "sve_auto_default")]
173    pub auto: bool,
174}
175
176#[cfg(target_arch = "aarch64")]
177impl Default for SveConfig {
178    fn default() -> Self {
179        SveConfig {
180            auto: sve_auto_default(),
181        }
182    }
183}
184
185/// FFA config
186// For now this is limited to android, will be opened to other aarch64 based pVMs after
187// corresponding kernel APIs are upstreamed.
188#[cfg(all(target_os = "android", target_arch = "aarch64"))]
189#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize, FromKeyValues)]
190#[serde(deny_unknown_fields, rename_all = "kebab-case")]
191pub struct FfaConfig {
192    /// Just enable FFA, don't care about the negotiated version.
193    #[serde(default)]
194    pub auto: bool,
195}
196
197fn parse_cpu_range(s: &str, cpuset: &mut Vec<usize>) -> Result<(), String> {
198    fn parse_cpu(s: &str) -> Result<usize, String> {
199        s.parse()
200            .map_err(|_| format!("invalid CPU index {s} - index must be a non-negative integer"))
201    }
202
203    let (first_cpu, last_cpu) = match s.split_once('-') {
204        Some((first_cpu, last_cpu)) => {
205            let first_cpu = parse_cpu(first_cpu)?;
206            let last_cpu = parse_cpu(last_cpu)?;
207
208            if last_cpu < first_cpu {
209                return Err(format!(
210                    "invalid CPU range {s} - ranges must be from low to high"
211                ));
212            }
213            (first_cpu, last_cpu)
214        }
215        None => {
216            let cpu = parse_cpu(s)?;
217            (cpu, cpu)
218        }
219    };
220
221    cpuset.extend(first_cpu..=last_cpu);
222
223    Ok(())
224}
225
226impl FromStr for CpuSet {
227    type Err = String;
228
229    fn from_str(s: &str) -> Result<Self, Self::Err> {
230        let mut cpuset = Vec::new();
231        for part in s.split(',') {
232            parse_cpu_range(part, &mut cpuset)?;
233        }
234        Ok(CpuSet::new(cpuset))
235    }
236}
237
238impl Deref for CpuSet {
239    type Target = Vec<usize>;
240
241    fn deref(&self) -> &Self::Target {
242        &self.0
243    }
244}
245
246impl IntoIterator for CpuSet {
247    type Item = usize;
248    type IntoIter = std::vec::IntoIter<Self::Item>;
249
250    fn into_iter(self) -> Self::IntoIter {
251        self.0.into_iter()
252    }
253}
254
255/// Selects the interface for guest-controlled power management of assigned devices.
256#[derive(Clone, Copy, Debug, Deserialize, PartialEq, Eq, Serialize)]
257pub enum DevicePowerManagerConfig {
258    /// Uses the protected KVM hypercall interface.
259    PkvmHvc,
260}
261
262impl FromStr for DevicePowerManagerConfig {
263    type Err = String;
264
265    fn from_str(s: &str) -> Result<Self, Self::Err> {
266        match s {
267            "pkvm-hvc" => Ok(Self::PkvmHvc),
268            _ => Err(format!("DevicePowerManagerConfig '{s}' not supported")),
269        }
270    }
271}
272
273/// Deserializes a `CpuSet` from a sequence which elements can either be integers, or strings
274/// representing CPU ranges (e.g. `5-8`).
275impl<'de> Deserialize<'de> for CpuSet {
276    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
277    where
278        D: serde::Deserializer<'de>,
279    {
280        struct CpuSetVisitor;
281        impl<'de> Visitor<'de> for CpuSetVisitor {
282            type Value = CpuSet;
283
284            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
285                formatter.write_str("CpuSet")
286            }
287
288            fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
289            where
290                A: serde::de::SeqAccess<'de>,
291            {
292                #[derive(Deserialize)]
293                #[serde(untagged)]
294                enum CpuSetValue<'a> {
295                    Single(usize),
296                    Range(&'a str),
297                }
298
299                let mut cpus = Vec::new();
300                while let Some(cpuset) = seq.next_element::<CpuSetValue>()? {
301                    match cpuset {
302                        CpuSetValue::Single(cpu) => cpus.push(cpu),
303                        CpuSetValue::Range(range) => {
304                            parse_cpu_range(range, &mut cpus).map_err(serde::de::Error::custom)?;
305                        }
306                    }
307                }
308
309                Ok(CpuSet::new(cpus))
310            }
311        }
312
313        deserializer.deserialize_seq(CpuSetVisitor)
314    }
315}
316
317/// Serializes a `CpuSet` into a sequence of integers and strings representing CPU ranges.
318impl Serialize for CpuSet {
319    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
320    where
321        S: serde::Serializer,
322    {
323        use serde::ser::SerializeSeq;
324
325        let mut seq = serializer.serialize_seq(None)?;
326
327        // Factorize ranges into "a-b" strings.
328        let mut serialize_range = |start: usize, end: usize| -> Result<(), S::Error> {
329            if start == end {
330                seq.serialize_element(&start)?;
331            } else {
332                seq.serialize_element(&format!("{start}-{end}"))?;
333            }
334
335            Ok(())
336        };
337
338        // Current range.
339        let mut range = None;
340        for core in &self.0 {
341            range = match range {
342                None => Some((core, core)),
343                Some((start, end)) if *end == *core - 1 => Some((start, core)),
344                Some((start, end)) => {
345                    serialize_range(*start, *end)?;
346                    Some((core, core))
347                }
348            };
349        }
350
351        if let Some((start, end)) = range {
352            serialize_range(*start, *end)?;
353        }
354
355        seq.end()
356    }
357}
358
359/// Mapping of guest VCPU threads to host CPU cores.
360#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
361pub enum VcpuAffinity {
362    /// All VCPU threads will be pinned to the same set of host CPU cores.
363    Global(CpuSet),
364    /// Each VCPU may be pinned to a set of host CPU cores.
365    /// The map key is a guest VCPU index, and the corresponding value is the set of
366    /// host CPU indices that the VCPU thread will be allowed to run on.
367    /// If a VCPU index is not present in the map, its affinity will not be set.
368    PerVcpu(BTreeMap<usize, CpuSet>),
369}
370
371/// Memory region with optional size.
372#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
373pub struct MemoryRegionConfig {
374    pub start: u64,
375    pub size: Option<u64>,
376}
377
378/// General PCI config.
379#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
380pub struct PciConfig {
381    /// region for PCI Configuration Access Mechanism
382    #[cfg(target_arch = "aarch64")]
383    pub cam: Option<MemoryRegionConfig>,
384    /// region for PCIe Enhanced Configuration Access Mechanism
385    #[cfg(target_arch = "x86_64")]
386    pub ecam: Option<MemoryRegionConfig>,
387    /// region for non-prefetchable PCI device memory below 4G
388    pub mem: Option<MemoryRegionConfig>,
389}
390
391pub const DEFAULT_CPU_CAPACITY: u32 = 1024;
392
393#[sorted]
394#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
395pub struct VcpuProperties {
396    pub capacity: Option<u32>,
397    pub dynamic_power_coefficient: Option<u32>,
398    pub frequencies: Vec<u32>,
399    #[cfg(all(
400        target_arch = "aarch64",
401        any(target_os = "android", target_os = "linux")
402    ))]
403    pub normalized_cpu_ipc_ratio: Option<u32>,
404    #[cfg(all(
405        target_arch = "aarch64",
406        any(target_os = "android", target_os = "linux")
407    ))]
408    pub vcpu_domain: Option<u32>,
409    #[cfg(all(
410        target_arch = "aarch64",
411        any(target_os = "android", target_os = "linux")
412    ))]
413    pub vcpu_domain_path: Option<PathBuf>,
414}
415
416/// Derives base VCPU properties from various config fields.
417pub fn derive_vcpu_properties(
418    vcpu_count: usize,
419    vcpu_capacity: &std::collections::BTreeMap<usize, u32>,
420    dynamic_power_coefficient: &std::collections::BTreeMap<usize, u32>,
421    vcpu_frequencies: &std::collections::BTreeMap<usize, Vec<u32>>,
422    #[cfg(all(
423        target_arch = "aarch64",
424        any(target_os = "android", target_os = "linux")
425    ))]
426    normalized_cpu_ipc_ratio: &std::collections::BTreeMap<usize, u32>,
427    #[cfg(all(
428        target_arch = "aarch64",
429        any(target_os = "android", target_os = "linux")
430    ))]
431    vcpu_domain: &std::collections::BTreeMap<usize, u32>,
432    #[cfg(all(
433        target_arch = "aarch64",
434        any(target_os = "android", target_os = "linux")
435    ))]
436    vcpu_domain_path: &std::collections::BTreeMap<usize, std::path::PathBuf>,
437) -> std::collections::BTreeMap<usize, VcpuProperties> {
438    let mut vcpu_properties = std::collections::BTreeMap::new();
439    for vcpu_id in 0..vcpu_count {
440        let vcpu_prop_capacity = vcpu_capacity.get(&vcpu_id).copied();
441
442        vcpu_properties.insert(
443            vcpu_id,
444            VcpuProperties {
445                capacity: vcpu_prop_capacity,
446                frequencies: vcpu_frequencies.get(&vcpu_id).cloned().unwrap_or_default(),
447                dynamic_power_coefficient: dynamic_power_coefficient.get(&vcpu_id).copied(),
448                #[cfg(all(
449                    target_arch = "aarch64",
450                    any(target_os = "android", target_os = "linux")
451                ))]
452                normalized_cpu_ipc_ratio: normalized_cpu_ipc_ratio.get(&vcpu_id).copied(),
453                #[cfg(all(
454                    target_arch = "aarch64",
455                    any(target_os = "android", target_os = "linux")
456                ))]
457                vcpu_domain: vcpu_domain.get(&vcpu_id).copied(),
458                #[cfg(all(
459                    target_arch = "aarch64",
460                    any(target_os = "android", target_os = "linux")
461                ))]
462                vcpu_domain_path: vcpu_domain_path.get(&vcpu_id).cloned(),
463            },
464        );
465    }
466    vcpu_properties
467}
468
469/// Holds the pieces needed to build a VM. Passed to `build_vm` in the `LinuxArch` trait below to
470/// create a `RunnableLinuxVm`.
471#[sorted]
472pub struct VmComponents {
473    #[cfg(all(target_arch = "x86_64", unix))]
474    pub ac_adapter: bool,
475    pub acpi_sdts: Vec<SDT>,
476    pub android_fstab: Option<File>,
477    pub boot_cpu: usize,
478    pub bootorder_fw_cfg_blob: Vec<u8>,
479    #[cfg(target_arch = "x86_64")]
480    pub break_linux_pci_config_io: bool,
481
482    pub delay_rt: bool,
483    pub dev_pm: Option<DevicePowerManagerConfig>,
484    pub extra_kernel_params: Vec<String>,
485    #[cfg(target_arch = "x86_64")]
486    pub force_s2idle: bool,
487    pub fw_cfg_enable: bool,
488    pub fw_cfg_parameters: Vec<FwCfgParameters>,
489    pub host_cpu_topology: bool,
490    pub hugepages: bool,
491    pub hv_cfg: hypervisor::Config,
492    pub initrd_image: Option<File>,
493    pub itmt: bool,
494    pub memory_size: u64,
495    pub no_i8042: bool,
496    pub no_rtc: bool,
497    pub no_smt: bool,
498
499    pub pci_config: PciConfig,
500    pub pflash_block_size: u32,
501    pub pflash_image: Option<File>,
502    pub pstore: Option<Pstore>,
503    /// A file to load as pVM firmware. Must be `Some` iff
504    /// `hv_cfg.protection_type == ProtectionType::UnprotectedWithFirmware`.
505    pub pvm_fw: Option<File>,
506    pub rt_cpus: CpuSet,
507    #[cfg(target_arch = "x86_64")]
508    pub smbios: SmbiosOptions,
509    pub smccc_trng: bool,
510    #[cfg(target_arch = "aarch64")]
511    pub sve_config: SveConfig,
512    pub swiotlb: Option<u64>,
513    pub vcpu_affinity: Option<VcpuAffinity>,
514    /// List of vCPU clusters, mapped from pCPU clusters.
515    pub vcpu_clusters: Vec<CpuSet>,
516    pub vcpu_properties: BTreeMap<usize, VcpuProperties>,
517    #[cfg(any(target_os = "android", target_os = "linux"))]
518    pub vfio_platform_pm: bool,
519    #[cfg(all(
520        target_arch = "aarch64",
521        any(target_os = "android", target_os = "linux")
522    ))]
523    pub virt_cpufreq_v2: bool,
524    pub vm_image: VmImage,
525}
526
527/// Holds the elements needed to run a Linux VM. Created by `build_vm`.
528#[sorted]
529pub struct RunnableLinuxVm {
530    pub bat_control: Option<BatControl>,
531    pub delay_rt: bool,
532    pub devices_thread: Option<std::thread::JoinHandle<()>>,
533    pub hotplug_bus: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
534    pub hypercall_bus: Arc<Bus>,
535    pub io_bus: Arc<Bus>,
536    pub irq_chip: Arc<dyn IrqChipArch>,
537    pub mmio_bus: Arc<Bus>,
538    pub no_smt: bool,
539    pub pid_debug_label_map: BTreeMap<u32, String>,
540    #[cfg(any(target_os = "android", target_os = "linux"))]
541    pub platform_devices: Vec<Arc<Mutex<dyn BusDevice>>>,
542    pub pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
543    /// Devices to be notified before the system resumes from the S3 suspended state.
544    pub resume_notify_devices: Vec<Arc<Mutex<dyn BusResumeDevice>>>,
545    pub root_config: Arc<Mutex<PciRoot>>,
546    pub rt_cpus: CpuSet,
547    pub suspend_tube: (Arc<Mutex<SendTube>>, RecvTube),
548    pub vcpu_affinity: Option<VcpuAffinity>,
549    pub vcpu_count: usize,
550    pub vcpu_init: Vec<VcpuInitArch>,
551    /// If vcpus is None, then it's the responsibility of the vcpu thread to create vcpus.
552    /// If it's Some, then `build_vm` already created the vcpus.
553    pub vcpus: Option<Vec<Arc<dyn VcpuArch>>>,
554    pub vm: Arc<dyn VmArch>,
555    pub vm_request_tubes: Vec<Tube>,
556}
557
558/// The device and optional jail.
559pub struct VirtioDeviceStub {
560    pub dev: Box<dyn VirtioDevice>,
561    pub jail: Option<Minijail>,
562}
563
564/// Trait which is implemented for each Linux Architecture in order to
565/// set up the memory, cpus, and system devices and to boot the kernel.
566pub trait LinuxArch {
567    type Error: StdError;
568    type ArchMemoryLayout;
569
570    /// Decide architecture specific memory layout details to be used by later stages of the VM
571    /// setup.
572    fn arch_memory_layout(
573        components: &VmComponents,
574    ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error>;
575
576    /// Returns a Vec of the valid memory addresses as pairs of address and length. These should be
577    /// used to configure the `GuestMemory` structure for the platform.
578    ///
579    /// # Arguments
580    ///
581    /// * `components` - Parts used to determine the memory layout.
582    fn guest_memory_layout(
583        components: &VmComponents,
584        arch_memory_layout: &Self::ArchMemoryLayout,
585        hypervisor: &impl hypervisor::Hypervisor,
586    ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>;
587
588    /// Gets the configuration for a new `SystemAllocator` that fits the given `Vm`'s memory layout.
589    ///
590    /// This is the per-architecture template for constructing the `SystemAllocator`. Platform
591    /// agnostic modifications may be made to this configuration, but the final `SystemAllocator`
592    /// will be at least as strict as this configuration.
593    ///
594    /// # Arguments
595    ///
596    /// * `vm` - The virtual machine to be used as a template for the `SystemAllocator`.
597    fn get_system_allocator_config(
598        vm: &dyn Vm,
599        arch_memory_layout: &Self::ArchMemoryLayout,
600    ) -> SystemAllocatorConfig;
601
602    /// Takes `VmComponents` and generates a `RunnableLinuxVm`.
603    ///
604    /// # Arguments
605    ///
606    /// * `components` - Parts to use to build the VM.
607    /// * `vm_evt_wrtube` - Tube used by sub-devices to request that crosvm exit because guest wants
608    ///   to stop/shut down or requested reset.
609    /// * `system_allocator` - Allocator created by this trait's implementation of
610    ///   `get_system_allocator_config`.
611    /// * `serial_parameters` - Definitions for how the serial devices should be configured.
612    /// * `serial_jail` - Jail used for serial devices created here.
613    /// * `battery` - Defines what battery device will be created.
614    /// * `vm` - A VM implementation to build upon.
615    /// * `ramoops_region` - Region allocated for ramoops.
616    /// * `devices` - The devices to be built into the VM.
617    /// * `irq_chip` - The IRQ chip implemention for the VM.
618    /// * `debugcon_jail` - Jail used for debugcon devices created here.
619    /// * `pflash_jail` - Jail used for pflash device created here.
620    /// * `fw_cfg_jail` - Jail used for fw_cfg device created here.
621    /// * `device_tree_overlays` - Device tree overlay binaries
622    fn build_vm(
623        components: VmComponents,
624        arch_memory_layout: &Self::ArchMemoryLayout,
625        vm_evt_wrtube: &SendTube,
626        system_allocator: &mut SystemAllocator,
627        serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
628        serial_jail: Option<Minijail>,
629        battery: (Option<BatteryType>, Option<Minijail>),
630        vm: Arc<dyn VmArch>,
631        ramoops_region: Option<pstore::RamoopsRegion>,
632        devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
633        irq_chip: Arc<dyn IrqChipArch>,
634        vcpu_ids: &mut Vec<usize>,
635        dump_device_tree_blob: Option<PathBuf>,
636        debugcon_jail: Option<Minijail>,
637        #[cfg(target_arch = "x86_64")] pflash_jail: Option<Minijail>,
638        #[cfg(target_arch = "x86_64")] fw_cfg_jail: Option<Minijail>,
639        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
640        guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
641        device_tree_overlays: Vec<DtbOverlay>,
642        fdt_position: Option<FdtPosition>,
643        no_pmu: bool,
644    ) -> std::result::Result<RunnableLinuxVm, Self::Error>;
645
646    /// Configures the vcpu and should be called once per vcpu from the vcpu's thread.
647    ///
648    /// # Arguments
649    ///
650    /// * `vm` - The virtual machine object.
651    /// * `hypervisor` - The `Hypervisor` that created the vcpu.
652    /// * `irq_chip` - The `IrqChip` associated with this vm.
653    /// * `vcpu` - The VCPU object to configure.
654    /// * `vcpu_init` - The data required to initialize VCPU registers and other state.
655    /// * `vcpu_id` - The id of the given `vcpu`.
656    /// * `num_vcpus` - Number of virtual CPUs the guest will have.
657    /// * `cpu_config` - CPU feature configurations.
658    fn configure_vcpu(
659        vm: &dyn Vm,
660        hypervisor: &dyn HypervisorArch,
661        irq_chip: &dyn IrqChipArch,
662        vcpu: &dyn VcpuArch,
663        vcpu_init: VcpuInitArch,
664        vcpu_id: usize,
665        num_vcpus: usize,
666        cpu_config: Option<CpuConfigArch>,
667    ) -> Result<(), Self::Error>;
668
669    /// Configures and add a pci device into vm
670    fn register_pci_device(
671        linux: &mut RunnableLinuxVm,
672        device: Box<dyn PciDevice>,
673        #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
674        resources: &mut SystemAllocator,
675        hp_control_tube: &mpsc::Sender<PciRootCommand>,
676        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
677    ) -> Result<PciAddress, Self::Error>;
678
679    /// Returns frequency map for each of the host's logical cores.
680    fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>, Self::Error>;
681
682    /// Returns max-freq map of the host's logical cores.
683    fn get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>, Self::Error>;
684
685    /// Returns capacity map of the host's logical cores.
686    fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>, Self::Error>;
687
688    /// Returns cluster masks for each of the host's logical cores.
689    fn get_host_cpu_clusters() -> Result<Vec<CpuSet>, Self::Error>;
690}
691
692#[cfg(feature = "gdb")]
693pub trait GdbOps {
694    type Error: StdError;
695
696    /// Reads vCPU's registers.
697    fn read_registers(vcpu: &dyn VcpuArch) -> Result<<GdbArch as Arch>::Registers, Self::Error>;
698
699    /// Writes vCPU's registers.
700    fn write_registers(
701        vcpu: &dyn VcpuArch,
702        regs: &<GdbArch as Arch>::Registers,
703    ) -> Result<(), Self::Error>;
704
705    /// Reads bytes from the guest memory.
706    fn read_memory(
707        vcpu: &dyn VcpuArch,
708        guest_mem: &GuestMemory,
709        vaddr: GuestAddress,
710        len: usize,
711    ) -> Result<Vec<u8>, Self::Error>;
712
713    /// Writes bytes to the specified guest memory.
714    fn write_memory(
715        vcpu: &dyn VcpuArch,
716        guest_mem: &GuestMemory,
717        vaddr: GuestAddress,
718        buf: &[u8],
719    ) -> Result<(), Self::Error>;
720
721    /// Reads bytes from the guest register.
722    ///
723    /// Returns an empty vector if `reg_id` is valid but the register is not available.
724    fn read_register(
725        vcpu: &dyn VcpuArch,
726        reg_id: <GdbArch as Arch>::RegId,
727    ) -> Result<Vec<u8>, Self::Error>;
728
729    /// Writes bytes to the specified guest register.
730    fn write_register(
731        vcpu: &dyn VcpuArch,
732        reg_id: <GdbArch as Arch>::RegId,
733        data: &[u8],
734    ) -> Result<(), Self::Error>;
735
736    /// Make the next vCPU's run single-step.
737    fn enable_singlestep(vcpu: &dyn VcpuArch) -> Result<(), Self::Error>;
738
739    /// Get maximum number of hardware breakpoints.
740    fn get_max_hw_breakpoints(vcpu: &dyn VcpuArch) -> Result<usize, Self::Error>;
741
742    /// Set hardware breakpoints at the given addresses.
743    fn set_hw_breakpoints(
744        vcpu: &dyn VcpuArch,
745        breakpoints: &[GuestAddress],
746    ) -> Result<(), Self::Error>;
747}
748
749/// Errors for device manager.
750#[sorted]
751#[derive(Error, Debug)]
752pub enum DeviceRegistrationError {
753    /// No more MMIO space available.
754    #[error("no more addresses are available")]
755    AddrsExhausted,
756    /// Could not allocate device address space for the device.
757    #[error("Allocating device addresses: {0}")]
758    AllocateDeviceAddrs(PciDeviceError),
759    /// Could not allocate IO space for the device.
760    #[error("Allocating IO addresses: {0}")]
761    AllocateIoAddrs(PciDeviceError),
762    /// Could not allocate MMIO or IO resource for the device.
763    #[error("Allocating IO resource: {0}")]
764    AllocateIoResource(resources::Error),
765    /// Could not allocate an IRQ number.
766    #[error("Allocating IRQ number")]
767    AllocateIrq,
768    /// Could not allocate IRQ resource for the device.
769    #[cfg(any(target_os = "android", target_os = "linux"))]
770    #[error("Allocating IRQ resource: {0}")]
771    AllocateIrqResource(devices::vfio::VfioError),
772    #[error("failed to attach the device to its power domain: {0}")]
773    AttachDevicePowerDomain(anyhow::Error),
774    /// Broken pci topology
775    #[error("pci topology is broken")]
776    BrokenPciTopology,
777    /// Unable to clone a jail for the device.
778    #[cfg(any(target_os = "android", target_os = "linux"))]
779    #[error("failed to clone jail: {0}")]
780    CloneJail(minijail::Error),
781    /// Appending to kernel command line failed.
782    #[error("unable to add device to kernel command line: {0}")]
783    Cmdline(kernel_cmdline::Error),
784    /// Configure window size failed.
785    #[error("failed to configure window size: {0}")]
786    ConfigureWindowSize(PciDeviceError),
787    // Unable to create a pipe.
788    #[error("failed to create pipe: {0}")]
789    CreatePipe(base::Error),
790    // Unable to create a root.
791    #[error("failed to create pci root: {0}")]
792    CreateRoot(anyhow::Error),
793    // Unable to create serial device from serial parameters
794    #[error("failed to create serial device: {0}")]
795    CreateSerialDevice(devices::SerialError),
796    // Unable to create tube
797    #[error("failed to create tube: {0}")]
798    CreateTube(base::TubeError),
799    /// Could not clone an event.
800    #[error("failed to clone event: {0}")]
801    EventClone(base::Error),
802    /// Could not create an event.
803    #[error("failed to create event: {0}")]
804    EventCreate(base::Error),
805    /// Failed to generate ACPI content.
806    #[error("failed to generate ACPI content")]
807    GenerateAcpi,
808    /// No more IRQs are available.
809    #[error("no more IRQs are available")]
810    IrqsExhausted,
811    /// VFIO device is missing a DT symbol.
812    #[error("cannot match VFIO device to DT node due to a missing symbol")]
813    MissingDeviceTreeSymbol,
814    /// Missing a required serial device.
815    #[error("missing required serial device {0}")]
816    MissingRequiredSerialDevice(u8),
817    /// Could not add a device to the mmio bus.
818    #[error("failed to add to mmio bus: {0}")]
819    MmioInsert(BusError),
820    /// Failed to insert device into PCI root.
821    #[error("failed to insert device into PCI root: {0}")]
822    PciRootAddDevice(PciDeviceError),
823    #[cfg(any(target_os = "android", target_os = "linux"))]
824    /// Failed to initialize proxy device for jailed device.
825    #[error("failed to create proxy device: {0}")]
826    ProxyDeviceCreation(devices::ProxyError),
827    #[cfg(any(target_os = "android", target_os = "linux"))]
828    /// Failed to register battery device.
829    #[error("failed to register battery device to VM: {0}")]
830    RegisterBattery(devices::BatteryError),
831    /// Could not register PCI device to pci root bus
832    #[error("failed to register PCI device to pci root bus")]
833    RegisterDevice(SendError<PciRootCommand>),
834    /// Could not register PCI device capabilities.
835    #[error("could not register PCI device capabilities: {0}")]
836    RegisterDeviceCapabilities(PciDeviceError),
837    /// Failed to register ioevent with VM.
838    #[error("failed to register ioevent to VM: {0}")]
839    RegisterIoevent(base::Error),
840    /// Failed to register irq event with VM.
841    #[error("failed to register irq event to VM: {0}")]
842    RegisterIrqfd(base::Error),
843    /// Could not setup VFIO platform IRQ for the device.
844    #[error("Setting up VFIO platform IRQ: {0}")]
845    SetupVfioPlatformIrq(anyhow::Error),
846}
847
848/// Config a PCI device for used by this vm.
849pub fn configure_pci_device(
850    linux: &mut RunnableLinuxVm,
851    mut device: Box<dyn PciDevice>,
852    #[cfg(any(target_os = "android", target_os = "linux"))] jail: Option<Minijail>,
853    resources: &mut SystemAllocator,
854    hp_control_tube: &mpsc::Sender<PciRootCommand>,
855    #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
856) -> Result<PciAddress, DeviceRegistrationError> {
857    // Allocate PCI device address before allocating BARs.
858    let pci_address = device
859        .allocate_address(resources)
860        .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
861
862    // Allocate ranges that may need to be in the low MMIO region (MmioType::Low).
863    let mmio_ranges = device
864        .allocate_io_bars(resources)
865        .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
866
867    // Allocate device ranges that may be in low or high MMIO after low-only ranges.
868    let device_ranges = device
869        .allocate_device_bars(resources)
870        .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
871
872    // If device is a pcie bridge, add its pci bus to pci root
873    if let Some(pci_bus) = device.get_new_pci_bus() {
874        hp_control_tube
875            .send(PciRootCommand::AddBridge(pci_bus))
876            .map_err(DeviceRegistrationError::RegisterDevice)?;
877        let bar_ranges = Vec::new();
878        device
879            .configure_bridge_window(resources, &bar_ranges)
880            .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
881    }
882
883    // Do not suggest INTx for hot-plug devices.
884    let intx_event = devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
885
886    if let PreferredIrq::Fixed { pin, gsi } = device.preferred_irq() {
887        resources.reserve_irq(gsi);
888
889        device.assign_irq(
890            intx_event
891                .try_clone()
892                .map_err(DeviceRegistrationError::EventClone)?,
893            pin,
894            gsi,
895        );
896
897        linux
898            .irq_chip
899            .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(&device))
900            .map_err(DeviceRegistrationError::RegisterIrqfd)?;
901    }
902
903    let mut keep_rds = device.keep_rds();
904    syslog::push_descriptors(&mut keep_rds);
905    cros_tracing::push_descriptors!(&mut keep_rds);
906    metrics::push_descriptors(&mut keep_rds);
907
908    device
909        .register_device_capabilities()
910        .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
911
912    #[cfg(any(target_os = "android", target_os = "linux"))]
913    let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
914        let proxy = ProxyDevice::new(
915            device,
916            jail,
917            keep_rds,
918            #[cfg(feature = "swap")]
919            swap_controller,
920        )
921        .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
922        linux
923            .pid_debug_label_map
924            .insert(proxy.pid() as u32, proxy.debug_label());
925        Arc::new(Mutex::new(proxy))
926    } else {
927        device.on_sandboxed();
928        Arc::new(Mutex::new(device))
929    };
930
931    #[cfg(windows)]
932    let arced_dev = {
933        device.on_sandboxed();
934        Arc::new(Mutex::new(device))
935    };
936
937    #[cfg(any(target_os = "android", target_os = "linux"))]
938    hp_control_tube
939        .send(PciRootCommand::Add(pci_address, arced_dev.clone()))
940        .map_err(DeviceRegistrationError::RegisterDevice)?;
941
942    for range in &mmio_ranges {
943        linux
944            .mmio_bus
945            .insert(arced_dev.clone(), range.addr, range.size)
946            .map_err(DeviceRegistrationError::MmioInsert)?;
947    }
948
949    for range in &device_ranges {
950        linux
951            .mmio_bus
952            .insert(arced_dev.clone(), range.addr, range.size)
953            .map_err(DeviceRegistrationError::MmioInsert)?;
954    }
955
956    Ok(pci_address)
957}
958
959// Generate pci topology starting from parent bus
960fn generate_pci_topology(
961    parent_bus: Arc<Mutex<PciBus>>,
962    resources: &mut SystemAllocator,
963    io_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
964    device_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
965    device_addrs: &[PciAddress],
966    devices: &mut Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
967) -> Result<(Vec<BarRange>, u8), DeviceRegistrationError> {
968    let mut bar_ranges = Vec::new();
969    let bus_num = parent_bus.lock().get_bus_num();
970    let mut subordinate_bus = bus_num;
971    for (dev_idx, addr) in device_addrs.iter().enumerate() {
972        // Only target for devices that located on this bus
973        if addr.bus == bus_num {
974            // If this device is a pci bridge (a.k.a., it has a pci bus structure),
975            // create its topology recursively
976            if let Some(child_bus) = devices[dev_idx].0.get_new_pci_bus() {
977                let (child_bar_ranges, child_sub_bus) = generate_pci_topology(
978                    child_bus.clone(),
979                    resources,
980                    io_ranges,
981                    device_ranges,
982                    device_addrs,
983                    devices,
984                )?;
985                let device = &mut devices[dev_idx].0;
986                parent_bus
987                    .lock()
988                    .add_child_bus(child_bus.clone())
989                    .map_err(|_| DeviceRegistrationError::BrokenPciTopology)?;
990                let bridge_window = device
991                    .configure_bridge_window(resources, &child_bar_ranges)
992                    .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
993                bar_ranges.extend(bridge_window);
994
995                let ranges = device
996                    .allocate_io_bars(resources)
997                    .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
998                io_ranges.insert(dev_idx, ranges.clone());
999                bar_ranges.extend(ranges);
1000
1001                let ranges = device
1002                    .allocate_device_bars(resources)
1003                    .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1004                device_ranges.insert(dev_idx, ranges.clone());
1005                bar_ranges.extend(ranges);
1006
1007                device.set_subordinate_bus(child_sub_bus);
1008
1009                subordinate_bus = std::cmp::max(subordinate_bus, child_sub_bus);
1010            }
1011        }
1012    }
1013
1014    for (dev_idx, addr) in device_addrs.iter().enumerate() {
1015        if addr.bus == bus_num {
1016            let device = &mut devices[dev_idx].0;
1017            // Allocate MMIO for non-bridge devices
1018            if device.get_new_pci_bus().is_none() {
1019                let ranges = device
1020                    .allocate_io_bars(resources)
1021                    .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
1022                io_ranges.insert(dev_idx, ranges.clone());
1023                bar_ranges.extend(ranges);
1024
1025                let ranges = device
1026                    .allocate_device_bars(resources)
1027                    .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1028                device_ranges.insert(dev_idx, ranges.clone());
1029                bar_ranges.extend(ranges);
1030            }
1031        }
1032    }
1033    Ok((bar_ranges, subordinate_bus))
1034}
1035
1036/// Ensure all PCI devices have an assigned PCI address.
1037pub fn assign_pci_addresses(
1038    devices: &mut [(Box<dyn BusDeviceObj>, Option<Minijail>)],
1039    resources: &mut SystemAllocator,
1040) -> Result<(), DeviceRegistrationError> {
1041    // First allocate devices with a preferred address.
1042    for pci_device in devices
1043        .iter_mut()
1044        .filter_map(|(device, _jail)| device.as_pci_device_mut())
1045        .filter(|pci_device| pci_device.preferred_address().is_some())
1046    {
1047        let _ = pci_device
1048            .allocate_address(resources)
1049            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1050    }
1051
1052    // Then allocate addresses for the remaining devices.
1053    for pci_device in devices
1054        .iter_mut()
1055        .filter_map(|(device, _jail)| device.as_pci_device_mut())
1056        .filter(|pci_device| pci_device.preferred_address().is_none())
1057    {
1058        let _ = pci_device
1059            .allocate_address(resources)
1060            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1061    }
1062
1063    Ok(())
1064}
1065
1066/// Creates a root PCI device for use by this Vm.
1067pub fn generate_pci_root(
1068    mut devices: Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
1069    irq_chip: &dyn IrqChip,
1070    mmio_bus: Arc<Bus>,
1071    mmio_base: GuestAddress,
1072    mmio_register_bit_num: usize,
1073    io_bus: Arc<Bus>,
1074    resources: &mut SystemAllocator,
1075    mut vm: &dyn Vm,
1076    max_irqs: usize,
1077    vcfg_base: Option<u64>,
1078    #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1079) -> Result<
1080    (
1081        PciRoot,
1082        Vec<(PciAddress, u32, PciInterruptPin)>,
1083        BTreeMap<u32, String>,
1084        BTreeMap<PciAddress, Vec<u8>>,
1085        BTreeMap<PciAddress, Vec<u8>>,
1086    ),
1087    DeviceRegistrationError,
1088> {
1089    let mut device_addrs = Vec::new();
1090
1091    for (device, _jail) in devices.iter_mut() {
1092        let address = device
1093            .allocate_address(resources)
1094            .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1095        device_addrs.push(address);
1096    }
1097
1098    let mut device_ranges = BTreeMap::new();
1099    let mut io_ranges = BTreeMap::new();
1100    let root_bus = Arc::new(Mutex::new(PciBus::new(0, 0, false)));
1101
1102    generate_pci_topology(
1103        root_bus.clone(),
1104        resources,
1105        &mut io_ranges,
1106        &mut device_ranges,
1107        &device_addrs,
1108        &mut devices,
1109    )?;
1110
1111    let mut root = PciRoot::new(
1112        vm,
1113        Arc::downgrade(&mmio_bus),
1114        mmio_base,
1115        mmio_register_bit_num,
1116        Arc::downgrade(&io_bus),
1117        root_bus,
1118    )
1119    .map_err(DeviceRegistrationError::CreateRoot)?;
1120    #[cfg_attr(windows, allow(unused_mut))]
1121    let mut pid_labels = BTreeMap::new();
1122
1123    // Allocate legacy INTx
1124    let mut pci_irqs = Vec::new();
1125    let mut irqs: Vec<u32> = Vec::new();
1126
1127    // Mapping of (bus, dev, pin) -> IRQ number.
1128    let mut dev_pin_irq = BTreeMap::new();
1129
1130    for (dev_idx, (device, _jail)) in devices.iter_mut().enumerate() {
1131        let pci_address = device_addrs[dev_idx];
1132
1133        let irq = match device.preferred_irq() {
1134            PreferredIrq::Fixed { pin, gsi } => {
1135                // The device reported a preferred IRQ, so use that rather than allocating one.
1136                resources.reserve_irq(gsi);
1137                Some((pin, gsi))
1138            }
1139            PreferredIrq::Any => {
1140                // The device did not provide a preferred IRQ but requested one, so allocate one.
1141
1142                // Choose a pin based on the slot's function number. Function 0 must always use
1143                // INTA# for single-function devices per the PCI spec, and we choose to use INTA#
1144                // for function 0 on multifunction devices and distribute the remaining functions
1145                // evenly across the other pins.
1146                let pin = match pci_address.func % 4 {
1147                    0 => PciInterruptPin::IntA,
1148                    1 => PciInterruptPin::IntB,
1149                    2 => PciInterruptPin::IntC,
1150                    _ => PciInterruptPin::IntD,
1151                };
1152
1153                // If an IRQ number has already been assigned for a different function with this
1154                // (bus, device, pin) combination, use it. Otherwise allocate a new one and insert
1155                // it into the map.
1156                let pin_key = (pci_address.bus, pci_address.dev, pin);
1157                let irq_num = if let Some(irq_num) = dev_pin_irq.get(&pin_key) {
1158                    *irq_num
1159                } else {
1160                    // If we have allocated fewer than `max_irqs` total, add a new irq to the `irqs`
1161                    // pool. Otherwise, share one of the existing `irqs`.
1162                    let irq_num = if irqs.len() < max_irqs {
1163                        let irq_num = resources
1164                            .allocate_irq()
1165                            .ok_or(DeviceRegistrationError::AllocateIrq)?;
1166                        irqs.push(irq_num);
1167                        irq_num
1168                    } else {
1169                        // Pick one of the existing IRQs to share, using `dev_idx` to distribute IRQ
1170                        // sharing evenly across devices.
1171                        irqs[dev_idx % max_irqs]
1172                    };
1173
1174                    dev_pin_irq.insert(pin_key, irq_num);
1175                    irq_num
1176                };
1177                Some((pin, irq_num))
1178            }
1179            PreferredIrq::None => {
1180                // The device does not want an INTx# IRQ.
1181                None
1182            }
1183        };
1184
1185        if let Some((pin, gsi)) = irq {
1186            let intx_event =
1187                devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
1188
1189            device.assign_irq(
1190                intx_event
1191                    .try_clone()
1192                    .map_err(DeviceRegistrationError::EventClone)?,
1193                pin,
1194                gsi,
1195            );
1196
1197            irq_chip
1198                .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(device))
1199                .map_err(DeviceRegistrationError::RegisterIrqfd)?;
1200
1201            pci_irqs.push((pci_address, gsi, pin));
1202        }
1203    }
1204
1205    // To prevent issues where device's on_sandbox may spawn thread before all
1206    // sandboxed devices are sandboxed we partition iterator to go over sandboxed
1207    // first. This is needed on linux platforms. On windows, this is a no-op since
1208    // jails are always None, even for sandboxed devices.
1209    let devices = {
1210        let (sandboxed, non_sandboxed): (Vec<_>, Vec<_>) = devices
1211            .into_iter()
1212            .enumerate()
1213            .partition(|(_, (_, jail))| jail.is_some());
1214        sandboxed.into_iter().chain(non_sandboxed)
1215    };
1216
1217    let mut amls = BTreeMap::new();
1218    let mut gpe_scope_amls = BTreeMap::new();
1219    for (dev_idx, dev_value) in devices {
1220        #[cfg(any(target_os = "android", target_os = "linux"))]
1221        let (mut device, jail) = dev_value;
1222        #[cfg(windows)]
1223        let (mut device, _) = dev_value;
1224        let address = device_addrs[dev_idx];
1225
1226        let mut keep_rds = device.keep_rds();
1227        syslog::push_descriptors(&mut keep_rds);
1228        cros_tracing::push_descriptors!(&mut keep_rds);
1229        metrics::push_descriptors(&mut keep_rds);
1230        keep_rds.append(&mut vm.get_memory().as_raw_descriptors());
1231
1232        let ranges = io_ranges.remove(&dev_idx).unwrap_or_default();
1233        let device_ranges = device_ranges.remove(&dev_idx).unwrap_or_default();
1234        device
1235            .register_device_capabilities()
1236            .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
1237
1238        if let Some(vcfg_base) = vcfg_base {
1239            let (methods, shm) = device.generate_acpi_methods();
1240            if !methods.is_empty() {
1241                amls.insert(address, methods);
1242            }
1243            if let Some((offset, mmap)) = shm {
1244                let _ = vm.add_memory_region(
1245                    GuestAddress(vcfg_base + offset as u64),
1246                    Box::new(mmap),
1247                    false,
1248                    false,
1249                    MemCacheType::CacheCoherent,
1250                );
1251            }
1252        }
1253        let gpe_nr = device.set_gpe(resources);
1254
1255        #[cfg(any(target_os = "android", target_os = "linux"))]
1256        let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
1257            let proxy = ProxyDevice::new(
1258                device,
1259                jail,
1260                keep_rds,
1261                #[cfg(feature = "swap")]
1262                swap_controller,
1263            )
1264            .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
1265            pid_labels.insert(proxy.pid() as u32, proxy.debug_label());
1266            Arc::new(Mutex::new(proxy))
1267        } else {
1268            device.on_sandboxed();
1269            Arc::new(Mutex::new(device))
1270        };
1271        #[cfg(windows)]
1272        let arced_dev = {
1273            device.on_sandboxed();
1274            Arc::new(Mutex::new(device))
1275        };
1276        root.add_device(address, arced_dev.clone(), &mut vm)
1277            .map_err(DeviceRegistrationError::PciRootAddDevice)?;
1278        for range in &ranges {
1279            mmio_bus
1280                .insert(arced_dev.clone(), range.addr, range.size)
1281                .map_err(DeviceRegistrationError::MmioInsert)?;
1282        }
1283
1284        for range in &device_ranges {
1285            mmio_bus
1286                .insert(arced_dev.clone(), range.addr, range.size)
1287                .map_err(DeviceRegistrationError::MmioInsert)?;
1288        }
1289
1290        if let Some(gpe_nr) = gpe_nr {
1291            if let Some(acpi_path) = root.acpi_path(&address) {
1292                let mut gpe_aml = Vec::new();
1293
1294                GpeScope {}.cast_to_aml_bytes(
1295                    &mut gpe_aml,
1296                    gpe_nr,
1297                    format!("\\{acpi_path}").as_str(),
1298                );
1299                if !gpe_aml.is_empty() {
1300                    gpe_scope_amls.insert(address, gpe_aml);
1301                }
1302            }
1303        }
1304    }
1305
1306    Ok((root, pci_irqs, pid_labels, amls, gpe_scope_amls))
1307}
1308
1309/// Errors for image loading.
1310#[sorted]
1311#[derive(Error, Debug)]
1312pub enum LoadImageError {
1313    #[error("Alignment not a power of two: {0}")]
1314    BadAlignment(u64),
1315    #[error("Getting image size failed: {0}")]
1316    GetLen(io::Error),
1317    #[error("GuestMemory get slice failed: {0}")]
1318    GuestMemorySlice(GuestMemoryError),
1319    #[error("Image size too large: {0}")]
1320    ImageSizeTooLarge(u64),
1321    #[error("No suitable memory region found")]
1322    NoSuitableMemoryRegion,
1323    #[error("Reading image into memory failed: {0}")]
1324    ReadToMemory(io::Error),
1325    #[error("Cannot load zero-sized image")]
1326    ZeroSizedImage,
1327}
1328
1329/// Load an image from a file into guest memory.
1330///
1331/// # Arguments
1332///
1333/// * `guest_mem` - The memory to be used by the guest.
1334/// * `guest_addr` - The starting address to load the image in the guest memory.
1335/// * `max_size` - The amount of space in bytes available in the guest memory for the image.
1336/// * `image` - The file containing the image to be loaded.
1337///
1338/// The size in bytes of the loaded image is returned.
1339pub fn load_image<F>(
1340    guest_mem: &GuestMemory,
1341    image: &mut F,
1342    guest_addr: GuestAddress,
1343    max_size: u64,
1344) -> Result<u32, LoadImageError>
1345where
1346    F: FileReadWriteAtVolatile + FileGetLen,
1347{
1348    let size = image.get_len().map_err(LoadImageError::GetLen)?;
1349
1350    if size > u32::MAX as u64 || size > max_size {
1351        return Err(LoadImageError::ImageSizeTooLarge(size));
1352    }
1353
1354    // This is safe due to the bounds check above.
1355    let size = size as u32;
1356
1357    let guest_slice = guest_mem
1358        .get_slice_at_addr(guest_addr, size as usize)
1359        .map_err(LoadImageError::GuestMemorySlice)?;
1360    image
1361        .read_exact_at_volatile(guest_slice, 0)
1362        .map_err(LoadImageError::ReadToMemory)?;
1363
1364    Ok(size)
1365}
1366
1367/// Load an image from a file into guest memory at the highest possible address.
1368///
1369/// # Arguments
1370///
1371/// * `guest_mem` - The memory to be used by the guest.
1372/// * `image` - The file containing the image to be loaded.
1373/// * `min_guest_addr` - The minimum address of the start of the image.
1374/// * `max_guest_addr` - The address to load the last byte of the image.
1375/// * `region_filter` - The optional filter function for determining if the given guest memory
1376///   region is suitable for loading the image into it.
1377/// * `align` - The minimum alignment of the start address of the image in bytes (must be a power of
1378///   two).
1379///
1380/// The guest address and size in bytes of the loaded image are returned.
1381pub fn load_image_high<F>(
1382    guest_mem: &GuestMemory,
1383    image: &mut F,
1384    min_guest_addr: GuestAddress,
1385    max_guest_addr: GuestAddress,
1386    region_filter: Option<fn(&MemoryRegionInformation) -> bool>,
1387    align: u64,
1388) -> Result<(GuestAddress, u32), LoadImageError>
1389where
1390    F: FileReadWriteAtVolatile + FileGetLen,
1391{
1392    if !align.is_power_of_two() {
1393        return Err(LoadImageError::BadAlignment(align));
1394    }
1395
1396    let max_size = max_guest_addr.offset_from(min_guest_addr) & !(align - 1);
1397    let size = image.get_len().map_err(LoadImageError::GetLen)?;
1398
1399    if size == 0 {
1400        return Err(LoadImageError::ZeroSizedImage);
1401    }
1402
1403    if size > u32::MAX as u64 || size > max_size {
1404        return Err(LoadImageError::ImageSizeTooLarge(size));
1405    }
1406
1407    // Sort the list of guest memory regions by address so we can iterate over them in reverse order
1408    // (high to low).
1409    let mut regions: Vec<_> = guest_mem
1410        .regions()
1411        .filter(region_filter.unwrap_or(|_| true))
1412        .collect();
1413    regions.sort_unstable_by(|a, b| a.guest_addr.cmp(&b.guest_addr));
1414
1415    // Find the highest valid address inside a guest memory region that satisfies the requested
1416    // alignment and min/max address requirements while having enough space for the image.
1417    let guest_addr = regions
1418        .into_iter()
1419        .rev()
1420        .filter_map(|r| {
1421            // Highest address within this region.
1422            let rgn_max_addr = r
1423                .guest_addr
1424                .checked_add((r.size as u64).checked_sub(1)?)?
1425                .min(max_guest_addr);
1426            // Lowest aligned address within this region.
1427            let rgn_start_aligned = r.guest_addr.align(align)?;
1428            // Hypothetical address of the image if loaded at the end of the region.
1429            let image_addr = rgn_max_addr.checked_sub(size - 1)? & !(align - 1);
1430
1431            // Would the image fit within the region?
1432            if image_addr >= rgn_start_aligned {
1433                Some(image_addr)
1434            } else {
1435                None
1436            }
1437        })
1438        .find(|&addr| addr >= min_guest_addr)
1439        .ok_or(LoadImageError::NoSuitableMemoryRegion)?;
1440
1441    // This is safe due to the bounds check above.
1442    let size = size as u32;
1443
1444    let guest_slice = guest_mem
1445        .get_slice_at_addr(guest_addr, size as usize)
1446        .map_err(LoadImageError::GuestMemorySlice)?;
1447    image
1448        .read_exact_at_volatile(guest_slice, 0)
1449        .map_err(LoadImageError::ReadToMemory)?;
1450
1451    Ok((guest_addr, size))
1452}
1453
1454/// SMBIOS table configuration
1455#[derive(Clone, Debug, Default, Serialize, Deserialize, FromKeyValues, PartialEq, Eq)]
1456#[serde(deny_unknown_fields, rename_all = "kebab-case")]
1457pub struct SmbiosOptions {
1458    /// BIOS vendor name.
1459    pub bios_vendor: Option<String>,
1460
1461    /// BIOS version number (free-form string).
1462    pub bios_version: Option<String>,
1463
1464    /// System manufacturer name.
1465    pub manufacturer: Option<String>,
1466
1467    /// System product name.
1468    pub product_name: Option<String>,
1469
1470    /// System serial number (free-form string).
1471    pub serial_number: Option<String>,
1472
1473    /// System UUID.
1474    pub uuid: Option<Uuid>,
1475
1476    /// Additional OEM strings to add to SMBIOS table.
1477    #[serde(default)]
1478    pub oem_strings: Vec<String>,
1479}
1480
1481#[cfg(test)]
1482mod tests {
1483    use serde_keyvalue::from_key_values;
1484    use tempfile::tempfile;
1485
1486    use super::*;
1487
1488    #[test]
1489    fn parse_pstore() {
1490        let res: Pstore = from_key_values("path=/some/path,size=16384").unwrap();
1491        assert_eq!(
1492            res,
1493            Pstore {
1494                path: "/some/path".into(),
1495                size: 16384,
1496            }
1497        );
1498
1499        let res = from_key_values::<Pstore>("path=/some/path");
1500        assert!(res.is_err());
1501
1502        let res = from_key_values::<Pstore>("size=16384");
1503        assert!(res.is_err());
1504
1505        let res = from_key_values::<Pstore>("");
1506        assert!(res.is_err());
1507    }
1508
1509    #[test]
1510    fn deserialize_cpuset_serde_kv() {
1511        let res: CpuSet = from_key_values("[0,4,7]").unwrap();
1512        assert_eq!(res, CpuSet::new(vec![0, 4, 7]));
1513
1514        let res: CpuSet = from_key_values("[9-12]").unwrap();
1515        assert_eq!(res, CpuSet::new(vec![9, 10, 11, 12]));
1516
1517        let res: CpuSet = from_key_values("[0,4,7,9-12,15]").unwrap();
1518        assert_eq!(res, CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]));
1519    }
1520
1521    #[test]
1522    fn deserialize_serialize_cpuset_json() {
1523        let json_str = "[0,4,7]";
1524        let cpuset = CpuSet::new(vec![0, 4, 7]);
1525        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1526        assert_eq!(res, cpuset);
1527        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1528
1529        let json_str = r#"["9-12"]"#;
1530        let cpuset = CpuSet::new(vec![9, 10, 11, 12]);
1531        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1532        assert_eq!(res, cpuset);
1533        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1534
1535        let json_str = r#"[0,4,7,"9-12",15]"#;
1536        let cpuset = CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]);
1537        let res: CpuSet = serde_json::from_str(json_str).unwrap();
1538        assert_eq!(res, cpuset);
1539        assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1540    }
1541
1542    #[test]
1543    fn load_image_high_max_4g() {
1544        let mem = GuestMemory::new(&[
1545            (GuestAddress(0x0000_0000), 0x4000_0000), // 0x00000000..0x40000000
1546            (GuestAddress(0x8000_0000), 0x4000_0000), // 0x80000000..0xC0000000
1547        ])
1548        .unwrap();
1549
1550        const TEST_IMAGE_SIZE: u64 = 1234;
1551        let mut test_image = tempfile().unwrap();
1552        test_image.set_len(TEST_IMAGE_SIZE).unwrap();
1553
1554        const TEST_ALIGN: u64 = 0x8000;
1555        let (addr, size) = load_image_high(
1556            &mem,
1557            &mut test_image,
1558            GuestAddress(0x8000),
1559            GuestAddress(0xFFFF_FFFF), // max_guest_addr beyond highest guest memory region
1560            None,
1561            TEST_ALIGN,
1562        )
1563        .unwrap();
1564
1565        assert_eq!(addr, GuestAddress(0xBFFF_8000));
1566        assert_eq!(addr.offset() % TEST_ALIGN, 0);
1567        assert_eq!(size, TEST_IMAGE_SIZE as u32);
1568    }
1569}