x86_64/
lib.rs

1// Copyright 2017 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! x86 architecture support.
6
7#![cfg(target_arch = "x86_64")]
8
9mod fdt;
10
11#[cfg(feature = "gdb")]
12mod gdb;
13
14const SETUP_DTB: u32 = 2;
15const SETUP_RNG_SEED: u32 = 9;
16
17#[allow(dead_code)]
18#[allow(non_upper_case_globals)]
19#[allow(non_camel_case_types)]
20#[allow(non_snake_case)]
21pub mod bootparam;
22
23#[allow(dead_code)]
24#[allow(non_upper_case_globals)]
25mod msr_index;
26
27#[allow(dead_code)]
28#[allow(non_upper_case_globals)]
29#[allow(non_camel_case_types)]
30#[allow(clippy::all)]
31mod mpspec;
32
33pub mod multiboot_spec;
34
35pub mod acpi;
36mod bzimage;
37pub mod cpuid;
38mod gdt;
39pub mod interrupts;
40pub mod mptable;
41pub mod regs;
42pub mod smbios;
43
44use std::arch::x86_64::CpuidResult;
45use std::cmp::min;
46use std::collections::BTreeMap;
47use std::fmt;
48use std::fs::File;
49use std::io;
50use std::io::Write;
51use std::mem;
52use std::path::PathBuf;
53use std::sync::mpsc;
54use std::sync::Arc;
55
56use acpi_tables::aml;
57use acpi_tables::aml::Aml;
58use acpi_tables::sdt::SDT;
59use anyhow::Context;
60use arch::get_serial_cmdline;
61use arch::serial::SerialDeviceInfo;
62use arch::CpuSet;
63use arch::DtbOverlay;
64use arch::FdtPosition;
65use arch::GetSerialCmdlineError;
66use arch::IrqChipArch;
67use arch::MemoryRegionConfig;
68use arch::PciConfig;
69use arch::RunnableLinuxVm;
70use arch::VmComponents;
71use arch::VmImage;
72use base::debug;
73use base::info;
74use base::warn;
75#[cfg(any(target_os = "android", target_os = "linux"))]
76use base::AsRawDescriptors;
77use base::Event;
78use base::FileGetLen;
79use base::FileReadWriteAtVolatile;
80use base::SendTube;
81use base::Tube;
82use base::TubeError;
83use chrono::Utc;
84pub use cpuid::adjust_cpuid;
85pub use cpuid::CpuIdContext;
86use devices::acpi::PM_WAKEUP_GPIO;
87use devices::Bus;
88use devices::BusDevice;
89use devices::BusDeviceObj;
90use devices::BusResumeDevice;
91use devices::BusType;
92use devices::Debugcon;
93use devices::FwCfgParameters;
94use devices::IrqChip;
95use devices::IrqChipX86_64;
96use devices::IrqEventSource;
97use devices::PciAddress;
98use devices::PciConfigIo;
99use devices::PciConfigMmio;
100use devices::PciDevice;
101use devices::PciInterruptPin;
102use devices::PciRoot;
103use devices::PciRootCommand;
104use devices::PciVirtualConfigMmio;
105use devices::Pflash;
106#[cfg(any(target_os = "android", target_os = "linux"))]
107use devices::ProxyDevice;
108use devices::Serial;
109use devices::SerialHardware;
110use devices::SerialParameters;
111use devices::VirtualPmc;
112use devices::FW_CFG_BASE_PORT;
113use devices::FW_CFG_MAX_FILE_SLOTS;
114use devices::FW_CFG_WIDTH;
115use hypervisor::CpuConfigX86_64;
116use hypervisor::Hypervisor;
117use hypervisor::HypervisorX86_64;
118use hypervisor::ProtectionType;
119use hypervisor::VcpuInitX86_64;
120use hypervisor::VcpuX86_64;
121use hypervisor::Vm;
122use hypervisor::VmCap;
123use hypervisor::VmX86_64;
124#[cfg(feature = "seccomp_trace")]
125use jail::read_jail_addr;
126#[cfg(windows)]
127use jail::FakeMinijailStub as Minijail;
128#[cfg(any(target_os = "android", target_os = "linux"))]
129use minijail::Minijail;
130use mptable::MPTABLE_RANGE;
131use multiboot_spec::MultibootInfo;
132use multiboot_spec::MultibootMmapEntry;
133use multiboot_spec::MULTIBOOT_BOOTLOADER_MAGIC;
134use remain::sorted;
135use resources::AddressRange;
136use resources::SystemAllocator;
137use resources::SystemAllocatorConfig;
138use sync::Condvar;
139use sync::Mutex;
140use thiserror::Error;
141use vm_control::BatControl;
142use vm_control::BatteryType;
143use vm_memory::GuestAddress;
144use vm_memory::GuestMemory;
145use vm_memory::GuestMemoryError;
146use vm_memory::MemoryRegionOptions;
147use vm_memory::MemoryRegionPurpose;
148use zerocopy::FromBytes;
149use zerocopy::Immutable;
150use zerocopy::IntoBytes;
151use zerocopy::KnownLayout;
152
153use crate::bootparam::boot_params;
154use crate::bootparam::setup_header;
155use crate::bootparam::XLF_CAN_BE_LOADED_ABOVE_4G;
156use crate::cpuid::EDX_HYBRID_CPU_SHIFT;
157
158#[sorted]
159#[derive(Error, Debug)]
160pub enum Error {
161    #[error("error allocating a single gpe")]
162    AllocateGpe,
163    #[error("error allocating IO resource: {0}")]
164    AllocateIOResouce(resources::Error),
165    #[error("error allocating a single irq")]
166    AllocateIrq,
167    #[error("unable to clone an Event: {0}")]
168    CloneEvent(base::Error),
169    #[error("failed to clone IRQ chip: {0}")]
170    CloneIrqChip(base::Error),
171    #[cfg(any(target_os = "android", target_os = "linux"))]
172    #[error("failed to clone jail: {0}")]
173    CloneJail(minijail::Error),
174    #[error("unable to clone a Tube: {0}")]
175    CloneTube(TubeError),
176    #[error("the given kernel command line was invalid: {0}")]
177    Cmdline(kernel_cmdline::Error),
178    #[error("failed writing command line to guest memory")]
179    CommandLineCopy,
180    #[error("command line overflowed guest memory")]
181    CommandLineOverflow,
182    #[error("failed to configure hotplugged pci device: {0}")]
183    ConfigurePciDevice(arch::DeviceRegistrationError),
184    #[error("bad PCI ECAM configuration: {0}")]
185    ConfigurePciEcam(String),
186    #[error("bad PCI mem configuration: {0}")]
187    ConfigurePciMem(String),
188    #[error("failed to configure segment registers: {0}")]
189    ConfigureSegments(regs::Error),
190    #[error("error configuring the system")]
191    ConfigureSystem,
192    #[error("unable to create ACPI tables")]
193    CreateAcpi,
194    #[error("unable to create battery devices: {0}")]
195    CreateBatDevices(arch::DeviceRegistrationError),
196    #[error("could not create debugcon device: {0}")]
197    CreateDebugconDevice(devices::SerialError),
198    #[error("unable to make an Event: {0}")]
199    CreateEvent(base::Error),
200    #[error("failed to create fdt: {0}")]
201    CreateFdt(cros_fdt::Error),
202    #[error("failed to create fw_cfg device: {0}")]
203    CreateFwCfgDevice(devices::FwCfgError),
204    #[error("failed to create IOAPIC device: {0}")]
205    CreateIoapicDevice(base::Error),
206    #[error("failed to create a PCI root hub: {0}")]
207    CreatePciRoot(arch::DeviceRegistrationError),
208    #[error("unable to create PIT: {0}")]
209    CreatePit(base::Error),
210    #[error("unable to make PIT device: {0}")]
211    CreatePitDevice(devices::PitError),
212    #[cfg(any(target_os = "android", target_os = "linux"))]
213    #[error("unable to create proxy device: {0}")]
214    CreateProxyDevice(devices::ProxyError),
215    #[error("unable to create serial devices: {0}")]
216    CreateSerialDevices(arch::DeviceRegistrationError),
217    #[error("failed to create socket: {0}")]
218    CreateSocket(io::Error),
219    #[error("failed to create tube: {0}")]
220    CreateTube(base::TubeError),
221    #[error("failed to create VCPU: {0}")]
222    CreateVcpu(base::Error),
223    #[error("DTB size is larger than the allowed size")]
224    DTBSizeGreaterThanAllowed,
225    #[error("invalid e820 setup params")]
226    E820Configuration,
227    #[error("failed to enable singlestep execution: {0}")]
228    EnableSinglestep(base::Error),
229    #[error("failed to enable split irqchip: {0}")]
230    EnableSplitIrqchip(base::Error),
231    #[error("failed to get serial cmdline: {0}")]
232    GetSerialCmdline(GetSerialCmdlineError),
233    #[error("failed to insert device onto bus: {0}")]
234    InsertBus(devices::BusError),
235    #[error("the kernel extends past the end of RAM")]
236    InvalidCpuConfig,
237    #[error("invalid CPU config parameters")]
238    KernelOffsetPastEnd,
239    #[error("error loading bios: {0}")]
240    LoadBios(io::Error),
241    #[error("error loading kernel bzImage: {0}")]
242    LoadBzImage(bzimage::Error),
243    #[error("error loading custom pVM firmware: {0}")]
244    LoadCustomPvmFw(arch::LoadImageError),
245    #[error("error loading initrd: {0}")]
246    LoadInitrd(arch::LoadImageError),
247    #[error("error loading Kernel: {0}")]
248    LoadKernel(kernel_loader::Error),
249    #[error("error loading pflash: {0}")]
250    LoadPflash(io::Error),
251    #[error("error loading pVM firmware: {0}")]
252    LoadPvmFw(base::Error),
253    #[error("error in multiboot_info setup")]
254    MultibootInfoSetup,
255    #[error("error translating address: Page not present")]
256    PageNotPresent,
257    #[error("pci mmio overlaps with pVM firmware memory")]
258    PciMmioOverlapPvmFw,
259    #[error("pVM firmware not supported when bios is used on x86_64")]
260    PvmFwBiosUnsupported,
261    #[error("error reading guest memory {0}")]
262    ReadingGuestMemory(vm_memory::GuestMemoryError),
263    #[error("single register read not supported on x86_64")]
264    ReadRegIsUnsupported,
265    #[error("error reading CPU registers {0}")]
266    ReadRegs(base::Error),
267    #[error("error registering an IrqFd: {0}")]
268    RegisterIrqfd(base::Error),
269    #[error("error registering virtual socket device: {0}")]
270    RegisterVsock(arch::DeviceRegistrationError),
271    #[error("error reserved pcie config mmio")]
272    ReservePcieCfgMmio(resources::Error),
273    #[error("failed to set a hardware breakpoint: {0}")]
274    SetHwBreakpoint(base::Error),
275    #[error("failed to set identity map addr: {0}")]
276    SetIdentityMapAddr(base::Error),
277    #[error("failed to set interrupts: {0}")]
278    SetLint(interrupts::Error),
279    #[error("failed to set tss addr: {0}")]
280    SetTssAddr(base::Error),
281    #[error("failed to set up cmos: {0}")]
282    SetupCmos(anyhow::Error),
283    #[error("failed to set up cpuid: {0}")]
284    SetupCpuid(cpuid::Error),
285    #[error("setup data too large")]
286    SetupDataTooLarge,
287    #[error("failed to set up FPU: {0}")]
288    SetupFpu(base::Error),
289    #[error("failed to set up guest memory: {0}")]
290    SetupGuestMemory(GuestMemoryError),
291    #[error("failed to set up mptable: {0}")]
292    SetupMptable(mptable::Error),
293    #[error("failed to set up MSRs: {0}")]
294    SetupMsrs(base::Error),
295    #[error("failed to set up page tables: {0}")]
296    SetupPageTables(regs::Error),
297    #[error("failed to set up pflash: {0}")]
298    SetupPflash(anyhow::Error),
299    #[error("failed to set up registers: {0}")]
300    SetupRegs(regs::Error),
301    #[error("failed to set up SMBIOS: {0}")]
302    SetupSmbios(smbios::Error),
303    #[error("failed to set up sregs: {0}")]
304    SetupSregs(base::Error),
305    #[error("too many vCPUs")]
306    TooManyVcpus,
307    #[error("failed to translate virtual address")]
308    TranslatingVirtAddr,
309    #[error("protected VMs not supported on x86_64")]
310    UnsupportedProtectionType,
311    #[error("single register write not supported on x86_64")]
312    WriteRegIsUnsupported,
313    #[error("error writing CPU registers {0}")]
314    WriteRegs(base::Error),
315    #[error("error writing guest memory {0}")]
316    WritingGuestMemory(GuestMemoryError),
317    #[error("error writing setup_data: {0}")]
318    WritingSetupData(GuestMemoryError),
319    #[error("the zero page extends past the end of guest_mem")]
320    ZeroPagePastRamEnd,
321    #[error("error writing the zero page of guest memory")]
322    ZeroPageSetup,
323}
324
325pub type Result<T> = std::result::Result<T, Error>;
326
327pub struct X8664arch;
328
329// Like `bootparam::setup_data` without the incomplete array field at the end, which allows us to
330// safely implement Copy, Clone
331#[repr(C)]
332#[derive(Copy, Clone, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
333struct setup_data_hdr {
334    pub next: u64,
335    pub type_: u32,
336    pub len: u32,
337}
338
339#[repr(u32)]
340#[derive(Copy, Clone, Debug, PartialEq, Eq)]
341pub enum SetupDataType {
342    Dtb = SETUP_DTB,
343    RngSeed = SETUP_RNG_SEED,
344}
345
346/// A single entry to be inserted in the bootparam `setup_data` linked list.
347pub struct SetupData {
348    pub data: Vec<u8>,
349    pub type_: SetupDataType,
350}
351
352impl SetupData {
353    /// Returns the length of the data
354    pub fn size(&self) -> usize {
355        self.data.len()
356    }
357}
358
359/// Collection of SetupData entries to be inserted in the
360/// bootparam `setup_data` linked list.
361pub struct SetupDataEntries {
362    entries: Vec<SetupData>,
363    setup_data_start: usize,
364    setup_data_end: usize,
365    available_size: usize,
366}
367
368impl SetupDataEntries {
369    /// Returns a new instance of SetupDataEntries
370    pub fn new(setup_data_start: usize, setup_data_end: usize) -> SetupDataEntries {
371        SetupDataEntries {
372            entries: Vec::new(),
373            setup_data_start,
374            setup_data_end,
375            available_size: setup_data_end - setup_data_start,
376        }
377    }
378
379    /// Adds a new SetupDataEntry and returns the remaining size available
380    pub fn insert(&mut self, setup_data: SetupData) -> usize {
381        self.available_size -= setup_data.size();
382        self.entries.push(setup_data);
383
384        self.available_size
385    }
386
387    /// Copy setup_data entries to guest memory and link them together with the `next` field.
388    /// Returns the guest address of the first entry in the setup_data list, if any.
389    pub fn write_setup_data(&self, guest_mem: &GuestMemory) -> Result<Option<GuestAddress>> {
390        write_setup_data(
391            guest_mem,
392            GuestAddress(self.setup_data_start as u64),
393            GuestAddress(self.setup_data_end as u64),
394            &self.entries,
395        )
396    }
397}
398
399#[derive(Copy, Clone, Debug)]
400enum E820Type {
401    Ram = 0x01,
402    Reserved = 0x2,
403}
404
405#[derive(Copy, Clone, Debug)]
406struct E820Entry {
407    pub address: GuestAddress,
408    pub len: u64,
409    pub mem_type: E820Type,
410}
411
412const KB: u64 = 1 << 10;
413const MB: u64 = 1 << 20;
414const GB: u64 = 1 << 30;
415
416pub const BOOT_STACK_POINTER: u64 = 0x8000;
417const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32;
418// Make sure it align to 256MB for MTRR convenient
419const MEM_32BIT_GAP_SIZE: u64 = 768 * MB;
420// Reserved memory for nand_bios/LAPIC/IOAPIC/HPET/.....
421const RESERVED_MEM_SIZE: u64 = 0x800_0000;
422const DEFAULT_PCI_MEM_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
423// Reserve 64MB for pcie enhanced configuration
424const DEFAULT_PCIE_CFG_MMIO_SIZE: u64 = 0x400_0000;
425const DEFAULT_PCIE_CFG_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
426const DEFAULT_PCIE_CFG_MMIO_START: u64 = DEFAULT_PCIE_CFG_MMIO_END - DEFAULT_PCIE_CFG_MMIO_SIZE + 1;
427// Linux (with 4-level paging) has a physical memory limit of 46 bits (64 TiB).
428const HIGH_MMIO_MAX_END: u64 = (1u64 << 46) - 1;
429pub const KERNEL_32BIT_ENTRY_OFFSET: u64 = 0x0;
430pub const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
431pub const MULTIBOOT_INFO_OFFSET: u64 = 0x6000;
432pub const MULTIBOOT_INFO_SIZE: u64 = 0x1000;
433pub const ZERO_PAGE_OFFSET: u64 = 0x7000;
434// Set BIOS max size to 16M: this is used only when `unrestricted guest` is disabled
435const BIOS_MAX_SIZE: u64 = 0x1000000;
436
437pub const KERNEL_START_OFFSET: u64 = 0x20_0000;
438const CMDLINE_OFFSET: u64 = 0x2_0000;
439const CMDLINE_MAX_SIZE: u64 = 0x800; // including terminating zero
440const SETUP_DATA_START: u64 = CMDLINE_OFFSET + CMDLINE_MAX_SIZE;
441const SETUP_DATA_END: u64 = MPTABLE_RANGE.start;
442const X86_64_FDT_MAX_SIZE: u64 = 0x4000;
443const X86_64_SERIAL_1_3_IRQ: u32 = 4;
444const X86_64_SERIAL_2_4_IRQ: u32 = 3;
445// X86_64_SCI_IRQ is used to fill the ACPI FACP table.
446// The sci_irq number is better to be a legacy
447// IRQ number which is less than 16(actually most of the
448// platforms have fixed IRQ number 9). So we can
449// reserve the IRQ number 5 for SCI and let the
450// the other devices starts from next.
451pub const X86_64_SCI_IRQ: u32 = 5;
452// The CMOS RTC uses IRQ 8; start allocating IRQs at 9.
453pub const X86_64_IRQ_BASE: u32 = 9;
454const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E_0000;
455
456// pVM firmware memory. Should be within the low 4GB, so that it is identity-mapped
457// by setup_page_tables() when a protected VM boots in long mode, since the pVM firmware is
458// the VM entry point.
459const PROTECTED_VM_FW_MAX_SIZE: u64 = 0x40_0000;
460// Load the pVM firmware just below 2 GB to allow use of `-mcmodel=small`.
461const PROTECTED_VM_FW_START: u64 = 0x8000_0000 - PROTECTED_VM_FW_MAX_SIZE;
462
463#[derive(Debug, PartialEq, Eq)]
464pub enum CpuManufacturer {
465    Intel,
466    Amd,
467    Unknown,
468}
469
470pub fn get_cpu_manufacturer() -> CpuManufacturer {
471    cpuid::cpu_manufacturer()
472}
473
474pub struct ArchMemoryLayout {
475    // the pci mmio range below 4G
476    pci_mmio_before_32bit: AddressRange,
477    // the pcie cfg mmio range
478    pcie_cfg_mmio: AddressRange,
479    // the pVM firmware memory (if running a protected VM)
480    pvmfw_mem: Option<AddressRange>,
481}
482
483pub fn create_arch_memory_layout(
484    pci_config: &PciConfig,
485    has_protected_vm_firmware: bool,
486) -> Result<ArchMemoryLayout> {
487    // the max bus number is 256 and each bus occupy 1MB, so the max pcie cfg mmio size = 256M
488    const MAX_PCIE_ECAM_SIZE: u64 = 256 * MB;
489    let pcie_cfg_mmio = match pci_config.ecam {
490        Some(MemoryRegionConfig {
491            start,
492            size: Some(size),
493        }) => AddressRange::from_start_and_size(start, size.min(MAX_PCIE_ECAM_SIZE)).unwrap(),
494        Some(MemoryRegionConfig { start, size: None }) => {
495            AddressRange::from_start_and_end(start, DEFAULT_PCIE_CFG_MMIO_END)
496        }
497        None => {
498            AddressRange::from_start_and_end(DEFAULT_PCIE_CFG_MMIO_START, DEFAULT_PCIE_CFG_MMIO_END)
499        }
500    };
501    if pcie_cfg_mmio.start % pcie_cfg_mmio.len().unwrap() != 0
502        || pcie_cfg_mmio.start % MB != 0
503        || pcie_cfg_mmio.len().unwrap() % MB != 0
504    {
505        return Err(Error::ConfigurePciEcam(
506            "base and len must be aligned to 1MB and base must be a multiple of len".to_string(),
507        ));
508    }
509    if pcie_cfg_mmio.end >= 0x1_0000_0000 {
510        return Err(Error::ConfigurePciEcam(
511            "end address can't go beyond 4G".to_string(),
512        ));
513    }
514
515    let pci_mmio_before_32bit = match pci_config.mem {
516        Some(MemoryRegionConfig {
517            start,
518            size: Some(size),
519        }) => AddressRange::from_start_and_size(start, size)
520            .ok_or(Error::ConfigurePciMem("region overflowed".to_string()))?,
521        Some(MemoryRegionConfig { start, size: None }) => {
522            AddressRange::from_start_and_end(start, DEFAULT_PCI_MEM_END)
523        }
524        None => AddressRange::from_start_and_end(
525            pcie_cfg_mmio
526                .start
527                .min(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE),
528            DEFAULT_PCI_MEM_END,
529        ),
530    };
531
532    let pvmfw_mem = if has_protected_vm_firmware {
533        let range = AddressRange {
534            start: PROTECTED_VM_FW_START,
535            end: PROTECTED_VM_FW_START + PROTECTED_VM_FW_MAX_SIZE - 1,
536        };
537        if !pci_mmio_before_32bit.intersect(range).is_empty() {
538            return Err(Error::PciMmioOverlapPvmFw);
539        }
540
541        Some(range)
542    } else {
543        None
544    };
545
546    Ok(ArchMemoryLayout {
547        pci_mmio_before_32bit,
548        pcie_cfg_mmio,
549        pvmfw_mem,
550    })
551}
552
553/// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
554/// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
555/// pointer at the effective physical address 0xFFFF_FFF0.
556fn bios_start(bios_size: u64) -> GuestAddress {
557    GuestAddress(FIRST_ADDR_PAST_32BITS - bios_size)
558}
559
560fn identity_map_addr_start() -> GuestAddress {
561    // Set Identity map address 4 pages before the max BIOS size
562    GuestAddress(FIRST_ADDR_PAST_32BITS - BIOS_MAX_SIZE - 4 * 0x1000)
563}
564
565fn tss_addr_start() -> GuestAddress {
566    // Set TSS address one page after identity map address
567    GuestAddress(identity_map_addr_start().offset() + 0x1000)
568}
569
570fn tss_addr_end() -> GuestAddress {
571    // Set TSS address section to have 3 pages
572    GuestAddress(tss_addr_start().offset() + 0x3000)
573}
574
575fn configure_boot_params(
576    guest_mem: &GuestMemory,
577    cmdline_addr: GuestAddress,
578    setup_data: Option<GuestAddress>,
579    initrd: Option<(GuestAddress, u32)>,
580    mut params: boot_params,
581    e820_entries: &[E820Entry],
582) -> Result<()> {
583    const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
584    const KERNEL_HDR_MAGIC: u32 = 0x5372_6448;
585    const KERNEL_LOADER_OTHER: u8 = 0xff;
586    const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x100_0000; // Must be non-zero.
587
588    params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
589    params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
590    params.hdr.header = KERNEL_HDR_MAGIC;
591    params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
592    params.ext_cmd_line_ptr = (cmdline_addr.offset() >> 32) as u32;
593    params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
594    if let Some(setup_data) = setup_data {
595        params.hdr.setup_data = setup_data.offset();
596    }
597    if let Some((initrd_addr, initrd_size)) = initrd {
598        params.hdr.ramdisk_image = initrd_addr.offset() as u32;
599        params.ext_ramdisk_image = (initrd_addr.offset() >> 32) as u32;
600        params.hdr.ramdisk_size = initrd_size;
601        params.ext_ramdisk_size = 0;
602    }
603
604    if e820_entries.len() >= params.e820_table.len() {
605        return Err(Error::E820Configuration);
606    }
607
608    for (src, dst) in e820_entries.iter().zip(params.e820_table.iter_mut()) {
609        dst.addr = src.address.offset();
610        dst.size = src.len;
611        dst.type_ = src.mem_type as u32;
612    }
613    params.e820_entries = e820_entries.len() as u8;
614
615    let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
616    if !guest_mem.is_valid_range(zero_page_addr, mem::size_of::<boot_params>() as u64) {
617        return Err(Error::ZeroPagePastRamEnd);
618    }
619
620    guest_mem
621        .write_obj_at_addr(params, zero_page_addr)
622        .map_err(|_| Error::ZeroPageSetup)?;
623
624    Ok(())
625}
626
627fn configure_multiboot_info(
628    guest_mem: &GuestMemory,
629    cmdline_addr: GuestAddress,
630    e820_entries: &[E820Entry],
631) -> Result<()> {
632    let mut multiboot_info = MultibootInfo {
633        ..Default::default()
634    };
635
636    // Extra Multiboot-related data is added directly after the info structure.
637    let mut multiboot_data_addr =
638        GuestAddress(MULTIBOOT_INFO_OFFSET + mem::size_of_val(&multiboot_info) as u64);
639    multiboot_data_addr = multiboot_data_addr
640        .align(16)
641        .ok_or(Error::MultibootInfoSetup)?;
642
643    // mem_lower is the amount of RAM below 1 MB, in units of KiB.
644    let mem_lower = guest_mem
645        .regions()
646        .filter(|r| {
647            r.options.purpose == MemoryRegionPurpose::GuestMemoryRegion
648                && r.guest_addr.offset() < 1 * MB
649        })
650        .map(|r| r.size as u64)
651        .sum::<u64>()
652        / KB;
653
654    // mem_upper is the amount of RAM above 1 MB up to the first memory hole, in units of KiB.
655    // We don't have the ISA 15-16 MB hole, so this includes all RAM from 1 MB up to the
656    // beginning of the PCI hole just below 4 GB.
657    let mem_upper = guest_mem
658        .regions()
659        .filter(|r| {
660            r.options.purpose == MemoryRegionPurpose::GuestMemoryRegion
661                && r.guest_addr.offset() >= 1 * MB
662                && r.guest_addr.offset() < 4 * GB
663        })
664        .map(|r| r.size as u64)
665        .sum::<u64>()
666        / KB;
667
668    multiboot_info.mem_lower = mem_lower as u32;
669    multiboot_info.mem_upper = mem_upper as u32;
670    multiboot_info.flags |= MultibootInfo::F_MEM;
671
672    // Memory map - convert from params.e820_table to Multiboot format.
673    let multiboot_mmap: Vec<MultibootMmapEntry> = e820_entries
674        .iter()
675        .map(|e820_entry| MultibootMmapEntry {
676            size: 20, // size of the entry, not including the size field itself
677            base_addr: e820_entry.address.offset(),
678            length: e820_entry.len,
679            type_: e820_entry.mem_type as u32,
680        })
681        .collect();
682    let multiboot_mmap_bytes = multiboot_mmap.as_bytes();
683    let multiboot_mmap_addr =
684        append_multiboot_info(guest_mem, &mut multiboot_data_addr, multiboot_mmap_bytes)?;
685    multiboot_info.mmap_addr = multiboot_mmap_addr.offset() as u32;
686    multiboot_info.mmap_length = multiboot_mmap_bytes.len() as u32;
687    multiboot_info.flags |= MultibootInfo::F_MMAP;
688
689    // Command line
690    multiboot_info.cmdline = cmdline_addr.offset() as u32;
691    multiboot_info.flags |= MultibootInfo::F_CMDLINE;
692
693    // Boot loader name
694    let boot_loader_name_addr =
695        append_multiboot_info(guest_mem, &mut multiboot_data_addr, b"crosvm\0")?;
696    multiboot_info.boot_loader_name = boot_loader_name_addr.offset() as u32;
697    multiboot_info.flags |= MultibootInfo::F_BOOT_LOADER_NAME;
698
699    guest_mem
700        .write_obj_at_addr(multiboot_info, GuestAddress(MULTIBOOT_INFO_OFFSET))
701        .map_err(|_| Error::MultibootInfoSetup)?;
702
703    Ok(())
704}
705
706fn append_multiboot_info(
707    guest_mem: &GuestMemory,
708    addr: &mut GuestAddress,
709    data: &[u8],
710) -> Result<GuestAddress> {
711    let data_addr = *addr;
712    let new_addr = addr
713        .checked_add(data.len() as u64)
714        .and_then(|a| a.align(16))
715        .ok_or(Error::MultibootInfoSetup)?;
716
717    // Make sure we don't write beyond the region reserved for Multiboot info.
718    if new_addr.offset() - MULTIBOOT_INFO_OFFSET > MULTIBOOT_INFO_SIZE {
719        return Err(Error::MultibootInfoSetup);
720    }
721
722    guest_mem
723        .write_all_at_addr(data, data_addr)
724        .map_err(|_| Error::MultibootInfoSetup)?;
725
726    *addr = new_addr;
727    Ok(data_addr)
728}
729
730/// Write setup_data entries in guest memory and link them together with the `next` field.
731///
732/// Returns the guest address of the first entry in the setup_data list, if any.
733fn write_setup_data(
734    guest_mem: &GuestMemory,
735    setup_data_start: GuestAddress,
736    setup_data_end: GuestAddress,
737    setup_data: &[SetupData],
738) -> Result<Option<GuestAddress>> {
739    let mut setup_data_list_head = None;
740
741    // Place the first setup_data at the first 64-bit aligned offset following setup_data_start.
742    let mut setup_data_addr = setup_data_start.align(8).ok_or(Error::SetupDataTooLarge)?;
743
744    let mut entry_iter = setup_data.iter().peekable();
745    while let Some(entry) = entry_iter.next() {
746        if setup_data_list_head.is_none() {
747            setup_data_list_head = Some(setup_data_addr);
748        }
749
750        // Ensure the entry (header plus data) fits into guest memory.
751        let entry_size = (mem::size_of::<setup_data_hdr>() + entry.data.len()) as u64;
752        let entry_end = setup_data_addr
753            .checked_add(entry_size)
754            .ok_or(Error::SetupDataTooLarge)?;
755
756        if entry_end >= setup_data_end {
757            return Err(Error::SetupDataTooLarge);
758        }
759
760        let next_setup_data_addr = if entry_iter.peek().is_some() {
761            // Place the next setup_data at a 64-bit aligned address.
762            setup_data_addr
763                .checked_add(entry_size)
764                .and_then(|addr| addr.align(8))
765                .ok_or(Error::SetupDataTooLarge)?
766        } else {
767            // This is the final entry. Terminate the list with next == 0.
768            GuestAddress(0)
769        };
770
771        let hdr = setup_data_hdr {
772            next: next_setup_data_addr.offset(),
773            type_: entry.type_ as u32,
774            len: entry
775                .data
776                .len()
777                .try_into()
778                .map_err(|_| Error::SetupDataTooLarge)?,
779        };
780
781        guest_mem
782            .write_obj_at_addr(hdr, setup_data_addr)
783            .map_err(Error::WritingSetupData)?;
784        guest_mem
785            .write_all_at_addr(
786                &entry.data,
787                setup_data_addr.unchecked_add(mem::size_of::<setup_data_hdr>() as u64),
788            )
789            .map_err(Error::WritingSetupData)?;
790
791        setup_data_addr = next_setup_data_addr;
792    }
793
794    Ok(setup_data_list_head)
795}
796
797/// Find the first `setup_data_hdr` with the given type in guest memory and return its address.
798fn find_setup_data(
799    mem: &GuestMemory,
800    setup_data_start: GuestAddress,
801    setup_data_end: GuestAddress,
802    type_: SetupDataType,
803) -> Option<GuestAddress> {
804    let mut setup_data_addr = setup_data_start.align(8)?;
805    while setup_data_addr < setup_data_end {
806        let hdr: setup_data_hdr = mem.read_obj_from_addr(setup_data_addr).ok()?;
807        if hdr.type_ == type_ as u32 {
808            return Some(setup_data_addr);
809        }
810
811        if hdr.next == 0 {
812            return None;
813        }
814
815        setup_data_addr = GuestAddress(hdr.next);
816    }
817    None
818}
819
820/// Generate a SETUP_RNG_SEED SetupData with random seed data.
821fn setup_data_rng_seed() -> SetupData {
822    let data: [u8; 256] = rand::random();
823    SetupData {
824        data: data.to_vec(),
825        type_: SetupDataType::RngSeed,
826    }
827}
828
829/// Add an e820 region to the e820 map.
830fn add_e820_entry(
831    e820_entries: &mut Vec<E820Entry>,
832    range: AddressRange,
833    mem_type: E820Type,
834) -> Result<()> {
835    e820_entries.push(E820Entry {
836        address: GuestAddress(range.start),
837        len: range.len().ok_or(Error::E820Configuration)?,
838        mem_type,
839    });
840
841    Ok(())
842}
843
844/// Generate a memory map in INT 0x15 AX=0xE820 format.
845fn generate_e820_memory_map(
846    arch_memory_layout: &ArchMemoryLayout,
847    guest_mem: &GuestMemory,
848) -> Result<Vec<E820Entry>> {
849    let mut e820_entries = Vec::new();
850
851    for r in guest_mem.regions() {
852        let range = AddressRange::from_start_and_size(r.guest_addr.offset(), r.size as u64)
853            .expect("invalid guest mem region");
854        let mem_type = match r.options.purpose {
855            MemoryRegionPurpose::Bios => E820Type::Reserved,
856            MemoryRegionPurpose::GuestMemoryRegion => E820Type::Ram,
857            // After the pVM firmware jumped to the guest, the pVM firmware itself is no longer
858            // running, so its memory is reusable by the guest OS. So add this memory as RAM rather
859            // than Reserved.
860            MemoryRegionPurpose::ProtectedFirmwareRegion => E820Type::Ram,
861            MemoryRegionPurpose::ReservedMemory => E820Type::Reserved,
862        };
863        add_e820_entry(&mut e820_entries, range, mem_type)?;
864    }
865
866    let pcie_cfg_mmio_range = arch_memory_layout.pcie_cfg_mmio;
867    add_e820_entry(&mut e820_entries, pcie_cfg_mmio_range, E820Type::Reserved)?;
868
869    add_e820_entry(
870        &mut e820_entries,
871        X8664arch::get_pcie_vcfg_mmio_range(guest_mem, &pcie_cfg_mmio_range),
872        E820Type::Reserved,
873    )?;
874
875    // Reserve memory section for Identity map and TSS
876    add_e820_entry(
877        &mut e820_entries,
878        AddressRange {
879            start: identity_map_addr_start().offset(),
880            end: tss_addr_end().offset() - 1,
881        },
882        E820Type::Reserved,
883    )?;
884
885    Ok(e820_entries)
886}
887
888/// Returns a Vec of the valid memory addresses.
889/// These should be used to configure the GuestMemory structure for the platform.
890/// For x86_64 all addresses are valid from the start of the kernel except a
891/// carve out at the end of 32bit address space.
892pub fn arch_memory_regions(
893    arch_memory_layout: &ArchMemoryLayout,
894    mem_size: u64,
895    bios_size: Option<u64>,
896) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
897    let mut regions = Vec::new();
898
899    // Some guest kernels expect a typical PC memory layout where the region between 640 KB and
900    // 1 MB is reserved for device memory/ROMs and get confused if there is a RAM region
901    // spanning this area, so we provide the traditional 640 KB low memory and 1 MB+
902    // high memory regions.
903    let mem_below_1m = 640 * KB;
904    regions.push((
905        GuestAddress(0),
906        mem_below_1m,
907        MemoryRegionOptions::new().purpose(MemoryRegionPurpose::GuestMemoryRegion),
908    ));
909
910    // Reserved/BIOS data area between 640 KB and 1 MB.
911    // This needs to be backed by an actual GuestMemory region so we can write BIOS tables here, but
912    // it should be reported as "reserved" in the e820 memory map to match PC architecture
913    // expectations.
914    regions.push((
915        GuestAddress(640 * KB),
916        (1 * MB) - (640 * KB),
917        MemoryRegionOptions::new().purpose(MemoryRegionPurpose::ReservedMemory),
918    ));
919
920    // RAM between 1 MB and 4 GB
921    let mem_1m_to_4g = arch_memory_layout.pci_mmio_before_32bit.start.min(mem_size) - 1 * MB;
922    regions.push((
923        GuestAddress(1 * MB),
924        mem_1m_to_4g,
925        MemoryRegionOptions::new().purpose(MemoryRegionPurpose::GuestMemoryRegion),
926    ));
927
928    // RAM above 4 GB
929    let mem_above_4g = mem_size.saturating_sub(1 * MB + mem_1m_to_4g);
930    if mem_above_4g > 0 {
931        regions.push((
932            GuestAddress(FIRST_ADDR_PAST_32BITS),
933            mem_above_4g,
934            MemoryRegionOptions::new().purpose(MemoryRegionPurpose::GuestMemoryRegion),
935        ));
936    }
937
938    if let Some(bios_size) = bios_size {
939        regions.push((
940            bios_start(bios_size),
941            bios_size,
942            MemoryRegionOptions::new().purpose(MemoryRegionPurpose::Bios),
943        ));
944    }
945
946    if let Some(pvmfw_mem) = arch_memory_layout.pvmfw_mem {
947        // Remove any areas of guest memory regions that overlap the pVM firmware range.
948        while let Some(overlapping_region_index) = regions.iter().position(|(addr, size, _opts)| {
949            let region_addr_range = AddressRange::from_start_and_size(addr.offset(), *size)
950                .expect("invalid GuestMemory range");
951            region_addr_range.overlaps(pvmfw_mem)
952        }) {
953            let overlapping_region = regions.swap_remove(overlapping_region_index);
954            let overlapping_region_range = AddressRange::from_start_and_size(
955                overlapping_region.0.offset(),
956                overlapping_region.1,
957            )
958            .unwrap();
959            let (first, second) = overlapping_region_range.non_overlapping_ranges(pvmfw_mem);
960            if !first.is_empty() {
961                regions.push((
962                    GuestAddress(first.start),
963                    first.len().unwrap(),
964                    overlapping_region.2.clone(),
965                ));
966            }
967            if !second.is_empty() {
968                regions.push((
969                    GuestAddress(second.start),
970                    second.len().unwrap(),
971                    overlapping_region.2,
972                ));
973            }
974        }
975
976        // Insert a region for the pVM firmware area.
977        regions.push((
978            GuestAddress(pvmfw_mem.start),
979            pvmfw_mem.len().expect("invalid pvmfw region"),
980            MemoryRegionOptions::new().purpose(MemoryRegionPurpose::ProtectedFirmwareRegion),
981        ));
982    }
983
984    regions.sort_unstable_by_key(|(addr, _, _)| *addr);
985
986    for (addr, size, options) in &regions {
987        debug!(
988            "{:#018x}-{:#018x} {:?}",
989            addr.offset(),
990            addr.offset() + size - 1,
991            options.purpose,
992        );
993    }
994
995    regions
996}
997
998impl arch::LinuxArch for X8664arch {
999    type Error = Error;
1000    type ArchMemoryLayout = ArchMemoryLayout;
1001
1002    fn arch_memory_layout(
1003        components: &VmComponents,
1004    ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error> {
1005        create_arch_memory_layout(
1006            &components.pci_config,
1007            components.hv_cfg.protection_type.runs_firmware(),
1008        )
1009    }
1010
1011    fn guest_memory_layout(
1012        components: &VmComponents,
1013        arch_memory_layout: &Self::ArchMemoryLayout,
1014        _hypervisor: &impl Hypervisor,
1015    ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error> {
1016        let bios_size = match &components.vm_image {
1017            VmImage::Bios(bios_file) => Some(bios_file.metadata().map_err(Error::LoadBios)?.len()),
1018            VmImage::Kernel(_) => None,
1019        };
1020
1021        Ok(arch_memory_regions(
1022            arch_memory_layout,
1023            components.memory_size,
1024            bios_size,
1025        ))
1026    }
1027
1028    fn get_system_allocator_config(
1029        vm: &dyn Vm,
1030        arch_memory_layout: &Self::ArchMemoryLayout,
1031    ) -> SystemAllocatorConfig {
1032        SystemAllocatorConfig {
1033            io: Some(AddressRange {
1034                start: 0xc000,
1035                end: 0xffff,
1036            }),
1037            low_mmio: arch_memory_layout.pci_mmio_before_32bit,
1038            high_mmio: Self::get_high_mmio_range(vm, arch_memory_layout),
1039            platform_mmio: None,
1040            first_irq: X86_64_IRQ_BASE,
1041        }
1042    }
1043
1044    fn build_vm(
1045        mut components: VmComponents,
1046        arch_memory_layout: &Self::ArchMemoryLayout,
1047        vm_evt_wrtube: &SendTube,
1048        system_allocator: &mut SystemAllocator,
1049        serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
1050        serial_jail: Option<Minijail>,
1051        battery: (Option<BatteryType>, Option<Minijail>),
1052        vm: Arc<dyn VmX86_64>,
1053        ramoops_region: Option<arch::pstore::RamoopsRegion>,
1054        devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
1055        irq_chip: Arc<dyn IrqChipX86_64>,
1056        vcpu_ids: &mut Vec<usize>,
1057        dump_device_tree_blob: Option<PathBuf>,
1058        debugcon_jail: Option<Minijail>,
1059        pflash_jail: Option<Minijail>,
1060        fw_cfg_jail: Option<Minijail>,
1061        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1062        guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
1063        device_tree_overlays: Vec<DtbOverlay>,
1064        _fdt_position: Option<FdtPosition>,
1065        _no_pmu: bool,
1066    ) -> std::result::Result<RunnableLinuxVm, Self::Error> {
1067        let mem = vm.get_memory().clone();
1068
1069        let vcpu_count = components.vcpu_properties.len();
1070
1071        vm.set_identity_map_addr(identity_map_addr_start())
1072            .map_err(Error::SetIdentityMapAddr)?;
1073
1074        vm.set_tss_addr(tss_addr_start())
1075            .map_err(Error::SetTssAddr)?;
1076
1077        // Use IRQ info in ACPI if provided by the user.
1078        let mut mptable = true;
1079        let mut sci_irq = X86_64_SCI_IRQ;
1080
1081        // punch pcie config mmio from pci low mmio, so that it couldn't be
1082        // allocated to any device.
1083        let pcie_cfg_mmio_range = arch_memory_layout.pcie_cfg_mmio;
1084        system_allocator
1085            .reserve_mmio(pcie_cfg_mmio_range)
1086            .map_err(Error::ReservePcieCfgMmio)?;
1087
1088        for sdt in components.acpi_sdts.iter() {
1089            if sdt.is_signature(b"FACP") {
1090                mptable = false;
1091                let sci_irq_fadt: u16 = sdt.read(acpi::FADT_FIELD_SCI_INTERRUPT);
1092                sci_irq = sci_irq_fadt.into();
1093                if !system_allocator.reserve_irq(sci_irq) {
1094                    warn!("sci irq {} already reserved.", sci_irq);
1095                }
1096            }
1097        }
1098
1099        let pcie_vcfg_range = Self::get_pcie_vcfg_mmio_range(&mem, &pcie_cfg_mmio_range);
1100        let mmio_bus = Arc::new(Bus::new(BusType::Mmio));
1101        let io_bus = Arc::new(Bus::new(BusType::Io));
1102        let hypercall_bus = Arc::new(Bus::new(BusType::Hypercall));
1103
1104        let (pci_devices, _devs): (Vec<_>, Vec<_>) = devs
1105            .into_iter()
1106            .partition(|(dev, _)| dev.as_pci_device().is_some());
1107
1108        let pci_devices = pci_devices
1109            .into_iter()
1110            .map(|(dev, jail_orig)| (dev.into_pci_device().unwrap(), jail_orig))
1111            .collect();
1112
1113        let (pci, pci_irqs, pid_debug_label_map, amls, gpe_scope_amls) = arch::generate_pci_root(
1114            pci_devices,
1115            &*irq_chip,
1116            mmio_bus.clone(),
1117            GuestAddress(pcie_cfg_mmio_range.start),
1118            12,
1119            io_bus.clone(),
1120            system_allocator,
1121            &*vm,
1122            4, // Share the four pin interrupts (INTx#)
1123            Some(pcie_vcfg_range.start),
1124            #[cfg(feature = "swap")]
1125            swap_controller,
1126        )
1127        .map_err(Error::CreatePciRoot)?;
1128
1129        let pci = Arc::new(Mutex::new(pci));
1130        pci.lock().enable_pcie_cfg_mmio(pcie_cfg_mmio_range.start);
1131        let pci_cfg = PciConfigIo::new(
1132            pci.clone(),
1133            components.break_linux_pci_config_io,
1134            vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
1135        );
1136        let pci_bus = Arc::new(Mutex::new(pci_cfg));
1137        io_bus.insert(pci_bus, 0xcf8, 0x8).unwrap();
1138
1139        let pcie_cfg_mmio = Arc::new(Mutex::new(PciConfigMmio::new(pci.clone(), 12)));
1140        let pcie_cfg_mmio_len = pcie_cfg_mmio_range.len().unwrap();
1141        mmio_bus
1142            .insert(pcie_cfg_mmio, pcie_cfg_mmio_range.start, pcie_cfg_mmio_len)
1143            .unwrap();
1144
1145        let pcie_vcfg_mmio = Arc::new(Mutex::new(PciVirtualConfigMmio::new(pci.clone(), 13)));
1146        mmio_bus
1147            .insert(
1148                pcie_vcfg_mmio,
1149                pcie_vcfg_range.start,
1150                pcie_vcfg_range.len().unwrap(),
1151            )
1152            .unwrap();
1153
1154        // Event used to notify crosvm that guest OS is trying to suspend.
1155        let (suspend_tube_send, suspend_tube_recv) =
1156            Tube::directional_pair().map_err(Error::CreateTube)?;
1157        let suspend_tube_send = Arc::new(Mutex::new(suspend_tube_send));
1158
1159        if components.fw_cfg_enable {
1160            Self::setup_fw_cfg_device(
1161                &io_bus,
1162                components.fw_cfg_parameters.clone(),
1163                components.bootorder_fw_cfg_blob.clone(),
1164                fw_cfg_jail,
1165                #[cfg(feature = "swap")]
1166                swap_controller,
1167            )?;
1168        }
1169
1170        if !components.no_i8042 {
1171            Self::setup_legacy_i8042_device(
1172                &io_bus,
1173                irq_chip.pit_uses_speaker_port(),
1174                vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
1175            )?;
1176        }
1177        let mut vm_request_tube = if !components.no_rtc {
1178            let (host_tube, device_tube) = Tube::pair()
1179                .context("create tube")
1180                .map_err(Error::SetupCmos)?;
1181            Self::setup_legacy_cmos_device(
1182                arch_memory_layout,
1183                &io_bus,
1184                irq_chip.clone(),
1185                device_tube,
1186                components.memory_size,
1187            )
1188            .map_err(Error::SetupCmos)?;
1189            Some(host_tube)
1190        } else {
1191            None
1192        };
1193        let serial_devices = Self::setup_serial_devices(
1194            components.hv_cfg.protection_type,
1195            &*irq_chip,
1196            &io_bus,
1197            serial_parameters,
1198            serial_jail,
1199            #[cfg(feature = "swap")]
1200            swap_controller,
1201        )?;
1202        Self::setup_debugcon_devices(
1203            components.hv_cfg.protection_type,
1204            &io_bus,
1205            serial_parameters,
1206            debugcon_jail,
1207            #[cfg(feature = "swap")]
1208            swap_controller,
1209        )?;
1210
1211        let bios_size = if let VmImage::Bios(ref bios) = components.vm_image {
1212            bios.metadata().map_err(Error::LoadBios)?.len()
1213        } else {
1214            0
1215        };
1216        if let Some(pflash_image) = components.pflash_image {
1217            Self::setup_pflash(
1218                pflash_image,
1219                components.pflash_block_size,
1220                bios_size,
1221                &mmio_bus,
1222                pflash_jail,
1223                #[cfg(feature = "swap")]
1224                swap_controller,
1225            )?;
1226        }
1227
1228        // Functions that use/create jails MUST be used before the call to
1229        // setup_acpi_devices below, as this move us into a multiprocessing state
1230        // from which we can no longer fork.
1231
1232        let mut resume_notify_devices = Vec::new();
1233
1234        // each bus occupy 1MB mmio for pcie enhanced configuration
1235        let max_bus = (pcie_cfg_mmio_len / 0x100000 - 1) as u8;
1236        let (mut acpi_dev_resource, bat_control) = Self::setup_acpi_devices(
1237            arch_memory_layout,
1238            pci.clone(),
1239            &mem,
1240            &io_bus,
1241            system_allocator,
1242            suspend_tube_send.clone(),
1243            vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
1244            components.acpi_sdts,
1245            &*irq_chip,
1246            sci_irq,
1247            battery,
1248            &mmio_bus,
1249            max_bus,
1250            &mut resume_notify_devices,
1251            #[cfg(feature = "swap")]
1252            swap_controller,
1253            guest_suspended_cvar,
1254            &pci_irqs,
1255        )?;
1256
1257        // Create customized SSDT table
1258        let sdt = acpi::create_customize_ssdt(pci.clone(), amls, gpe_scope_amls);
1259        if let Some(sdt) = sdt {
1260            acpi_dev_resource.sdts.push(sdt);
1261        }
1262
1263        irq_chip
1264            .clone()
1265            .finalize_devices(system_allocator, &io_bus, &mmio_bus)
1266            .map_err(Error::RegisterIrqfd)?;
1267
1268        // All of these bios generated tables are set manually for the benefit of the kernel boot
1269        // flow (since there's no BIOS to set it) and for the BIOS boot flow since crosvm doesn't
1270        // have a way to pass the BIOS these configs.
1271        // This works right now because the only guest BIOS used with crosvm (u-boot) ignores these
1272        // tables and the guest OS picks them up.
1273        // If another guest does need a way to pass these tables down to it's BIOS, this approach
1274        // should be rethought.
1275
1276        // Make sure the `vcpu_count` casts below and the arithmetic in `setup_mptable` are well
1277        // defined.
1278        if vcpu_count >= u8::MAX.into() {
1279            return Err(Error::TooManyVcpus);
1280        }
1281
1282        if mptable {
1283            mptable::setup_mptable(&mem, vcpu_count as u8, &pci_irqs)
1284                .map_err(Error::SetupMptable)?;
1285        }
1286        smbios::setup_smbios(&mem, &components.smbios, bios_size).map_err(Error::SetupSmbios)?;
1287
1288        let host_cpus = if components.host_cpu_topology {
1289            components.vcpu_affinity.clone()
1290        } else {
1291            None
1292        };
1293
1294        // TODO (tjeznach) Write RSDP to bootconfig before writing to memory
1295        acpi::create_acpi_tables(
1296            &mem,
1297            vcpu_count as u8,
1298            sci_irq,
1299            0xcf9,
1300            6, // RST_CPU|SYS_RST
1301            &acpi_dev_resource,
1302            host_cpus,
1303            vcpu_ids,
1304            &pci_irqs,
1305            pcie_cfg_mmio_range.start,
1306            max_bus,
1307            components.force_s2idle,
1308        )
1309        .ok_or(Error::CreateAcpi)?;
1310
1311        let mut cmdline = Self::get_base_linux_cmdline();
1312
1313        get_serial_cmdline(&mut cmdline, serial_parameters, "io", &serial_devices)
1314            .map_err(Error::GetSerialCmdline)?;
1315
1316        for param in components.extra_kernel_params {
1317            cmdline.insert_str(&param).map_err(Error::Cmdline)?;
1318        }
1319
1320        if let Some(ramoops_region) = ramoops_region {
1321            arch::pstore::add_ramoops_kernel_cmdline(&mut cmdline, &ramoops_region)
1322                .map_err(Error::Cmdline)?;
1323        }
1324
1325        let pci_start = arch_memory_layout.pci_mmio_before_32bit.start;
1326
1327        let mut vcpu_init = vec![VcpuInitX86_64::default(); vcpu_count];
1328        let mut msrs = BTreeMap::new();
1329
1330        let protection_type = components.hv_cfg.protection_type;
1331
1332        match components.vm_image {
1333            VmImage::Bios(ref mut bios) => {
1334                if protection_type.runs_firmware() {
1335                    return Err(Error::PvmFwBiosUnsupported);
1336                }
1337
1338                // Allow a bios to hardcode CMDLINE_OFFSET and read the kernel command line from it.
1339                Self::load_cmdline(
1340                    &mem,
1341                    GuestAddress(CMDLINE_OFFSET),
1342                    cmdline,
1343                    CMDLINE_MAX_SIZE as usize - 1,
1344                )?;
1345                Self::load_bios(&mem, bios)?;
1346                regs::set_default_msrs(&mut msrs);
1347                // The default values for `Regs` and `Sregs` already set up the reset vector.
1348            }
1349            VmImage::Kernel(ref mut kernel_image) => {
1350                let (params, kernel_region, kernel_entry, mut cpu_mode, kernel_type) =
1351                    Self::load_kernel(&mem, kernel_image)?;
1352
1353                info!("Loaded {} kernel", kernel_type);
1354
1355                Self::setup_system_memory(
1356                    arch_memory_layout,
1357                    &mem,
1358                    cmdline,
1359                    components.initrd_image,
1360                    components.android_fstab,
1361                    kernel_region,
1362                    params,
1363                    dump_device_tree_blob,
1364                    device_tree_overlays,
1365                    protection_type,
1366                )?;
1367
1368                if protection_type.needs_firmware_loaded() {
1369                    arch::load_image(
1370                        &mem,
1371                        &mut components
1372                            .pvm_fw
1373                            .expect("pvmfw must be available if ProtectionType loads it"),
1374                        GuestAddress(PROTECTED_VM_FW_START),
1375                        PROTECTED_VM_FW_MAX_SIZE,
1376                    )
1377                    .map_err(Error::LoadCustomPvmFw)?;
1378                } else if protection_type.runs_firmware() {
1379                    // Tell the hypervisor to load the pVM firmware.
1380                    vm.load_protected_vm_firmware(
1381                        GuestAddress(PROTECTED_VM_FW_START),
1382                        PROTECTED_VM_FW_MAX_SIZE,
1383                    )
1384                    .map_err(Error::LoadPvmFw)?;
1385                }
1386
1387                let entry_addr = if protection_type.needs_firmware_loaded() {
1388                    Some(PROTECTED_VM_FW_START)
1389                } else if protection_type.runs_firmware() {
1390                    None // Initial RIP value is set by the hypervisor
1391                } else {
1392                    Some(kernel_entry.offset())
1393                };
1394
1395                if let Some(entry) = entry_addr {
1396                    vcpu_init[0].regs.rip = entry;
1397                }
1398
1399                match kernel_type {
1400                    KernelType::BzImage | KernelType::Elf => {
1401                        // Configure the bootstrap VCPU for the Linux/x86 boot protocol.
1402                        // <https://www.kernel.org/doc/html/latest/x86/boot.html>
1403                        vcpu_init[0].regs.rsp = BOOT_STACK_POINTER;
1404                        vcpu_init[0].regs.rsi = ZERO_PAGE_OFFSET;
1405                    }
1406                    KernelType::Multiboot => {
1407                        // Provide Multiboot-compatible bootloader information.
1408                        vcpu_init[0].regs.rax = MULTIBOOT_BOOTLOADER_MAGIC.into();
1409                        vcpu_init[0].regs.rbx = MULTIBOOT_INFO_OFFSET;
1410                    }
1411                }
1412
1413                if protection_type.runs_firmware() {
1414                    // Pass DTB address to pVM firmware. This is redundant with the DTB entry in the
1415                    // `setup_data` list, but it allows the pVM firmware to know the location of the
1416                    // DTB without having the `setup_data` region mapped yet.
1417                    if let Some(fdt_setup_data_addr) = find_setup_data(
1418                        &mem,
1419                        GuestAddress(SETUP_DATA_START),
1420                        GuestAddress(SETUP_DATA_END),
1421                        SetupDataType::Dtb,
1422                    ) {
1423                        vcpu_init[0].regs.rdx =
1424                            fdt_setup_data_addr.offset() + size_of::<setup_data_hdr>() as u64;
1425                    }
1426
1427                    // Pass pVM payload entry address to pVM firmware.
1428                    // NOTE: this is only for development purposes. An actual pvmfw
1429                    // implementation should not use this value and should instead receive
1430                    // the pVM payload start and size info from crosvm as the DTB properties
1431                    // /config/kernel-address and /config/kernel-size and determine the offset
1432                    // of the entry point on its own, not trust crosvm to provide it.
1433                    vcpu_init[0].regs.rdi = kernel_entry.offset();
1434
1435                    // The pVM firmware itself always starts in 32-bit protected mode
1436                    // with paging disabled, regardless of the type of payload.
1437                    cpu_mode = CpuMode::FlatProtectedMode;
1438                }
1439
1440                match cpu_mode {
1441                    CpuMode::LongMode => {
1442                        regs::set_long_mode_msrs(&mut msrs);
1443
1444                        // Set up long mode and enable paging.
1445                        regs::configure_segments_and_sregs(&mem, &mut vcpu_init[0].sregs)
1446                            .map_err(Error::ConfigureSegments)?;
1447                        regs::setup_page_tables(&mem, &mut vcpu_init[0].sregs)
1448                            .map_err(Error::SetupPageTables)?;
1449                    }
1450                    CpuMode::FlatProtectedMode => {
1451                        regs::set_default_msrs(&mut msrs);
1452
1453                        // Set up 32-bit protected mode with paging disabled.
1454                        regs::configure_segments_and_sregs_flat32(&mem, &mut vcpu_init[0].sregs)
1455                            .map_err(Error::ConfigureSegments)?;
1456                    }
1457                }
1458
1459                regs::set_mtrr_msrs(&mut msrs, &*vm, pci_start);
1460            }
1461        }
1462
1463        // Initialize MSRs for all VCPUs.
1464        for vcpu in vcpu_init.iter_mut() {
1465            vcpu.msrs = msrs.clone();
1466        }
1467
1468        let mut vm_request_tubes = Vec::new();
1469        if let Some(req_tube) = vm_request_tube.take() {
1470            vm_request_tubes.push(req_tube);
1471        }
1472
1473        Ok(RunnableLinuxVm {
1474            vm,
1475            vcpu_count,
1476            vcpus: None,
1477            vcpu_affinity: components.vcpu_affinity,
1478            vcpu_init,
1479            no_smt: components.no_smt,
1480            irq_chip,
1481            hypercall_bus,
1482            io_bus,
1483            mmio_bus,
1484            pid_debug_label_map,
1485            suspend_tube: (suspend_tube_send, suspend_tube_recv),
1486            resume_notify_devices,
1487            rt_cpus: components.rt_cpus,
1488            delay_rt: components.delay_rt,
1489            bat_control,
1490            pm: Some(acpi_dev_resource.pm),
1491            root_config: pci,
1492            #[cfg(any(target_os = "android", target_os = "linux"))]
1493            platform_devices: Vec::new(),
1494            hotplug_bus: BTreeMap::new(),
1495            devices_thread: None,
1496            vm_request_tubes,
1497        })
1498    }
1499
1500    fn configure_vcpu(
1501        vm: &dyn Vm,
1502        hypervisor: &dyn HypervisorX86_64,
1503        irq_chip: &dyn IrqChipX86_64,
1504        vcpu: &dyn VcpuX86_64,
1505        vcpu_init: VcpuInitX86_64,
1506        vcpu_id: usize,
1507        num_vcpus: usize,
1508        cpu_config: Option<CpuConfigX86_64>,
1509    ) -> Result<()> {
1510        let cpu_config = match cpu_config {
1511            Some(config) => config,
1512            None => return Err(Error::InvalidCpuConfig),
1513        };
1514        if !vm.check_capability(VmCap::EarlyInitCpuid) {
1515            cpuid::setup_cpuid(hypervisor, irq_chip, vcpu, vcpu_id, num_vcpus, cpu_config)
1516                .map_err(Error::SetupCpuid)?;
1517        }
1518
1519        vcpu.set_regs(&vcpu_init.regs).map_err(Error::WriteRegs)?;
1520
1521        vcpu.set_sregs(&vcpu_init.sregs)
1522            .map_err(Error::SetupSregs)?;
1523
1524        vcpu.set_fpu(&vcpu_init.fpu).map_err(Error::SetupFpu)?;
1525
1526        let vcpu_supported_var_mtrrs = regs::vcpu_supported_variable_mtrrs(vcpu);
1527        let num_var_mtrrs = regs::count_variable_mtrrs(&vcpu_init.msrs);
1528        let skip_mtrr_msrs = if num_var_mtrrs > vcpu_supported_var_mtrrs {
1529            warn!(
1530                "Too many variable MTRR entries ({} required, {} supported),
1531                please check pci_start addr, guest with pass through device may be very slow",
1532                num_var_mtrrs, vcpu_supported_var_mtrrs,
1533            );
1534            // Filter out the MTRR entries from the MSR list.
1535            true
1536        } else {
1537            false
1538        };
1539
1540        for (msr_index, value) in vcpu_init.msrs.into_iter() {
1541            if skip_mtrr_msrs && regs::is_mtrr_msr(msr_index) {
1542                continue;
1543            }
1544
1545            vcpu.set_msr(msr_index, value).map_err(Error::SetupMsrs)?;
1546        }
1547
1548        interrupts::set_lint(vcpu_id, irq_chip).map_err(Error::SetLint)?;
1549
1550        Ok(())
1551    }
1552
1553    fn register_pci_device(
1554        linux: &mut RunnableLinuxVm,
1555        device: Box<dyn PciDevice>,
1556        #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
1557        resources: &mut SystemAllocator,
1558        hp_control_tube: &mpsc::Sender<PciRootCommand>,
1559        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1560    ) -> Result<PciAddress> {
1561        arch::configure_pci_device(
1562            linux,
1563            device,
1564            #[cfg(any(target_os = "android", target_os = "linux"))]
1565            minijail,
1566            resources,
1567            hp_control_tube,
1568            #[cfg(feature = "swap")]
1569            swap_controller,
1570        )
1571        .map_err(Error::ConfigurePciDevice)
1572    }
1573
1574    fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>> {
1575        Ok(BTreeMap::new())
1576    }
1577
1578    fn get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>> {
1579        Ok(BTreeMap::new())
1580    }
1581
1582    fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>> {
1583        Ok(BTreeMap::new())
1584    }
1585
1586    fn get_host_cpu_clusters() -> Result<Vec<CpuSet>> {
1587        Ok(Vec::new())
1588    }
1589}
1590
1591// OSC returned status register in CDW1
1592const OSC_STATUS_UNSUPPORT_UUID: u32 = 0x4;
1593// pci host bridge OSC returned control register in CDW3
1594#[allow(dead_code)]
1595const PCI_HB_OSC_CONTROL_PCIE_HP: u32 = 0x1;
1596const PCI_HB_OSC_CONTROL_SHPC_HP: u32 = 0x2;
1597#[allow(dead_code)]
1598const PCI_HB_OSC_CONTROL_PCIE_PME: u32 = 0x4;
1599const PCI_HB_OSC_CONTROL_PCIE_AER: u32 = 0x8;
1600#[allow(dead_code)]
1601const PCI_HB_OSC_CONTROL_PCIE_CAP: u32 = 0x10;
1602
1603struct PciRootOSC {}
1604
1605// Method (_OSC, 4, NotSerialized)  // _OSC: Operating System Capabilities
1606// {
1607//     CreateDWordField (Arg3, Zero, CDW1)  // flag and return value
1608//     If (Arg0 == ToUUID ("33db4d5b-1ff7-401c-9657-7441c03dd766"))
1609//     {
1610//         CreateDWordField (Arg3, 8, CDW3) // control field
1611//         if ( 0 == (CDW1 & 0x01))  // Query flag ?
1612//         {
1613//              CDW3 &= !(SHPC_HP | AER)
1614//         }
1615//     } Else {
1616//         CDW1 |= UNSUPPORT_UUID
1617//     }
1618//     Return (Arg3)
1619// }
1620impl Aml for PciRootOSC {
1621    fn to_aml_bytes(&self, aml: &mut Vec<u8>) {
1622        let osc_uuid = "33DB4D5B-1FF7-401C-9657-7441C03DD766";
1623        // virtual pcie root port supports hotplug, pme, and pcie cap register, clear all
1624        // the other bits.
1625        let mask = !(PCI_HB_OSC_CONTROL_SHPC_HP | PCI_HB_OSC_CONTROL_PCIE_AER);
1626        aml::Method::new(
1627            "_OSC".into(),
1628            4,
1629            false,
1630            vec![
1631                &aml::CreateDWordField::new(
1632                    &aml::Name::new_field_name("CDW1"),
1633                    &aml::Arg(3),
1634                    &aml::ZERO,
1635                ),
1636                &aml::If::new(
1637                    &aml::Equal::new(&aml::Arg(0), &aml::Uuid::new(osc_uuid)),
1638                    vec![
1639                        &aml::CreateDWordField::new(
1640                            &aml::Name::new_field_name("CDW3"),
1641                            &aml::Arg(3),
1642                            &(8_u8),
1643                        ),
1644                        &aml::If::new(
1645                            &aml::Equal::new(
1646                                &aml::ZERO,
1647                                &aml::And::new(
1648                                    &aml::ZERO,
1649                                    &aml::Name::new_field_name("CDW1"),
1650                                    &aml::ONE,
1651                                ),
1652                            ),
1653                            vec![&aml::And::new(
1654                                &aml::Name::new_field_name("CDW3"),
1655                                &mask,
1656                                &aml::Name::new_field_name("CDW3"),
1657                            )],
1658                        ),
1659                    ],
1660                ),
1661                &aml::Else::new(vec![&aml::Or::new(
1662                    &aml::Name::new_field_name("CDW1"),
1663                    &OSC_STATUS_UNSUPPORT_UUID,
1664                    &aml::Name::new_field_name("CDW1"),
1665                )]),
1666                &aml::Return::new(&aml::Arg(3)),
1667            ],
1668        )
1669        .to_aml_bytes(aml)
1670    }
1671}
1672
1673pub enum CpuMode {
1674    /// 32-bit protected mode with paging disabled.
1675    FlatProtectedMode,
1676
1677    /// 64-bit long mode.
1678    LongMode,
1679}
1680
1681#[derive(Copy, Clone, Debug, Eq, PartialEq)]
1682pub enum KernelType {
1683    BzImage,
1684    Elf,
1685    Multiboot,
1686}
1687
1688impl fmt::Display for KernelType {
1689    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1690        match self {
1691            KernelType::BzImage => write!(f, "bzImage"),
1692            KernelType::Elf => write!(f, "ELF"),
1693            KernelType::Multiboot => write!(f, "Multiboot"),
1694        }
1695    }
1696}
1697
1698impl X8664arch {
1699    /// Loads the bios from an open file.
1700    ///
1701    /// # Arguments
1702    ///
1703    /// * `mem` - The memory to be used by the guest.
1704    /// * `bios_image` - the File object for the specified bios
1705    fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
1706        let bios_image_length = bios_image.get_len().map_err(Error::LoadBios)?;
1707        if bios_image_length >= FIRST_ADDR_PAST_32BITS {
1708            return Err(Error::LoadBios(io::Error::new(
1709                io::ErrorKind::InvalidData,
1710                format!(
1711                    "bios was {bios_image_length} bytes, expected less than {FIRST_ADDR_PAST_32BITS}",
1712                ),
1713            )));
1714        }
1715
1716        let guest_slice = mem
1717            .get_slice_at_addr(bios_start(bios_image_length), bios_image_length as usize)
1718            .map_err(Error::SetupGuestMemory)?;
1719        bios_image
1720            .read_exact_at_volatile(guest_slice, 0)
1721            .map_err(Error::LoadBios)?;
1722        Ok(())
1723    }
1724
1725    fn setup_pflash(
1726        pflash_image: File,
1727        block_size: u32,
1728        bios_size: u64,
1729        mmio_bus: &Bus,
1730        jail: Option<Minijail>,
1731        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1732    ) -> Result<()> {
1733        let size = pflash_image.metadata().map_err(Error::LoadPflash)?.len();
1734        let start = FIRST_ADDR_PAST_32BITS - bios_size - size;
1735        let pflash_image = Box::new(pflash_image);
1736
1737        #[cfg(any(target_os = "android", target_os = "linux"))]
1738        let fds = pflash_image.as_raw_descriptors();
1739
1740        let pflash = Pflash::new(pflash_image, block_size).map_err(Error::SetupPflash)?;
1741        let pflash: Arc<Mutex<dyn BusDevice>> = match jail {
1742            #[cfg(any(target_os = "android", target_os = "linux"))]
1743            Some(jail) => Arc::new(Mutex::new(
1744                ProxyDevice::new(
1745                    pflash,
1746                    jail,
1747                    fds,
1748                    #[cfg(feature = "swap")]
1749                    swap_controller,
1750                )
1751                .map_err(Error::CreateProxyDevice)?,
1752            )),
1753            #[cfg(windows)]
1754            Some(_) => unreachable!(),
1755            None => Arc::new(Mutex::new(pflash)),
1756        };
1757        mmio_bus
1758            .insert(pflash, start, size)
1759            .map_err(Error::InsertBus)?;
1760
1761        Ok(())
1762    }
1763
1764    /// Writes the command line string to the given memory slice.
1765    ///
1766    /// # Arguments
1767    ///
1768    /// * `guest_mem` - A u8 slice that will be partially overwritten by the command line.
1769    /// * `guest_addr` - The address in `guest_mem` at which to load the command line.
1770    /// * `cmdline` - The kernel command line.
1771    /// * `kernel_max_cmdline_len` - The maximum command line length (without NUL terminator)
1772    ///   supported by the kernel.
1773    fn load_cmdline(
1774        guest_mem: &GuestMemory,
1775        guest_addr: GuestAddress,
1776        cmdline: kernel_cmdline::Cmdline,
1777        kernel_max_cmdline_len: usize,
1778    ) -> Result<()> {
1779        let mut cmdline_guest_mem_slice = guest_mem
1780            .get_slice_at_addr(guest_addr, CMDLINE_MAX_SIZE as usize)
1781            .map_err(|_| Error::CommandLineOverflow)?;
1782
1783        let mut cmdline_bytes: Vec<u8> = cmdline
1784            .into_bytes_with_max_len(kernel_max_cmdline_len)
1785            .map_err(Error::Cmdline)?;
1786        cmdline_bytes.push(0u8); // Add NUL terminator.
1787
1788        cmdline_guest_mem_slice
1789            .write_all(&cmdline_bytes)
1790            .map_err(|_| Error::CommandLineOverflow)?;
1791
1792        Ok(())
1793    }
1794
1795    /// Loads the kernel from an open file.
1796    ///
1797    /// # Arguments
1798    ///
1799    /// * `mem` - The memory to be used by the guest.
1800    /// * `kernel_image` - the File object for the specified kernel.
1801    ///
1802    /// # Returns
1803    ///
1804    /// On success, returns the Linux x86_64 boot protocol parameters, the address range containing
1805    /// the kernel, the entry point (initial `RIP` value), the initial CPU mode, and the type of
1806    /// kernel.
1807    fn load_kernel(
1808        mem: &GuestMemory,
1809        kernel_image: &mut File,
1810    ) -> Result<(boot_params, AddressRange, GuestAddress, CpuMode, KernelType)> {
1811        let kernel_start = GuestAddress(KERNEL_START_OFFSET);
1812
1813        let multiboot =
1814            kernel_loader::multiboot_header_from_file(kernel_image).map_err(Error::LoadKernel)?;
1815
1816        if let Some(multiboot_load) = multiboot.as_ref().and_then(|m| m.load.as_ref()) {
1817            let loaded_kernel = kernel_loader::load_multiboot(mem, kernel_image, multiboot_load)
1818                .map_err(Error::LoadKernel)?;
1819
1820            let boot_params = boot_params {
1821                hdr: setup_header {
1822                    cmdline_size: CMDLINE_MAX_SIZE as u32 - 1,
1823                    ..Default::default()
1824                },
1825                ..Default::default()
1826            };
1827            return Ok((
1828                boot_params,
1829                loaded_kernel.address_range,
1830                loaded_kernel.entry,
1831                CpuMode::FlatProtectedMode,
1832                KernelType::Multiboot,
1833            ));
1834        }
1835
1836        match kernel_loader::load_elf(mem, kernel_start, kernel_image, 0) {
1837            Ok(loaded_kernel) => {
1838                // ELF kernels don't contain a `boot_params` structure, so synthesize a default one.
1839                let boot_params = boot_params {
1840                    hdr: setup_header {
1841                        cmdline_size: CMDLINE_MAX_SIZE as u32 - 1,
1842                        ..Default::default()
1843                    },
1844                    ..Default::default()
1845                };
1846                Ok((
1847                    boot_params,
1848                    loaded_kernel.address_range,
1849                    loaded_kernel.entry,
1850                    match loaded_kernel.class {
1851                        kernel_loader::ElfClass::ElfClass32 => CpuMode::FlatProtectedMode,
1852                        kernel_loader::ElfClass::ElfClass64 => CpuMode::LongMode,
1853                    },
1854                    KernelType::Elf,
1855                ))
1856            }
1857            Err(kernel_loader::Error::InvalidMagicNumber) => {
1858                // The image failed to parse as ELF, so try to load it as a bzImage.
1859                let (boot_params, bzimage_region, bzimage_entry, cpu_mode) =
1860                    bzimage::load_bzimage(mem, kernel_start, kernel_image)
1861                        .map_err(Error::LoadBzImage)?;
1862                Ok((
1863                    boot_params,
1864                    bzimage_region,
1865                    bzimage_entry,
1866                    cpu_mode,
1867                    KernelType::BzImage,
1868                ))
1869            }
1870            Err(e) => Err(Error::LoadKernel(e)),
1871        }
1872    }
1873
1874    /// Configures the system memory space should be called once per vm before
1875    /// starting vcpu threads.
1876    ///
1877    /// # Arguments
1878    ///
1879    /// * `mem` - The memory to be used by the guest.
1880    /// * `cmdline` - the kernel commandline
1881    /// * `initrd_file` - an initial ramdisk image
1882    pub fn setup_system_memory(
1883        arch_memory_layout: &ArchMemoryLayout,
1884        mem: &GuestMemory,
1885        cmdline: kernel_cmdline::Cmdline,
1886        initrd_file: Option<File>,
1887        android_fstab: Option<File>,
1888        kernel_region: AddressRange,
1889        params: boot_params,
1890        dump_device_tree_blob: Option<PathBuf>,
1891        device_tree_overlays: Vec<DtbOverlay>,
1892        protection_type: ProtectionType,
1893    ) -> Result<()> {
1894        let e820_entries = generate_e820_memory_map(arch_memory_layout, mem)?;
1895
1896        let kernel_max_cmdline_len = if params.hdr.cmdline_size == 0 {
1897            // Old kernels have a maximum length of 255 bytes, not including the NUL.
1898            255
1899        } else {
1900            params.hdr.cmdline_size as usize
1901        };
1902        debug!("kernel_max_cmdline_len={kernel_max_cmdline_len}");
1903        Self::load_cmdline(
1904            mem,
1905            GuestAddress(CMDLINE_OFFSET),
1906            cmdline,
1907            kernel_max_cmdline_len,
1908        )?;
1909
1910        let initrd = match initrd_file {
1911            Some(mut initrd_file) => {
1912                let initrd_addr_max = if params.hdr.xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G != 0 {
1913                    u64::MAX
1914                } else if params.hdr.initrd_addr_max == 0 {
1915                    // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
1916                    0x37FFFFFF
1917                } else {
1918                    u64::from(params.hdr.initrd_addr_max)
1919                };
1920
1921                let (initrd_start, initrd_size) = arch::load_image_high(
1922                    mem,
1923                    &mut initrd_file,
1924                    GuestAddress(kernel_region.end + 1),
1925                    GuestAddress(initrd_addr_max),
1926                    Some(|region| {
1927                        region.options.purpose != MemoryRegionPurpose::ProtectedFirmwareRegion
1928                    }),
1929                    base::pagesize() as u64,
1930                )
1931                .map_err(Error::LoadInitrd)?;
1932                Some((initrd_start, initrd_size))
1933            }
1934            None => None,
1935        };
1936
1937        let mut setup_data_entries =
1938            SetupDataEntries::new(SETUP_DATA_START as usize, SETUP_DATA_END as usize);
1939
1940        let setup_data_size = setup_data_entries.insert(setup_data_rng_seed());
1941
1942        // SETUP_DTB should be the last one in SETUP_DATA.
1943        // This is to reserve enough space for SETUP_DTB
1944        // without exceeding the size of SETUP_DATA area.
1945        if android_fstab.is_some()
1946            || !device_tree_overlays.is_empty()
1947            || protection_type.runs_firmware()
1948        {
1949            let fdt_max_size = min(X86_64_FDT_MAX_SIZE as usize, setup_data_size);
1950            let mut device_tree_blob = fdt::create_fdt(
1951                mem,
1952                android_fstab,
1953                dump_device_tree_blob,
1954                device_tree_overlays,
1955                kernel_region,
1956                initrd,
1957            )
1958            .map_err(Error::CreateFdt)?;
1959            if device_tree_blob.len() > fdt_max_size {
1960                return Err(Error::DTBSizeGreaterThanAllowed);
1961            }
1962
1963            // Reserve and zero fill dtb memory to maximum allowable size
1964            // so that pvmfw could patch and extend the dtb in-place.
1965            device_tree_blob.resize(fdt_max_size, 0);
1966
1967            setup_data_entries.insert(SetupData {
1968                data: device_tree_blob,
1969                type_: SetupDataType::Dtb,
1970            });
1971        }
1972
1973        let setup_data = setup_data_entries.write_setup_data(mem)?;
1974
1975        configure_boot_params(
1976            mem,
1977            GuestAddress(CMDLINE_OFFSET),
1978            setup_data,
1979            initrd,
1980            params,
1981            &e820_entries,
1982        )?;
1983
1984        configure_multiboot_info(mem, GuestAddress(CMDLINE_OFFSET), &e820_entries)?;
1985
1986        Ok(())
1987    }
1988
1989    fn get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange {
1990        // Put PCIe VCFG region at a 2MB boundary after physical memory or 4gb, whichever is
1991        // greater.
1992        let ram_end_round_2mb = mem.end_addr().offset().next_multiple_of(2 * MB);
1993        let start = std::cmp::max(ram_end_round_2mb, 4 * GB);
1994        // Each pci device's ECAM size is 4kb and its vcfg size is 8kb
1995        let end = start + pcie_cfg_mmio.len().unwrap() * 2 - 1;
1996        AddressRange { start, end }
1997    }
1998
1999    /// Returns the high mmio range
2000    fn get_high_mmio_range(vm: &dyn Vm, arch_memory_layout: &ArchMemoryLayout) -> AddressRange {
2001        let mem = vm.get_memory();
2002        let start = Self::get_pcie_vcfg_mmio_range(mem, &arch_memory_layout.pcie_cfg_mmio).end + 1;
2003
2004        let phys_mem_end = (1u64 << vm.get_guest_phys_addr_bits()) - 1;
2005        let high_mmio_end = std::cmp::min(phys_mem_end, HIGH_MMIO_MAX_END);
2006
2007        AddressRange {
2008            start,
2009            end: high_mmio_end,
2010        }
2011    }
2012
2013    /// This returns a minimal kernel command for this architecture
2014    pub fn get_base_linux_cmdline() -> kernel_cmdline::Cmdline {
2015        let mut cmdline = kernel_cmdline::Cmdline::new();
2016        cmdline.insert_str("panic=-1").unwrap();
2017
2018        cmdline
2019    }
2020
2021    /// Sets up fw_cfg device.
2022    ///  # Arguments
2023    ///
2024    /// * `io_bus` - the IO bus object
2025    /// * `fw_cfg_parameters` - command-line specified data to add to device. May contain all None
2026    ///   fields if user did not specify data to add to the device
2027    fn setup_fw_cfg_device(
2028        io_bus: &Bus,
2029        fw_cfg_parameters: Vec<FwCfgParameters>,
2030        bootorder_fw_cfg_blob: Vec<u8>,
2031        fw_cfg_jail: Option<Minijail>,
2032        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2033    ) -> Result<()> {
2034        let fw_cfg = match devices::FwCfgDevice::new(FW_CFG_MAX_FILE_SLOTS, fw_cfg_parameters) {
2035            Ok(mut device) => {
2036                // this condition will only be true if the user specified at least one bootindex
2037                // option on the command line. If none were specified, bootorder_fw_cfg_blob will
2038                // only have a null byte (null terminator)
2039                if bootorder_fw_cfg_blob.len() > 1 {
2040                    // Add boot order file to the device. If the file is not present, firmware may
2041                    // not be able to boot.
2042                    if let Err(err) = device.add_file(
2043                        "bootorder",
2044                        bootorder_fw_cfg_blob,
2045                        devices::FwCfgItemType::GenericItem,
2046                    ) {
2047                        return Err(Error::CreateFwCfgDevice(err));
2048                    }
2049                }
2050                device
2051            }
2052            Err(err) => {
2053                return Err(Error::CreateFwCfgDevice(err));
2054            }
2055        };
2056
2057        let fw_cfg: Arc<Mutex<dyn BusDevice>> = match fw_cfg_jail.as_ref() {
2058            #[cfg(any(target_os = "android", target_os = "linux"))]
2059            Some(jail) => {
2060                let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
2061                #[cfg(feature = "seccomp_trace")]
2062                debug!(
2063                    "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
2064                    read_jail_addr(jail),
2065                    read_jail_addr(&jail_clone)
2066                );
2067                Arc::new(Mutex::new(
2068                    ProxyDevice::new(
2069                        fw_cfg,
2070                        jail_clone,
2071                        Vec::new(),
2072                        #[cfg(feature = "swap")]
2073                        swap_controller,
2074                    )
2075                    .map_err(Error::CreateProxyDevice)?,
2076                ))
2077            }
2078            #[cfg(windows)]
2079            Some(_) => unreachable!(),
2080            None => Arc::new(Mutex::new(fw_cfg)),
2081        };
2082
2083        io_bus
2084            .insert(fw_cfg, FW_CFG_BASE_PORT, FW_CFG_WIDTH)
2085            .map_err(Error::InsertBus)?;
2086
2087        Ok(())
2088    }
2089
2090    /// Sets up the legacy x86 i8042/KBD platform device
2091    ///
2092    /// # Arguments
2093    ///
2094    /// * - `io_bus` - the IO bus object
2095    /// * - `pit_uses_speaker_port` - does the PIT use port 0x61 for the PC speaker
2096    /// * - `vm_evt_wrtube` - the event object which should receive exit events
2097    pub fn setup_legacy_i8042_device(
2098        io_bus: &Bus,
2099        pit_uses_speaker_port: bool,
2100        vm_evt_wrtube: SendTube,
2101    ) -> Result<()> {
2102        let i8042 = Arc::new(Mutex::new(devices::I8042Device::new(
2103            vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
2104        )));
2105
2106        if pit_uses_speaker_port {
2107            io_bus.insert(i8042, 0x062, 0x3).unwrap();
2108        } else {
2109            io_bus.insert(i8042, 0x061, 0x4).unwrap();
2110        }
2111
2112        Ok(())
2113    }
2114
2115    /// Sets up the legacy x86 CMOS/RTC platform device
2116    /// # Arguments
2117    ///
2118    /// * - `io_bus` - the IO bus object
2119    /// * - `mem_size` - the size in bytes of physical ram for the guest
2120    pub fn setup_legacy_cmos_device(
2121        arch_memory_layout: &ArchMemoryLayout,
2122        io_bus: &Bus,
2123        irq_chip: Arc<dyn IrqChipArch>,
2124        vm_control: Tube,
2125        mem_size: u64,
2126    ) -> anyhow::Result<()> {
2127        let mem_regions = arch_memory_regions(arch_memory_layout, mem_size, None);
2128
2129        let mem_below_4g = mem_regions
2130            .iter()
2131            .filter(|r| r.0.offset() < FIRST_ADDR_PAST_32BITS)
2132            .map(|r| r.1)
2133            .sum();
2134
2135        let mem_above_4g = mem_regions
2136            .iter()
2137            .filter(|r| r.0.offset() >= FIRST_ADDR_PAST_32BITS)
2138            .map(|r| r.1)
2139            .sum();
2140
2141        let irq_evt = devices::IrqEdgeEvent::new().context("cmos irq")?;
2142        let cmos = devices::cmos::Cmos::new(
2143            mem_below_4g,
2144            mem_above_4g,
2145            Utc::now,
2146            vm_control,
2147            irq_evt.try_clone().context("cmos irq clone")?,
2148        )
2149        .context("create cmos")?;
2150
2151        irq_chip
2152            .register_edge_irq_event(
2153                devices::cmos::RTC_IRQ as u32,
2154                &irq_evt,
2155                IrqEventSource::from_device(&cmos),
2156            )
2157            .context("cmos register irq")?;
2158        io_bus
2159            .insert(Arc::new(Mutex::new(cmos)), 0x70, 0x2)
2160            .context("cmos insert irq")?;
2161
2162        Ok(())
2163    }
2164
2165    /// Sets up the acpi devices for this platform and
2166    /// return the resources which is used to set the ACPI tables.
2167    ///
2168    /// # Arguments
2169    ///
2170    /// * `io_bus` the I/O bus to add the devices to
2171    /// * `resources` the SystemAllocator to allocate IO and MMIO for acpi devices.
2172    /// * `suspend_tube` the tube object which used to suspend/resume the VM.
2173    /// * `sdts` ACPI system description tables
2174    /// * `irq_chip` the IrqChip object for registering irq events
2175    /// * `battery` indicate whether to create the battery
2176    /// * `mmio_bus` the MMIO bus to add the devices to
2177    /// * `pci_irqs` IRQ assignment of PCI devices. Tuples of (PCI address, gsi, PCI interrupt pin).
2178    ///   Note that this matches one of the return values of generate_pci_root.
2179    pub fn setup_acpi_devices(
2180        arch_memory_layout: &ArchMemoryLayout,
2181        pci_root: Arc<Mutex<PciRoot>>,
2182        mem: &GuestMemory,
2183        io_bus: &Bus,
2184        resources: &mut SystemAllocator,
2185        suspend_tube: Arc<Mutex<SendTube>>,
2186        vm_evt_wrtube: SendTube,
2187        sdts: Vec<SDT>,
2188        irq_chip: &dyn IrqChip,
2189        sci_irq: u32,
2190        battery: (Option<BatteryType>, Option<Minijail>),
2191        #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &Bus,
2192        max_bus: u8,
2193        resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>,
2194        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2195        guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
2196        pci_irqs: &[(PciAddress, u32, PciInterruptPin)],
2197    ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)> {
2198        // The AML data for the acpi devices
2199        let mut amls = Vec::new();
2200
2201        let bat_control = if let Some(battery_type) = battery.0 {
2202            match battery_type {
2203                #[cfg(any(target_os = "android", target_os = "linux"))]
2204                BatteryType::Goldfish => {
2205                    let irq_num = resources.allocate_irq().ok_or(Error::CreateBatDevices(
2206                        arch::DeviceRegistrationError::AllocateIrq,
2207                    ))?;
2208                    let (control_tube, _mmio_base) = arch::sys::linux::add_goldfish_battery(
2209                        &mut amls,
2210                        battery.1,
2211                        mmio_bus,
2212                        irq_chip,
2213                        irq_num,
2214                        resources,
2215                        #[cfg(feature = "swap")]
2216                        swap_controller,
2217                    )
2218                    .map_err(Error::CreateBatDevices)?;
2219                    Some(BatControl {
2220                        type_: BatteryType::Goldfish,
2221                        control_tube,
2222                    })
2223                }
2224                #[cfg(windows)]
2225                _ => None,
2226            }
2227        } else {
2228            None
2229        };
2230
2231        let pm_alloc = resources.get_anon_alloc();
2232        let pm_iobase = match resources.io_allocator() {
2233            Some(io) => io
2234                .allocate_with_align(
2235                    devices::acpi::ACPIPM_RESOURCE_LEN as u64,
2236                    pm_alloc,
2237                    "ACPIPM".to_string(),
2238                    4, // must be 32-bit aligned
2239                )
2240                .map_err(Error::AllocateIOResouce)?,
2241            None => 0x600,
2242        };
2243
2244        let pcie_vcfg = aml::Name::new(
2245            "VCFG".into(),
2246            &Self::get_pcie_vcfg_mmio_range(mem, &arch_memory_layout.pcie_cfg_mmio).start,
2247        );
2248        pcie_vcfg.to_aml_bytes(&mut amls);
2249
2250        let pm_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
2251
2252        //Virtual PMC
2253        if let Some(guest_suspended_cvar) = guest_suspended_cvar {
2254            let alloc = resources.get_anon_alloc();
2255            let mmio_base = resources
2256                .allocate_mmio(
2257                    devices::pmc_virt::VPMC_VIRT_MMIO_SIZE,
2258                    alloc,
2259                    "VirtualPmc".to_string(),
2260                    resources::AllocOptions::new().align(devices::pmc_virt::VPMC_VIRT_MMIO_SIZE),
2261                )
2262                .unwrap();
2263
2264            let pmc_virtio_mmio =
2265                Arc::new(Mutex::new(VirtualPmc::new(mmio_base, guest_suspended_cvar)));
2266            mmio_bus
2267                .insert(
2268                    pmc_virtio_mmio.clone(),
2269                    mmio_base,
2270                    devices::pmc_virt::VPMC_VIRT_MMIO_SIZE,
2271                )
2272                .unwrap();
2273            pmc_virtio_mmio.lock().to_aml_bytes(&mut amls);
2274        }
2275
2276        let mut pmresource = devices::ACPIPMResource::new(
2277            pm_sci_evt.try_clone().map_err(Error::CloneEvent)?,
2278            suspend_tube,
2279            vm_evt_wrtube,
2280        );
2281        pmresource.to_aml_bytes(&mut amls);
2282        irq_chip
2283            .register_level_irq_event(
2284                sci_irq,
2285                &pm_sci_evt,
2286                IrqEventSource::from_device(&pmresource),
2287            )
2288            .map_err(Error::RegisterIrqfd)?;
2289        pmresource.start();
2290
2291        let mut crs_entries: Vec<Box<dyn Aml>> = vec![
2292            Box::new(aml::AddressSpace::new_bus_number(0x0u16, max_bus as u16)),
2293            Box::new(aml::IO::new(0xcf8, 0xcf8, 1, 0x8)),
2294        ];
2295        for r in resources.mmio_pools() {
2296            let entry: Box<dyn Aml> = match (u32::try_from(r.start), u32::try_from(r.end)) {
2297                (Ok(start), Ok(end)) => Box::new(aml::AddressSpace::new_memory(
2298                    aml::AddressSpaceCachable::NotCacheable,
2299                    true,
2300                    start,
2301                    end,
2302                )),
2303                _ => Box::new(aml::AddressSpace::new_memory(
2304                    aml::AddressSpaceCachable::NotCacheable,
2305                    true,
2306                    r.start,
2307                    r.end,
2308                )),
2309            };
2310            crs_entries.push(entry);
2311        }
2312
2313        let prt_entries: Vec<aml::Package> = pci_irqs
2314            .iter()
2315            .map(|(pci_address, gsi, pci_intr_pin)| {
2316                aml::Package::new(vec![
2317                    &pci_address.acpi_adr(),
2318                    &pci_intr_pin.to_mask(),
2319                    &aml::ZERO,
2320                    gsi,
2321                ])
2322            })
2323            .collect();
2324
2325        aml::Device::new(
2326            "_SB_.PC00".into(),
2327            vec![
2328                &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A08")),
2329                &aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A03")),
2330                &aml::Name::new("_ADR".into(), &aml::ZERO),
2331                &aml::Name::new("_SEG".into(), &aml::ZERO),
2332                &aml::Name::new("_UID".into(), &aml::ZERO),
2333                &aml::Name::new("SUPP".into(), &aml::ZERO),
2334                &aml::Name::new(
2335                    "_CRS".into(),
2336                    &aml::ResourceTemplate::new(crs_entries.iter().map(|b| b.as_ref()).collect()),
2337                ),
2338                &PciRootOSC {},
2339                &aml::Name::new(
2340                    "_PRT".into(),
2341                    &aml::Package::new(prt_entries.iter().map(|p| p as &dyn Aml).collect()),
2342                ),
2343            ],
2344        )
2345        .to_aml_bytes(&mut amls);
2346
2347        if let (Some(start), Some(len)) = (
2348            u32::try_from(arch_memory_layout.pcie_cfg_mmio.start).ok(),
2349            arch_memory_layout
2350                .pcie_cfg_mmio
2351                .len()
2352                .and_then(|l| u32::try_from(l).ok()),
2353        ) {
2354            aml::Device::new(
2355                "_SB_.MB00".into(),
2356                vec![
2357                    &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C02")),
2358                    &aml::Name::new(
2359                        "_CRS".into(),
2360                        &aml::ResourceTemplate::new(vec![&aml::Memory32Fixed::new(
2361                            true, start, len,
2362                        )]),
2363                    ),
2364                ],
2365            )
2366            .to_aml_bytes(&mut amls);
2367        } else {
2368            warn!("Failed to create ACPI MMCFG region reservation");
2369        }
2370
2371        let root_bus = pci_root.lock().get_root_bus();
2372        let addresses = root_bus.lock().get_downstream_devices();
2373        for address in addresses {
2374            if let Some(acpi_path) = pci_root.lock().acpi_path(&address) {
2375                const DEEPEST_SLEEP_STATE: u32 = 3;
2376                aml::Device::new(
2377                    (*acpi_path).into(),
2378                    vec![
2379                        &aml::Name::new("_ADR".into(), &address.acpi_adr()),
2380                        &aml::Name::new(
2381                            "_PRW".into(),
2382                            &aml::Package::new(vec![&PM_WAKEUP_GPIO, &DEEPEST_SLEEP_STATE]),
2383                        ),
2384                    ],
2385                )
2386                .to_aml_bytes(&mut amls);
2387            }
2388        }
2389
2390        let pm = Arc::new(Mutex::new(pmresource));
2391        io_bus
2392            .insert(
2393                pm.clone(),
2394                pm_iobase,
2395                devices::acpi::ACPIPM_RESOURCE_LEN as u64,
2396            )
2397            .unwrap();
2398        resume_notify_devices.push(pm.clone());
2399
2400        Ok((
2401            acpi::AcpiDevResource {
2402                amls,
2403                pm_iobase,
2404                pm,
2405                sdts,
2406            },
2407            bat_control,
2408        ))
2409    }
2410
2411    /// Sets up the serial devices for this platform. Returns a list of configured serial devices.
2412    ///
2413    /// # Arguments
2414    ///
2415    /// * - `irq_chip` the IrqChip object for registering irq events
2416    /// * - `io_bus` the I/O bus to add the devices to
2417    /// * - `serial_parameters` - definitions for how the serial devices should be configured
2418    pub fn setup_serial_devices(
2419        protection_type: ProtectionType,
2420        irq_chip: &dyn IrqChip,
2421        io_bus: &Bus,
2422        serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2423        serial_jail: Option<Minijail>,
2424        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2425    ) -> Result<Vec<SerialDeviceInfo>> {
2426        let com_evt_1_3 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2427        let com_evt_2_4 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2428
2429        let serial_devices = arch::add_serial_devices(
2430            protection_type,
2431            io_bus,
2432            (X86_64_SERIAL_1_3_IRQ, com_evt_1_3.get_trigger()),
2433            (X86_64_SERIAL_2_4_IRQ, com_evt_2_4.get_trigger()),
2434            serial_parameters,
2435            serial_jail,
2436            #[cfg(feature = "swap")]
2437            swap_controller,
2438        )
2439        .map_err(Error::CreateSerialDevices)?;
2440
2441        let source = IrqEventSource {
2442            device_id: Serial::device_id(),
2443            queue_id: 0,
2444            device_name: Serial::debug_label(),
2445        };
2446        irq_chip
2447            .register_edge_irq_event(X86_64_SERIAL_1_3_IRQ, &com_evt_1_3, source.clone())
2448            .map_err(Error::RegisterIrqfd)?;
2449        irq_chip
2450            .register_edge_irq_event(X86_64_SERIAL_2_4_IRQ, &com_evt_2_4, source)
2451            .map_err(Error::RegisterIrqfd)?;
2452
2453        Ok(serial_devices)
2454    }
2455
2456    fn setup_debugcon_devices(
2457        protection_type: ProtectionType,
2458        io_bus: &Bus,
2459        serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2460        debugcon_jail: Option<Minijail>,
2461        #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2462    ) -> Result<()> {
2463        for param in serial_parameters.values() {
2464            if param.hardware != SerialHardware::Debugcon {
2465                continue;
2466            }
2467
2468            let mut preserved_fds = Vec::new();
2469            let con = param
2470                .create_serial_device::<Debugcon>(
2471                    protection_type,
2472                    // Debugcon doesn't use the interrupt event
2473                    &Event::new().map_err(Error::CreateEvent)?,
2474                    &mut preserved_fds,
2475                )
2476                .map_err(Error::CreateDebugconDevice)?;
2477
2478            let con: Arc<Mutex<dyn BusDevice>> = match debugcon_jail.as_ref() {
2479                #[cfg(any(target_os = "android", target_os = "linux"))]
2480                Some(jail) => {
2481                    let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
2482                    #[cfg(feature = "seccomp_trace")]
2483                    debug!(
2484                        "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
2485                        read_jail_addr(jail),
2486                        read_jail_addr(&jail_clone)
2487                    );
2488                    Arc::new(Mutex::new(
2489                        ProxyDevice::new(
2490                            con,
2491                            jail_clone,
2492                            preserved_fds,
2493                            #[cfg(feature = "swap")]
2494                            swap_controller,
2495                        )
2496                        .map_err(Error::CreateProxyDevice)?,
2497                    ))
2498                }
2499                #[cfg(windows)]
2500                Some(_) => unreachable!(),
2501                None => Arc::new(Mutex::new(con)),
2502            };
2503            io_bus
2504                .insert(con.clone(), param.debugcon_port.into(), 1)
2505                .map_err(Error::InsertBus)?;
2506        }
2507
2508        Ok(())
2509    }
2510}
2511
2512#[sorted]
2513#[derive(Error, Debug)]
2514pub enum MsrError {
2515    #[error("CPU not support. Only intel CPUs support ITMT.")]
2516    CpuUnSupport,
2517    #[error("msr must be unique: {0}")]
2518    MsrDuplicate(u32),
2519}
2520
2521#[derive(Error, Debug)]
2522pub enum HybridSupportError {
2523    #[error("Host CPU doesn't support hybrid architecture.")]
2524    UnsupportedHostCpu,
2525}
2526
2527/// The wrapper for CPUID call functions.
2528pub struct CpuIdCall {
2529    /// __cpuid_count or a fake function for test.
2530    cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2531    /// __cpuid or a fake function for test.
2532    cpuid: unsafe fn(u32) -> CpuidResult,
2533}
2534
2535impl CpuIdCall {
2536    pub fn new(
2537        cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2538        cpuid: unsafe fn(u32) -> CpuidResult,
2539    ) -> CpuIdCall {
2540        CpuIdCall { cpuid_count, cpuid }
2541    }
2542}
2543
2544/// Check if host supports hybrid CPU feature. The check include:
2545///     1. Check if CPUID.1AH exists. CPUID.1AH is hybrid information enumeration leaf.
2546///     2. Check if CPUID.07H.00H:EDX[bit 15] sets. This bit means the processor is identified as a
2547///        hybrid part.
2548///     3. Check if CPUID.1AH:EAX sets. The hybrid core type is set in EAX.
2549///
2550/// # Arguments
2551///
2552/// * - `cpuid` the wrapped cpuid functions used to get CPUID info.
2553pub fn check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError> {
2554    // CPUID.0H.EAX returns maximum input value for basic CPUID information.
2555    //
2556    // SAFETY:
2557    // Safe because we pass 0 for this call and the host supports the
2558    // `cpuid` instruction.
2559    let mut cpuid_entry = unsafe { (cpuid.cpuid)(0x0) };
2560    if cpuid_entry.eax < 0x1A {
2561        return Err(HybridSupportError::UnsupportedHostCpu);
2562    }
2563    // SAFETY:
2564    // Safe because we pass 0x7 and 0 for this call and the host supports the
2565    // `cpuid` instruction.
2566    cpuid_entry = unsafe { (cpuid.cpuid_count)(0x7, 0) };
2567    if cpuid_entry.edx & 1 << EDX_HYBRID_CPU_SHIFT == 0 {
2568        return Err(HybridSupportError::UnsupportedHostCpu);
2569    }
2570    // From SDM, if a value entered for CPUID.EAX is less than or equal to the
2571    // maximum input value and the leaf is not supported on that processor then
2572    // 0 is returned in all the registers.
2573    // For the CPU with hybrid support, its CPUID.1AH.EAX shouldn't be zero.
2574    //
2575    // SAFETY:
2576    // Safe because we pass 0 for this call and the host supports the
2577    // `cpuid` instruction.
2578    cpuid_entry = unsafe { (cpuid.cpuid)(0x1A) };
2579    if cpuid_entry.eax == 0 {
2580        return Err(HybridSupportError::UnsupportedHostCpu);
2581    }
2582    Ok(())
2583}
2584
2585#[cfg(test)]
2586mod tests {
2587    use std::mem::size_of;
2588
2589    use super::*;
2590
2591    fn setup() -> ArchMemoryLayout {
2592        let pci_config = PciConfig {
2593            ecam: Some(MemoryRegionConfig {
2594                start: 3 * GB,
2595                size: Some(256 * MB),
2596            }),
2597            mem: Some(MemoryRegionConfig {
2598                start: 2 * GB,
2599                size: None,
2600            }),
2601        };
2602        create_arch_memory_layout(&pci_config, false).unwrap()
2603    }
2604
2605    #[test]
2606    fn regions_lt_4gb_nobios() {
2607        let arch_memory_layout = setup();
2608        let regions = arch_memory_regions(&arch_memory_layout, 512 * MB, /* bios_size */ None);
2609        assert_eq!(
2610            regions,
2611            [
2612                (
2613                    GuestAddress(0),
2614                    640 * KB,
2615                    MemoryRegionOptions {
2616                        align: 0,
2617                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2618                        file_backed: None,
2619                    },
2620                ),
2621                (
2622                    GuestAddress(640 * KB),
2623                    384 * KB,
2624                    MemoryRegionOptions {
2625                        align: 0,
2626                        purpose: MemoryRegionPurpose::ReservedMemory,
2627                        file_backed: None,
2628                    },
2629                ),
2630                (
2631                    GuestAddress(1 * MB),
2632                    512 * MB - 1 * MB,
2633                    MemoryRegionOptions {
2634                        align: 0,
2635                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2636                        file_backed: None,
2637                    },
2638                )
2639            ]
2640        );
2641    }
2642
2643    #[test]
2644    fn regions_gt_4gb_nobios() {
2645        let arch_memory_layout = setup();
2646        let size = 4 * GB + 0x8000;
2647        let regions = arch_memory_regions(&arch_memory_layout, size, /* bios_size */ None);
2648        assert_eq!(
2649            regions,
2650            [
2651                (
2652                    GuestAddress(0),
2653                    640 * KB,
2654                    MemoryRegionOptions {
2655                        align: 0,
2656                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2657                        file_backed: None,
2658                    },
2659                ),
2660                (
2661                    GuestAddress(640 * KB),
2662                    384 * KB,
2663                    MemoryRegionOptions {
2664                        align: 0,
2665                        purpose: MemoryRegionPurpose::ReservedMemory,
2666                        file_backed: None,
2667                    },
2668                ),
2669                (
2670                    GuestAddress(1 * MB),
2671                    2 * GB - 1 * MB,
2672                    MemoryRegionOptions {
2673                        align: 0,
2674                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2675                        file_backed: None,
2676                    },
2677                ),
2678                (
2679                    GuestAddress(4 * GB),
2680                    2 * GB + 0x8000,
2681                    MemoryRegionOptions {
2682                        align: 0,
2683                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2684                        file_backed: None,
2685                    },
2686                ),
2687            ]
2688        );
2689    }
2690
2691    #[test]
2692    fn regions_lt_4gb_bios() {
2693        let arch_memory_layout = setup();
2694        let bios_len = 1 * MB;
2695        let regions = arch_memory_regions(&arch_memory_layout, 512 * MB, Some(bios_len));
2696        assert_eq!(
2697            regions,
2698            [
2699                (
2700                    GuestAddress(0),
2701                    640 * KB,
2702                    MemoryRegionOptions {
2703                        align: 0,
2704                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2705                        file_backed: None,
2706                    },
2707                ),
2708                (
2709                    GuestAddress(640 * KB),
2710                    384 * KB,
2711                    MemoryRegionOptions {
2712                        align: 0,
2713                        purpose: MemoryRegionPurpose::ReservedMemory,
2714                        file_backed: None,
2715                    },
2716                ),
2717                (
2718                    GuestAddress(1 * MB),
2719                    512 * MB - 1 * MB,
2720                    MemoryRegionOptions {
2721                        align: 0,
2722                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2723                        file_backed: None,
2724                    },
2725                ),
2726                (
2727                    GuestAddress(4 * GB - bios_len),
2728                    bios_len,
2729                    MemoryRegionOptions {
2730                        align: 0,
2731                        purpose: MemoryRegionPurpose::Bios,
2732                        file_backed: None,
2733                    },
2734                ),
2735            ]
2736        );
2737    }
2738
2739    #[test]
2740    fn regions_gt_4gb_bios() {
2741        let arch_memory_layout = setup();
2742        let bios_len = 1 * MB;
2743        let regions = arch_memory_regions(&arch_memory_layout, 4 * GB + 0x8000, Some(bios_len));
2744        assert_eq!(
2745            regions,
2746            [
2747                (
2748                    GuestAddress(0),
2749                    640 * KB,
2750                    MemoryRegionOptions {
2751                        align: 0,
2752                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2753                        file_backed: None,
2754                    },
2755                ),
2756                (
2757                    GuestAddress(640 * KB),
2758                    384 * KB,
2759                    MemoryRegionOptions {
2760                        align: 0,
2761                        purpose: MemoryRegionPurpose::ReservedMemory,
2762                        file_backed: None,
2763                    },
2764                ),
2765                (
2766                    GuestAddress(1 * MB),
2767                    2 * GB - 1 * MB,
2768                    MemoryRegionOptions {
2769                        align: 0,
2770                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2771                        file_backed: None,
2772                    },
2773                ),
2774                (
2775                    GuestAddress(4 * GB - bios_len),
2776                    bios_len,
2777                    MemoryRegionOptions {
2778                        align: 0,
2779                        purpose: MemoryRegionPurpose::Bios,
2780                        file_backed: None,
2781                    },
2782                ),
2783                (
2784                    GuestAddress(4 * GB),
2785                    2 * GB + 0x8000,
2786                    MemoryRegionOptions {
2787                        align: 0,
2788                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2789                        file_backed: None,
2790                    },
2791                ),
2792            ]
2793        );
2794    }
2795
2796    #[test]
2797    fn regions_eq_4gb_nobios() {
2798        let arch_memory_layout = setup();
2799        // Test with exact size of 4GB - the overhead.
2800        let regions = arch_memory_regions(&arch_memory_layout, 2 * GB, /* bios_size */ None);
2801        assert_eq!(
2802            regions,
2803            [
2804                (
2805                    GuestAddress(0),
2806                    640 * KB,
2807                    MemoryRegionOptions {
2808                        align: 0,
2809                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2810                        file_backed: None,
2811                    },
2812                ),
2813                (
2814                    GuestAddress(640 * KB),
2815                    384 * KB,
2816                    MemoryRegionOptions {
2817                        align: 0,
2818                        purpose: MemoryRegionPurpose::ReservedMemory,
2819                        file_backed: None,
2820                    },
2821                ),
2822                (
2823                    GuestAddress(1 * MB),
2824                    2 * GB - 1 * MB,
2825                    MemoryRegionOptions {
2826                        align: 0,
2827                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2828                        file_backed: None,
2829                    },
2830                )
2831            ]
2832        );
2833    }
2834
2835    #[test]
2836    fn regions_eq_4gb_bios() {
2837        let arch_memory_layout = setup();
2838        // Test with exact size of 4GB - the overhead.
2839        let bios_len = 1 * MB;
2840        let regions = arch_memory_regions(&arch_memory_layout, 2 * GB, Some(bios_len));
2841        assert_eq!(
2842            regions,
2843            [
2844                (
2845                    GuestAddress(0),
2846                    640 * KB,
2847                    MemoryRegionOptions {
2848                        align: 0,
2849                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2850                        file_backed: None,
2851                    },
2852                ),
2853                (
2854                    GuestAddress(640 * KB),
2855                    384 * KB,
2856                    MemoryRegionOptions {
2857                        align: 0,
2858                        purpose: MemoryRegionPurpose::ReservedMemory,
2859                        file_backed: None,
2860                    },
2861                ),
2862                (
2863                    GuestAddress(1 * MB),
2864                    2 * GB - 1 * MB,
2865                    MemoryRegionOptions {
2866                        align: 0,
2867                        purpose: MemoryRegionPurpose::GuestMemoryRegion,
2868                        file_backed: None,
2869                    },
2870                ),
2871                (
2872                    GuestAddress(4 * GB - bios_len),
2873                    bios_len,
2874                    MemoryRegionOptions {
2875                        align: 0,
2876                        purpose: MemoryRegionPurpose::Bios,
2877                        file_backed: None,
2878                    },
2879                ),
2880            ]
2881        );
2882    }
2883
2884    #[test]
2885    fn check_pci_mmio_layout() {
2886        let arch_memory_layout = setup();
2887
2888        assert_eq!(arch_memory_layout.pci_mmio_before_32bit.start, 2 * GB);
2889        assert_eq!(arch_memory_layout.pcie_cfg_mmio.start, 3 * GB);
2890        assert_eq!(arch_memory_layout.pcie_cfg_mmio.len().unwrap(), 256 * MB);
2891    }
2892
2893    #[test]
2894    fn check_32bit_gap_size_alignment() {
2895        let arch_memory_layout = setup();
2896        // pci_mmio_before_32bit is 256 MB aligned to be friendly for MTRR mappings.
2897        assert_eq!(
2898            arch_memory_layout.pci_mmio_before_32bit.start % (256 * MB),
2899            0
2900        );
2901    }
2902
2903    #[test]
2904    fn write_setup_data_empty() {
2905        let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2906        let setup_data = [];
2907        let setup_data_addr = write_setup_data(
2908            &mem,
2909            GuestAddress(0x1000),
2910            GuestAddress(0x2000),
2911            &setup_data,
2912        )
2913        .expect("write_setup_data");
2914        assert_eq!(setup_data_addr, None);
2915    }
2916
2917    #[test]
2918    fn write_setup_data_two_of_them() {
2919        let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2920
2921        let entry1_addr = GuestAddress(0x1000);
2922        let entry1_next_addr = entry1_addr;
2923        let entry1_len_addr = entry1_addr.checked_add(12).unwrap();
2924        let entry1_data_addr = entry1_addr.checked_add(16).unwrap();
2925        let entry1_data = [0x55u8; 13];
2926        let entry1_size = (size_of::<setup_data_hdr>() + entry1_data.len()) as u64;
2927        let entry1_align = 3;
2928
2929        let entry2_addr = GuestAddress(entry1_addr.offset() + entry1_size + entry1_align);
2930        let entry2_next_addr = entry2_addr;
2931        let entry2_len_addr = entry2_addr.checked_add(12).unwrap();
2932        let entry2_data_addr = entry2_addr.checked_add(16).unwrap();
2933        let entry2_data = [0xAAu8; 9];
2934
2935        let setup_data = [
2936            SetupData {
2937                data: entry1_data.to_vec(),
2938                type_: SetupDataType::Dtb,
2939            },
2940            SetupData {
2941                data: entry2_data.to_vec(),
2942                type_: SetupDataType::Dtb,
2943            },
2944        ];
2945
2946        let setup_data_head_addr = write_setup_data(
2947            &mem,
2948            GuestAddress(0x1000),
2949            GuestAddress(0x2000),
2950            &setup_data,
2951        )
2952        .expect("write_setup_data");
2953        assert_eq!(setup_data_head_addr, Some(entry1_addr));
2954
2955        assert_eq!(
2956            mem.read_obj_from_addr::<u64>(entry1_next_addr).unwrap(),
2957            entry2_addr.offset()
2958        );
2959        assert_eq!(
2960            mem.read_obj_from_addr::<u32>(entry1_len_addr).unwrap(),
2961            entry1_data.len() as u32
2962        );
2963        assert_eq!(
2964            mem.read_obj_from_addr::<[u8; 13]>(entry1_data_addr)
2965                .unwrap(),
2966            entry1_data
2967        );
2968
2969        assert_eq!(mem.read_obj_from_addr::<u64>(entry2_next_addr).unwrap(), 0);
2970        assert_eq!(
2971            mem.read_obj_from_addr::<u32>(entry2_len_addr).unwrap(),
2972            entry2_data.len() as u32
2973        );
2974        assert_eq!(
2975            mem.read_obj_from_addr::<[u8; 9]>(entry2_data_addr).unwrap(),
2976            entry2_data
2977        );
2978    }
2979
2980    #[test]
2981    fn cmdline_overflow() {
2982        const MEM_SIZE: u64 = 0x1000;
2983        let gm = GuestMemory::new(&[(GuestAddress(0x0), MEM_SIZE)]).unwrap();
2984        let mut cmdline = kernel_cmdline::Cmdline::new();
2985        cmdline.insert_str("12345").unwrap();
2986        let cmdline_address = GuestAddress(MEM_SIZE - 5);
2987        let err =
2988            X8664arch::load_cmdline(&gm, cmdline_address, cmdline, CMDLINE_MAX_SIZE as usize - 1)
2989                .unwrap_err();
2990        assert!(matches!(err, Error::CommandLineOverflow));
2991    }
2992
2993    #[test]
2994    fn cmdline_write_end() {
2995        const MEM_SIZE: u64 = 0x1000;
2996        let gm = GuestMemory::new(&[(GuestAddress(0x0), MEM_SIZE)]).unwrap();
2997        let mut cmdline = kernel_cmdline::Cmdline::new();
2998        cmdline.insert_str("1234").unwrap();
2999        let mut cmdline_address = GuestAddress(45);
3000        X8664arch::load_cmdline(&gm, cmdline_address, cmdline, CMDLINE_MAX_SIZE as usize - 1)
3001            .unwrap();
3002        let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
3003        assert_eq!(val, b'1');
3004        cmdline_address = cmdline_address.unchecked_add(1);
3005        let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
3006        assert_eq!(val, b'2');
3007        cmdline_address = cmdline_address.unchecked_add(1);
3008        let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
3009        assert_eq!(val, b'3');
3010        cmdline_address = cmdline_address.unchecked_add(1);
3011        let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
3012        assert_eq!(val, b'4');
3013        cmdline_address = cmdline_address.unchecked_add(1);
3014        let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
3015        assert_eq!(val, b'\0');
3016    }
3017}