crosvm/crosvm/sys/
linux.rs

1// Copyright 2022 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#[cfg(target_os = "android")]
6mod android;
7pub mod cmdline;
8pub mod config;
9mod device_helpers;
10pub(crate) mod ext2;
11#[cfg(feature = "gpu")]
12pub(crate) mod gpu;
13#[cfg(feature = "pci-hotplug")]
14pub(crate) mod jail_warden;
15#[cfg(feature = "pci-hotplug")]
16pub(crate) mod pci_hotplug_helpers;
17#[cfg(feature = "pci-hotplug")]
18pub(crate) mod pci_hotplug_manager;
19mod vcpu;
20
21#[cfg(all(feature = "pvclock", target_arch = "aarch64"))]
22use std::arch::asm;
23use std::cmp::max;
24use std::collections::BTreeMap;
25use std::collections::BTreeSet;
26#[cfg(feature = "registered_events")]
27use std::collections::HashMap;
28#[cfg(feature = "registered_events")]
29use std::collections::HashSet;
30use std::convert::TryInto;
31use std::ffi::CString;
32#[cfg(target_arch = "aarch64")]
33use std::fs::create_dir_all;
34use std::fs::File;
35use std::fs::OpenOptions;
36#[cfg(feature = "registered_events")]
37use std::hash::Hash;
38use std::io::stdin;
39use std::iter;
40use std::mem;
41#[cfg(target_arch = "x86_64")]
42use std::ops::RangeInclusive;
43use std::os::unix::process::ExitStatusExt;
44use std::path::Path;
45#[cfg(target_arch = "aarch64")]
46use std::path::PathBuf;
47#[cfg(target_arch = "aarch64")]
48use std::process;
49#[cfg(feature = "registered_events")]
50use std::rc::Rc;
51use std::sync::mpsc;
52use std::sync::Arc;
53use std::sync::Barrier;
54use std::thread::JoinHandle;
55
56#[cfg(target_arch = "aarch64")]
57use aarch64::AArch64 as Arch;
58use acpi_tables::sdt::SDT;
59use anyhow::anyhow;
60use anyhow::bail;
61use anyhow::Context;
62use anyhow::Result;
63use arch::DtbOverlay;
64use arch::IrqChipArch;
65use arch::LinuxArch;
66use arch::RunnableLinuxVm;
67use arch::VcpuAffinity;
68use arch::VcpuArch;
69use arch::VirtioDeviceStub;
70use arch::VmArch;
71use arch::VmComponents;
72use arch::VmImage;
73use arch::DEFAULT_CPU_CAPACITY;
74use argh::FromArgs;
75use base::ReadNotifier;
76#[cfg(feature = "balloon")]
77use base::UnixSeqpacket;
78use base::UnixSeqpacketListener;
79use base::UnlinkUnixSeqpacketListener;
80use base::*;
81use cros_async::Executor;
82use device_helpers::*;
83use devices::create_devices_worker_thread;
84use devices::serial_device::SerialHardware;
85#[cfg(all(feature = "pvclock", target_arch = "x86_64"))]
86use devices::tsc::get_tsc_sync_mitigations;
87use devices::vfio::VfioContainerManager;
88#[cfg(feature = "gpu")]
89use devices::virtio;
90#[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
91use devices::virtio::device_constants::video::VideoDeviceType;
92#[cfg(feature = "gpu")]
93use devices::virtio::gpu::EventDevice;
94#[cfg(target_arch = "x86_64")]
95use devices::virtio::memory_mapper::MemoryMapper;
96use devices::virtio::memory_mapper::MemoryMapperTrait;
97use devices::virtio::vhost_user_backend::VhostUserConnectionTrait;
98use devices::virtio::vhost_user_backend::VhostUserListener;
99#[cfg(feature = "balloon")]
100use devices::virtio::BalloonFeatures;
101#[cfg(feature = "pci-hotplug")]
102use devices::virtio::NetParameters;
103#[cfg(feature = "pci-hotplug")]
104use devices::virtio::NetParametersMode;
105use devices::virtio::VirtioDevice;
106use devices::virtio::VirtioDeviceType;
107use devices::Bus;
108use devices::BusDeviceObj;
109use devices::BusType;
110use devices::CoIommuDev;
111#[cfg(feature = "usb")]
112use devices::DeviceProvider;
113#[cfg(target_arch = "x86_64")]
114use devices::HotPlugBus;
115#[cfg(target_arch = "x86_64")]
116use devices::HotPlugKey;
117use devices::IommuDevType;
118use devices::IrqEventIndex;
119use devices::IrqEventSource;
120#[cfg(feature = "pci-hotplug")]
121use devices::NetResourceCarrier;
122#[cfg(target_arch = "x86_64")]
123use devices::PciAddress;
124#[cfg(target_arch = "x86_64")]
125use devices::PciBridge;
126use devices::PciDevice;
127#[cfg(target_arch = "x86_64")]
128use devices::PciMmioMapper;
129#[cfg(target_arch = "x86_64")]
130use devices::PciRoot;
131#[cfg(target_arch = "x86_64")]
132use devices::PciRootCommand;
133#[cfg(target_arch = "x86_64")]
134use devices::PcieDownstreamPort;
135#[cfg(target_arch = "x86_64")]
136use devices::PcieHostPort;
137#[cfg(target_arch = "x86_64")]
138use devices::PcieRootPort;
139#[cfg(target_arch = "x86_64")]
140use devices::PcieUpstreamPort;
141use devices::PvPanicCode;
142use devices::PvPanicPciDevice;
143#[cfg(feature = "pci-hotplug")]
144use devices::ResourceCarrier;
145use devices::StubPciDevice;
146use devices::VirtioPciDevice;
147#[cfg(feature = "usb")]
148use devices::XhciController;
149#[cfg(feature = "gpu")]
150use gpu::*;
151#[cfg(target_arch = "riscv64")]
152use hypervisor::CpuConfigRiscv64;
153#[cfg(target_arch = "x86_64")]
154use hypervisor::CpuConfigX86_64;
155use hypervisor::Hypervisor;
156use hypervisor::HypervisorCap;
157use hypervisor::MemCacheType;
158use hypervisor::ProtectionType;
159use hypervisor::Vm;
160use hypervisor::VmCap;
161use jail::*;
162#[cfg(feature = "pci-hotplug")]
163use jail_warden::JailWarden;
164#[cfg(feature = "pci-hotplug")]
165use jail_warden::JailWardenImpl;
166#[cfg(feature = "pci-hotplug")]
167use jail_warden::PermissiveJailWarden;
168use libc;
169use metrics::MetricsController;
170use minijail::Minijail;
171#[cfg(feature = "pci-hotplug")]
172use pci_hotplug_manager::PciHotPlugManager;
173use resources::AddressRange;
174use resources::Alloc;
175use resources::SystemAllocator;
176#[cfg(target_arch = "riscv64")]
177use riscv64::Riscv64 as Arch;
178use rutabaga_gfx::RutabagaGralloc;
179use rutabaga_gfx::RutabagaGrallocBackendFlags;
180use smallvec::SmallVec;
181#[cfg(feature = "swap")]
182use swap::SwapController;
183use sync::Condvar;
184use sync::Mutex;
185use vm_control::api::VmMemoryClient;
186use vm_control::*;
187use vm_memory::FileBackedMappingParameters;
188use vm_memory::GuestAddress;
189use vm_memory::GuestMemory;
190use vm_memory::MemoryPolicy;
191use vm_memory::MemoryRegionOptions;
192#[cfg(target_arch = "x86_64")]
193use x86_64::X8664arch as Arch;
194
195use crate::crosvm::config::Config;
196use crate::crosvm::config::Executable;
197use crate::crosvm::config::HypervisorKind;
198use crate::crosvm::config::InputDeviceOption;
199use crate::crosvm::config::IrqChipKind;
200use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
201use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
202#[cfg(feature = "gdb")]
203use crate::crosvm::gdb::gdb_thread;
204#[cfg(feature = "gdb")]
205use crate::crosvm::gdb::GdbStub;
206#[cfg(target_arch = "x86_64")]
207use crate::crosvm::ratelimit::Ratelimit;
208use crate::crosvm::sys::cmdline::DevicesCommand;
209use crate::crosvm::sys::config::SharedDir;
210use crate::crosvm::sys::config::SharedDirKind;
211use crate::crosvm::sys::platform::vcpu::VcpuPidTid;
212
213const KVM_PATH: &str = "/dev/kvm";
214#[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
215const GENIEZONE_PATH: &str = "/dev/gzvm";
216#[cfg(all(target_arch = "aarch64", feature = "gunyah"))]
217static GUNYAH_PATH: &str = "/dev/gunyah";
218#[cfg(target_arch = "aarch64")]
219#[cfg(feature = "halla")]
220const HALLA_PATH: &str = "/dev/halla";
221
222fn create_virtio_devices(
223    cfg: &Config,
224    vm: &mut impl VmArch,
225    resources: &mut SystemAllocator,
226    add_control_tube: &mut impl FnMut(AnyControlTube),
227    #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube,
228    #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>,
229    worker_process_pids: &mut BTreeSet<Pid>,
230    #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
231    #[cfg(feature = "gpu")] has_vfio_gfx_device: bool,
232    #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
233) -> DeviceResult<Vec<VirtioDeviceStub>> {
234    let mut devs = Vec::new();
235
236    #[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))]
237    let mut resource_bridges = Vec::<Tube>::new();
238
239    if !cfg.wayland_socket_paths.is_empty() {
240        #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
241        let mut wl_resource_bridge = None::<Tube>;
242
243        #[cfg(feature = "gpu")]
244        {
245            if cfg.gpu_parameters.is_some() {
246                let (wl_socket, gpu_socket) = Tube::pair().context("failed to create tube")?;
247                resource_bridges.push(gpu_socket);
248                wl_resource_bridge = Some(wl_socket);
249            }
250        }
251
252        devs.push(create_wayland_device(
253            cfg.protection_type,
254            cfg.jail_config.as_ref(),
255            &cfg.wayland_socket_paths,
256            wl_resource_bridge,
257        )?);
258    }
259
260    #[cfg(all(feature = "media", feature = "video-decoder"))]
261    let media_adapter_cfg = cfg
262        .media_decoder
263        .iter()
264        .map(|config| {
265            let (video_tube, gpu_tube) =
266                Tube::pair().expect("failed to create tube for media adapter");
267            resource_bridges.push(gpu_tube);
268            (video_tube, config.backend)
269        })
270        .collect::<Vec<_>>();
271
272    #[cfg(feature = "video-decoder")]
273    let video_dec_cfg = cfg
274        .video_dec
275        .iter()
276        .map(|config| {
277            let (video_tube, gpu_tube) =
278                Tube::pair().expect("failed to create tube for video decoder");
279            resource_bridges.push(gpu_tube);
280            (video_tube, config.backend)
281        })
282        .collect::<Vec<_>>();
283
284    #[cfg(feature = "video-encoder")]
285    let video_enc_cfg = cfg
286        .video_enc
287        .iter()
288        .map(|config| {
289            let (video_tube, gpu_tube) =
290                Tube::pair().expect("failed to create tube for video encoder");
291            resource_bridges.push(gpu_tube);
292            (video_tube, config.backend)
293        })
294        .collect::<Vec<_>>();
295
296    #[cfg(feature = "gpu")]
297    {
298        if let Some(gpu_parameters) = &cfg.gpu_parameters {
299            let mut event_devices = Vec::new();
300            if cfg.display_window_mouse {
301                let display_param = if gpu_parameters.display_params.is_empty() {
302                    Default::default()
303                } else {
304                    gpu_parameters.display_params[0].clone()
305                };
306                let (gpu_display_w, gpu_display_h) = display_param.get_virtual_display_size();
307
308                let (event_device_socket, virtio_dev_socket) =
309                    StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
310                        .context("failed to create socket")?;
311                let mut multi_touch_width = gpu_display_w;
312                let mut multi_touch_height = gpu_display_h;
313                let mut multi_touch_name = None;
314                for input in &cfg.virtio_input {
315                    if let InputDeviceOption::MultiTouch {
316                        width,
317                        height,
318                        name,
319                        ..
320                    } = input
321                    {
322                        if let Some(width) = width {
323                            multi_touch_width = *width;
324                        }
325                        if let Some(height) = height {
326                            multi_touch_height = *height;
327                        }
328                        if let Some(name) = name {
329                            multi_touch_name = Some(name.as_str());
330                        }
331                        break;
332                    }
333                }
334                let dev = virtio::input::new_multi_touch(
335                    // u32::MAX is the least likely to collide with the indices generated above for
336                    // the multi_touch options, which begin at 0.
337                    u32::MAX,
338                    virtio_dev_socket,
339                    multi_touch_width,
340                    multi_touch_height,
341                    multi_touch_name,
342                    virtio::base_features(cfg.protection_type),
343                )
344                .context("failed to set up mouse device")?;
345                devs.push(VirtioDeviceStub {
346                    dev: Box::new(dev),
347                    jail: simple_jail(cfg.jail_config.as_ref(), "input_device")?,
348                });
349                event_devices.push(EventDevice::touchscreen(event_device_socket));
350            }
351            if cfg.display_window_keyboard {
352                let (event_device_socket, virtio_dev_socket) =
353                    StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
354                        .context("failed to create socket")?;
355                let dev = virtio::input::new_keyboard(
356                    // u32::MAX is the least likely to collide with the indices generated above for
357                    // the multi_touch options, which begin at 0.
358                    u32::MAX,
359                    virtio_dev_socket,
360                    virtio::base_features(cfg.protection_type),
361                )
362                .context("failed to set up keyboard device")?;
363                devs.push(VirtioDeviceStub {
364                    dev: Box::new(dev),
365                    jail: simple_jail(cfg.jail_config.as_ref(), "input_device")?,
366                });
367                event_devices.push(EventDevice::keyboard(event_device_socket));
368            }
369
370            let (gpu_control_host_tube, gpu_control_device_tube) =
371                Tube::pair().context("failed to create gpu tube")?;
372            add_control_tube(DeviceControlTube::Gpu(gpu_control_host_tube).into());
373            devs.push(create_gpu_device(
374                cfg,
375                vm_evt_wrtube,
376                gpu_control_device_tube,
377                resource_bridges,
378                render_server_fd,
379                has_vfio_gfx_device,
380                event_devices,
381            )?);
382        }
383    }
384
385    for (_, param) in cfg
386        .serial_parameters
387        .iter()
388        .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
389    {
390        let dev =
391            param.create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?;
392        devs.push(dev);
393    }
394
395    for disk in &cfg.disks {
396        let (disk_host_tube, disk_device_tube) = Tube::pair().context("failed to create tube")?;
397        add_control_tube(DeviceControlTube::Disk(disk_host_tube).into());
398        let disk_config = DiskConfig::new(disk, Some(disk_device_tube));
399        devs.push(
400            disk_config
401                .create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?,
402        );
403    }
404
405    if !cfg.scsis.is_empty() {
406        let scsi_config = ScsiConfig(&cfg.scsis);
407        devs.push(
408            scsi_config
409                .create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?,
410        );
411    }
412
413    for (index, pmem_disk) in cfg.pmems.iter().enumerate() {
414        let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
415        add_control_tube(TaggedControlTube::VmMsync(pmem_host_tube).into());
416        devs.push(create_pmem_device(
417            cfg.protection_type,
418            cfg.jail_config.as_ref(),
419            vm,
420            resources,
421            pmem_disk,
422            index,
423            pmem_device_tube,
424        )?);
425    }
426
427    for (index, pmem_ext2) in cfg.pmem_ext2.iter().enumerate() {
428        // Prepare a `VmMemoryClient` for pmem-ext2 device to send a request for mmap() and memory
429        // registeration.
430        let (pmem_ext2_host_tube, pmem_ext2_device_tube) =
431            Tube::pair().context("failed to create tube")?;
432        let vm_memory_client = VmMemoryClient::new(pmem_ext2_device_tube);
433        add_control_tube(
434            VmMemoryTube {
435                tube: pmem_ext2_host_tube,
436                expose_with_viommu: false,
437            }
438            .into(),
439        );
440        let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
441        add_control_tube(TaggedControlTube::VmMsync(pmem_host_tube).into());
442        devs.push(create_pmem_ext2_device(
443            cfg.protection_type,
444            cfg.jail_config.as_ref(),
445            resources,
446            pmem_ext2,
447            index,
448            vm_memory_client,
449            pmem_device_tube,
450            worker_process_pids,
451        )?);
452    }
453
454    if cfg.rng {
455        devs.push(create_virtio_rng_device(
456            cfg.protection_type,
457            cfg.jail_config.as_ref(),
458        )?);
459    }
460
461    #[cfg(feature = "pvclock")]
462    if cfg.pvclock {
463        // pvclock gets a tube for handling suspend/resume requests from the main thread.
464        let (host_suspend_tube, suspend_tube) = Tube::pair().context("failed to create tube")?;
465        add_control_tube(DeviceControlTube::PvClock(host_suspend_tube).into());
466
467        let frequency: u64;
468        #[cfg(target_arch = "x86_64")]
469        {
470            let tsc_state = devices::tsc::tsc_state()?;
471            let tsc_sync_mitigations =
472                get_tsc_sync_mitigations(&tsc_state, cfg.vcpu_count.unwrap_or(1));
473            if tsc_state.core_grouping.size() > 1 {
474                // Host TSCs are not in sync. Log what mitigations are applied.
475                warn!(
476                    "Host TSCs are not in sync, applying the following mitigations: {:?}",
477                    tsc_sync_mitigations
478                );
479            }
480            frequency = tsc_state.frequency;
481        }
482        #[cfg(target_arch = "aarch64")]
483        {
484            let mut x: u64;
485            // SAFETY: This instruction have no side effect apart from storing the current timestamp
486            //         frequency into the specified register.
487            unsafe {
488                asm!("mrs {x}, cntfrq_el0",
489                    x = out(reg) x,
490                );
491            }
492            frequency = x;
493
494            // If unset, KVM defaults to an offset that is calculated from VM boot time. Explicitly
495            // set it to zero on boot. When updating the offset, we always set it to the total
496            // amount of time the VM has been suspended.
497            vm.set_counter_offset(0)
498                .context("failed to set up pvclock")?;
499        }
500        let dev = create_pvclock_device(
501            cfg.protection_type,
502            cfg.jail_config.as_ref(),
503            frequency,
504            suspend_tube,
505        )?;
506        devs.push(dev);
507        info!("virtio-pvclock is enabled for this vm");
508    }
509
510    #[cfg(feature = "vtpm")]
511    {
512        if cfg.vtpm_proxy {
513            devs.push(create_vtpm_proxy_device(
514                cfg.protection_type,
515                cfg.jail_config.as_ref(),
516            )?);
517        }
518    }
519
520    let mut keyboard_idx = 0;
521    let mut mouse_idx = 0;
522    let mut rotary_idx = 0;
523    let mut switches_idx = 0;
524    let mut multi_touch_idx = 0;
525    let mut single_touch_idx = 0;
526    let mut trackpad_idx = 0;
527    let mut multi_touch_trackpad_idx = 0;
528    let mut custom_idx = 0;
529    for input in &cfg.virtio_input {
530        let input_dev = match input {
531            InputDeviceOption::Evdev { path } => create_vinput_device(
532                cfg.protection_type,
533                cfg.jail_config.as_ref(),
534                path.as_path(),
535            )?,
536            InputDeviceOption::Keyboard { path } => {
537                let dev = create_keyboard_device(
538                    cfg.protection_type,
539                    cfg.jail_config.as_ref(),
540                    path.as_path(),
541                    keyboard_idx,
542                )?;
543                keyboard_idx += 1;
544                dev
545            }
546            InputDeviceOption::Mouse { path } => {
547                let dev = create_mouse_device(
548                    cfg.protection_type,
549                    cfg.jail_config.as_ref(),
550                    path.as_path(),
551                    mouse_idx,
552                )?;
553                mouse_idx += 1;
554                dev
555            }
556            InputDeviceOption::MultiTouch {
557                path,
558                width,
559                height,
560                name,
561            } => {
562                let mut width = *width;
563                let mut height = *height;
564                if multi_touch_idx == 0 {
565                    if width.is_none() {
566                        width = cfg.display_input_width;
567                    }
568                    if height.is_none() {
569                        height = cfg.display_input_height;
570                    }
571                }
572                let dev = create_multi_touch_device(
573                    cfg.protection_type,
574                    cfg.jail_config.as_ref(),
575                    path.as_path(),
576                    width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
577                    height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
578                    name.as_deref(),
579                    multi_touch_idx,
580                )?;
581                multi_touch_idx += 1;
582                dev
583            }
584            InputDeviceOption::Rotary { path } => {
585                let dev = create_rotary_device(
586                    cfg.protection_type,
587                    cfg.jail_config.as_ref(),
588                    path.as_path(),
589                    rotary_idx,
590                )?;
591                rotary_idx += 1;
592                dev
593            }
594            InputDeviceOption::SingleTouch {
595                path,
596                width,
597                height,
598                name,
599            } => {
600                let mut width = *width;
601                let mut height = *height;
602                if single_touch_idx == 0 {
603                    if width.is_none() {
604                        width = cfg.display_input_width;
605                    }
606                    if height.is_none() {
607                        height = cfg.display_input_height;
608                    }
609                }
610                let dev = create_single_touch_device(
611                    cfg.protection_type,
612                    cfg.jail_config.as_ref(),
613                    path.as_path(),
614                    width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
615                    height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
616                    name.as_deref(),
617                    single_touch_idx,
618                )?;
619                single_touch_idx += 1;
620                dev
621            }
622            InputDeviceOption::Switches { path } => {
623                let dev = create_switches_device(
624                    cfg.protection_type,
625                    cfg.jail_config.as_ref(),
626                    path.as_path(),
627                    switches_idx,
628                )?;
629                switches_idx += 1;
630                dev
631            }
632            InputDeviceOption::Trackpad {
633                path,
634                width,
635                height,
636                name,
637            } => {
638                let dev = create_trackpad_device(
639                    cfg.protection_type,
640                    cfg.jail_config.as_ref(),
641                    path.as_path(),
642                    width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
643                    height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
644                    name.as_deref(),
645                    trackpad_idx,
646                )?;
647                trackpad_idx += 1;
648                dev
649            }
650            InputDeviceOption::MultiTouchTrackpad {
651                path,
652                width,
653                height,
654                name,
655            } => {
656                let dev = create_multitouch_trackpad_device(
657                    cfg.protection_type,
658                    cfg.jail_config.as_ref(),
659                    path.as_path(),
660                    width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
661                    height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
662                    name.as_deref(),
663                    multi_touch_trackpad_idx,
664                )?;
665                multi_touch_trackpad_idx += 1;
666                dev
667            }
668            InputDeviceOption::Custom { path, config_path } => {
669                let dev = create_custom_device(
670                    cfg.protection_type,
671                    cfg.jail_config.as_ref(),
672                    path.as_path(),
673                    custom_idx,
674                    config_path.clone(),
675                )?;
676                custom_idx += 1;
677                dev
678            }
679        };
680        devs.push(input_dev);
681    }
682
683    #[cfg(feature = "balloon")]
684    if cfg.balloon {
685        let balloon_device_tube = if let Some(ref path) = cfg.balloon_control {
686            Tube::try_from(UnixSeqpacket::connect(path).with_context(|| {
687                format!(
688                    "failed to connect to balloon control socket {}",
689                    path.display(),
690                )
691            })?)?
692        } else {
693            // Balloon gets a special socket so balloon requests can be forwarded
694            // from the main process.
695            let (host, device) = Tube::pair().context("failed to create tube")?;
696            add_control_tube(DeviceControlTube::Balloon(host).into());
697            device
698        };
699
700        let balloon_features = (cfg.balloon_page_reporting as u64)
701            << BalloonFeatures::PageReporting as u64
702            | (cfg.balloon_ws_reporting as u64) << BalloonFeatures::WSReporting as u64;
703
704        let init_balloon_size = if let Some(init_memory) = cfg.init_memory {
705            let init_memory_bytes = init_memory.saturating_mul(1024 * 1024);
706            let total_memory_bytes = vm.get_memory().memory_size();
707
708            if init_memory_bytes > total_memory_bytes {
709                bail!(
710                    "initial memory {} cannot be greater than total memory {}",
711                    init_memory,
712                    total_memory_bytes / (1024 * 1024),
713                );
714            }
715
716            // The initial balloon size is the total memory size minus the initial memory size.
717            total_memory_bytes - init_memory_bytes
718        } else {
719            // No --init-mem specified; start with balloon completely deflated.
720            0
721        };
722
723        // The balloon device also needs a tube to communicate back to the main process to
724        // handle remapping memory dynamically.
725        let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
726            Tube::pair().context("failed to create tube")?;
727        add_control_tube(
728            VmMemoryTube {
729                tube: dynamic_mapping_host_tube,
730                expose_with_viommu: false,
731            }
732            .into(),
733        );
734
735        devs.push(create_balloon_device(
736            cfg.protection_type,
737            cfg.jail_config.as_ref(),
738            balloon_device_tube,
739            balloon_inflate_tube,
740            init_balloon_size,
741            VmMemoryClient::new(dynamic_mapping_device_tube),
742            balloon_features,
743            #[cfg(feature = "registered_events")]
744            Some(
745                registered_evt_q
746                    .try_clone()
747                    .context("failed to clone registered_evt_q tube")?,
748            ),
749            cfg.balloon_ws_num_bins,
750        )?);
751    }
752
753    #[cfg(feature = "net")]
754    for opt in &cfg.net {
755        let dev =
756            opt.create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?;
757        devs.push(dev);
758    }
759
760    #[cfg(feature = "audio")]
761    {
762        for (card_index, virtio_snd) in cfg.virtio_snds.iter().enumerate() {
763            let (snd_host_tube, snd_device_tube) =
764                Tube::pair().context("failed to create tube for snd")?;
765            add_control_tube(DeviceControlTube::Snd(snd_host_tube).into());
766            let mut snd_params = virtio_snd.clone();
767            snd_params.card_index = card_index;
768            devs.push(create_virtio_snd_device(
769                cfg.protection_type,
770                cfg.jail_config.as_ref(),
771                snd_params,
772                snd_device_tube,
773            )?);
774        }
775    }
776
777    #[cfg(any(target_os = "android", target_os = "linux"))]
778    #[cfg(feature = "media")]
779    {
780        for v4l2_device in &cfg.v4l2_proxy {
781            devs.push(create_v4l2_device(cfg.protection_type, v4l2_device)?);
782        }
783    }
784
785    #[cfg(feature = "media")]
786    if cfg.simple_media_device {
787        devs.push(create_simple_media_device(cfg.protection_type)?);
788    }
789
790    #[cfg(all(feature = "media", feature = "video-decoder"))]
791    {
792        for (tube, backend) in media_adapter_cfg {
793            devs.push(create_virtio_media_adapter(
794                cfg.protection_type,
795                cfg.jail_config.as_ref(),
796                tube,
797                backend,
798            )?);
799        }
800    }
801
802    #[cfg(feature = "video-decoder")]
803    {
804        for (tube, backend) in video_dec_cfg {
805            register_video_device(
806                backend,
807                &mut devs,
808                tube,
809                cfg.protection_type,
810                cfg.jail_config.as_ref(),
811                VideoDeviceType::Decoder,
812            )?;
813        }
814    }
815
816    #[cfg(feature = "video-encoder")]
817    {
818        for (tube, backend) in video_enc_cfg {
819            register_video_device(
820                backend,
821                &mut devs,
822                tube,
823                cfg.protection_type,
824                cfg.jail_config.as_ref(),
825                VideoDeviceType::Encoder,
826            )?;
827        }
828    }
829
830    if let Some(vsock_config) = &cfg.vsock {
831        devs.push(
832            vsock_config
833                .create_virtio_device_and_jail(cfg.protection_type, cfg.jail_config.as_ref())?,
834        );
835    }
836
837    #[cfg(target_arch = "aarch64")]
838    {
839        if cfg.vhost_scmi {
840            devs.push(create_vhost_scmi_device(
841                cfg.protection_type,
842                cfg.jail_config.as_ref(),
843                cfg.vhost_scmi_device.clone(),
844            )?);
845        }
846    }
847
848    for shared_dir in &cfg.shared_dirs {
849        let SharedDir {
850            src,
851            tag,
852            kind,
853            ugid,
854            uid_map,
855            gid_map,
856            fs_cfg,
857            p9_cfg,
858        } = shared_dir;
859
860        let dev = match kind {
861            SharedDirKind::FS => {
862                let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
863                add_control_tube(TaggedControlTube::Fs(host_tube).into());
864
865                create_fs_device(
866                    cfg.protection_type,
867                    cfg.jail_config.as_ref(),
868                    *ugid,
869                    uid_map,
870                    gid_map,
871                    src,
872                    tag,
873                    fs_cfg.clone(),
874                    device_tube,
875                )?
876            }
877            SharedDirKind::P9 => create_9p_device(
878                cfg.protection_type,
879                cfg.jail_config.as_ref(),
880                *ugid,
881                uid_map,
882                gid_map,
883                src,
884                tag,
885                p9_cfg.clone(),
886            )?,
887        };
888        devs.push(dev);
889    }
890
891    #[cfg(feature = "audio")]
892    if let Some(path) = &cfg.sound {
893        devs.push(create_sound_device(
894            path,
895            cfg.protection_type,
896            cfg.jail_config.as_ref(),
897        )?);
898    }
899
900    for opt in &cfg.vhost_user {
901        devs.push(create_vhost_user_frontend(
902            cfg.protection_type,
903            opt,
904            cfg.vhost_user_connect_timeout_ms,
905            vm_evt_wrtube.try_clone()?,
906        )?);
907    }
908
909    Ok(devs)
910}
911
912fn create_devices(
913    cfg: &Config,
914    vm: &mut impl VmArch,
915    resources: &mut SystemAllocator,
916    add_control_tube: &mut impl FnMut(AnyControlTube),
917    vm_evt_wrtube: &SendTube,
918    iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
919    #[cfg(feature = "usb")] usb_provider: DeviceProvider,
920    #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
921    iova_max_addr: &mut Option<u64>,
922    #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
923    vfio_container_manager: &mut VfioContainerManager,
924    // Stores a set of PID of child processes that are suppose to exit cleanly.
925    worker_process_pids: &mut BTreeSet<Pid>,
926) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
927    let mut devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)> = Vec::new();
928    #[cfg(feature = "balloon")]
929    let mut balloon_inflate_tube: Option<Tube> = None;
930    #[cfg(feature = "gpu")]
931    let mut has_vfio_gfx_device = false;
932    if !cfg.vfio.is_empty() {
933        let mut coiommu_attached_endpoints = Vec::new();
934
935        for vfio_dev in &cfg.vfio {
936            let (dev, jail, viommu_mapper) = create_vfio_device(
937                cfg.jail_config.as_ref(),
938                vm,
939                resources,
940                add_control_tube,
941                &vfio_dev.path,
942                false,
943                None,
944                vfio_dev.guest_address,
945                Some(&mut coiommu_attached_endpoints),
946                vfio_dev.iommu,
947                vfio_dev.dt_symbol.clone(),
948                vfio_container_manager,
949            )?;
950            match dev {
951                VfioDeviceVariant::Pci(vfio_pci_device) => {
952                    *iova_max_addr = Some(max(
953                        vfio_pci_device.get_max_iova(),
954                        iova_max_addr.unwrap_or(0),
955                    ));
956
957                    #[cfg(feature = "gpu")]
958                    if vfio_pci_device.is_gfx() {
959                        has_vfio_gfx_device = true;
960                    }
961
962                    if let Some(viommu_mapper) = viommu_mapper {
963                        iommu_attached_endpoints.insert(
964                            vfio_pci_device
965                                .pci_address()
966                                .context("not initialized")?
967                                .to_u32(),
968                            Arc::new(Mutex::new(Box::new(viommu_mapper))),
969                        );
970                    }
971
972                    devices.push((Box::new(vfio_pci_device), jail));
973                }
974                VfioDeviceVariant::Platform(vfio_plat_dev) => {
975                    devices.push((Box::new(vfio_plat_dev), jail));
976                }
977            }
978        }
979
980        if !coiommu_attached_endpoints.is_empty() || !iommu_attached_endpoints.is_empty() {
981            let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
982            // SAFETY: trivially safe
983            let res = unsafe { libc::getrlimit64(libc::RLIMIT_MEMLOCK, buf.as_mut_ptr()) };
984            if res == 0 {
985                // SAFETY: safe because getrlimit64 has returned success.
986                let limit = unsafe { buf.assume_init() };
987                let rlim_new = limit.rlim_cur.saturating_add(vm.get_memory().memory_size());
988                let rlim_max = max(limit.rlim_max, rlim_new);
989                if limit.rlim_cur < rlim_new {
990                    let limit_arg = libc::rlimit64 {
991                        rlim_cur: rlim_new,
992                        rlim_max,
993                    };
994                    // SAFETY: trivially safe
995                    let res = unsafe { libc::setrlimit64(libc::RLIMIT_MEMLOCK, &limit_arg) };
996                    if res != 0 {
997                        bail!("Set rlimit failed");
998                    }
999                }
1000            } else {
1001                bail!("Get rlimit failed");
1002            }
1003        }
1004        #[cfg(feature = "balloon")]
1005        let coiommu_tube: Option<Tube>;
1006        #[cfg(not(feature = "balloon"))]
1007        let coiommu_tube: Option<Tube> = None;
1008        if !coiommu_attached_endpoints.is_empty() {
1009            let vfio_container = vfio_container_manager
1010                .get_container(IommuDevType::CoIommu, None as Option<&Path>)
1011                .context("failed to get vfio container")?;
1012            let (coiommu_host_tube, coiommu_device_tube) =
1013                Tube::pair().context("failed to create coiommu tube")?;
1014            add_control_tube(
1015                VmMemoryTube {
1016                    tube: coiommu_host_tube,
1017                    expose_with_viommu: false,
1018                }
1019                .into(),
1020            );
1021            let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u64;
1022            #[cfg(feature = "balloon")]
1023            match Tube::pair() {
1024                Ok((x, y)) => {
1025                    coiommu_tube = Some(x);
1026                    balloon_inflate_tube = Some(y);
1027                }
1028                Err(x) => return Err(x).context("failed to create coiommu tube"),
1029            }
1030            let dev = CoIommuDev::new(
1031                vm.get_memory().clone(),
1032                vfio_container,
1033                VmMemoryClient::new(coiommu_device_tube),
1034                coiommu_tube,
1035                coiommu_attached_endpoints,
1036                vcpu_count,
1037                cfg.coiommu_param.unwrap_or_default(),
1038            )
1039            .context("failed to create coiommu device")?;
1040
1041            devices.push((
1042                Box::new(dev),
1043                simple_jail(cfg.jail_config.as_ref(), "coiommu_device")?,
1044            ));
1045        }
1046    }
1047
1048    let stubs = create_virtio_devices(
1049        cfg,
1050        vm,
1051        resources,
1052        add_control_tube,
1053        vm_evt_wrtube,
1054        #[cfg(feature = "balloon")]
1055        balloon_inflate_tube,
1056        worker_process_pids,
1057        #[cfg(feature = "gpu")]
1058        render_server_fd,
1059        #[cfg(feature = "gpu")]
1060        has_vfio_gfx_device,
1061        #[cfg(feature = "registered_events")]
1062        registered_evt_q,
1063    )?;
1064
1065    for stub in stubs {
1066        let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1067        add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1068
1069        let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
1070            let (host_tube, device_tube) =
1071                Tube::pair().context("failed to create shared memory tube")?;
1072            add_control_tube(
1073                VmMemoryTube {
1074                    tube: host_tube,
1075                    expose_with_viommu: stub.dev.expose_shmem_descriptors_with_viommu(),
1076                }
1077                .into(),
1078            );
1079            Some(device_tube)
1080        } else {
1081            None
1082        };
1083
1084        let (ioevent_host_tube, ioevent_device_tube) =
1085            Tube::pair().context("failed to create ioevent tube")?;
1086        add_control_tube(
1087            VmMemoryTube {
1088                tube: ioevent_host_tube,
1089                expose_with_viommu: false,
1090            }
1091            .into(),
1092        );
1093
1094        let (host_tube, device_tube) =
1095            Tube::pair().context("failed to create device control tube")?;
1096        add_control_tube(TaggedControlTube::Vm(host_tube).into());
1097
1098        let dev = VirtioPciDevice::new(
1099            vm.get_memory().clone(),
1100            stub.dev,
1101            msi_device_tube,
1102            cfg.disable_virtio_intx,
1103            shared_memory_tube.map(VmMemoryClient::new),
1104            VmMemoryClient::new(ioevent_device_tube),
1105            device_tube,
1106        )
1107        .context("failed to create virtio pci dev")?;
1108
1109        devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
1110    }
1111
1112    #[cfg(feature = "usb")]
1113    if cfg.usb {
1114        // Create xhci controller.
1115        let usb_controller = Box::new(XhciController::new(
1116            vm.get_memory().clone(),
1117            Box::new(usb_provider),
1118        ));
1119        devices.push((
1120            usb_controller,
1121            simple_jail(cfg.jail_config.as_ref(), "xhci_device")?,
1122        ));
1123    }
1124
1125    for params in &cfg.stub_pci_devices {
1126        // Stub devices don't need jailing since they don't do anything.
1127        devices.push((Box::new(StubPciDevice::new(params)), None));
1128    }
1129
1130    devices.push((
1131        Box::new(PvPanicPciDevice::new(vm_evt_wrtube.try_clone()?)),
1132        None,
1133    ));
1134
1135    Ok(devices)
1136}
1137
1138fn create_mmio_file_backed_mappings(
1139    cfg: &Config,
1140    vm: &mut impl Vm,
1141    resources: &mut SystemAllocator,
1142) -> Result<()> {
1143    for mapping in &cfg.file_backed_mappings_mmio {
1144        let file = mapping
1145            .open()
1146            .context("failed to open file for file-backed mapping")?;
1147        let prot = if mapping.writable {
1148            Protection::read_write()
1149        } else {
1150            Protection::read()
1151        };
1152        let size = mapping
1153            .size
1154            .try_into()
1155            .context("Invalid size for file-backed mapping")?;
1156        let memory_mapping = MemoryMappingBuilder::new(size)
1157            .from_file(&file)
1158            .offset(mapping.offset)
1159            .protection(prot)
1160            .build()
1161            .context("failed to map backing file for file-backed mapping")?;
1162
1163        let mapping_range = AddressRange::from_start_and_size(mapping.address, mapping.size)
1164            .context("failed to convert to AddressRange")?;
1165        match resources.mmio_allocator_any().allocate_at(
1166            mapping_range,
1167            Alloc::FileBacked(mapping.address),
1168            "file-backed mapping".to_owned(),
1169        ) {
1170            // OutOfSpace just means that this mapping is not in the MMIO regions at all, so don't
1171            // consider it an error.
1172            // TODO(b/222769529): Reserve this region in a global memory address space allocator
1173            // once we have that so nothing else can accidentally overlap with it.
1174            Ok(()) | Err(resources::Error::OutOfSpace) => {}
1175            e => e.context("failed to allocate guest address for file-backed mapping")?,
1176        }
1177
1178        vm.add_memory_region(
1179            GuestAddress(mapping.address),
1180            Box::new(memory_mapping),
1181            !mapping.writable,
1182            /* log_dirty_pages = */ false,
1183            MemCacheType::CacheCoherent,
1184        )
1185        .context("failed to configure file-backed mapping")?;
1186    }
1187
1188    Ok(())
1189}
1190
1191#[cfg(target_arch = "x86_64")]
1192/// Collection of devices related to PCI hotplug.
1193struct HotPlugStub {
1194    /// Map from bus index to hotplug bus.
1195    hotplug_buses: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
1196    /// Bus ranges of devices for virtio-iommu.
1197    iommu_bus_ranges: Vec<RangeInclusive<u32>>,
1198    /// Map from gpe index to GpeNotify devices.
1199    gpe_notify_devs: BTreeMap<u32, Arc<Mutex<dyn GpeNotify>>>,
1200    /// Map from bus index to GpeNotify devices.
1201    pme_notify_devs: BTreeMap<u8, Arc<Mutex<dyn PmeNotify>>>,
1202}
1203
1204#[cfg(target_arch = "x86_64")]
1205impl HotPlugStub {
1206    /// Constructs empty HotPlugStub.
1207    fn new() -> Self {
1208        Self {
1209            hotplug_buses: BTreeMap::new(),
1210            iommu_bus_ranges: Vec::new(),
1211            gpe_notify_devs: BTreeMap::new(),
1212            pme_notify_devs: BTreeMap::new(),
1213        }
1214    }
1215}
1216
1217#[cfg(target_arch = "x86_64")]
1218/// Creates PCIE root port with only virtual devices.
1219///
1220/// user doesn't specify host pcie root port which link to this virtual pcie rp,
1221/// find the empty bus and create a total virtual pcie rp
1222fn create_pure_virtual_pcie_root_port(
1223    sys_allocator: &mut SystemAllocator,
1224    add_control_tube: &mut impl FnMut(AnyControlTube),
1225    devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
1226    hp_bus_count: u8,
1227) -> Result<HotPlugStub> {
1228    let mut hp_sec_buses = Vec::new();
1229    let mut hp_stub = HotPlugStub::new();
1230    // Create Pcie Root Port for non-root buses, each non-root bus device will be
1231    // connected behind a virtual pcie root port.
1232    for i in 1..255 {
1233        if sys_allocator.pci_bus_empty(i) {
1234            if hp_sec_buses.len() < hp_bus_count.into() {
1235                hp_sec_buses.push(i);
1236            }
1237            continue;
1238        }
1239        let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(i, false)));
1240        hp_stub
1241            .pme_notify_devs
1242            .insert(i, pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>);
1243        let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1244        add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1245        let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1246        // no ipc is used if the root port disables hotplug
1247        devices.push((pci_bridge, None));
1248    }
1249
1250    // Create Pcie Root Port for hot-plug
1251    if hp_sec_buses.len() < hp_bus_count.into() {
1252        return Err(anyhow!("no more addresses are available"));
1253    }
1254
1255    for hp_sec_bus in hp_sec_buses {
1256        let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(hp_sec_bus, true)));
1257        hp_stub.pme_notify_devs.insert(
1258            hp_sec_bus,
1259            pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>,
1260        );
1261        let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1262        add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1263        let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1264
1265        hp_stub.iommu_bus_ranges.push(RangeInclusive::new(
1266            PciAddress {
1267                bus: pci_bridge.get_secondary_num(),
1268                dev: 0,
1269                func: 0,
1270            }
1271            .to_u32(),
1272            PciAddress {
1273                bus: pci_bridge.get_subordinate_num(),
1274                dev: 32,
1275                func: 8,
1276            }
1277            .to_u32(),
1278        ));
1279
1280        devices.push((pci_bridge, None));
1281        hp_stub
1282            .hotplug_buses
1283            .insert(hp_sec_bus, pcie_root_port as Arc<Mutex<dyn HotPlugBus>>);
1284    }
1285    Ok(hp_stub)
1286}
1287
1288/// For `vcpu_id`, return the pcpu that it's affined to. It's considered the "representative"
1289/// pcpu since there could be multiple pcpu's affined to a single vcpu, so arbitrarily return the
1290/// first pcpu we see. This "shouldn't" be an issue since ideally all the affined pcpu's have the
1291/// same capacity, frequency, etc.
1292fn get_representative_pcpu(vcpu_id: usize, vcpu_affinity: &Option<VcpuAffinity>) -> usize {
1293    match vcpu_affinity {
1294        // Default to pcpu 0 to preserve the intent to map all vcpu's to the same cluster of pcpu's.
1295        Some(VcpuAffinity::Global(s)) => s.iter().next().copied().unwrap_or(0),
1296        Some(VcpuAffinity::PerVcpu(m)) => match m.get(&vcpu_id) {
1297            Some(s) => s.iter().next().copied().unwrap_or(vcpu_id),
1298            None => vcpu_id,
1299        },
1300        None => vcpu_id,
1301    }
1302}
1303
1304/// Given `vcpu_affinity` (vcpu->pcpu mapping) and `host_capacity` (pcpu->pcpu capacity mapping),
1305/// return a mapping of vcpu->pcpu's capacity.
1306fn map_vcpu_capacity(
1307    vcpu_count: usize,
1308    vcpu_affinity: &Option<VcpuAffinity>,
1309    host_capacity: &BTreeMap<usize, u32>,
1310) -> anyhow::Result<BTreeMap<usize, u32>> {
1311    let mut mapped_capacity = BTreeMap::new();
1312    for vcpu_id in 0..vcpu_count {
1313        let pcpu_id = get_representative_pcpu(vcpu_id, vcpu_affinity);
1314        let capacity = host_capacity
1315            .get(&pcpu_id)
1316            .copied()
1317            .unwrap_or(DEFAULT_CPU_CAPACITY);
1318        mapped_capacity.insert(vcpu_id, capacity);
1319    }
1320    Ok(mapped_capacity)
1321}
1322
1323/// Given `vcpu_affinity` (vcpu->pcpu mapping) and `host_clusters` (cluster->pcpu mapping),
1324/// return a mapping of cluster->vcpu mapping.
1325fn map_vcpu_clusters(
1326    vcpu_count: usize,
1327    vcpu_affinity: &Option<VcpuAffinity>,
1328    host_clusters: Vec<arch::CpuSet>,
1329) -> anyhow::Result<Vec<arch::CpuSet>> {
1330    let mut pcpu_to_cluster = std::collections::BTreeMap::new();
1331    for (cluster_idx, cluster) in host_clusters.iter().enumerate() {
1332        for pcpu_id in cluster.iter() {
1333            pcpu_to_cluster.insert(*pcpu_id, cluster_idx);
1334        }
1335    }
1336
1337    let mut vcpu_clusters_sets: Vec<std::collections::BTreeSet<usize>> =
1338        vec![std::collections::BTreeSet::new(); host_clusters.len()];
1339
1340    for vcpu_id in 0..vcpu_count {
1341        let pcpu_id = get_representative_pcpu(vcpu_id, vcpu_affinity);
1342
1343        if let Some(&cluster_idx) = pcpu_to_cluster.get(&pcpu_id) {
1344            vcpu_clusters_sets[cluster_idx].insert(vcpu_id);
1345        }
1346    }
1347
1348    Ok(vcpu_clusters_sets
1349        .into_iter()
1350        .filter(|s| !s.is_empty())
1351        .map(arch::CpuSet::new)
1352        .collect())
1353}
1354
1355fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
1356    let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
1357        Some(
1358            open_file_or_duplicate(initrd_path, OpenOptions::new().read(true))
1359                .with_context(|| format!("failed to open initrd {}", initrd_path.display()))?,
1360        )
1361    } else {
1362        None
1363    };
1364    let pvm_fw_image = if let Some(pvm_fw_path) = &cfg.pvm_fw {
1365        Some(
1366            open_file_or_duplicate(pvm_fw_path, OpenOptions::new().read(true))
1367                .with_context(|| format!("failed to open pvm_fw {}", pvm_fw_path.display()))?,
1368        )
1369    } else {
1370        None
1371    };
1372
1373    let vm_image = match cfg.executable_path {
1374        Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
1375            open_file_or_duplicate(kernel_path, OpenOptions::new().read(true)).with_context(
1376                || format!("failed to open kernel image {}", kernel_path.display()),
1377            )?,
1378        ),
1379        Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
1380            open_file_or_duplicate(bios_path, OpenOptions::new().read(true))
1381                .with_context(|| format!("failed to open bios {}", bios_path.display()))?,
1382        ),
1383        _ => panic!("Did not receive a bios or kernel, should be impossible."),
1384    };
1385
1386    let swiotlb = if let Some(size) = cfg.swiotlb {
1387        Some(
1388            size.checked_mul(1024 * 1024)
1389                .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
1390        )
1391    } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
1392        None
1393    } else {
1394        Some(64 * 1024 * 1024)
1395    };
1396
1397    let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
1398    {
1399        (
1400            Some(
1401                open_file_or_duplicate(
1402                    &pflash_parameters.path,
1403                    OpenOptions::new().read(true).write(true),
1404                )
1405                .with_context(|| {
1406                    format!("failed to open pflash {}", pflash_parameters.path.display())
1407                })?,
1408            ),
1409            pflash_parameters.block_size,
1410        )
1411    } else {
1412        (None, 0)
1413    };
1414
1415    // Maps vcpu -> the corresponding vcpu's frequency.
1416    #[allow(unused_mut)]
1417    let mut vcpu_frequencies: BTreeMap<usize, Vec<u32>> = BTreeMap::new();
1418    #[cfg(target_arch = "aarch64")]
1419    let mut normalized_cpu_ipc_ratios = BTreeMap::new();
1420
1421    // if --enable-fw-cfg or --fw-cfg was given, we want to enable fw_cfg
1422    let fw_cfg_enable = cfg.enable_fw_cfg || !cfg.fw_cfg_parameters.is_empty();
1423    let (vcpu_clusters, vcpu_capacity) = if cfg.host_cpu_topology {
1424        let host_capacity = Arch::get_host_cpu_capacity()?;
1425        let mapped_capacity = map_vcpu_capacity(
1426            cfg.vcpu_count.unwrap_or(1),
1427            &cfg.vcpu_affinity,
1428            &host_capacity,
1429        )?;
1430
1431        let host_clusters = Arch::get_host_cpu_clusters()?;
1432        let mapped_clusters = map_vcpu_clusters(
1433            cfg.vcpu_count.unwrap_or(1),
1434            &cfg.vcpu_affinity,
1435            host_clusters,
1436        )?;
1437
1438        (mapped_clusters, mapped_capacity)
1439    } else {
1440        (cfg.cpu_clusters.clone(), cfg.cpu_capacity.clone())
1441    };
1442
1443    #[cfg(target_arch = "aarch64")]
1444    let cpu_ipc_ratio = if cfg.host_cpu_topology {
1445        &vcpu_capacity
1446    } else {
1447        &cfg.cpu_ipc_ratio
1448    };
1449
1450    #[cfg(target_arch = "aarch64")]
1451    let mut vcpu_domain_paths = BTreeMap::new();
1452    #[cfg(target_arch = "aarch64")]
1453    let mut vcpu_domains = BTreeMap::new();
1454
1455    #[cfg(target_arch = "aarch64")]
1456    if cfg.virt_cpufreq || cfg.virt_cpufreq_v2 {
1457        if !cfg.cpu_frequencies_khz.is_empty() {
1458            vcpu_frequencies = cfg.cpu_frequencies_khz.clone();
1459        } else {
1460            match Arch::get_host_cpu_frequencies_khz() {
1461                Ok(host_cpu_frequencies) => {
1462                    for vcpu_id in 0..cfg.vcpu_count.unwrap_or(1) {
1463                        let vcpu_affinity = match cfg.vcpu_affinity.clone() {
1464                            Some(VcpuAffinity::Global(v)) => v,
1465                            Some(VcpuAffinity::PerVcpu(mut m)) => {
1466                                m.remove(&vcpu_id).unwrap_or_default()
1467                            }
1468                            None => {
1469                                panic!("There must be some vcpu_affinity setting with VirtCpufreq enabled!")
1470                            }
1471                        };
1472
1473                        // Check that the physical CPUs that the vCPU is affined to all share the
1474                        // same frequency domain.
1475                        if let Some(freq_domain) = host_cpu_frequencies.get(&vcpu_affinity[0]) {
1476                            for cpu in vcpu_affinity.iter() {
1477                                if let Some(frequencies) = host_cpu_frequencies.get(cpu) {
1478                                    if frequencies != freq_domain {
1479                                        panic!("Affined CPUs do not share a frequency domain!");
1480                                    }
1481                                }
1482                            }
1483                            vcpu_frequencies.insert(vcpu_id, freq_domain.clone());
1484                        } else {
1485                            panic!("No frequency domain for vcpu:{vcpu_id}");
1486                        }
1487                    }
1488                }
1489                Err(e) => {
1490                    warn!("Unable to get host cpu frequencies {:#}", e);
1491                }
1492            }
1493        }
1494
1495        if !vcpu_frequencies.is_empty() {
1496            let host_max_freqs = Arch::get_host_cpu_max_freq_khz()?;
1497            // Find the highest maximum frequency over all host CPUs. The guest CPU IPC ratios will
1498            // be normalized by dividing by this value.
1499            let host_max_freq = host_max_freqs.values().copied().max().unwrap_or_default();
1500
1501            normalized_cpu_ipc_ratios = normalize_cpu_ipc_ratios(
1502                vcpu_frequencies.iter().map(|(vcpu_id, frequencies)| {
1503                    (
1504                        *vcpu_id,
1505                        frequencies.iter().copied().max().unwrap_or_default(),
1506                    )
1507                }),
1508                host_max_freq,
1509                |vcpu_id| {
1510                    cpu_ipc_ratio
1511                        .get(&vcpu_id)
1512                        .copied()
1513                        .unwrap_or(DEFAULT_CPU_CAPACITY)
1514                },
1515            )?;
1516
1517            if !cfg.cpu_freq_domains.is_empty() {
1518                let cgroup_path = cfg
1519                    .vcpu_cgroup_path
1520                    .clone()
1521                    .context("cpu_freq_domains requires vcpu_cgroup_path")?;
1522
1523                if !cgroup_path.join("cgroup.controllers").exists() {
1524                    panic!("CGroupsV2 must be enabled for cpu freq domain support!");
1525                }
1526
1527                // Assign parent crosvm process to top level cgroup
1528                let cgroup_procs_path = cgroup_path.join("cgroup.procs");
1529                std::fs::write(
1530                    cgroup_procs_path.clone(),
1531                    process::id().to_string().as_bytes(),
1532                )
1533                .with_context(|| {
1534                    format!(
1535                        "failed to create vcpu-cgroup-path {}",
1536                        cgroup_procs_path.display(),
1537                    )
1538                })?;
1539
1540                for (freq_domain_idx, cpus) in cfg.cpu_freq_domains.iter().enumerate() {
1541                    let vcpu_domain_path =
1542                        cgroup_path.join(format!("vcpu-domain{freq_domain_idx}"));
1543                    // Create subtree for domain
1544                    create_dir_all(&vcpu_domain_path)?;
1545
1546                    // Set vcpu_domain cgroup type as 'threaded' to get thread level granularity
1547                    // controls
1548                    let cgroup_type_path = cgroup_path.join(vcpu_domain_path.join("cgroup.type"));
1549                    std::fs::write(cgroup_type_path.clone(), b"threaded").with_context(|| {
1550                        format!(
1551                            "failed to create vcpu-cgroup-path {}",
1552                            cgroup_type_path.display(),
1553                        )
1554                    })?;
1555                    for core_idx in cpus.iter() {
1556                        vcpu_domain_paths.insert(*core_idx, vcpu_domain_path.clone());
1557                        vcpu_domains.insert(*core_idx, freq_domain_idx as u32);
1558                    }
1559                }
1560            }
1561        }
1562    }
1563
1564    let vcpu_count = cfg.vcpu_count.unwrap_or(1);
1565    let vcpu_properties = arch::derive_vcpu_properties(
1566        vcpu_count,
1567        &vcpu_capacity,
1568        &cfg.dynamic_power_coefficient,
1569        &vcpu_frequencies,
1570        #[cfg(all(
1571            target_arch = "aarch64",
1572            any(target_os = "android", target_os = "linux")
1573        ))]
1574        &normalized_cpu_ipc_ratios,
1575        #[cfg(all(
1576            target_arch = "aarch64",
1577            any(target_os = "android", target_os = "linux")
1578        ))]
1579        &vcpu_domains,
1580        #[cfg(all(
1581            target_arch = "aarch64",
1582            any(target_os = "android", target_os = "linux")
1583        ))]
1584        &vcpu_domain_paths,
1585    );
1586
1587    Ok(VmComponents {
1588        #[cfg(target_arch = "x86_64")]
1589        ac_adapter: cfg.ac_adapter,
1590        #[cfg(target_arch = "x86_64")]
1591        break_linux_pci_config_io: cfg.break_linux_pci_config_io,
1592        memory_size: cfg
1593            .memory
1594            .unwrap_or(256)
1595            .checked_mul(1024 * 1024)
1596            .ok_or_else(|| anyhow!("requested memory size too large"))?,
1597        swiotlb,
1598        fw_cfg_enable,
1599        bootorder_fw_cfg_blob: Vec::new(),
1600        vcpu_properties,
1601        vcpu_affinity: cfg.vcpu_affinity.clone(),
1602        fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
1603        vcpu_clusters,
1604        dev_pm: cfg.dev_pm,
1605        no_smt: cfg.no_smt,
1606        hugepages: cfg.hugepages,
1607        hv_cfg: hypervisor::Config {
1608            #[cfg(target_arch = "aarch64")]
1609            mte: cfg.mte,
1610            protection_type: cfg.protection_type,
1611            #[cfg(all(target_os = "android", target_arch = "aarch64"))]
1612            ffa: cfg.ffa.map(|g| g.auto).unwrap_or(false),
1613            force_disable_readonly_mem: cfg.force_disable_readonly_mem,
1614        },
1615        vm_image,
1616        android_fstab: cfg
1617            .android_fstab
1618            .as_ref()
1619            .map(|x| {
1620                File::open(x)
1621                    .with_context(|| format!("failed to open android fstab file {}", x.display()))
1622            })
1623            .map_or(Ok(None), |v| v.map(Some))?,
1624        pstore: cfg.pstore.clone(),
1625        pflash_block_size,
1626        pflash_image,
1627        initrd_image,
1628        extra_kernel_params: cfg.params.clone(),
1629        acpi_sdts: cfg
1630            .acpi_tables
1631            .iter()
1632            .map(|path| {
1633                SDT::from_file(path)
1634                    .with_context(|| format!("failed to open ACPI file {}", path.display()))
1635            })
1636            .collect::<Result<Vec<SDT>>>()?,
1637        rt_cpus: cfg.rt_cpus.clone(),
1638        delay_rt: cfg.delay_rt,
1639        no_i8042: cfg.no_i8042,
1640        no_rtc: cfg.no_rtc,
1641        #[cfg(target_arch = "x86_64")]
1642        smbios: cfg.smbios.clone(),
1643        host_cpu_topology: cfg.host_cpu_topology,
1644        itmt: cfg.itmt,
1645        #[cfg(target_arch = "x86_64")]
1646        force_s2idle: cfg.force_s2idle,
1647        pvm_fw: pvm_fw_image,
1648        pci_config: cfg.pci_config,
1649        boot_cpu: cfg.boot_cpu,
1650        vfio_platform_pm: cfg.vfio_platform_pm,
1651        #[cfg(target_arch = "aarch64")]
1652        virt_cpufreq_v2: cfg.virt_cpufreq_v2,
1653        smccc_trng: cfg.smccc_trng,
1654        #[cfg(target_arch = "aarch64")]
1655        sve_config: cfg.sve.unwrap_or_default(),
1656    })
1657}
1658
1659#[cfg(target_arch = "aarch64")]
1660fn normalize_cpu_ipc_ratios(
1661    max_frequency_per_cpu: impl Iterator<Item = (usize, u32)>,
1662    host_max_freq: u32,
1663    cpu_ipc_ratio: impl Fn(usize) -> u32,
1664) -> Result<BTreeMap<usize, u32>> {
1665    if host_max_freq == 0 {
1666        return Err(anyhow!("invalid host_max_freq 0"));
1667    }
1668
1669    let host_max_freq = u64::from(host_max_freq);
1670    let mut normalized_cpu_ipc_ratios = BTreeMap::new();
1671    for (cpu_id, max_freq) in max_frequency_per_cpu {
1672        let ipc_ratio = u64::from(cpu_ipc_ratio(cpu_id));
1673        let max_freq = u64::from(max_freq);
1674
1675        let normalized_cpu_ipc_ratio = (ipc_ratio * max_freq) / host_max_freq;
1676
1677        normalized_cpu_ipc_ratios.insert(
1678            cpu_id,
1679            u32::try_from(normalized_cpu_ipc_ratio)
1680                .context("normalized CPU IPC ratio out of u32 range")?,
1681        );
1682    }
1683
1684    Ok(normalized_cpu_ipc_ratios)
1685}
1686
1687#[derive(Copy, Clone, Debug, Eq, PartialEq)]
1688pub enum ExitState {
1689    Reset,
1690    Stop,
1691    Crash,
1692    GuestPanic,
1693    WatchdogReset,
1694}
1695
1696// Replaces ranges in `guest_mem_layout` that overlap with ranges in `file_backed_mappings`.
1697// Returns the updated guest memory layout.
1698fn punch_holes_in_guest_mem_layout_for_mappings(
1699    guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>,
1700    file_backed_mappings_ram: &[FileBackedMappingParameters],
1701) -> Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>> {
1702    // Create a set containing (start, end) pairs with exclusive end (end = start + size; the byte
1703    // at end is not included in the range).
1704    let mut layout_set = BTreeSet::new();
1705    for (addr, size, options) in &guest_mem_layout {
1706        layout_set.insert((addr.offset(), addr.offset() + size, options.clone()));
1707    }
1708
1709    // Make sure the RAM mappings are a subset of the RAM memory layout.
1710    // For simplicity, we currently require each mapping to be fully contained within a single
1711    // region of the input layout.
1712    for mapping in file_backed_mappings_ram {
1713        anyhow::ensure!(
1714            layout_set
1715                .iter()
1716                .any(|(addr, size, _)| *addr <= mapping.address
1717                    && mapping.address + mapping.size <= *addr + *size),
1718            "RAM file-backed-mapping must be a subset of a RAM region"
1719        );
1720    }
1721
1722    for mapping in file_backed_mappings_ram.iter().cloned() {
1723        let mapping_start = mapping.address;
1724        let mapping_end = mapping_start + mapping.size;
1725        let mut purpose = None;
1726        // Repeatedly split overlapping guest memory regions until no overlaps remain.
1727        while let Some((range_start, range_end, options)) = layout_set
1728            .iter()
1729            .find(|&&(range_start, range_end, _)| {
1730                mapping_start < range_end && mapping_end > range_start
1731            })
1732            .cloned()
1733        {
1734            let purpose = *purpose.get_or_insert(options.purpose);
1735            anyhow::ensure!(
1736                options.purpose == purpose,
1737                "RAM file-backed-mapping cannot span regions with different purposes: {:?} vs {:?}",
1738                options.purpose,
1739                purpose
1740            );
1741
1742            layout_set.remove(&(range_start, range_end, options.clone()));
1743
1744            if range_start < mapping_start {
1745                layout_set.insert((range_start, mapping_start, options.clone()));
1746            }
1747            if range_end > mapping_end {
1748                layout_set.insert((mapping_end, range_end, options));
1749            }
1750        }
1751        layout_set.insert((
1752            mapping_start,
1753            mapping_end,
1754            MemoryRegionOptions::new()
1755                .purpose(purpose.unwrap())
1756                .file_backed(mapping),
1757        ));
1758    }
1759
1760    // Build the final guest memory layout from the modified layout_set.
1761    Ok(layout_set
1762        .into_iter()
1763        .map(|(start, end, options)| (GuestAddress(start), end - start, options))
1764        .collect())
1765}
1766
1767fn create_guest_memory(
1768    cfg: &Config,
1769    components: &VmComponents,
1770    arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
1771    hypervisor: &impl Hypervisor,
1772) -> Result<GuestMemory> {
1773    let guest_mem_layout = Arch::guest_memory_layout(components, arch_memory_layout, hypervisor)
1774        .context("failed to create guest memory layout")?;
1775
1776    let guest_mem_layout = punch_holes_in_guest_mem_layout_for_mappings(
1777        guest_mem_layout,
1778        &cfg.file_backed_mappings_ram,
1779    )?;
1780
1781    let mut guest_mem = GuestMemory::new_with_options(&guest_mem_layout)
1782        .context("failed to create guest memory")?;
1783    let mut mem_policy = MemoryPolicy::empty();
1784    if components.hugepages {
1785        mem_policy |= MemoryPolicy::USE_HUGEPAGES;
1786    }
1787
1788    if cfg.lock_guest_memory {
1789        mem_policy |= MemoryPolicy::LOCK_GUEST_MEMORY;
1790    }
1791    // When sandboxing is enabled, we can MADV_REMOVE from the balloon process, otherwise, fallback
1792    // to using FALLOC_FL_PUNCH_HOLE.
1793    if cfg.jail_config.is_none() {
1794        mem_policy |= MemoryPolicy::USE_PUNCHHOLE_LOCKED;
1795    }
1796    guest_mem.set_memory_policy(mem_policy);
1797
1798    if cfg.unmap_guest_memory_on_fork {
1799        // Note that this isn't compatible with sandboxing. We could potentially fix that by
1800        // delaying the call until after the sandboxed devices are forked. However, the main use
1801        // for this is in conjunction with protected VMs, where most of the guest memory has been
1802        // unshared with the host. We'd need to be confident that the guest memory is unshared with
1803        // the host only after the `use_dontfork` call and those details will vary by hypervisor.
1804        // So, for now we keep things simple to be safe.
1805        guest_mem.use_dontfork().context("use_dontfork failed")?;
1806    }
1807
1808    Ok(guest_mem)
1809}
1810
1811#[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
1812fn run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1813    use devices::GeniezoneKernelIrqChip;
1814    use hypervisor::geniezone::Geniezone;
1815    use hypervisor::geniezone::GeniezoneVcpu;
1816    use hypervisor::geniezone::GeniezoneVm;
1817
1818    let device_path = device_path.unwrap_or(Path::new(GENIEZONE_PATH));
1819    let gzvm = Geniezone::new_with_path(device_path)
1820        .with_context(|| format!("failed to open GenieZone device {}", device_path.display()))?;
1821
1822    let arch_memory_layout =
1823        Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1824    let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &gzvm)?;
1825
1826    #[cfg(feature = "swap")]
1827    let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1828        Some(
1829            SwapController::launch(guest_mem.clone(), swap_dir, cfg.jail_config.as_ref())
1830                .context("launch vmm-swap monitor process")?,
1831        )
1832    } else {
1833        None
1834    };
1835
1836    let vm =
1837        GeniezoneVm::new(&gzvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1838
1839    // Check that the VM was actually created in protected mode as expected.
1840    if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1841        bail!("Failed to create protected VM");
1842    }
1843    let vm_clone = vm.try_clone().context("failed to clone vm")?;
1844
1845    let ioapic_host_tube;
1846    let mut irq_chip = match cfg.irq_chip.unwrap_or_default() {
1847        IrqChipKind::Split => bail!("Geniezone does not support split irqchip mode"),
1848        IrqChipKind::Userspace => bail!("Geniezone does not support userspace irqchip mode"),
1849        IrqChipKind::Kernel { allow_vgic_its: _ } => {
1850            ioapic_host_tube = None;
1851            GeniezoneKernelIrqChip::new(vm_clone, components.vcpu_properties.len())
1852                .context("failed to create IRQ chip")?
1853        }
1854    };
1855
1856    run_vm::<GeniezoneVcpu, GeniezoneVm>(
1857        cfg,
1858        components,
1859        &arch_memory_layout,
1860        vm,
1861        &mut irq_chip,
1862        ioapic_host_tube,
1863        #[cfg(feature = "swap")]
1864        swap_controller,
1865    )
1866}
1867
1868#[cfg(all(target_arch = "aarch64", feature = "halla"))]
1869fn run_halla(
1870    device_path: Option<&Path>,
1871    cfg: Config,
1872    components: VmComponents,
1873) -> Result<ExitState> {
1874    use devices::HallaKernelIrqChip;
1875    use hypervisor::halla::Halla;
1876    use hypervisor::halla::HallaVcpu;
1877    use hypervisor::halla::HallaVm;
1878
1879    let device_path = device_path.unwrap_or(Path::new(HALLA_PATH));
1880    let hvm = Halla::new_with_path(device_path)
1881        .with_context(|| format!("failed to open Halla device {}", device_path.display()))?;
1882
1883    let arch_memory_layout =
1884        Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1885    let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &hvm)?;
1886
1887    #[cfg(feature = "swap")]
1888    let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1889        Some(
1890            SwapController::launch(guest_mem.clone(), swap_dir, cfg.jail_config.as_ref())
1891                .context("launch vmm-swap monitor process")?,
1892        )
1893    } else {
1894        None
1895    };
1896
1897    let vm = HallaVm::new(&hvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1898
1899    // Check that the VM was actually created in protected mode as expected.
1900    if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1901        bail!("Failed to create protected VM");
1902    }
1903    let vm_clone = vm.try_clone().context("failed to clone vm")?;
1904
1905    let ioapic_host_tube;
1906    let mut irq_chip = match cfg.irq_chip.unwrap_or_default() {
1907        IrqChipKind::Split => bail!("Halla does not support split irqchip mode"),
1908        IrqChipKind::Userspace => bail!("Halla does not support userspace irqchip mode"),
1909        IrqChipKind::Kernel { allow_vgic_its: _ } => {
1910            ioapic_host_tube = None;
1911            HallaKernelIrqChip::new(vm_clone, components.vcpu_properties.len())
1912                .context("failed to create IRQ chip")?
1913        }
1914    };
1915
1916    run_vm::<HallaVcpu, HallaVm>(
1917        cfg,
1918        components,
1919        &arch_memory_layout,
1920        vm,
1921        &mut irq_chip,
1922        ioapic_host_tube,
1923        #[cfg(feature = "swap")]
1924        swap_controller,
1925    )
1926}
1927
1928fn run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1929    use devices::KvmKernelIrqChip;
1930    #[cfg(target_arch = "x86_64")]
1931    use devices::KvmSplitIrqChip;
1932    use hypervisor::kvm::Kvm;
1933    use hypervisor::kvm::KvmVcpu;
1934    use hypervisor::kvm::KvmVm;
1935
1936    let device_path = device_path.unwrap_or(Path::new(KVM_PATH));
1937    let kvm = Kvm::new_with_path(device_path)
1938        .with_context(|| format!("failed to open KVM device {}", device_path.display()))?;
1939
1940    let arch_memory_layout =
1941        Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1942    let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &kvm)?;
1943
1944    #[cfg(feature = "swap")]
1945    let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1946        Some(
1947            SwapController::launch(guest_mem.clone(), swap_dir, cfg.jail_config.as_ref())
1948                .context("launch vmm-swap monitor process")?,
1949        )
1950    } else {
1951        None
1952    };
1953
1954    let vm = KvmVm::new(&kvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1955
1956    #[cfg(target_arch = "x86_64")]
1957    if cfg.itmt {
1958        vm.set_platform_info_read_access(false)
1959            .context("failed to disable MSR_PLATFORM_INFO read access")?;
1960    }
1961
1962    // Check that the VM was actually created in protected mode as expected.
1963    // This check is only needed on aarch64. On x86_64, protected VM creation will fail
1964    // if protected mode is not supported.
1965    #[cfg(not(target_arch = "x86_64"))]
1966    if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1967        bail!("Failed to create protected VM");
1968    }
1969    let vm_clone = vm.try_clone().context("failed to clone vm")?;
1970
1971    enum KvmIrqChip {
1972        #[cfg(target_arch = "x86_64")]
1973        Split(KvmSplitIrqChip),
1974        Kernel(KvmKernelIrqChip),
1975    }
1976
1977    impl KvmIrqChip {
1978        fn as_mut(&mut self) -> &mut dyn IrqChipArch {
1979            match self {
1980                #[cfg(target_arch = "x86_64")]
1981                KvmIrqChip::Split(i) => i,
1982                KvmIrqChip::Kernel(i) => i,
1983            }
1984        }
1985    }
1986
1987    let ioapic_host_tube;
1988    let mut irq_chip = match cfg.irq_chip.unwrap_or_default() {
1989        IrqChipKind::Userspace => {
1990            bail!("KVM userspace irqchip mode not implemented");
1991        }
1992        IrqChipKind::Split => {
1993            #[cfg(not(target_arch = "x86_64"))]
1994            bail!("KVM split irqchip mode only supported on x86 processors");
1995            #[cfg(target_arch = "x86_64")]
1996            {
1997                let (host_tube, ioapic_device_tube) =
1998                    Tube::pair().context("failed to create tube")?;
1999                ioapic_host_tube = Some(host_tube);
2000                KvmIrqChip::Split(
2001                    KvmSplitIrqChip::new(
2002                        vm_clone,
2003                        components.vcpu_properties.len(),
2004                        ioapic_device_tube,
2005                        Some(24),
2006                    )
2007                    .context("failed to create IRQ chip")?,
2008                )
2009            }
2010        }
2011        IrqChipKind::Kernel {
2012            #[cfg(target_arch = "aarch64")]
2013            allow_vgic_its,
2014        } => {
2015            ioapic_host_tube = None;
2016            KvmIrqChip::Kernel(
2017                KvmKernelIrqChip::new(
2018                    vm_clone,
2019                    components.vcpu_properties.len(),
2020                    #[cfg(target_arch = "aarch64")]
2021                    allow_vgic_its,
2022                )
2023                .context("failed to create IRQ chip")?,
2024            )
2025        }
2026    };
2027
2028    run_vm::<KvmVcpu, KvmVm>(
2029        cfg,
2030        components,
2031        &arch_memory_layout,
2032        vm,
2033        irq_chip.as_mut(),
2034        ioapic_host_tube,
2035        #[cfg(feature = "swap")]
2036        swap_controller,
2037    )
2038}
2039
2040#[cfg(all(target_arch = "aarch64", feature = "gunyah"))]
2041fn run_gunyah(
2042    device_path: Option<&Path>,
2043    qcom_trusted_vm_id: Option<u16>,
2044    qcom_trusted_vm_pas_id: Option<u32>,
2045    cfg: Config,
2046    components: VmComponents,
2047) -> Result<ExitState> {
2048    use devices::GunyahIrqChip;
2049    use hypervisor::gunyah::Gunyah;
2050    use hypervisor::gunyah::GunyahVcpu;
2051    use hypervisor::gunyah::GunyahVm;
2052
2053    let device_path = device_path.unwrap_or(Path::new(GUNYAH_PATH));
2054    let gunyah = Gunyah::new_with_path(device_path)
2055        .with_context(|| format!("failed to open Gunyah device {}", device_path.display()))?;
2056
2057    let arch_memory_layout =
2058        Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
2059    let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &gunyah)?;
2060
2061    #[cfg(feature = "swap")]
2062    let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
2063        Some(
2064            SwapController::launch(guest_mem.clone(), swap_dir, cfg.jail_config.as_ref())
2065                .context("launch vmm-swap monitor process")?,
2066        )
2067    } else {
2068        None
2069    };
2070
2071    let vm = GunyahVm::new(
2072        &gunyah,
2073        qcom_trusted_vm_id,
2074        qcom_trusted_vm_pas_id,
2075        guest_mem,
2076        components.hv_cfg,
2077    )
2078    .context("failed to create vm")?;
2079
2080    // Check that the VM was actually created in protected mode as expected.
2081    if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
2082        bail!("Failed to create protected VM");
2083    }
2084
2085    let vm_clone = vm.try_clone()?;
2086
2087    run_vm::<GunyahVcpu, GunyahVm>(
2088        cfg,
2089        components,
2090        &arch_memory_layout,
2091        vm,
2092        &mut GunyahIrqChip::new(vm_clone)?,
2093        None,
2094        #[cfg(feature = "swap")]
2095        swap_controller,
2096    )
2097}
2098
2099/// Choose a default hypervisor if no `--hypervisor` option was specified.
2100fn get_default_hypervisor() -> Option<HypervisorKind> {
2101    let kvm_path = Path::new(KVM_PATH);
2102    if kvm_path.exists() {
2103        return Some(HypervisorKind::Kvm {
2104            device: Some(kvm_path.to_path_buf()),
2105        });
2106    }
2107
2108    #[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
2109    {
2110        let gz_path = Path::new(GENIEZONE_PATH);
2111        if gz_path.exists() {
2112            return Some(HypervisorKind::Geniezone {
2113                device: Some(gz_path.to_path_buf()),
2114            });
2115        }
2116    }
2117
2118    #[cfg(target_arch = "aarch64")]
2119    #[cfg(feature = "halla")]
2120    {
2121        let halla_path = Path::new(HALLA_PATH);
2122        if halla_path.exists() {
2123            return Some(HypervisorKind::Halla {
2124                device: Some(halla_path.to_path_buf()),
2125            });
2126        }
2127    }
2128
2129    #[cfg(all(unix, target_arch = "aarch64", feature = "gunyah"))]
2130    {
2131        let gunyah_path = Path::new(GUNYAH_PATH);
2132        if gunyah_path.exists() {
2133            return Some(HypervisorKind::Gunyah {
2134                device: Some(gunyah_path.to_path_buf()),
2135                qcom_trusted_vm_id: None,
2136                qcom_trusted_vm_pas_id: None,
2137            });
2138        }
2139    }
2140
2141    None
2142}
2143
2144pub fn run_config(cfg: Config) -> Result<ExitState> {
2145    let components = setup_vm_components(&cfg)?;
2146
2147    let hypervisor = cfg
2148        .hypervisor
2149        .clone()
2150        .or_else(get_default_hypervisor)
2151        .context("no enabled hypervisor")?;
2152
2153    debug!("creating hypervisor: {:?}", hypervisor);
2154
2155    match hypervisor {
2156        HypervisorKind::Kvm { device } => run_kvm(device.as_deref(), cfg, components),
2157        #[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
2158        HypervisorKind::Geniezone { device } => run_gz(device.as_deref(), cfg, components),
2159        #[cfg(target_arch = "aarch64")]
2160        #[cfg(feature = "halla")]
2161        HypervisorKind::Halla { device } => run_halla(device.as_deref(), cfg, components),
2162        #[cfg(all(unix, target_arch = "aarch64", feature = "gunyah"))]
2163        HypervisorKind::Gunyah {
2164            device,
2165            qcom_trusted_vm_id,
2166            qcom_trusted_vm_pas_id,
2167        } => run_gunyah(
2168            device.as_deref(),
2169            qcom_trusted_vm_id,
2170            qcom_trusted_vm_pas_id,
2171            cfg,
2172            components,
2173        ),
2174    }
2175}
2176
2177fn run_vm<Vcpu, V>(
2178    cfg: Config,
2179    #[allow(unused_mut)] mut components: VmComponents,
2180    arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
2181    mut vm: V,
2182    irq_chip: &mut dyn IrqChipArch,
2183    ioapic_host_tube: Option<Tube>,
2184    #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>,
2185) -> Result<ExitState>
2186where
2187    Vcpu: VcpuArch + 'static,
2188    V: VmArch + 'static,
2189{
2190    if cfg.jail_config.is_some() {
2191        // Printing something to the syslog before entering minijail so that libc's syslogger has a
2192        // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
2193        // access to those files will not be possible.
2194        info!("crosvm entering multiprocess mode");
2195    }
2196
2197    let (metrics_send, metrics_recv) = Tube::directional_pair().context("metrics tube")?;
2198    metrics::initialize(metrics_send);
2199
2200    #[cfg(all(feature = "pci-hotplug", feature = "swap"))]
2201    let swap_device_helper = match &swap_controller {
2202        Some(swap_controller) => Some(swap_controller.create_device_helper()?),
2203        None => None,
2204    };
2205    // pci-hotplug is only implemented for x86_64 for now, attempting to use it on other platform
2206    // would crash.
2207    #[cfg(all(feature = "pci-hotplug", not(target_arch = "x86_64")))]
2208    if cfg.pci_hotplug_slots.is_some() {
2209        bail!("pci-hotplug is not implemented for non x86_64 architecture");
2210    }
2211    // hotplug_manager must be created before vm is started since it forks jail warden process.
2212    #[cfg(feature = "pci-hotplug")]
2213    // TODO(293801301): Remove unused_mut after aarch64 support
2214    #[allow(unused_mut)]
2215    let mut hotplug_manager = if cfg.pci_hotplug_slots.is_some() {
2216        Some(PciHotPlugManager::new(
2217            vm.get_memory().clone(),
2218            &cfg,
2219            #[cfg(feature = "swap")]
2220            swap_device_helper,
2221        )?)
2222    } else {
2223        None
2224    };
2225
2226    #[cfg(feature = "usb")]
2227    let (usb_control_tube, usb_provider) =
2228        DeviceProvider::new().context("failed to create usb provider")?;
2229
2230    // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
2231    // before any jailed devices have been spawned, so that we can catch any of them that fail very
2232    // quickly.
2233    let sigchld_fd = SignalFd::new(libc::SIGCHLD).context("failed to create signalfd")?;
2234
2235    let control_server_socket = match &cfg.socket_path {
2236        Some(path) => Some(UnlinkUnixSeqpacketListener(
2237            UnixSeqpacketListener::bind(path).context("failed to create control server")?,
2238        )),
2239        None => None,
2240    };
2241
2242    let mut all_control_tubes = Vec::new();
2243    let mut add_control_tube = |t| all_control_tubes.push(t);
2244
2245    if let Some(ioapic_host_tube) = ioapic_host_tube {
2246        add_control_tube(AnyControlTube::IrqTube(ioapic_host_tube));
2247    }
2248
2249    let battery = if cfg.battery_config.is_some() {
2250        #[cfg_attr(
2251            not(feature = "power-monitor-powerd"),
2252            allow(clippy::manual_map, clippy::needless_match, unused_mut)
2253        )]
2254        let jail = if let Some(jail_config) = cfg.jail_config.as_ref() {
2255            let mut config = SandboxConfig::new(jail_config, "battery");
2256            #[cfg(feature = "power-monitor-powerd")]
2257            {
2258                config.bind_mounts = true;
2259            }
2260            let mut jail =
2261                create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)?;
2262
2263            // Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
2264            #[cfg(feature = "power-monitor-powerd")]
2265            {
2266                let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
2267                jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
2268            }
2269            Some(jail)
2270        } else {
2271            None
2272        };
2273        (cfg.battery_config.as_ref().map(|c| c.type_), jail)
2274    } else {
2275        (cfg.battery_config.as_ref().map(|c| c.type_), None)
2276    };
2277
2278    let (vm_evt_wrtube, vm_evt_rdtube) =
2279        Tube::directional_pair().context("failed to create vm event tube")?;
2280
2281    let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
2282    let mut sys_allocator = SystemAllocator::new(
2283        Arch::get_system_allocator_config(&vm, arch_memory_layout),
2284        pstore_size,
2285        &cfg.mmio_address_ranges,
2286    )
2287    .context("failed to create system allocator")?;
2288
2289    let ramoops_region = match &components.pstore {
2290        Some(pstore) => Some(
2291            arch::pstore::create_memory_region(
2292                &mut vm,
2293                sys_allocator.reserved_region().unwrap(),
2294                pstore,
2295            )
2296            .context("failed to allocate pstore region")?,
2297        ),
2298        None => None,
2299    };
2300
2301    create_mmio_file_backed_mappings(&cfg, &mut vm, &mut sys_allocator)?;
2302
2303    #[cfg(feature = "gpu")]
2304    // Hold on to the render server jail so it keeps running until we exit run_vm()
2305    let (_render_server_jail, render_server_fd) =
2306        if let Some(parameters) = &cfg.gpu_render_server_parameters {
2307            let (jail, fd) = start_gpu_render_server(&cfg, parameters)?;
2308            (Some(ScopedMinijail(jail)), Some(fd))
2309        } else {
2310            (None, None)
2311        };
2312
2313    let mut iommu_attached_endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>> =
2314        BTreeMap::new();
2315    let mut iova_max_addr: Option<u64> = None;
2316
2317    let mut vfio_container_manager = VfioContainerManager::new();
2318
2319    #[cfg(feature = "registered_events")]
2320    let (reg_evt_wrtube, reg_evt_rdtube) =
2321        Tube::directional_pair().context("failed to create registered event tube")?;
2322
2323    let mut worker_process_pids = BTreeSet::new();
2324
2325    let mut devices = create_devices(
2326        &cfg,
2327        &mut vm,
2328        &mut sys_allocator,
2329        &mut add_control_tube,
2330        &vm_evt_wrtube,
2331        &mut iommu_attached_endpoints,
2332        #[cfg(feature = "usb")]
2333        usb_provider,
2334        #[cfg(feature = "gpu")]
2335        render_server_fd,
2336        &mut iova_max_addr,
2337        #[cfg(feature = "registered_events")]
2338        &reg_evt_wrtube,
2339        &mut vfio_container_manager,
2340        &mut worker_process_pids,
2341    )?;
2342
2343    #[cfg(feature = "pci-hotplug")]
2344    // TODO(293801301): Remove unused_variables after aarch64 support
2345    #[allow(unused_variables)]
2346    let pci_hotplug_slots = cfg.pci_hotplug_slots;
2347    #[cfg(not(feature = "pci-hotplug"))]
2348    #[allow(unused_variables)]
2349    let pci_hotplug_slots: Option<u8> = None;
2350    #[cfg(target_arch = "x86_64")]
2351    let hp_stub = create_pure_virtual_pcie_root_port(
2352        &mut sys_allocator,
2353        &mut add_control_tube,
2354        &mut devices,
2355        pci_hotplug_slots.unwrap_or(1),
2356    )?;
2357
2358    arch::assign_pci_addresses(&mut devices, &mut sys_allocator)?;
2359
2360    let pci_devices: Vec<&dyn PciDevice> = devices
2361        .iter()
2362        .filter_map(|d| (d.0).as_pci_device())
2363        .collect();
2364
2365    let virtio_devices: Vec<(&dyn VirtioDevice, devices::PciAddress)> = pci_devices
2366        .into_iter()
2367        .flat_map(|s| {
2368            if let Some(virtio_pci_device) = s.as_virtio_pci_device() {
2369                std::iter::zip(
2370                    Some(virtio_pci_device.virtio_device()),
2371                    virtio_pci_device.pci_address(),
2372                )
2373                .next()
2374            } else {
2375                None
2376            }
2377        })
2378        .collect();
2379
2380    let mut open_firmware_device_paths: Vec<(Vec<u8>, usize)> = virtio_devices
2381        .iter()
2382        .flat_map(|s| (s.0).bootorder_fw_cfg(s.1.dev))
2383        .collect();
2384
2385    // order the OpenFirmware device paths, in ascending order, by their boot_index
2386    open_firmware_device_paths.sort_by(|a, b| (a.1).cmp(&(b.1)));
2387
2388    // "/pci@iocf8/" is x86 specific and represents the root at the system bus port
2389    let mut bootorder_fw_cfg_blob =
2390        open_firmware_device_paths
2391            .into_iter()
2392            .fold(Vec::new(), |a, b| {
2393                a.into_iter()
2394                    .chain("/pci@i0cf8/".as_bytes().iter().copied())
2395                    .chain(b.0)
2396                    .chain("\n".as_bytes().iter().copied())
2397                    .collect()
2398            });
2399
2400    // the "bootorder" file is expected to end with a null terminator
2401    bootorder_fw_cfg_blob.push(0);
2402
2403    components.bootorder_fw_cfg_blob = bootorder_fw_cfg_blob;
2404
2405    // if the bootindex argument was given, we want to make sure that fw_cfg is enabled so the
2406    // "bootorder" file can be accessed by the guest.
2407    components.fw_cfg_enable |= components.bootorder_fw_cfg_blob.len() > 1;
2408
2409    let (translate_response_senders, request_rx) = setup_virtio_access_platform(
2410        &mut sys_allocator,
2411        &mut iommu_attached_endpoints,
2412        &mut devices,
2413    )?;
2414
2415    #[cfg(target_arch = "x86_64")]
2416    let iommu_bus_ranges = hp_stub.iommu_bus_ranges;
2417    #[cfg(not(target_arch = "x86_64"))]
2418    let iommu_bus_ranges = Vec::new();
2419
2420    let iommu_host_tube = if !iommu_attached_endpoints.is_empty()
2421        || (cfg.vfio_isolate_hotplug && !iommu_bus_ranges.is_empty())
2422    {
2423        let (iommu_host_tube, iommu_device_tube) = Tube::pair().context("failed to create tube")?;
2424        let iommu_dev = create_iommu_device(
2425            cfg.protection_type,
2426            cfg.jail_config.as_ref(),
2427            iova_max_addr.unwrap_or(u64::MAX),
2428            iommu_attached_endpoints,
2429            iommu_bus_ranges,
2430            translate_response_senders,
2431            request_rx,
2432            iommu_device_tube,
2433        )?;
2434
2435        let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2436        add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2437        let (ioevent_host_tube, ioevent_device_tube) =
2438            Tube::pair().context("failed to create ioevent tube")?;
2439        add_control_tube(
2440            VmMemoryTube {
2441                tube: ioevent_host_tube,
2442                expose_with_viommu: false,
2443            }
2444            .into(),
2445        );
2446        let (host_tube, device_tube) =
2447            Tube::pair().context("failed to create device control tube")?;
2448        add_control_tube(TaggedControlTube::Vm(host_tube).into());
2449        let mut dev = VirtioPciDevice::new(
2450            vm.get_memory().clone(),
2451            iommu_dev.dev,
2452            msi_device_tube,
2453            cfg.disable_virtio_intx,
2454            None,
2455            VmMemoryClient::new(ioevent_device_tube),
2456            device_tube,
2457        )
2458        .context("failed to create virtio pci dev")?;
2459        // early reservation for viommu.
2460        dev.allocate_address(&mut sys_allocator)
2461            .context("failed to allocate resources early for virtio pci dev")?;
2462        let dev = Box::new(dev);
2463        devices.push((dev, iommu_dev.jail));
2464        Some(iommu_host_tube)
2465    } else {
2466        None
2467    };
2468
2469    #[cfg(target_arch = "x86_64")]
2470    for device in devices
2471        .iter_mut()
2472        .filter_map(|(dev, _)| dev.as_pci_device_mut())
2473    {
2474        device
2475            .generate_acpi(&mut components.acpi_sdts)
2476            .with_context(|| format!("generate_acpi failed for {}", device.debug_label()))?;
2477    }
2478
2479    // KVM_CREATE_VCPU uses apic id for x86 and uses cpu id for others.
2480    let mut vcpu_ids = Vec::new();
2481
2482    let guest_suspended_cvar = if cfg.force_s2idle {
2483        Some(Arc::new((Mutex::new(false), Condvar::new())))
2484    } else {
2485        None
2486    };
2487
2488    let dt_overlays = cfg
2489        .device_tree_overlay
2490        .iter()
2491        .map(|o| {
2492            Ok(DtbOverlay {
2493                file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2494                    .with_context(|| {
2495                        format!("failed to open device tree overlay {}", o.path.display())
2496                    })?,
2497                do_filter: o.filter_devs,
2498            })
2499        })
2500        .collect::<Result<Vec<DtbOverlay>>>()?;
2501
2502    #[cfg(target_arch = "aarch64")]
2503    let vcpu_domain_paths: BTreeMap<usize, PathBuf> = components
2504        .vcpu_properties
2505        .iter()
2506        .filter_map(|(id, props)| {
2507            props
2508                .vcpu_domain_path
2509                .as_ref()
2510                .map(|path| (*id, path.clone()))
2511        })
2512        .collect();
2513
2514    let mut linux = Arch::build_vm::<V, Vcpu>(
2515        components,
2516        arch_memory_layout,
2517        &vm_evt_wrtube,
2518        &mut sys_allocator,
2519        &cfg.serial_parameters,
2520        simple_jail(cfg.jail_config.as_ref(), "serial_device")?,
2521        battery,
2522        vm,
2523        ramoops_region,
2524        devices,
2525        irq_chip,
2526        &mut vcpu_ids,
2527        cfg.dump_device_tree_blob.clone(),
2528        simple_jail(cfg.jail_config.as_ref(), "serial_device")?,
2529        #[cfg(target_arch = "x86_64")]
2530        simple_jail(cfg.jail_config.as_ref(), "block_device")?,
2531        #[cfg(target_arch = "x86_64")]
2532        simple_jail(cfg.jail_config.as_ref(), "fw_cfg_device")?,
2533        #[cfg(feature = "swap")]
2534        &mut swap_controller,
2535        guest_suspended_cvar.clone(),
2536        dt_overlays,
2537        cfg.fdt_position,
2538        cfg.no_pmu,
2539    )
2540    .context("the architecture failed to build the vm")?;
2541
2542    for tube in linux.vm_request_tubes.drain(..) {
2543        add_control_tube(TaggedControlTube::Vm(tube).into());
2544    }
2545
2546    #[cfg(target_arch = "x86_64")]
2547    let (hp_control_tube, hp_worker_tube) = mpsc::channel();
2548    #[cfg(all(feature = "pci-hotplug", target_arch = "x86_64"))]
2549    if let Some(hotplug_manager) = &mut hotplug_manager {
2550        hotplug_manager.set_rootbus_controller(hp_control_tube.clone())?;
2551    }
2552    #[cfg(target_arch = "x86_64")]
2553    let hp_thread = {
2554        for (bus_num, hp_bus) in hp_stub.hotplug_buses.into_iter() {
2555            #[cfg(feature = "pci-hotplug")]
2556            if let Some(hotplug_manager) = &mut hotplug_manager {
2557                hotplug_manager.add_port(hp_bus)?;
2558            } else {
2559                linux.hotplug_bus.insert(bus_num, hp_bus);
2560            }
2561            #[cfg(not(feature = "pci-hotplug"))]
2562            linux.hotplug_bus.insert(bus_num, hp_bus);
2563        }
2564
2565        if let Some(pm) = &linux.pm {
2566            for (gpe, notify_dev) in hp_stub.gpe_notify_devs.into_iter() {
2567                pm.lock().register_gpe_notify_dev(gpe, notify_dev);
2568            }
2569            for (bus, notify_dev) in hp_stub.pme_notify_devs.into_iter() {
2570                pm.lock().register_pme_notify_dev(bus, notify_dev);
2571            }
2572        }
2573
2574        let (hp_vm_mem_host_tube, hp_vm_mem_worker_tube) =
2575            Tube::pair().context("failed to create tube")?;
2576        add_control_tube(
2577            VmMemoryTube {
2578                tube: hp_vm_mem_host_tube,
2579                expose_with_viommu: false,
2580            }
2581            .into(),
2582        );
2583
2584        let supports_readonly_mapping = linux.vm.supports_readonly_mapping();
2585        let pci_root = linux.root_config.clone();
2586        std::thread::Builder::new()
2587            .name("pci_root".to_string())
2588            .spawn(move || {
2589                start_pci_root_worker(
2590                    supports_readonly_mapping,
2591                    pci_root,
2592                    hp_worker_tube,
2593                    hp_vm_mem_worker_tube,
2594                )
2595            })?
2596    };
2597
2598    let flags = RutabagaGrallocBackendFlags::new().disable_vulkano();
2599    let gralloc = RutabagaGralloc::new(flags).context("failed to create gralloc")?;
2600
2601    run_control(
2602        linux,
2603        sys_allocator,
2604        cfg,
2605        control_server_socket,
2606        all_control_tubes,
2607        #[cfg(feature = "usb")]
2608        usb_control_tube,
2609        vm_evt_rdtube,
2610        vm_evt_wrtube,
2611        sigchld_fd,
2612        gralloc,
2613        vcpu_ids,
2614        iommu_host_tube,
2615        #[cfg(target_arch = "x86_64")]
2616        hp_control_tube,
2617        #[cfg(target_arch = "x86_64")]
2618        hp_thread,
2619        #[cfg(feature = "pci-hotplug")]
2620        hotplug_manager,
2621        #[cfg(feature = "swap")]
2622        swap_controller,
2623        #[cfg(feature = "registered_events")]
2624        reg_evt_rdtube,
2625        guest_suspended_cvar,
2626        metrics_recv,
2627        vfio_container_manager,
2628        worker_process_pids,
2629        #[cfg(target_arch = "aarch64")]
2630        vcpu_domain_paths,
2631    )
2632}
2633
2634// Hotplug command is facing dead lock issue when it tries to acquire the lock
2635// for pci root in the vm control thread. Dead lock could happen when the vm
2636// control thread(Thread A namely) is handling the hotplug command and it tries
2637// to get the lock for pci root. However, the lock is already hold by another
2638// device in thread B, which is actively sending an vm control to be handled by
2639// thread A and waiting for response. However, thread A is blocked on acquiring
2640// the lock, so dead lock happens. In order to resolve this issue, we add this
2641// worker thread and push all work that locks pci root to this thread.
2642#[cfg(target_arch = "x86_64")]
2643fn start_pci_root_worker(
2644    supports_readonly_mapping: bool,
2645    pci_root: Arc<Mutex<PciRoot>>,
2646    hp_device_tube: mpsc::Receiver<PciRootCommand>,
2647    vm_control_tube: Tube,
2648) {
2649    struct PciMmioMapperTube {
2650        supports_readonly_mapping: bool,
2651        vm_control_tube: Tube,
2652        registered_regions: BTreeMap<u32, VmMemoryRegionId>,
2653        next_id: u32,
2654    }
2655
2656    impl PciMmioMapper for PciMmioMapperTube {
2657        fn supports_readonly_mapping(&self) -> bool {
2658            self.supports_readonly_mapping
2659        }
2660
2661        fn add_mapping(&mut self, addr: GuestAddress, shmem: &SharedMemory) -> anyhow::Result<u32> {
2662            let shmem = shmem
2663                .try_clone()
2664                .context("failed to create new SharedMemory")?;
2665            self.vm_control_tube
2666                .send(&VmMemoryRequest::RegisterMemory {
2667                    source: VmMemorySource::SharedMemory(shmem),
2668                    dest: VmMemoryDestination::GuestPhysicalAddress(addr.0),
2669                    prot: Protection::read(),
2670                    cache: MemCacheType::CacheCoherent,
2671                })
2672                .context("failed to send request")?;
2673            match self.vm_control_tube.recv::<VmMemoryResponse>() {
2674                Ok(VmMemoryResponse::RegisterMemory { region_id, .. }) => {
2675                    let cur_id = self.next_id;
2676                    self.registered_regions.insert(cur_id, region_id);
2677                    self.next_id += 1;
2678                    Ok(cur_id)
2679                }
2680                res => bail!("Bad response: {:?}", res),
2681            }
2682        }
2683    }
2684
2685    let mut mapper = PciMmioMapperTube {
2686        supports_readonly_mapping,
2687        vm_control_tube,
2688        registered_regions: BTreeMap::new(),
2689        next_id: 0,
2690    };
2691
2692    loop {
2693        match hp_device_tube.recv() {
2694            Ok(cmd) => match cmd {
2695                PciRootCommand::Add(addr, device) => {
2696                    if let Err(e) = pci_root.lock().add_device(addr, device, &mut mapper) {
2697                        error!("failed to add hotplugged device to PCI root port: {}", e);
2698                    }
2699                }
2700                PciRootCommand::AddBridge(pci_bus) => {
2701                    if let Err(e) = pci_root.lock().add_bridge(pci_bus) {
2702                        error!("failed to add hotplugged bridge to PCI root port: {}", e);
2703                    }
2704                }
2705                PciRootCommand::Remove(addr) => {
2706                    pci_root.lock().remove_device(addr);
2707                }
2708                PciRootCommand::Kill => break,
2709            },
2710            Err(e) => {
2711                error!("Error: pci root worker channel closed: {}", e);
2712                break;
2713            }
2714        }
2715    }
2716}
2717
2718#[cfg(target_arch = "x86_64")]
2719fn get_hp_bus<V: VmArch, Vcpu: VcpuArch>(
2720    linux: &RunnableLinuxVm<V, Vcpu>,
2721    host_addr: PciAddress,
2722) -> Result<Arc<Mutex<dyn HotPlugBus>>> {
2723    for (_, hp_bus) in linux.hotplug_bus.iter() {
2724        if hp_bus.lock().is_match(host_addr).is_some() {
2725            return Ok(hp_bus.clone());
2726        }
2727    }
2728    Err(anyhow!("Failed to find a suitable hotplug bus"))
2729}
2730
2731#[cfg(target_arch = "x86_64")]
2732fn add_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2733    linux: &mut RunnableLinuxVm<V, Vcpu>,
2734    sys_allocator: &mut SystemAllocator,
2735    cfg: &Config,
2736    add_control_tube: &mut impl FnMut(AnyControlTube),
2737    hp_control_tube: &mpsc::Sender<PciRootCommand>,
2738    iommu_host_tube: Option<&Tube>,
2739    device: &HotPlugDeviceInfo,
2740    #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2741    vfio_container_manager: &mut VfioContainerManager,
2742) -> Result<()> {
2743    let host_addr = PciAddress::from_path(&device.path)
2744        .context("failed to parse hotplug device's PCI address")?;
2745    let hp_bus = get_hp_bus(linux, host_addr)?;
2746
2747    let (hotplug_key, pci_address) = match device.device_type {
2748        HotPlugDeviceType::UpstreamPort | HotPlugDeviceType::DownstreamPort => {
2749            let (vm_host_tube, vm_device_tube) = Tube::pair().context("failed to create tube")?;
2750            add_control_tube(TaggedControlTube::Vm(vm_host_tube).into());
2751            let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2752            add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2753            let pcie_host = PcieHostPort::new(device.path.as_path(), vm_device_tube)?;
2754            let (hotplug_key, pci_bridge) = match device.device_type {
2755                HotPlugDeviceType::UpstreamPort => {
2756                    let hotplug_key = HotPlugKey::HostUpstreamPort { host_addr };
2757                    let pcie_upstream_port = Arc::new(Mutex::new(PcieUpstreamPort::new_from_host(
2758                        pcie_host, true,
2759                    )?));
2760                    let pci_bridge =
2761                        Box::new(PciBridge::new(pcie_upstream_port.clone(), msi_device_tube));
2762                    linux
2763                        .hotplug_bus
2764                        .insert(pci_bridge.get_secondary_num(), pcie_upstream_port);
2765                    (hotplug_key, pci_bridge)
2766                }
2767                HotPlugDeviceType::DownstreamPort => {
2768                    let hotplug_key = HotPlugKey::HostDownstreamPort { host_addr };
2769                    let pcie_downstream_port = Arc::new(Mutex::new(
2770                        PcieDownstreamPort::new_from_host(pcie_host, true)?,
2771                    ));
2772                    let pci_bridge = Box::new(PciBridge::new(
2773                        pcie_downstream_port.clone(),
2774                        msi_device_tube,
2775                    ));
2776                    linux
2777                        .hotplug_bus
2778                        .insert(pci_bridge.get_secondary_num(), pcie_downstream_port);
2779                    (hotplug_key, pci_bridge)
2780                }
2781                _ => {
2782                    bail!("Impossible to reach here")
2783                }
2784            };
2785            let pci_address = Arch::register_pci_device(
2786                linux,
2787                pci_bridge,
2788                None,
2789                sys_allocator,
2790                hp_control_tube,
2791                #[cfg(feature = "swap")]
2792                swap_controller,
2793            )?;
2794
2795            (hotplug_key, pci_address)
2796        }
2797        HotPlugDeviceType::EndPoint => {
2798            let hotplug_key = HotPlugKey::HostVfio { host_addr };
2799            let (vfio_device, jail, viommu_mapper) = create_vfio_device(
2800                cfg.jail_config.as_ref(),
2801                &linux.vm,
2802                sys_allocator,
2803                add_control_tube,
2804                &device.path,
2805                true,
2806                None,
2807                None,
2808                None,
2809                if iommu_host_tube.is_some() {
2810                    IommuDevType::VirtioIommu
2811                } else {
2812                    IommuDevType::NoIommu
2813                },
2814                None,
2815                vfio_container_manager,
2816            )?;
2817            let vfio_pci_device = match vfio_device {
2818                VfioDeviceVariant::Pci(pci) => Box::new(pci),
2819                VfioDeviceVariant::Platform(_) => bail!("vfio platform hotplug not supported"),
2820            };
2821            let pci_address = Arch::register_pci_device(
2822                linux,
2823                vfio_pci_device,
2824                jail,
2825                sys_allocator,
2826                hp_control_tube,
2827                #[cfg(feature = "swap")]
2828                swap_controller,
2829            )?;
2830            if let Some(iommu_host_tube) = iommu_host_tube {
2831                let endpoint_addr = pci_address.to_u32();
2832                let vfio_wrapper = viommu_mapper.context("expected mapper")?;
2833                let descriptor = vfio_wrapper.clone_as_raw_descriptor()?;
2834                let request =
2835                    VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceAdd {
2836                        endpoint_addr,
2837                        wrapper_id: vfio_wrapper.id(),
2838                        container: {
2839                            // SAFETY:
2840                            // Safe because the descriptor is uniquely owned by `descriptor`.
2841                            unsafe { File::from_raw_descriptor(descriptor) }
2842                        },
2843                    });
2844                match virtio_iommu_request(iommu_host_tube, &request)
2845                    .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2846                {
2847                    VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2848                    resp => bail!("Unexpected message response: {:?}", resp),
2849                }
2850            }
2851
2852            (hotplug_key, pci_address)
2853        }
2854    };
2855    hp_bus.lock().add_hotplug_device(hotplug_key, pci_address);
2856    if device.hp_interrupt {
2857        hp_bus.lock().hot_plug(pci_address)?;
2858    }
2859    Ok(())
2860}
2861
2862#[cfg(feature = "pci-hotplug")]
2863fn add_hotplug_net<V: VmArch, Vcpu: VcpuArch>(
2864    linux: &mut RunnableLinuxVm<V, Vcpu>,
2865    sys_allocator: &mut SystemAllocator,
2866    add_control_tube: &mut impl FnMut(AnyControlTube),
2867    hotplug_manager: &mut PciHotPlugManager,
2868    net_param: NetParameters,
2869) -> Result<u8> {
2870    let (msi_host_tube, msi_device_tube) = Tube::pair().context("create tube")?;
2871    add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2872    let (ioevent_host_tube, ioevent_device_tube) = Tube::pair().context("create tube")?;
2873    let ioevent_vm_memory_client = VmMemoryClient::new(ioevent_device_tube);
2874    add_control_tube(
2875        VmMemoryTube {
2876            tube: ioevent_host_tube,
2877            expose_with_viommu: false,
2878        }
2879        .into(),
2880    );
2881    let (vm_control_host_tube, vm_control_device_tube) = Tube::pair().context("create tube")?;
2882    add_control_tube(TaggedControlTube::Vm(vm_control_host_tube).into());
2883    let net_carrier_device = NetResourceCarrier::new(
2884        net_param,
2885        msi_device_tube,
2886        ioevent_vm_memory_client,
2887        vm_control_device_tube,
2888    );
2889    hotplug_manager.hotplug_device(
2890        vec![ResourceCarrier::VirtioNet(net_carrier_device)],
2891        linux,
2892        sys_allocator,
2893    )
2894}
2895
2896#[cfg(feature = "pci-hotplug")]
2897fn handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>(
2898    net_cmd: NetControlCommand,
2899    linux: &mut RunnableLinuxVm<V, Vcpu>,
2900    sys_allocator: &mut SystemAllocator,
2901    add_control_tube: &mut impl FnMut(AnyControlTube),
2902    hotplug_manager: &mut PciHotPlugManager,
2903) -> VmResponse {
2904    match net_cmd {
2905        NetControlCommand::AddTap(tap_name) => handle_hotplug_net_add(
2906            linux,
2907            sys_allocator,
2908            add_control_tube,
2909            hotplug_manager,
2910            &tap_name,
2911        ),
2912        NetControlCommand::RemoveTap(bus) => {
2913            handle_hotplug_net_remove(linux, sys_allocator, hotplug_manager, bus)
2914        }
2915    }
2916}
2917
2918#[cfg(feature = "pci-hotplug")]
2919fn handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>(
2920    linux: &mut RunnableLinuxVm<V, Vcpu>,
2921    sys_allocator: &mut SystemAllocator,
2922    add_control_tube: &mut impl FnMut(AnyControlTube),
2923    hotplug_manager: &mut PciHotPlugManager,
2924    tap_name: &str,
2925) -> VmResponse {
2926    let net_param_mode = NetParametersMode::TapName {
2927        tap_name: tap_name.to_owned(),
2928        mac: None,
2929    };
2930    let net_param = NetParameters {
2931        mode: net_param_mode,
2932        vhost_net: None,
2933        vq_pairs: None,
2934        packed_queue: false,
2935        pci_address: None,
2936        mrg_rxbuf: false,
2937    };
2938    let ret = add_hotplug_net(
2939        linux,
2940        sys_allocator,
2941        add_control_tube,
2942        hotplug_manager,
2943        net_param,
2944    );
2945
2946    match ret {
2947        Ok(pci_bus) => VmResponse::PciHotPlugResponse { bus: pci_bus },
2948        Err(e) => VmResponse::ErrString(format!("{e:?}")),
2949    }
2950}
2951
2952#[cfg(feature = "pci-hotplug")]
2953fn handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>(
2954    linux: &mut RunnableLinuxVm<V, Vcpu>,
2955    sys_allocator: &mut SystemAllocator,
2956    hotplug_manager: &mut PciHotPlugManager,
2957    bus: u8,
2958) -> VmResponse {
2959    match hotplug_manager.remove_hotplug_device(bus, linux, sys_allocator) {
2960        Ok(_) => VmResponse::Ok,
2961        Err(e) => VmResponse::ErrString(format!("{e:?}")),
2962    }
2963}
2964
2965#[cfg(target_arch = "x86_64")]
2966fn remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>(
2967    linux: &RunnableLinuxVm<V, Vcpu>,
2968    sys_allocator: &mut SystemAllocator,
2969    buses_to_remove: &mut Vec<u8>,
2970    hotplug_key: HotPlugKey,
2971    child_bus: u8,
2972) -> Result<()> {
2973    for (bus_num, hp_bus) in linux.hotplug_bus.iter() {
2974        let mut hp_bus_lock = hp_bus.lock();
2975        if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2976            sys_allocator.release_pci(pci_addr);
2977            hp_bus_lock.hot_unplug(pci_addr)?;
2978            buses_to_remove.push(child_bus);
2979            if hp_bus_lock.is_empty() {
2980                if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2981                    remove_hotplug_bridge(
2982                        linux,
2983                        sys_allocator,
2984                        buses_to_remove,
2985                        hotplug_key,
2986                        *bus_num,
2987                    )?;
2988                }
2989            }
2990            return Ok(());
2991        }
2992    }
2993
2994    Err(anyhow!(
2995        "Can not find device {:?} on hotplug buses",
2996        hotplug_key
2997    ))
2998}
2999
3000#[cfg(target_arch = "x86_64")]
3001fn remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
3002    linux: &mut RunnableLinuxVm<V, Vcpu>,
3003    sys_allocator: &mut SystemAllocator,
3004    iommu_host_tube: Option<&Tube>,
3005    device: &HotPlugDeviceInfo,
3006) -> Result<()> {
3007    let host_addr = PciAddress::from_path(&device.path)?;
3008    let hotplug_key = match device.device_type {
3009        HotPlugDeviceType::UpstreamPort => HotPlugKey::HostUpstreamPort { host_addr },
3010        HotPlugDeviceType::DownstreamPort => HotPlugKey::HostDownstreamPort { host_addr },
3011        HotPlugDeviceType::EndPoint => HotPlugKey::HostVfio { host_addr },
3012    };
3013
3014    let hp_bus = linux
3015        .hotplug_bus
3016        .iter()
3017        .find(|(_, hp_bus)| {
3018            let hp_bus = hp_bus.lock();
3019            hp_bus.get_hotplug_device(hotplug_key).is_some()
3020        })
3021        .map(|(bus_num, hp_bus)| (*bus_num, hp_bus.clone()));
3022
3023    if let Some((bus_num, hp_bus)) = hp_bus {
3024        let mut buses_to_remove = Vec::new();
3025        let mut removed_key = None;
3026        let mut hp_bus_lock = hp_bus.lock();
3027        if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
3028            if let Some(iommu_host_tube) = iommu_host_tube {
3029                let request =
3030                    VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceDel {
3031                        endpoint_addr: pci_addr.to_u32(),
3032                    });
3033                match virtio_iommu_request(iommu_host_tube, &request)
3034                    .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
3035                {
3036                    VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
3037                    resp => bail!("Unexpected message response: {:?}", resp),
3038                }
3039            }
3040            let mut empty_simbling = true;
3041            if let Some(HotPlugKey::HostDownstreamPort { host_addr }) =
3042                hp_bus_lock.get_hotplug_key()
3043            {
3044                let addr_alias = host_addr;
3045                for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
3046                    if *simbling_bus_num != bus_num {
3047                        let hp_bus_lock = hp_bus.lock();
3048                        let hotplug_key = hp_bus_lock.get_hotplug_key();
3049                        if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
3050                            if addr_alias.bus == host_addr.bus && !hp_bus_lock.is_empty() {
3051                                empty_simbling = false;
3052                                break;
3053                            }
3054                        }
3055                    }
3056                }
3057            }
3058
3059            // If all simbling downstream ports are empty, do not send hot unplug event for this
3060            // downstream port. Root port will send one plug out interrupt and remove all
3061            // the remaining devices
3062            if !empty_simbling {
3063                hp_bus_lock.hot_unplug(pci_addr)?;
3064            }
3065
3066            sys_allocator.release_pci(pci_addr);
3067            if empty_simbling || hp_bus_lock.is_empty() {
3068                if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
3069                    removed_key = Some(hotplug_key);
3070                    remove_hotplug_bridge(
3071                        linux,
3072                        sys_allocator,
3073                        &mut buses_to_remove,
3074                        hotplug_key,
3075                        bus_num,
3076                    )?;
3077                }
3078            }
3079        }
3080
3081        // Some types of TBT device has a few empty downstream ports. The emulated bridges
3082        // of these ports won't be removed since no vfio device is connected to our emulated
3083        // bridges. So we explicitly check all simbling bridges of the removed bridge here,
3084        // and remove them if bridge has no child device connected.
3085        if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = removed_key {
3086            let addr_alias = host_addr;
3087            for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
3088                if *simbling_bus_num != bus_num {
3089                    let hp_bus_lock = hp_bus.lock();
3090                    let hotplug_key = hp_bus_lock.get_hotplug_key();
3091                    if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
3092                        if addr_alias.bus == host_addr.bus && hp_bus_lock.is_empty() {
3093                            remove_hotplug_bridge(
3094                                linux,
3095                                sys_allocator,
3096                                &mut buses_to_remove,
3097                                hotplug_key.unwrap(),
3098                                *simbling_bus_num,
3099                            )?;
3100                        }
3101                    }
3102                }
3103            }
3104        }
3105        for bus in buses_to_remove.iter() {
3106            linux.hotplug_bus.remove(bus);
3107        }
3108        return Ok(());
3109    }
3110
3111    Err(anyhow!(
3112        "Can not find device {:?} on hotplug buses",
3113        hotplug_key
3114    ))
3115}
3116
3117pub fn trigger_vm_suspend_and_wait_for_entry(
3118    guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>,
3119    tube: &SendTube,
3120    response: vm_control::VmResponse,
3121    suspend_tube: Arc<Mutex<SendTube>>,
3122    pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
3123) {
3124    let (lock, cvar) = &*guest_suspended_cvar;
3125    let mut guest_suspended = lock.lock();
3126
3127    *guest_suspended = false;
3128
3129    // During suspend also emulate sleepbtn, which allows to suspend VM (if running e.g. acpid and
3130    // reacts on sleep button events)
3131    if let Some(pm) = pm {
3132        pm.lock().slpbtn_evt();
3133    } else {
3134        error!("generating sleepbtn during suspend not supported");
3135    }
3136
3137    // Wait for notification about guest suspension, if not received after 15sec,
3138    // proceed anyway.
3139    let result = cvar.wait_timeout(guest_suspended, std::time::Duration::from_secs(15));
3140    guest_suspended = result.0;
3141
3142    if result.1.timed_out() {
3143        warn!("Guest suspension timeout - proceeding anyway");
3144    } else if *guest_suspended {
3145        info!("Guest suspended");
3146    }
3147
3148    if let Err(e) = suspend_tube.lock().send(&true) {
3149        error!("failed to trigger suspend event: {}", e);
3150    }
3151    // Now we ready to send response over the tube and communicate that VM suspend has finished
3152    if let Err(e) = tube.send(&response) {
3153        error!("failed to send VmResponse: {}", e);
3154    }
3155}
3156
3157#[cfg(feature = "pvclock")]
3158#[derive(Debug)]
3159/// The action requested by the pvclock device to perform on the main thread.
3160enum PvClockAction {
3161    #[cfg(target_arch = "aarch64")]
3162    /// Update the counter offset with VmAarch64::set_counter_offset.
3163    SetCounterOffset(u64),
3164}
3165
3166#[cfg(feature = "pvclock")]
3167fn send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<Option<PvClockAction>> {
3168    tube.send(&command)
3169        .with_context(|| format!("failed to send pvclock command {command:?}"))?;
3170    let resp = tube
3171        .recv::<PvClockCommandResponse>()
3172        .context("failed to receive pvclock command response")?;
3173    match resp {
3174        PvClockCommandResponse::Err(e) => {
3175            bail!("pvclock encountered error on {:?}: {}", command, e);
3176        }
3177        PvClockCommandResponse::DeviceInactive => {
3178            warn!("Tried to send {command:?} but pvclock device was inactive");
3179            Ok(None)
3180        }
3181        PvClockCommandResponse::Resumed {
3182            total_suspended_ticks,
3183        } => {
3184            info!("{command:?} completed with {total_suspended_ticks} total_suspended_ticks");
3185            cfg_if::cfg_if! {
3186                if #[cfg(target_arch = "aarch64")] {
3187                    Ok(Some(PvClockAction::SetCounterOffset(total_suspended_ticks)))
3188                } else {
3189                    // For non-AArch64 platforms this is handled by directly updating the offset in
3190                    // shared memory in the pvclock device worker.
3191                    Ok(None)
3192                }
3193            }
3194        }
3195        PvClockCommandResponse::Ok => {
3196            info!("{command:?} completed with {resp:?}");
3197            Ok(None)
3198        }
3199    }
3200}
3201
3202#[cfg(target_arch = "x86_64")]
3203fn handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>(
3204    linux: &mut RunnableLinuxVm<V, Vcpu>,
3205    sys_allocator: &mut SystemAllocator,
3206    cfg: &Config,
3207    add_control_tube: &mut impl FnMut(AnyControlTube),
3208    hp_control_tube: &mpsc::Sender<PciRootCommand>,
3209    iommu_host_tube: Option<&Tube>,
3210    device: &HotPlugDeviceInfo,
3211    add: bool,
3212    #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
3213    vfio_container_manager: &mut VfioContainerManager,
3214) -> VmResponse {
3215    let iommu_host_tube = if cfg.vfio_isolate_hotplug {
3216        iommu_host_tube
3217    } else {
3218        None
3219    };
3220
3221    let ret = if add {
3222        add_hotplug_device(
3223            linux,
3224            sys_allocator,
3225            cfg,
3226            add_control_tube,
3227            hp_control_tube,
3228            iommu_host_tube,
3229            device,
3230            #[cfg(feature = "swap")]
3231            swap_controller,
3232            vfio_container_manager,
3233        )
3234    } else {
3235        remove_hotplug_device(linux, sys_allocator, iommu_host_tube, device)
3236    };
3237
3238    match ret {
3239        Ok(()) => VmResponse::Ok,
3240        Err(e) => {
3241            error!("handle_hotplug_command failure: {}", e);
3242            VmResponse::Err(base::Error::new(libc::EINVAL))
3243        }
3244    }
3245}
3246
3247struct ControlLoopState<'a, V: VmArch, Vcpu: VcpuArch> {
3248    linux: &'a mut RunnableLinuxVm<V, Vcpu>,
3249    cfg: &'a Config,
3250    sys_allocator: &'a Arc<Mutex<SystemAllocator>>,
3251    control_tubes: &'a BTreeMap<usize, TaggedControlTube>,
3252    disk_host_tubes: &'a [Tube],
3253    #[cfg(feature = "audio")]
3254    snd_host_tubes: &'a [Tube],
3255    #[cfg(feature = "gpu")]
3256    gpu_control_tube: Option<&'a Tube>,
3257    #[cfg(feature = "usb")]
3258    usb_control_tube: &'a Tube,
3259    #[cfg(target_arch = "x86_64")]
3260    iommu_host_tube: &'a Option<Arc<Mutex<Tube>>>,
3261    #[cfg(target_arch = "x86_64")]
3262    hp_control_tube: &'a mpsc::Sender<PciRootCommand>,
3263    guest_suspended_cvar: &'a Option<Arc<(Mutex<bool>, Condvar)>>,
3264    #[cfg(feature = "pci-hotplug")]
3265    hotplug_manager: &'a mut Option<PciHotPlugManager>,
3266    #[cfg(feature = "swap")]
3267    swap_controller: &'a mut Option<SwapController>,
3268    vcpu_handles: &'a [(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)],
3269    #[cfg(feature = "balloon")]
3270    balloon_tube: Option<&'a mut BalloonTube>,
3271    device_ctrl_tube: &'a Tube,
3272    irq_handler_control: &'a Tube,
3273    #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3274    vm_memory_handler_control: &'a Tube,
3275    #[cfg(feature = "registered_events")]
3276    registered_evt_tubes: &'a mut HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
3277    #[cfg(feature = "pvclock")]
3278    pvclock_host_tube: Option<Arc<Tube>>,
3279    vfio_container_manager: &'a mut VfioContainerManager,
3280    suspended_pvclock_state: &'a mut Option<hypervisor::ClockState>,
3281    vcpus_pid_tid: &'a BTreeMap<usize, (u32, u32)>,
3282    vm_memory_control_client: &'a VmMemoryClient,
3283}
3284
3285struct VmRequestResult {
3286    response: Option<VmResponse>,
3287    exit: bool,
3288}
3289
3290impl VmRequestResult {
3291    fn new(response: Option<VmResponse>, exit: bool) -> Self {
3292        VmRequestResult { response, exit }
3293    }
3294}
3295
3296fn process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3297    state: &mut ControlLoopState<V, Vcpu>,
3298    id: usize,
3299    tube: &Tube,
3300    request: VmRequest,
3301    #[cfg_attr(
3302        not(any(target_arch = "x86_64", feature = "pci-hotplug")),
3303        allow(unused_variables, clippy::ptr_arg)
3304    )]
3305    add_tubes: &mut Vec<TaggedControlTube>,
3306) -> Result<VmRequestResult> {
3307    #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3308    let mut add_irq_control_tubes = Vec::new();
3309    #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3310    let mut add_vm_memory_control_tubes = Vec::new();
3311
3312    #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3313    let mut add_control_tube = |t| match t {
3314        AnyControlTube::DeviceControlTube(_) => {
3315            panic!("hotplugging DeviceControlTube not supported yet")
3316        }
3317        AnyControlTube::IrqTube(t) => add_irq_control_tubes.push(t),
3318        AnyControlTube::TaggedControlTube(t) => add_tubes.push(t),
3319        AnyControlTube::VmMemoryTube(t) => add_vm_memory_control_tubes.push(t),
3320    };
3321
3322    let response = match request {
3323        VmRequest::Exit => {
3324            return Ok(VmRequestResult::new(Some(VmResponse::Ok), true));
3325        }
3326        VmRequest::HotPlugVfioCommand { device, add } => {
3327            #[cfg(target_arch = "x86_64")]
3328            {
3329                handle_hotplug_command(
3330                    state.linux,
3331                    &mut state.sys_allocator.lock(),
3332                    state.cfg,
3333                    &mut add_control_tube,
3334                    state.hp_control_tube,
3335                    state.iommu_host_tube.as_ref().map(|t| t.lock()).as_deref(),
3336                    &device,
3337                    add,
3338                    #[cfg(feature = "swap")]
3339                    state.swap_controller,
3340                    state.vfio_container_manager,
3341                )
3342            }
3343
3344            #[cfg(not(target_arch = "x86_64"))]
3345            {
3346                // Suppress warnings.
3347                let _ = (device, add);
3348                let _ = &state.vfio_container_manager;
3349                VmResponse::Ok
3350            }
3351        }
3352        #[cfg(feature = "pci-hotplug")]
3353        VmRequest::HotPlugNetCommand(net_cmd) => {
3354            if let Some(hotplug_manager) = state.hotplug_manager.as_mut() {
3355                handle_hotplug_net_command(
3356                    net_cmd,
3357                    state.linux,
3358                    &mut state.sys_allocator.lock(),
3359                    &mut add_control_tube,
3360                    hotplug_manager,
3361                )
3362            } else {
3363                VmResponse::ErrString("PCI hotplug is not enabled.".to_owned())
3364            }
3365        }
3366        #[cfg(feature = "registered_events")]
3367        VmRequest::RegisterListener { socket_addr, event } => {
3368            let (registered_tube, already_registered) =
3369                find_registered_tube(state.registered_evt_tubes, &socket_addr, event);
3370
3371            if !already_registered {
3372                let addr_tube = make_addr_tube_from_maybe_existing(registered_tube, socket_addr)?;
3373
3374                if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
3375                    tubes.insert(addr_tube);
3376                } else {
3377                    state
3378                        .registered_evt_tubes
3379                        .insert(event, vec![addr_tube].into_iter().collect());
3380                }
3381            }
3382            VmResponse::Ok
3383        }
3384        #[cfg(feature = "registered_events")]
3385        VmRequest::UnregisterListener { socket_addr, event } => {
3386            if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
3387                tubes.retain(|t| t.socket_addr != socket_addr);
3388            }
3389            state
3390                .registered_evt_tubes
3391                .retain(|_, tubes| !tubes.is_empty());
3392            VmResponse::Ok
3393        }
3394        #[cfg(feature = "registered_events")]
3395        VmRequest::Unregister { socket_addr } => {
3396            for (_, tubes) in state.registered_evt_tubes.iter_mut() {
3397                tubes.retain(|t| t.socket_addr != socket_addr);
3398            }
3399            state
3400                .registered_evt_tubes
3401                .retain(|_, tubes| !tubes.is_empty());
3402            VmResponse::Ok
3403        }
3404        #[cfg(feature = "balloon")]
3405        VmRequest::BalloonCommand(cmd) => {
3406            if let Some(tube) = state.balloon_tube.as_mut() {
3407                let Some((r, key)) = tube.send_cmd(cmd, Some(id)) else {
3408                    return Ok(VmRequestResult::new(None, false));
3409                };
3410                if key != id {
3411                    let Some(TaggedControlTube::Vm(tube)) = state.control_tubes.get(&key) else {
3412                        return Ok(VmRequestResult::new(None, false));
3413                    };
3414                    if let Err(e) = tube.send(&r) {
3415                        error!("failed to send VmResponse: {}", e);
3416                    }
3417                    return Ok(VmRequestResult::new(None, false));
3418                }
3419                r
3420            } else {
3421                VmResponse::Err(base::Error::new(libc::ENOTSUP))
3422            }
3423        }
3424        VmRequest::VcpuPidTid => VmResponse::VcpuPidTidResponse {
3425            pid_tid_map: state.vcpus_pid_tid.clone(),
3426        },
3427        VmRequest::Throttle(vcpu, cycles) => {
3428            vcpu::kick_vcpu(
3429                &state.vcpu_handles.get(vcpu),
3430                state.linux.irq_chip.as_irq_chip(),
3431                VcpuControl::Throttle(cycles),
3432            );
3433            return Ok(VmRequestResult::new(None, false));
3434        }
3435        VmRequest::RegisterMemory {
3436            fd,
3437            offset,
3438            range_start,
3439            range_end,
3440            cache_coherent,
3441        } => {
3442            if range_start >= range_end {
3443                error!("range_start >= range_end");
3444                return Ok(VmRequestResult::new(
3445                    Some(VmResponse::Err(base::Error::new(libc::EINVAL))),
3446                    false,
3447                ));
3448            }
3449            let source = VmMemorySource::Descriptor {
3450                descriptor: fd,
3451                offset,
3452                size: range_end - range_start,
3453            };
3454            let dest = VmMemoryDestination::GuestPhysicalAddress(range_start);
3455            let cache_type = if cache_coherent {
3456                MemCacheType::CacheCoherent
3457            } else {
3458                MemCacheType::CacheNonCoherent
3459            };
3460            match state.vm_memory_control_client.register_memory(
3461                source,
3462                dest,
3463                Protection::read_write(),
3464                cache_type,
3465            ) {
3466                Ok(region_id) => VmResponse::RegisterMemory2 {
3467                    region_id: region_id.0 .0,
3468                },
3469                Err(e) => VmResponse::ErrString(format!("register memory failed: {e:?}")),
3470            }
3471        }
3472        VmRequest::UnregisterMemory { region_id } => {
3473            let mem_region_id = VmMemoryRegionId(GuestAddress(region_id));
3474            match state
3475                .vm_memory_control_client
3476                .unregister_memory(mem_region_id)
3477            {
3478                Ok(_) => VmResponse::Ok,
3479                Err(e) => VmResponse::ErrString(format!("unregister memory failed: {e:?}")),
3480            }
3481        }
3482        _ => {
3483            if !state.cfg.force_s2idle {
3484                #[cfg(feature = "pvclock")]
3485                if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3486                    // Update clock offset when pvclock is used.
3487                    if let VmRequest::ResumeVcpus = request {
3488                        let cmd = PvClockCommand::Resume;
3489                        match send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3490                            Ok(action) => {
3491                                info!("{:?} command successfully processed", cmd);
3492                                if let Some(action) = action {
3493                                    match action {
3494                                        #[cfg(target_arch = "aarch64")]
3495                                        PvClockAction::SetCounterOffset(offset) => {
3496                                            state.linux.vm.set_counter_offset(offset)?;
3497                                        }
3498                                    }
3499                                }
3500                            }
3501                            Err(e) => error!("{:?} command failed: {:#}", cmd, e),
3502                        };
3503                    }
3504                }
3505            }
3506            let kick_all_vcpus = |msg| {
3507                if let VcpuControl::RunState(VmRunMode::Running) = msg {
3508                    for dev in &state.linux.resume_notify_devices {
3509                        dev.lock().resume_imminent();
3510                    }
3511                }
3512                vcpu::kick_all_vcpus(state.vcpu_handles, state.linux.irq_chip.as_irq_chip(), msg);
3513            };
3514            let response = request.execute(
3515                &state.linux.vm,
3516                state.disk_host_tubes,
3517                #[cfg(feature = "audio")]
3518                state.snd_host_tubes,
3519                #[cfg(not(feature = "audio"))]
3520                &[],
3521                &mut state.linux.pm,
3522                #[cfg(feature = "gpu")]
3523                state.gpu_control_tube,
3524                #[cfg(not(feature = "gpu"))]
3525                None,
3526                #[cfg(feature = "usb")]
3527                Some(state.usb_control_tube),
3528                #[cfg(not(feature = "usb"))]
3529                None,
3530                &mut state.linux.bat_control,
3531                kick_all_vcpus,
3532                |index, msg| {
3533                    vcpu::kick_vcpu(
3534                        &state.vcpu_handles.get(index),
3535                        state.linux.irq_chip.as_irq_chip(),
3536                        msg,
3537                    )
3538                },
3539                state.cfg.force_s2idle,
3540                #[cfg(feature = "swap")]
3541                state.swap_controller.as_ref(),
3542                state.device_ctrl_tube,
3543                state.vcpu_handles.len(),
3544                state.irq_handler_control,
3545                || state.linux.irq_chip.snapshot(state.linux.vcpu_count),
3546                state.suspended_pvclock_state,
3547            );
3548            if state.cfg.force_s2idle {
3549                if let VmRequest::SuspendVcpus = request {
3550                    // Spawn s2idle wait thread.
3551                    let send_tube = tube.try_clone_send_tube().unwrap();
3552                    let suspend_tube = state.linux.suspend_tube.0.clone();
3553                    let guest_suspended_cvar = state.guest_suspended_cvar.clone();
3554                    let pm = state.linux.pm.clone();
3555
3556                    std::thread::Builder::new()
3557                        .name("s2idle_wait".to_owned())
3558                        .spawn(move || {
3559                            trigger_vm_suspend_and_wait_for_entry(
3560                                guest_suspended_cvar.unwrap(),
3561                                &send_tube,
3562                                response,
3563                                suspend_tube,
3564                                pm,
3565                            )
3566                        })
3567                        .context("failed to spawn s2idle_wait thread")?;
3568
3569                    // For s2idle, omit the response since it will be sent by
3570                    // s2idle_wait thread when suspension actually happens.
3571                    return Ok(VmRequestResult::new(None, false));
3572                }
3573            } else {
3574                #[cfg(feature = "pvclock")]
3575                if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3576                    // Record the time after VCPUs are suspended to track suspension duration.
3577                    if let VmRequest::SuspendVcpus = request {
3578                        let cmd = PvClockCommand::Suspend;
3579                        match send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3580                            Ok(action) => {
3581                                info!("{:?} command successfully processed", cmd);
3582                                if let Some(action) = action {
3583                                    error!("Unexpected action {:?} requested for suspend", action);
3584                                }
3585                            }
3586                            Err(e) => error!("{:?} command failed: {:#}", cmd, e),
3587                        };
3588                    }
3589                }
3590            }
3591            response
3592        }
3593    };
3594
3595    cfg_if::cfg_if! {
3596        if #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))] {
3597            if !add_irq_control_tubes.is_empty() {
3598                state
3599                    .irq_handler_control
3600                    .send(&IrqHandlerRequest::AddIrqControlTubes(
3601                        add_irq_control_tubes,
3602                    ))?;
3603            }
3604            if !add_vm_memory_control_tubes.is_empty() {
3605                state
3606                    .vm_memory_handler_control
3607                    .send(&VmMemoryHandlerRequest::AddControlTubes(
3608                        add_vm_memory_control_tubes,
3609                    ))?;
3610            }
3611        }
3612    }
3613
3614    Ok(VmRequestResult::new(Some(response), false))
3615}
3616
3617fn process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3618    state: &mut ControlLoopState<V, Vcpu>,
3619    id: usize,
3620    socket: &TaggedControlTube,
3621) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)> {
3622    let mut vm_control_ids_to_remove = Vec::new();
3623    let mut add_tubes = Vec::new();
3624    match socket {
3625        TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
3626            Ok(request) => {
3627                let res = process_vm_request(state, id, tube, request, &mut add_tubes)?;
3628
3629                if let Some(response) = res.response {
3630                    if let Err(e) = tube.send(&response) {
3631                        error!("failed to send VmResponse: {}", e);
3632                    }
3633                }
3634
3635                if res.exit {
3636                    return Ok((true, Vec::new(), Vec::new()));
3637                }
3638            }
3639            Err(e) => {
3640                if let TubeError::Disconnected = e {
3641                    vm_control_ids_to_remove.push(id);
3642                } else {
3643                    error!("failed to recv VmRequest: {}", e);
3644                }
3645            }
3646        },
3647        TaggedControlTube::VmMsync(tube) => match tube.recv::<VmMemoryMappingRequest>() {
3648            Ok(request) => {
3649                let response = request.execute(&mut state.linux.vm);
3650                if let Err(e) = tube.send(&response) {
3651                    error!("failed to send VmMsyncResponse: {}", e);
3652                }
3653            }
3654            Err(e) => {
3655                if let TubeError::Disconnected = e {
3656                    vm_control_ids_to_remove.push(id);
3657                } else {
3658                    error!("failed to recv VmMsyncRequest: {}", e);
3659                }
3660            }
3661        },
3662        TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
3663            Ok(request) => {
3664                let response =
3665                    request.execute(&mut state.linux.vm, &mut state.sys_allocator.lock());
3666                if let Err(e) = tube.send(&response) {
3667                    error!("failed to send VmResponse: {}", e);
3668                }
3669            }
3670            Err(e) => {
3671                if let TubeError::Disconnected = e {
3672                    vm_control_ids_to_remove.push(id);
3673                } else {
3674                    error!("failed to recv VmResponse: {}", e);
3675                }
3676            }
3677        },
3678    }
3679
3680    Ok((false, vm_control_ids_to_remove, add_tubes))
3681}
3682
3683#[cfg(feature = "registered_events")]
3684struct AddressedProtoTube {
3685    tube: Rc<ProtoTube>,
3686    socket_addr: String,
3687}
3688
3689#[cfg(feature = "registered_events")]
3690impl PartialEq for AddressedProtoTube {
3691    fn eq(&self, other: &Self) -> bool {
3692        self.socket_addr == other.socket_addr
3693    }
3694}
3695
3696#[cfg(feature = "registered_events")]
3697impl Eq for AddressedProtoTube {}
3698
3699#[cfg(feature = "registered_events")]
3700impl Hash for AddressedProtoTube {
3701    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
3702        self.socket_addr.hash(state);
3703    }
3704}
3705
3706#[cfg(feature = "registered_events")]
3707impl AddressedProtoTube {
3708    pub fn send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError> {
3709        self.tube.send_proto(msg)
3710    }
3711}
3712
3713#[cfg(feature = "registered_events")]
3714fn find_registered_tube<'a>(
3715    registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
3716    socket_addr: &str,
3717    event: RegisteredEvent,
3718) -> (Option<&'a Rc<ProtoTube>>, bool) {
3719    let mut registered_tube: Option<&Rc<ProtoTube>> = None;
3720    let mut already_registered = false;
3721    'outer: for (evt, addr_tubes) in registered_tubes {
3722        for addr_tube in addr_tubes {
3723            if addr_tube.socket_addr == socket_addr {
3724                if *evt == event {
3725                    already_registered = true;
3726                    break 'outer;
3727                }
3728                // Since all tubes of the same addr should
3729                // be an RC to the same tube, it doesn't
3730                // matter which one we get. But we do need
3731                // to check for a registration for the
3732                // current event, so can't break here.
3733                registered_tube = Some(&addr_tube.tube);
3734            }
3735        }
3736    }
3737    (registered_tube, already_registered)
3738}
3739
3740#[cfg(feature = "registered_events")]
3741fn make_addr_tube_from_maybe_existing(
3742    tube: Option<&Rc<ProtoTube>>,
3743    addr: String,
3744) -> Result<AddressedProtoTube> {
3745    if let Some(registered_tube) = tube {
3746        Ok(AddressedProtoTube {
3747            tube: registered_tube.clone(),
3748            socket_addr: addr,
3749        })
3750    } else {
3751        let sock = UnixSeqpacket::connect(addr.clone())
3752            .with_context(|| format!("failed to connect to registered listening socket {addr}"))?;
3753        let tube = ProtoTube::from(Tube::try_from(sock)?);
3754        Ok(AddressedProtoTube {
3755            tube: Rc::new(tube),
3756            socket_addr: addr,
3757        })
3758    }
3759}
3760
3761fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3762    mut linux: RunnableLinuxVm<V, Vcpu>,
3763    sys_allocator: SystemAllocator,
3764    cfg: Config,
3765    control_server_socket: Option<UnlinkUnixSeqpacketListener>,
3766    all_control_tubes: Vec<AnyControlTube>,
3767    #[cfg(feature = "usb")] usb_control_tube: Tube,
3768    vm_evt_rdtube: RecvTube,
3769    vm_evt_wrtube: SendTube,
3770    sigchld_fd: SignalFd,
3771    gralloc: RutabagaGralloc,
3772    vcpu_ids: Vec<usize>,
3773    iommu_host_tube: Option<Tube>,
3774    #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>,
3775    #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>,
3776    #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>,
3777    #[allow(unused_mut)] // mut is required x86 only
3778    #[cfg(feature = "swap")]
3779    mut swap_controller: Option<SwapController>,
3780    #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube,
3781    guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
3782    metrics_tube: RecvTube,
3783    mut vfio_container_manager: VfioContainerManager,
3784    // A set of PID of child processes whose clean exit is expected and can be ignored.
3785    mut worker_process_pids: BTreeSet<Pid>,
3786    #[cfg(target_arch = "aarch64")] vcpu_domain_paths: BTreeMap<usize, PathBuf>,
3787) -> Result<ExitState> {
3788    // Split up `all_control_tubes`.
3789    #[cfg(feature = "balloon")]
3790    let mut balloon_host_tube = None;
3791    let mut disk_host_tubes = Vec::new();
3792    #[cfg(feature = "gpu")]
3793    let mut gpu_control_tube = None;
3794    #[cfg(feature = "pvclock")]
3795    let mut pvclock_host_tube = None;
3796    #[cfg(feature = "audio")]
3797    let mut snd_host_tubes = Vec::new();
3798    let mut irq_control_tubes = Vec::new();
3799    let mut vm_memory_control_tubes = Vec::new();
3800    let mut control_tubes = Vec::new();
3801    for t in all_control_tubes {
3802        match t {
3803            #[cfg(feature = "balloon")]
3804            AnyControlTube::DeviceControlTube(DeviceControlTube::Balloon(t)) => {
3805                assert!(balloon_host_tube.is_none());
3806                balloon_host_tube = Some(t)
3807            }
3808            AnyControlTube::DeviceControlTube(DeviceControlTube::Disk(t)) => {
3809                disk_host_tubes.push(t)
3810            }
3811            #[cfg(feature = "gpu")]
3812            AnyControlTube::DeviceControlTube(DeviceControlTube::Gpu(t)) => {
3813                assert!(gpu_control_tube.is_none());
3814                gpu_control_tube = Some(t)
3815            }
3816            #[cfg(feature = "pvclock")]
3817            AnyControlTube::DeviceControlTube(DeviceControlTube::PvClock(t)) => {
3818                assert!(pvclock_host_tube.is_none());
3819                pvclock_host_tube = Some(Arc::new(t))
3820            }
3821            #[cfg(feature = "audio")]
3822            AnyControlTube::DeviceControlTube(DeviceControlTube::Snd(t)) => {
3823                snd_host_tubes.push(t);
3824            }
3825            AnyControlTube::IrqTube(t) => irq_control_tubes.push(t),
3826            AnyControlTube::TaggedControlTube(t) => control_tubes.push(t),
3827            AnyControlTube::VmMemoryTube(t) => vm_memory_control_tubes.push(t),
3828        }
3829    }
3830
3831    #[cfg(feature = "gdb")]
3832    let (to_gdb_channel, gdb) = if let Some(port) = cfg.gdb {
3833        // GDB needs a control socket to interrupt vcpus.
3834        let (gdb_host_tube, gdb_control_tube) = Tube::pair().context("failed to create tube")?;
3835        control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
3836        // Create a channel for GDB thread.
3837        let (to_gdb_channel, from_vcpu_channel) = mpsc::channel();
3838        (
3839            Some(to_gdb_channel),
3840            Some((port, gdb_control_tube, from_vcpu_channel)),
3841        )
3842    } else {
3843        (None, None)
3844    };
3845
3846    #[derive(EventToken)]
3847    enum Token {
3848        VmEvent,
3849        Suspend,
3850        ChildSignal,
3851        VmControlServer,
3852        VmControl {
3853            id: usize,
3854        },
3855        #[cfg(feature = "registered_events")]
3856        RegisteredEvent,
3857        #[cfg(feature = "balloon")]
3858        BalloonTube,
3859    }
3860    stdin()
3861        .set_raw_mode()
3862        .expect("failed to set terminal raw mode");
3863
3864    let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
3865    let iommu_host_tube = iommu_host_tube.map(|t| Arc::new(Mutex::new(t)));
3866
3867    let wait_ctx = WaitContext::build_with(&[
3868        (&linux.suspend_tube.1, Token::Suspend),
3869        (&sigchld_fd, Token::ChildSignal),
3870        (&vm_evt_rdtube, Token::VmEvent),
3871        #[cfg(feature = "registered_events")]
3872        (&reg_evt_rdtube, Token::RegisteredEvent),
3873    ])
3874    .context("failed to build wait context")?;
3875
3876    if let Some(socket_server) = &control_server_socket {
3877        wait_ctx
3878            .add(socket_server, Token::VmControlServer)
3879            .context("failed to add descriptor to wait context")?;
3880    }
3881    let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
3882    let mut next_control_id = control_tubes.len();
3883    for (id, socket) in control_tubes.iter() {
3884        wait_ctx
3885            .add(socket.as_ref(), Token::VmControl { id: *id })
3886            .context("failed to add descriptor to wait context")?;
3887    }
3888
3889    #[cfg(feature = "balloon")]
3890    let mut balloon_tube = balloon_host_tube
3891        .map(|tube| -> Result<BalloonTube> {
3892            wait_ctx
3893                .add(&tube, Token::BalloonTube)
3894                .context("failed to add descriptor to wait context")?;
3895            Ok(BalloonTube::new(tube))
3896        })
3897        .transpose()
3898        .context("failed to create balloon tube")?;
3899
3900    if cfg.jail_config.is_some() {
3901        // Before starting VCPUs, in case we started with some capabilities, drop them all.
3902        drop_capabilities().context("failed to drop process capabilities")?;
3903    }
3904
3905    let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
3906    // Create devices thread, and restore if a restore file exists.
3907    linux.devices_thread = match create_devices_worker_thread(
3908        linux.io_bus.clone(),
3909        linux.mmio_bus.clone(),
3910        device_ctrl_resp,
3911    ) {
3912        Ok(join_handle) => Some(join_handle),
3913        Err(e) => {
3914            return Err(anyhow!("Failed to start devices thread: {}", e));
3915        }
3916    };
3917
3918    let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
3919    let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
3920
3921    if !linux
3922        .vm
3923        .get_hypervisor()
3924        .check_capability(HypervisorCap::ImmediateExit)
3925    {
3926        return Err(anyhow!(
3927            "missing required hypervisor capability ImmediateExit"
3928        ));
3929    }
3930
3931    vcpu::setup_vcpu_signal_handler()?;
3932
3933    let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
3934        Some(vec) => vec.into_iter().map(Some).collect(),
3935        None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
3936    };
3937    // Enable core scheduling before creating vCPUs so that the cookie will be
3938    // shared by all vCPU threads.
3939    // TODO(b/199312402): Avoid enabling core scheduling for the crosvm process
3940    // itself for even better performance. Only vCPUs need the feature.
3941    if cfg.core_scheduling && cfg.per_vm_core_scheduling {
3942        if let Err(e) = enable_core_scheduling() {
3943            error!("Failed to enable core scheduling: {}", e);
3944        }
3945    }
3946
3947    // The tasks file only exist on sysfs if CgroupV1 hierachies are enabled
3948    let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
3949        None => None,
3950        Some(cgroup_path) => {
3951            // Move main process to cgroup_path
3952            match File::create(cgroup_path.join("tasks")) {
3953                Ok(file) => Some(file),
3954                Err(_) => {
3955                    info!(
3956                        "Unable to open tasks file in cgroup: {}, trying CgroupV2",
3957                        cgroup_path.display()
3958                    );
3959                    None
3960                }
3961            }
3962        }
3963    };
3964
3965    // vCPU freq domains are currently only supported with CgroupsV2.
3966    let mut vcpu_cgroup_v2_files: std::collections::BTreeMap<usize, File> = BTreeMap::new();
3967    #[cfg(target_arch = "aarch64")]
3968    for (vcpu_id, vcpu_domain_path) in vcpu_domain_paths.iter() {
3969        let vcpu_cgroup_v2_file = File::create(vcpu_domain_path.join("cgroup.threads"))
3970            .with_context(|| {
3971                format!(
3972                    "failed to create vcpu-cgroup-path {}",
3973                    vcpu_domain_path.join("cgroup.threads").display(),
3974                )
3975            })?;
3976        vcpu_cgroup_v2_files.insert(*vcpu_id, vcpu_cgroup_v2_file);
3977    }
3978
3979    #[cfg(target_arch = "x86_64")]
3980    let bus_lock_ratelimit_ctrl: Arc<Mutex<Ratelimit>> = Arc::new(Mutex::new(Ratelimit::new()));
3981    #[cfg(target_arch = "x86_64")]
3982    if cfg.bus_lock_ratelimit > 0 {
3983        let bus_lock_ratelimit = cfg.bus_lock_ratelimit;
3984        if linux.vm.check_capability(VmCap::BusLockDetect) {
3985            info!("Hypervisor support bus lock detect");
3986            linux
3987                .vm
3988                .enable_capability(VmCap::BusLockDetect, 0)
3989                .expect("kvm: Failed to enable bus lock detection cap");
3990            info!("Hypervisor enabled bus lock detect");
3991            bus_lock_ratelimit_ctrl
3992                .lock()
3993                .ratelimit_set_speed(bus_lock_ratelimit);
3994        } else {
3995            bail!("Kvm: bus lock detection unsuported");
3996        }
3997    }
3998
3999    #[cfg(target_os = "android")]
4000    android::set_process_profiles(&cfg.task_profiles)?;
4001
4002    #[allow(unused_mut)]
4003    let mut run_mode = if cfg.suspended {
4004        // Sleep devices before creating vcpus.
4005        device_ctrl_tube
4006            .send(&DeviceControlCommand::SleepDevices)
4007            .context("send command to devices control socket")?;
4008        match device_ctrl_tube
4009            .recv()
4010            .context("receive from devices control socket")?
4011        {
4012            VmResponse::Ok => (),
4013            resp => bail!("device sleep failed: {}", resp),
4014        }
4015        VmRunMode::Suspending
4016    } else {
4017        VmRunMode::Running
4018    };
4019    #[cfg(feature = "gdb")]
4020    if to_gdb_channel.is_some() {
4021        // Wait until a GDB client attaches
4022        run_mode = VmRunMode::Breakpoint;
4023    }
4024    // If we are restoring from a snapshot, then start suspended.
4025    let (run_mode, post_restore_run_mode) = if cfg.restore_path.is_some() {
4026        (VmRunMode::Suspending, run_mode)
4027    } else {
4028        (run_mode, run_mode)
4029    };
4030
4031    // Architecture-specific code must supply a vcpu_init element for each VCPU.
4032    assert_eq!(vcpus.len(), linux.vcpu_init.len());
4033
4034    let (vcpu_pid_tid_sender, vcpu_pid_tid_receiver) = mpsc::channel();
4035    for ((cpu_id, vcpu), vcpu_init) in vcpus.into_iter().enumerate().zip(linux.vcpu_init.drain(..))
4036    {
4037        let vcpu_cgroup_file: Option<File>;
4038        if let Some(cgroup_file) = &vcpu_cgroup_tasks_file {
4039            vcpu_cgroup_file = Some(cgroup_file.try_clone().unwrap())
4040        } else if !cfg.cpu_freq_domains.is_empty() {
4041            vcpu_cgroup_file = Some(
4042                (vcpu_cgroup_v2_files.remove(&cpu_id).unwrap())
4043                    .try_clone()
4044                    .unwrap(),
4045            )
4046        } else {
4047            vcpu_cgroup_file = None
4048        };
4049
4050        let (to_vcpu_channel, from_main_channel) = mpsc::channel();
4051        let vcpu_affinity = match &linux.vcpu_affinity {
4052            Some(VcpuAffinity::Global(v)) => v.clone(),
4053            Some(VcpuAffinity::PerVcpu(m)) => m.get(&cpu_id).cloned().unwrap_or_default(),
4054            None => Default::default(),
4055        };
4056
4057        #[cfg(target_arch = "x86_64")]
4058        let vcpu_hybrid_type = if !cfg.vcpu_hybrid_type.is_empty() {
4059            Some(*cfg.vcpu_hybrid_type.get(&cpu_id).unwrap())
4060        } else {
4061            None
4062        };
4063
4064        #[cfg(target_arch = "x86_64")]
4065        let cpu_config = Some(CpuConfigX86_64::new(
4066            cfg.force_calibrated_tsc_leaf,
4067            cfg.host_cpu_topology,
4068            cfg.enable_hwp,
4069            cfg.no_smt,
4070            cfg.itmt,
4071            vcpu_hybrid_type,
4072        ));
4073        #[cfg(target_arch = "x86_64")]
4074        let bus_lock_ratelimit_ctrl = Arc::clone(&bus_lock_ratelimit_ctrl);
4075
4076        #[cfg(target_arch = "aarch64")]
4077        let cpu_config = None;
4078
4079        #[cfg(target_arch = "riscv64")]
4080        let cpu_config = Some(CpuConfigRiscv64::new(vcpu_init.fdt_address));
4081
4082        let handle = vcpu::run_vcpu(
4083            cpu_id,
4084            vcpu_ids[cpu_id],
4085            vcpu,
4086            vcpu_init,
4087            linux.vm.try_clone().context("failed to clone vm")?,
4088            linux
4089                .irq_chip
4090                .try_box_clone()
4091                .context("failed to clone irqchip")?,
4092            linux.vcpu_count,
4093            linux.rt_cpus.contains(&cpu_id),
4094            vcpu_affinity,
4095            linux.delay_rt,
4096            vcpu_thread_barrier.clone(),
4097            (*linux.io_bus).clone(),
4098            (*linux.mmio_bus).clone(),
4099            (*linux.hypercall_bus).clone(),
4100            vm_evt_wrtube
4101                .try_clone()
4102                .context("failed to clone vm event tube")?,
4103            from_main_channel,
4104            #[cfg(feature = "gdb")]
4105            to_gdb_channel.clone(),
4106            cfg.core_scheduling,
4107            cfg.per_vm_core_scheduling,
4108            cpu_config,
4109            match vcpu_cgroup_file {
4110                None => None,
4111                Some(ref f) => Some(
4112                    f.try_clone()
4113                        .context("failed to clone vcpu cgroup tasks file")?,
4114                ),
4115            },
4116            #[cfg(target_arch = "x86_64")]
4117            bus_lock_ratelimit_ctrl,
4118            run_mode,
4119            cfg.boost_uclamp,
4120            vcpu_pid_tid_sender.clone(),
4121        )?;
4122        vcpu_handles.push((handle, to_vcpu_channel));
4123    }
4124
4125    let mut vcpus_pid_tid = BTreeMap::new();
4126    for _ in 0..vcpu_handles.len() {
4127        let vcpu_pid_tid: VcpuPidTid = vcpu_pid_tid_receiver
4128            .recv()
4129            .context("failed receiving vcpu pid/tid")?;
4130        if vcpus_pid_tid
4131            .insert(
4132                vcpu_pid_tid.vcpu_id,
4133                (vcpu_pid_tid.process_id, vcpu_pid_tid.thread_id),
4134            )
4135            .is_some()
4136        {
4137            return Err(anyhow!(
4138                "Vcpu {} returned more than 1 PID and TID",
4139                vcpu_pid_tid.vcpu_id
4140            ));
4141        }
4142    }
4143
4144    #[cfg(feature = "gdb")]
4145    // Spawn GDB thread.
4146    if let Some((gdb_port_num, gdb_control_tube, from_vcpu_channel)) = gdb {
4147        let to_vcpu_channels = vcpu_handles
4148            .iter()
4149            .map(|(_handle, channel)| channel.clone())
4150            .collect();
4151        let target = GdbStub::new(gdb_control_tube, to_vcpu_channels, from_vcpu_channel);
4152        std::thread::Builder::new()
4153            .name("gdb".to_owned())
4154            .spawn(move || gdb_thread(target, gdb_port_num))
4155            .context("failed to spawn GDB thread")?;
4156    };
4157
4158    let (irq_handler_control, irq_handler_control_for_thread) = Tube::pair()?;
4159    let sys_allocator_for_thread = sys_allocator_mutex.clone();
4160    let irq_chip_for_thread = linux.irq_chip.try_box_clone()?;
4161    let irq_handler_thread = std::thread::Builder::new()
4162        .name("irq_handler_thread".into())
4163        .spawn(move || {
4164            irq_handler_thread(
4165                irq_control_tubes,
4166                irq_chip_for_thread,
4167                sys_allocator_for_thread,
4168                irq_handler_control_for_thread,
4169            )
4170        })
4171        .unwrap();
4172
4173    let (vm_memory_control_tube1, vm_memory_control_tube_2) = Tube::pair()?;
4174    vm_memory_control_tubes.push(VmMemoryTube {
4175        tube: vm_memory_control_tube1,
4176        expose_with_viommu: false,
4177    });
4178    let vm_memory_control_client = VmMemoryClient::new(vm_memory_control_tube_2);
4179    let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
4180    let vm_memory_handler_thread = std::thread::Builder::new()
4181        .name("vm_memory_handler_thread".into())
4182        .spawn({
4183            let vm = linux.vm.try_clone().context("failed to clone Vm")?;
4184            let sys_allocator_mutex = sys_allocator_mutex.clone();
4185            let iommu_client = iommu_host_tube
4186                .as_ref()
4187                .map(|t| VmMemoryRequestIommuClient::new(t.clone()));
4188            move || {
4189                vm_memory_handler_thread(
4190                    vm_memory_control_tubes,
4191                    vm,
4192                    sys_allocator_mutex,
4193                    gralloc,
4194                    iommu_client,
4195                    vm_memory_handler_control_for_thread,
4196                )
4197            }
4198        })
4199        .unwrap();
4200
4201    vcpu_thread_barrier.wait();
4202
4203    // See comment on `VmRequest::execute`.
4204    let mut suspended_pvclock_state: Option<hypervisor::ClockState> = None;
4205
4206    // Restore VM (if applicable).
4207    // Must happen after the vCPU barrier to avoid deadlock.
4208    if let Some(path) = &cfg.restore_path {
4209        vm_control::do_restore(
4210            path,
4211            |msg| vcpu::kick_all_vcpus(&vcpu_handles, linux.irq_chip.as_irq_chip(), msg),
4212            |msg, index| {
4213                vcpu::kick_vcpu(&vcpu_handles.get(index), linux.irq_chip.as_irq_chip(), msg)
4214            },
4215            &irq_handler_control,
4216            &device_ctrl_tube,
4217            linux.vcpu_count,
4218            |image| {
4219                linux
4220                    .irq_chip
4221                    .try_box_clone()?
4222                    .restore(image, linux.vcpu_count)
4223            },
4224            /* require_encrypted= */ false,
4225            &mut suspended_pvclock_state,
4226            &linux.vm,
4227        )?;
4228        // Allow the vCPUs to start for real.
4229        vcpu::kick_all_vcpus(
4230            &vcpu_handles,
4231            linux.irq_chip.as_irq_chip(),
4232            VcpuControl::RunState(post_restore_run_mode),
4233        )
4234    }
4235
4236    #[cfg(feature = "swap")]
4237    if let Some(swap_controller) = &swap_controller {
4238        swap_controller
4239            .on_static_devices_setup_complete()
4240            .context("static device setup complete")?;
4241    }
4242
4243    let metrics_thread = if metrics::is_initialized() {
4244        Some(
4245            std::thread::Builder::new()
4246                .name("metrics_thread".into())
4247                .spawn(move || {
4248                    if let Err(e) = MetricsController::new(vec![metrics_tube]).run() {
4249                        error!("Metrics controller error: {:?}", e);
4250                    }
4251                })
4252                .context("metrics thread failed")?,
4253        )
4254    } else {
4255        None
4256    };
4257
4258    let mut exit_state = ExitState::Stop;
4259    let mut pvpanic_code = PvPanicCode::Unknown;
4260    #[cfg(feature = "registered_events")]
4261    let mut registered_evt_tubes: HashMap<RegisteredEvent, HashSet<AddressedProtoTube>> =
4262        HashMap::new();
4263
4264    'wait: loop {
4265        let events = {
4266            match wait_ctx.wait() {
4267                Ok(v) => v,
4268                Err(e) => {
4269                    error!("failed to poll: {}", e);
4270                    break;
4271                }
4272            }
4273        };
4274
4275        let mut vm_control_ids_to_remove = Vec::new();
4276        for event in events.iter().filter(|e| e.is_readable) {
4277            match event.token {
4278                #[cfg(feature = "registered_events")]
4279                Token::RegisteredEvent => match reg_evt_rdtube.recv::<RegisteredEventWithData>() {
4280                    Ok(reg_evt) => {
4281                        let evt = reg_evt.into_event();
4282                        let mut tubes_to_remove: Vec<String> = Vec::new();
4283                        if let Some(tubes) = registered_evt_tubes.get_mut(&evt) {
4284                            for tube in tubes.iter() {
4285                                if let Err(e) = tube.send(&reg_evt.into_proto()) {
4286                                    warn!(
4287                                        "failed to send registered event {:?} to {}, removing from \
4288                                         registrations: {}",
4289                                        reg_evt, tube.socket_addr, e
4290                                    );
4291                                    tubes_to_remove.push(tube.socket_addr.clone());
4292                                }
4293                            }
4294                        }
4295                        for tube_addr in tubes_to_remove {
4296                            for tubes in registered_evt_tubes.values_mut() {
4297                                tubes.retain(|t| t.socket_addr != tube_addr);
4298                            }
4299                        }
4300                        registered_evt_tubes.retain(|_, tubes| !tubes.is_empty());
4301                    }
4302                    Err(e) => {
4303                        warn!("failed to recv RegisteredEvent: {}", e);
4304                    }
4305                },
4306                Token::VmEvent => {
4307                    let mut break_to_wait: bool = true;
4308                    match vm_evt_rdtube.recv::<VmEventType>() {
4309                        Ok(vm_event) => match vm_event {
4310                            VmEventType::Exit => {
4311                                info!("vcpu requested shutdown");
4312                                exit_state = ExitState::Stop;
4313                            }
4314                            VmEventType::Reset => {
4315                                info!("vcpu requested reset");
4316                                exit_state = ExitState::Reset;
4317                            }
4318                            VmEventType::Crash => {
4319                                info!("vcpu crashed");
4320                                exit_state = ExitState::Crash;
4321                            }
4322                            VmEventType::GuestPanic => {
4323                                info!("guest panic event");
4324                                exit_state = ExitState::GuestPanic;
4325                            }
4326                            VmEventType::DeviceCrashed => {
4327                                info!("device crashed");
4328                                exit_state = ExitState::Crash;
4329                            }
4330                            VmEventType::Panic(panic_code) => {
4331                                pvpanic_code = PvPanicCode::from_u8(panic_code);
4332                                info!("Guest reported panic [Code: {}]", pvpanic_code);
4333                                break_to_wait = false;
4334                            }
4335                            VmEventType::WatchdogReset => {
4336                                info!("vcpu stall detected");
4337                                exit_state = ExitState::WatchdogReset;
4338                            }
4339                        },
4340                        Err(e) => {
4341                            warn!("failed to recv VmEvent: {}", e);
4342                        }
4343                    }
4344                    if break_to_wait {
4345                        if pvpanic_code == PvPanicCode::Panicked {
4346                            exit_state = ExitState::GuestPanic;
4347                        }
4348                        break 'wait;
4349                    }
4350                }
4351                Token::Suspend => match linux.suspend_tube.1.recv::<bool>() {
4352                    Ok(is_suspend_request) => {
4353                        let mode = if is_suspend_request {
4354                            VmRunMode::Suspending
4355                        } else {
4356                            for dev in &linux.resume_notify_devices {
4357                                dev.lock().resume_imminent();
4358                            }
4359                            VmRunMode::Running
4360                        };
4361                        info!("VM requested {}", mode);
4362                        vcpu::kick_all_vcpus(
4363                            &vcpu_handles,
4364                            linux.irq_chip.as_irq_chip(),
4365                            VcpuControl::RunState(mode),
4366                        );
4367                    }
4368                    Err(err) => {
4369                        warn!("Failed to read suspend tube {:?}", err);
4370                    }
4371                },
4372                Token::ChildSignal => {
4373                    // Print all available siginfo structs, then exit the loop if child process has
4374                    // been exited except CLD_STOPPED and CLD_CONTINUED. the two should be ignored
4375                    // here since they are used by the vmm-swap feature.
4376                    let mut do_exit = false;
4377                    while let Some(siginfo) =
4378                        sigchld_fd.read().context("failed to read signalfd")?
4379                    {
4380                        let pid = siginfo.ssi_pid;
4381                        let pid_label = match linux.pid_debug_label_map.get(&pid) {
4382                            Some(label) => format!("{label} (pid {pid})"),
4383                            None => format!("pid {pid}"),
4384                        };
4385
4386                        // TODO(kawasin): this is a temporary exception until device suspension.
4387                        #[cfg(feature = "swap")]
4388                        if siginfo.ssi_code == libc::CLD_STOPPED
4389                            || siginfo.ssi_code == libc::CLD_CONTINUED
4390                        {
4391                            continue;
4392                        }
4393
4394                        // Ignore clean exits of non-tracked child processes when running without
4395                        // sandboxing. The virtio gpu process launches a render server for
4396                        // pass-through graphics. Host GPU drivers have been observed to fork
4397                        // child processes that exit cleanly which should not be considered a
4398                        // crash. When running with sandboxing, this should be handled by the
4399                        // device's process handler.
4400                        if cfg.jail_config.is_none()
4401                            && !linux.pid_debug_label_map.contains_key(&pid)
4402                            && siginfo.ssi_signo == libc::SIGCHLD as u32
4403                            && siginfo.ssi_code == libc::CLD_EXITED
4404                            && siginfo.ssi_status == 0
4405                        {
4406                            continue;
4407                        }
4408
4409                        // Allow clean exits of a child process in `worker_process_pids`.
4410                        if siginfo.ssi_signo == libc::SIGCHLD as u32
4411                            && siginfo.ssi_code == libc::CLD_EXITED
4412                            && siginfo.ssi_status == 0
4413                            && worker_process_pids.remove(&(pid as Pid))
4414                        {
4415                            info!("child {pid} exited successfully");
4416                            continue;
4417                        }
4418
4419                        if siginfo.ssi_signo == libc::SIGCHLD as u32
4420                            && (siginfo.ssi_code == libc::CLD_KILLED
4421                                || siginfo.ssi_code == libc::CLD_DUMPED)
4422                        {
4423                            error!(
4424                                "child {} killed by signal {} ({})",
4425                                pid_label,
4426                                siginfo.ssi_status,
4427                                base::signal::Signal::try_from(siginfo.ssi_status)
4428                                    .map(|s| s.to_string())
4429                                    .unwrap_or("unknown".to_string()),
4430                            );
4431                        } else {
4432                            error!(
4433                                "child {} exited: signo {}, status {}, code {}",
4434                                pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
4435                            );
4436                        }
4437                        do_exit = true;
4438                    }
4439                    if do_exit {
4440                        exit_state = ExitState::Crash;
4441                        break 'wait;
4442                    }
4443                }
4444                Token::VmControlServer => {
4445                    if let Some(socket_server) = &control_server_socket {
4446                        match socket_server.accept() {
4447                            Ok(socket) => {
4448                                let id = next_control_id;
4449                                next_control_id += 1;
4450                                wait_ctx
4451                                    .add(&socket, Token::VmControl { id })
4452                                    .context("failed to add descriptor to wait context")?;
4453                                control_tubes
4454                                    .insert(id, TaggedControlTube::Vm(Tube::try_from(socket)?));
4455                            }
4456                            Err(e) => error!("failed to accept socket: {}", e),
4457                        }
4458                    }
4459                }
4460                Token::VmControl { id } => {
4461                    if let Some(socket) = control_tubes.get(&id) {
4462                        let mut state = ControlLoopState {
4463                            linux: &mut linux,
4464                            cfg: &cfg,
4465                            sys_allocator: &sys_allocator_mutex,
4466                            control_tubes: &control_tubes,
4467                            disk_host_tubes: &disk_host_tubes[..],
4468                            #[cfg(feature = "audio")]
4469                            snd_host_tubes: &snd_host_tubes[..],
4470                            #[cfg(feature = "gpu")]
4471                            gpu_control_tube: gpu_control_tube.as_ref(),
4472                            #[cfg(feature = "usb")]
4473                            usb_control_tube: &usb_control_tube,
4474                            #[cfg(target_arch = "x86_64")]
4475                            iommu_host_tube: &iommu_host_tube,
4476                            #[cfg(target_arch = "x86_64")]
4477                            hp_control_tube: &hp_control_tube,
4478                            guest_suspended_cvar: &guest_suspended_cvar,
4479                            #[cfg(feature = "pci-hotplug")]
4480                            hotplug_manager: &mut hotplug_manager,
4481                            #[cfg(feature = "swap")]
4482                            swap_controller: &mut swap_controller,
4483                            vcpu_handles: &vcpu_handles,
4484                            #[cfg(feature = "balloon")]
4485                            balloon_tube: balloon_tube.as_mut(),
4486                            device_ctrl_tube: &device_ctrl_tube,
4487                            irq_handler_control: &irq_handler_control,
4488                            #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
4489                            vm_memory_handler_control: &vm_memory_handler_control,
4490                            #[cfg(feature = "registered_events")]
4491                            registered_evt_tubes: &mut registered_evt_tubes,
4492                            #[cfg(feature = "pvclock")]
4493                            pvclock_host_tube: pvclock_host_tube.clone(),
4494                            vfio_container_manager: &mut vfio_container_manager,
4495                            suspended_pvclock_state: &mut suspended_pvclock_state,
4496                            vcpus_pid_tid: &vcpus_pid_tid,
4497                            vm_memory_control_client: &vm_memory_control_client,
4498                        };
4499                        let (exit_requested, mut ids_to_remove, add_tubes) =
4500                            process_vm_control_event(&mut state, id, socket)?;
4501                        if exit_requested {
4502                            break 'wait;
4503                        }
4504                        vm_control_ids_to_remove.append(&mut ids_to_remove);
4505                        for socket in add_tubes {
4506                            let id = next_control_id;
4507                            next_control_id += 1;
4508                            wait_ctx
4509                                .add(socket.as_ref(), Token::VmControl { id })
4510                                .context(
4511                                    "failed to add hotplug vfio-pci descriptor to wait context",
4512                                )?;
4513                            control_tubes.insert(id, socket);
4514                        }
4515                    }
4516                }
4517                #[cfg(feature = "balloon")]
4518                Token::BalloonTube => {
4519                    match balloon_tube.as_mut().expect("missing balloon tube").recv() {
4520                        Ok(resp) => {
4521                            for (resp, idx) in resp {
4522                                if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
4523                                    if let Err(e) = tube.send(&resp) {
4524                                        error!("failed to send VmResponse: {}", e);
4525                                    }
4526                                } else {
4527                                    error!("Bad tube index {}", idx);
4528                                }
4529                            }
4530                        }
4531                        Err(err) => {
4532                            error!("Error processing balloon tube {:?}", err)
4533                        }
4534                    }
4535                }
4536            }
4537        }
4538
4539        remove_hungup_and_drained_tubes(
4540            &events,
4541            &wait_ctx,
4542            &mut control_tubes,
4543            vm_control_ids_to_remove,
4544            |token: &Token| {
4545                if let Token::VmControl { id } = token {
4546                    return Some(*id);
4547                }
4548                None
4549            },
4550        )?;
4551    }
4552
4553    vcpu::kick_all_vcpus(
4554        &vcpu_handles,
4555        linux.irq_chip.as_irq_chip(),
4556        VcpuControl::RunState(VmRunMode::Exiting),
4557    );
4558    for (handle, _) in vcpu_handles {
4559        if let Err(e) = handle.join() {
4560            error!("failed to join vcpu thread: {:?}", e);
4561        }
4562    }
4563
4564    // After joining all vcpu threads, unregister the process-wide signal handler.
4565    if let Err(e) = vcpu::remove_vcpu_signal_handler() {
4566        error!("failed to remove vcpu thread signal handler: {:#}", e);
4567    }
4568
4569    // Stop the vmm-swap monitor process.
4570    #[cfg(feature = "swap")]
4571    drop(swap_controller);
4572
4573    // Stop pci root worker thread
4574    #[cfg(target_arch = "x86_64")]
4575    {
4576        let _ = hp_control_tube.send(PciRootCommand::Kill);
4577        if let Err(e) = hp_thread.join() {
4578            error!("failed to join hotplug thread: {:?}", e);
4579        }
4580    }
4581
4582    if linux.devices_thread.is_some() {
4583        if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
4584            error!("failed to stop device control loop: {}", e);
4585        };
4586        if let Some(thread) = linux.devices_thread.take() {
4587            if let Err(e) = thread.join() {
4588                error!("failed to exit devices thread: {:?}", e);
4589            }
4590        }
4591    }
4592
4593    // At this point, the only remaining `Arc` references to the `Bus` objects should be the ones
4594    // inside `linux`. If the checks below fail, then some other thread is probably still running
4595    // and needs to be explicitly stopped before dropping `linux` to ensure devices actually get
4596    // cleaned up.
4597    match Arc::try_unwrap(std::mem::replace(
4598        &mut linux.mmio_bus,
4599        Arc::new(Bus::new(BusType::Mmio)),
4600    )) {
4601        Ok(_) => {}
4602        Err(_) => panic!("internal error: mmio_bus had more than one reference at shutdown"),
4603    }
4604    match Arc::try_unwrap(std::mem::replace(
4605        &mut linux.io_bus,
4606        Arc::new(Bus::new(BusType::Io)),
4607    )) {
4608        Ok(_) => {}
4609        Err(_) => panic!("internal error: io_bus had more than one reference at shutdown"),
4610    }
4611
4612    // Explicitly drop the VM structure here to allow the devices to clean up before the
4613    // control sockets are closed when this function exits.
4614    mem::drop(linux);
4615
4616    // Shut down the VM memory handler thread. This must happen after the potential device worker
4617    // threads(including the vhost device request handler threads) exit, because device worker
4618    // threads can issue VM memory requests. Those device worker threads are supposed to stop after
4619    // the RunnableLinuxVm is dropped.
4620    if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
4621        error!(
4622            "failed to request exit from VM Memory handler thread: {}",
4623            e
4624        );
4625    }
4626    if let Err(e) = vm_memory_handler_thread.join() {
4627        error!("failed to exit VM Memory handler thread: {:?}", e);
4628    }
4629
4630    // Shut down the IRQ handler thread after the devices are dropped.
4631    if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
4632        error!("failed to request exit from IRQ handler thread: {}", e);
4633    }
4634    if let Err(e) = irq_handler_thread.join() {
4635        error!("failed to exit irq handler thread: {:?}", e);
4636    }
4637
4638    // Drop the hotplug manager to tell the warden process to exit before we try to join
4639    // the metrics thread.
4640    #[cfg(feature = "pci-hotplug")]
4641    mem::drop(hotplug_manager);
4642
4643    // All our children should have exited by now, so closing our fd should
4644    // terminate metrics. Then join so that everything gets flushed.
4645    metrics::get_destructor().cleanup();
4646    if let Some(metrics_thread) = metrics_thread {
4647        if let Err(e) = metrics_thread.join() {
4648            error!("failed to exit irq handler thread: {:?}", e);
4649        }
4650    }
4651
4652    stdin()
4653        .set_canon_mode()
4654        .expect("failed to restore canonical mode for terminal");
4655
4656    Ok(exit_state)
4657}
4658
4659#[derive(EventToken)]
4660enum IrqHandlerToken {
4661    IrqFd { index: IrqEventIndex },
4662    VmIrq { id: usize },
4663    DelayedIrqFd,
4664    HandlerControl,
4665}
4666
4667/// Handles IRQs and requests from devices to add additional IRQ lines.
4668fn irq_handler_thread(
4669    irq_control_tubes: Vec<Tube>,
4670    mut irq_chip: Box<dyn IrqChipArch + 'static>,
4671    sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4672    handler_control: Tube,
4673) -> anyhow::Result<()> {
4674    let wait_ctx = WaitContext::build_with(&[(
4675        handler_control.get_read_notifier(),
4676        IrqHandlerToken::HandlerControl,
4677    )])
4678    .context("failed to build wait context")?;
4679
4680    if let Some(delayed_ioapic_irq_trigger) = irq_chip.irq_delayed_event_token()? {
4681        wait_ctx
4682            .add(&delayed_ioapic_irq_trigger, IrqHandlerToken::DelayedIrqFd)
4683            .context("failed to add descriptor to wait context")?;
4684    }
4685
4686    let mut irq_event_tokens = irq_chip
4687        .irq_event_tokens()
4688        .context("failed get event tokens from irqchip")?;
4689
4690    for (index, _gsi, evt) in irq_event_tokens.iter() {
4691        wait_ctx
4692            .add(evt, IrqHandlerToken::IrqFd { index: *index })
4693            .context("failed to add irq chip event tokens to wait context")?;
4694    }
4695
4696    let mut irq_control_tubes = BTreeMap::from_iter(irq_control_tubes.into_iter().enumerate());
4697    let mut next_control_id = irq_control_tubes.len();
4698    for (id, socket) in irq_control_tubes.iter() {
4699        wait_ctx
4700            .add(
4701                socket.get_read_notifier(),
4702                IrqHandlerToken::VmIrq { id: *id },
4703            )
4704            .context("irq control tubes to wait context")?;
4705    }
4706
4707    'wait: loop {
4708        let events = {
4709            match wait_ctx.wait() {
4710                Ok(v) => v,
4711                Err(e) => {
4712                    error!("failed to poll: {}", e);
4713                    break 'wait;
4714                }
4715            }
4716        };
4717        let token_count = events.len();
4718        let mut vm_irq_tubes_to_remove = Vec::new();
4719        let mut notify_control_on_iteration_end = false;
4720
4721        for event in events.iter().filter(|e| e.is_readable) {
4722            match event.token {
4723                IrqHandlerToken::HandlerControl => {
4724                    match handler_control.recv::<IrqHandlerRequest>() {
4725                        Ok(request) => {
4726                            match request {
4727                                IrqHandlerRequest::Exit => break 'wait,
4728                                IrqHandlerRequest::AddIrqControlTubes(tubes) => {
4729                                    for socket in tubes {
4730                                        let id = next_control_id;
4731                                        next_control_id += 1;
4732                                        wait_ctx
4733                                        .add(
4734                                            socket.get_read_notifier(),
4735                                            IrqHandlerToken::VmIrq { id },
4736                                        )
4737                                        .context("failed to add new IRQ control Tube to wait context")?;
4738                                        irq_control_tubes.insert(id, socket);
4739                                    }
4740                                }
4741                                IrqHandlerRequest::RefreshIrqEventTokens => {
4742                                    for (_index, _gsi, evt) in irq_event_tokens.iter() {
4743                                        wait_ctx.delete(evt).context(
4744                                            "failed to remove irq chip event \
4745                                                token from wait context",
4746                                        )?;
4747                                    }
4748
4749                                    irq_event_tokens = irq_chip
4750                                        .irq_event_tokens()
4751                                        .context("failed get event tokens from irqchip")?;
4752                                    for (index, _gsi, evt) in irq_event_tokens.iter() {
4753                                        wait_ctx
4754                                            .add(evt, IrqHandlerToken::IrqFd { index: *index })
4755                                            .context(
4756                                                "failed to add irq chip event \
4757                                                tokens to wait context",
4758                                            )?;
4759                                    }
4760
4761                                    if let Err(e) = handler_control
4762                                        .send(&IrqHandlerResponse::IrqEventTokenRefreshComplete)
4763                                    {
4764                                        error!(
4765                                            "failed to notify IRQ event token refresh \
4766                                            was completed: {}",
4767                                            e
4768                                        );
4769                                    }
4770                                }
4771                                IrqHandlerRequest::WakeAndNotifyIteration => {
4772                                    notify_control_on_iteration_end = true;
4773                                }
4774                            }
4775                        }
4776                        Err(e) => {
4777                            if let TubeError::Disconnected = e {
4778                                panic!("irq handler control tube disconnected.");
4779                            } else {
4780                                error!("failed to recv IrqHandlerRequest: {}", e);
4781                            }
4782                        }
4783                    }
4784                }
4785                IrqHandlerToken::VmIrq { id } => {
4786                    if let Some(tube) = irq_control_tubes.get(&id) {
4787                        handle_irq_tube_request(
4788                            &sys_allocator_mutex,
4789                            &mut irq_chip,
4790                            &mut vm_irq_tubes_to_remove,
4791                            &wait_ctx,
4792                            tube,
4793                            id,
4794                        );
4795                    }
4796                }
4797                IrqHandlerToken::IrqFd { index } => {
4798                    if let Err(e) = irq_chip.service_irq_event(index) {
4799                        error!("failed to signal irq {}: {}", index, e);
4800                    }
4801                }
4802                IrqHandlerToken::DelayedIrqFd => {
4803                    if let Err(e) = irq_chip.process_delayed_irq_events() {
4804                        warn!("can't deliver delayed irqs: {}", e);
4805                    }
4806                }
4807            }
4808        }
4809
4810        if notify_control_on_iteration_end {
4811            if let Err(e) = handler_control.send(&IrqHandlerResponse::HandlerIterationComplete(
4812                token_count - 1,
4813            )) {
4814                error!(
4815                    "failed to notify on iteration completion (snapshotting may fail): {}",
4816                    e
4817                );
4818            }
4819        }
4820
4821        remove_hungup_and_drained_tubes(
4822            &events,
4823            &wait_ctx,
4824            &mut irq_control_tubes,
4825            vm_irq_tubes_to_remove,
4826            |token: &IrqHandlerToken| {
4827                if let IrqHandlerToken::VmIrq { id } = token {
4828                    return Some(*id);
4829                }
4830                None
4831            },
4832        )?;
4833        if events.iter().any(|e| {
4834            e.is_hungup && !e.is_readable && matches!(e.token, IrqHandlerToken::HandlerControl)
4835        }) {
4836            error!("IRQ handler control hung up but did not request an exit.");
4837            break 'wait;
4838        }
4839    }
4840    Ok(())
4841}
4842
4843fn handle_irq_tube_request(
4844    sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
4845    irq_chip: &mut Box<dyn IrqChipArch + 'static>,
4846    vm_irq_tubes_to_remove: &mut Vec<usize>,
4847    wait_ctx: &WaitContext<IrqHandlerToken>,
4848    tube: &Tube,
4849    tube_index: usize,
4850) {
4851    match tube.recv::<VmIrqRequest>() {
4852        Ok(request) => {
4853            let response = {
4854                request.execute(
4855                    |setup| match setup {
4856                        IrqSetup::Event(irq, ev, device_id, queue_id, device_name) => {
4857                            let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4858                            let source = IrqEventSource {
4859                                device_id,
4860                                queue_id,
4861                                device_name,
4862                            };
4863                            if let Some(event_index) =
4864                                irq_chip.register_edge_irq_event(irq, &irq_evt, source)?
4865                            {
4866                                if let Err(e) =
4867                                    wait_ctx.add(ev, IrqHandlerToken::IrqFd { index: event_index })
4868                                {
4869                                    warn!("failed to add IrqFd to poll context: {}", e);
4870                                    return Err(e);
4871                                }
4872                            }
4873                            Ok(())
4874                        }
4875                        IrqSetup::Route(route) => irq_chip.route_irq(route),
4876                        IrqSetup::UnRegister(irq, ev) => {
4877                            let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4878                            irq_chip.unregister_edge_irq_event(irq, &irq_evt)
4879                        }
4880                    },
4881                    &mut sys_allocator_mutex.lock(),
4882                )
4883            };
4884            if let Err(e) = tube.send(&response) {
4885                error!("failed to send VmIrqResponse: {}", e);
4886            }
4887        }
4888        Err(e) => {
4889            if let TubeError::Disconnected = e {
4890                vm_irq_tubes_to_remove.push(tube_index);
4891            } else {
4892                error!("failed to recv VmIrqRequest: {}", e);
4893            }
4894        }
4895    }
4896}
4897
4898/// Commands to control the VM Memory handler thread.
4899#[derive(serde::Serialize, serde::Deserialize)]
4900pub enum VmMemoryHandlerRequest {
4901    /// No response is sent for this command.
4902    AddControlTubes(Vec<VmMemoryTube>),
4903    /// No response is sent for this command.
4904    Exit,
4905}
4906
4907fn vm_memory_handler_thread(
4908    control_tubes: Vec<VmMemoryTube>,
4909    mut vm: impl Vm,
4910    sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4911    mut gralloc: RutabagaGralloc,
4912    mut iommu_client: Option<VmMemoryRequestIommuClient>,
4913    handler_control: Tube,
4914) -> anyhow::Result<()> {
4915    #[derive(EventToken)]
4916    enum Token {
4917        VmControl { id: usize },
4918        HandlerControl,
4919    }
4920
4921    let wait_ctx =
4922        WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
4923            .context("failed to build wait context")?;
4924    let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
4925    let mut next_control_id = control_tubes.len();
4926    for (id, socket) in control_tubes.iter() {
4927        wait_ctx
4928            .add(socket.as_ref(), Token::VmControl { id: *id })
4929            .context("failed to add descriptor to wait context")?;
4930    }
4931
4932    let mut region_state: VmMemoryRegionState = Default::default();
4933
4934    'wait: loop {
4935        let events = {
4936            match wait_ctx.wait() {
4937                Ok(v) => v,
4938                Err(e) => {
4939                    error!("failed to poll: {}", e);
4940                    break;
4941                }
4942            }
4943        };
4944
4945        let mut vm_control_ids_to_remove = Vec::new();
4946        for event in events.iter().filter(|e| e.is_readable) {
4947            match event.token {
4948                Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
4949                    Ok(request) => match request {
4950                        VmMemoryHandlerRequest::Exit => break 'wait,
4951                        VmMemoryHandlerRequest::AddControlTubes(tubes) => {
4952                            for socket in tubes {
4953                                let id = next_control_id;
4954                                next_control_id += 1;
4955                                wait_ctx
4956                                    .add(socket.get_read_notifier(), Token::VmControl { id })
4957                                    .context(
4958                                        "failed to add new vm memory control Tube to wait context",
4959                                    )?;
4960                                control_tubes.insert(id, socket);
4961                            }
4962                        }
4963                    },
4964                    Err(e) => {
4965                        if let TubeError::Disconnected = e {
4966                            panic!("vm memory control tube disconnected.");
4967                        } else {
4968                            error!("failed to recv VmMemoryHandlerRequest: {}", e);
4969                        }
4970                    }
4971                },
4972                Token::VmControl { id } => {
4973                    if let Some(VmMemoryTube {
4974                        tube,
4975                        expose_with_viommu,
4976                    }) = control_tubes.get(&id)
4977                    {
4978                        match tube.recv::<VmMemoryRequest>() {
4979                            Ok(request) => {
4980                                let response = request.execute(
4981                                    tube,
4982                                    &mut vm,
4983                                    &mut sys_allocator_mutex.lock(),
4984                                    &mut gralloc,
4985                                    if *expose_with_viommu {
4986                                        iommu_client.as_mut()
4987                                    } else {
4988                                        None
4989                                    },
4990                                    &mut region_state,
4991                                );
4992                                if let Err(e) = tube.send(&response) {
4993                                    error!("failed to send VmMemoryControlResponse: {}", e);
4994                                }
4995                            }
4996                            Err(e) => {
4997                                if let TubeError::Disconnected = e {
4998                                    vm_control_ids_to_remove.push(id);
4999                                } else {
5000                                    error!("failed to recv VmMemoryControlRequest: {}", e);
5001                                }
5002                            }
5003                        }
5004                    }
5005                }
5006            }
5007        }
5008
5009        remove_hungup_and_drained_tubes(
5010            &events,
5011            &wait_ctx,
5012            &mut control_tubes,
5013            vm_control_ids_to_remove,
5014            |token: &Token| {
5015                if let Token::VmControl { id } = token {
5016                    return Some(*id);
5017                }
5018                None
5019            },
5020        )?;
5021        if events
5022            .iter()
5023            .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
5024        {
5025            error!("vm memory handler control hung up but did not request an exit.");
5026            break 'wait;
5027        }
5028    }
5029    Ok(())
5030}
5031
5032/// When control tubes hang up, we want to make sure that we've fully drained
5033/// the underlying socket before removing it. This function also handles
5034/// removing closed sockets in such a way that avoids phantom events.
5035///
5036/// `tube_ids_to_remove` is the set of ids that we already know should
5037/// be removed (e.g. from getting a disconnect error on read).
5038fn remove_hungup_and_drained_tubes<T, U>(
5039    events: &SmallVec<[TriggeredEvent<T>; 16]>,
5040    wait_ctx: &WaitContext<T>,
5041    tubes: &mut BTreeMap<usize, U>,
5042    mut tube_ids_to_remove: Vec<usize>,
5043    get_tube_id: fn(token: &T) -> Option<usize>,
5044) -> anyhow::Result<()>
5045where
5046    T: EventToken,
5047    U: ReadNotifier,
5048{
5049    // It's possible more data is readable and buffered while the socket is hungup,
5050    // so don't delete the tube from the poll context until we're sure all the
5051    // data is read.
5052    // Below case covers a condition where we have received a hungup event and the tube is not
5053    // readable.
5054    // In case of readable tube, once all data is read, any attempt to read more data on hungup
5055    // tube should fail. On such failure, we get Disconnected error and ids gets added to
5056    // tube_ids_to_remove by the time we reach here.
5057    for event in events.iter().filter(|e| e.is_hungup && !e.is_readable) {
5058        if let Some(id) = get_tube_id(&event.token) {
5059            tube_ids_to_remove.push(id);
5060        }
5061    }
5062
5063    tube_ids_to_remove.dedup();
5064    for id in tube_ids_to_remove {
5065        // Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
5066        // this automatically when the FD inserted into the `wait_ctx` is closed after this
5067        // if-block, but this removal can be deferred unpredictably. In some instances where the
5068        // system is under heavy load, we can even get events returned by `wait_ctx` for an FD
5069        // that has already been closed. Because the token associated with that spurious event
5070        // now belongs to a different socket, the control loop will start to interact with
5071        // sockets that might not be ready to use. This can cause incorrect hangup detection or
5072        // blocking on a socket that will never be ready. See also: crbug.com/1019986
5073        if let Some(socket) = tubes.remove(&id) {
5074            wait_ctx
5075                .delete(socket.get_read_notifier())
5076                .context("failed to remove descriptor from wait context")?;
5077        }
5078    }
5079    Ok(())
5080}
5081
5082/// Start and jail a vhost-user device according to its configuration and a vhost listener string.
5083///
5084/// The jailing business is nasty and potentially unsafe if done from the wrong context - do not
5085/// call outside of `start_devices`!
5086///
5087/// Returns the pid of the jailed device process.
5088fn jail_and_start_vu_device<T: VirtioDeviceBuilder>(
5089    jail_config: Option<&JailConfig>,
5090    params: T,
5091    vhost: &str,
5092    name: &str,
5093) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)> {
5094    let mut keep_rds = Vec::new();
5095
5096    base::syslog::push_descriptors(&mut keep_rds);
5097    cros_tracing::push_descriptors!(&mut keep_rds);
5098    metrics::push_descriptors(&mut keep_rds);
5099
5100    let jail_type = VirtioDeviceType::VhostUser;
5101
5102    // Create a jail from the configuration. If the configuration is `None`, `create_jail` will also
5103    // return `None` so fall back to an empty (i.e. non-constrained) Minijail.
5104    let jail = params
5105        .create_jail(jail_config, jail_type)
5106        .with_context(|| format!("failed to create jail for {name}"))?
5107        .ok_or(())
5108        .or_else(|_| Minijail::new())
5109        .with_context(|| format!("failed to create empty jail for {name}"))?;
5110
5111    // Create the device in the parent process, so the child does not need any privileges necessary
5112    // to do it (only runtime capabilities are required).
5113    let device = params
5114        .create_vhost_user_device(&mut keep_rds)
5115        .context("failed to create vhost-user device")?;
5116    let mut listener =
5117        VhostUserListener::new(vhost).context("failed to create the vhost listener")?;
5118    keep_rds.push(listener.as_raw_descriptor());
5119    let parent_resources = listener.take_parent_process_resources();
5120
5121    // Executor must be created before jail in order to prevent the jailed process from creating
5122    // unrestricted io_urings.
5123    let ex = Executor::new().context("Failed to create an Executor")?;
5124    keep_rds.extend(ex.as_raw_descriptors());
5125
5126    // Deduplicate the FDs since minijail expects them to be unique.
5127    keep_rds.sort_unstable();
5128    keep_rds.dedup();
5129
5130    // SAFETY:
5131    // Safe because we are keeping all the descriptors needed for the child to function.
5132    match unsafe { jail.fork(Some(&keep_rds)).context("error while forking")? } {
5133        0 => {
5134            // In the child process.
5135
5136            // Free memory for the resources managed by the parent, without running drop() on them.
5137            // The parent will do it as we exit.
5138            let _ = std::mem::ManuallyDrop::new(parent_resources);
5139
5140            // Make sure the child process does not survive its parent.
5141            // SAFETY: trivially safe
5142            if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } < 0 {
5143                panic!("call to prctl(PR_SET_DEATHSIG, SIGKILL) failed. Aborting child process.");
5144            }
5145
5146            // Set the name for the thread.
5147            const MAX_LEN: usize = 15; // pthread_setname_np() limit on Linux
5148            let debug_label_trimmed = &name.as_bytes()[..std::cmp::min(MAX_LEN, name.len())];
5149            let thread_name = CString::new(debug_label_trimmed).unwrap();
5150            // SAFETY:
5151            // Safe because we trimmed the name to 15 characters (and pthread_setname_np will return
5152            // an error if we don't anyway).
5153            let _ = unsafe { libc::pthread_setname_np(libc::pthread_self(), thread_name.as_ptr()) };
5154
5155            // Run the device loop and terminate the child process once it exits.
5156            let res = match listener.run_device(ex, device) {
5157                Ok(()) => 0,
5158                Err(e) => {
5159                    error!("error while running device {}: {:#}", name, e);
5160                    1
5161                }
5162            };
5163            // SAFETY: trivially safe
5164            unsafe { libc::exit(res) };
5165        }
5166        pid => {
5167            // In the parent process. We will drop the device and listener when exiting this method.
5168            // This is fine as ownership for both has been transferred to the child process and they
5169            // will keep living there. We just retain `parent_resources` for things we are supposed
5170            // to clean up ourselves.
5171
5172            info!("process for device {} (PID {}) started", &name, pid);
5173            #[cfg(feature = "seccomp_trace")]
5174            debug!(
5175                    "seccomp_trace {{\"event\": \"minijail_fork\", \"pid\": {}, \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
5176                    pid,
5177                    &name,
5178                    read_jail_addr(&jail)
5179                );
5180            Ok((pid, parent_resources))
5181        }
5182    }
5183}
5184
5185fn process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()> {
5186    let command = tube
5187        .recv::<VmRequest>()
5188        .context("failed to receive VmRequest")?;
5189    let resp = match command {
5190        VmRequest::DiskCommand {
5191            disk_index,
5192            ref command,
5193        } => match &disk_host_tubes.get(disk_index) {
5194            Some(tube) => handle_disk_command(command, tube),
5195            None => VmResponse::Err(base::Error::new(libc::ENODEV)),
5196        },
5197        request => {
5198            error!(
5199                "Request {:?} currently not supported in vhost user backend",
5200                request
5201            );
5202            VmResponse::Err(base::Error::new(libc::EPERM))
5203        }
5204    };
5205
5206    tube.send(&resp).context("failed to send VmResponse")?;
5207    Ok(())
5208}
5209
5210fn start_vhost_user_control_server(
5211    control_server_socket: UnlinkUnixSeqpacketListener,
5212    disk_host_tubes: Vec<Tube>,
5213) {
5214    info!("Start vhost-user control server");
5215    loop {
5216        match control_server_socket.accept() {
5217            Ok(socket) => {
5218                let tube = match Tube::try_from(socket) {
5219                    Ok(tube) => tube,
5220                    Err(e) => {
5221                        error!("failed to open tube: {:#}", e);
5222                        return;
5223                    }
5224                };
5225                if let Err(e) = process_vhost_user_control_request(tube, &disk_host_tubes) {
5226                    error!("failed to process control request: {:#}", e);
5227                }
5228            }
5229            Err(e) => {
5230                error!("failed to establish connection: {}", e);
5231            }
5232        }
5233    }
5234}
5235
5236pub fn start_devices(opts: DevicesCommand) -> anyhow::Result<()> {
5237    if let Some(async_executor) = opts.async_executor {
5238        Executor::set_default_executor_kind(async_executor)
5239            .context("Failed to set the default async executor")?;
5240    }
5241
5242    struct DeviceJailInfo {
5243        // Unique name for the device, in the form `foomatic-0`.
5244        name: String,
5245        _drop_resources: Option<Box<dyn std::any::Any>>,
5246    }
5247
5248    fn add_device<T: VirtioDeviceBuilder>(
5249        i: usize,
5250        device_params: T,
5251        vhost: &str,
5252        jail_config: Option<&JailConfig>,
5253        devices_jails: &mut BTreeMap<libc::pid_t, DeviceJailInfo>,
5254    ) -> anyhow::Result<()> {
5255        let name = format!("{}-{}", T::NAME, i);
5256
5257        let (pid, _drop_resources) =
5258            jail_and_start_vu_device::<T>(jail_config, device_params, vhost, &name)?;
5259
5260        devices_jails.insert(
5261            pid,
5262            DeviceJailInfo {
5263                name,
5264                _drop_resources,
5265            },
5266        );
5267
5268        Ok(())
5269    }
5270
5271    let mut devices_jails: BTreeMap<libc::pid_t, DeviceJailInfo> = BTreeMap::new();
5272
5273    let jail = if opts.disable_sandbox {
5274        None
5275    } else {
5276        Some(&opts.jail)
5277    };
5278
5279    // Create control server socket
5280    let control_server_socket = opts.control_socket.map(|path| {
5281        UnlinkUnixSeqpacketListener(
5282            UnixSeqpacketListener::bind(path).expect("Could not bind socket"),
5283        )
5284    });
5285
5286    // Create serial devices.
5287    for (i, params) in opts.serial.iter().enumerate() {
5288        let serial_config = &params.device;
5289        add_device(i, serial_config, &params.vhost, jail, &mut devices_jails)?;
5290    }
5291
5292    let mut disk_host_tubes = Vec::new();
5293    let control_socket_exists = control_server_socket.is_some();
5294    // Create block devices.
5295    for (i, params) in opts.block.iter().enumerate() {
5296        let tube = if control_socket_exists {
5297            let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
5298            disk_host_tubes.push(host_tube);
5299            Some(device_tube)
5300        } else {
5301            None
5302        };
5303        let disk_config = DiskConfig::new(&params.device, tube);
5304        add_device(i, disk_config, &params.vhost, jail, &mut devices_jails)?;
5305    }
5306
5307    // Create vsock devices.
5308    for (i, params) in opts.vsock.iter().enumerate() {
5309        add_device(i, &params.device, &params.vhost, jail, &mut devices_jails)?;
5310    }
5311
5312    // Create network devices.
5313    #[cfg(feature = "net")]
5314    for (i, params) in opts.net.iter().enumerate() {
5315        add_device(i, &params.device, &params.vhost, jail, &mut devices_jails)?;
5316    }
5317
5318    // No device created, that's probably not intended - print the help in that case.
5319    if devices_jails.is_empty() {
5320        let err = DevicesCommand::from_args(
5321            &[&std::env::args().next().unwrap_or(String::from("crosvm"))],
5322            &["--help"],
5323        )
5324        .unwrap_err();
5325        println!("{}", err.output);
5326        return Ok(());
5327    }
5328
5329    if let Some(control_server_socket) = control_server_socket {
5330        // Start the control server in the parent process.
5331        std::thread::spawn(move || {
5332            start_vhost_user_control_server(control_server_socket, disk_host_tubes)
5333        });
5334    }
5335
5336    // Now wait for all device processes to return.
5337    while !devices_jails.is_empty() {
5338        match base::linux::wait_for_pid(-1, 0) {
5339            Err(e) => panic!("error waiting for child process to complete: {e:#}"),
5340            Ok((Some(pid), wait_status)) => match devices_jails.remove_entry(&pid) {
5341                Some((_, info)) => {
5342                    if let Some(status) = wait_status.code() {
5343                        info!(
5344                            "process for device {} (PID {}) exited with code {}",
5345                            &info.name, pid, status
5346                        );
5347                    } else if let Some(signal) = wait_status.signal() {
5348                        warn!(
5349                            "process for device {} (PID {}) has been killed by signal {:?}",
5350                            &info.name, pid, signal,
5351                        );
5352                    }
5353                }
5354                None => error!("pid {} is not one of our device processes", pid),
5355            },
5356            // `wait_for_pid` will necessarily return a PID because we asked to it wait for one to
5357            // complete.
5358            Ok((None, _)) => unreachable!(),
5359        }
5360    }
5361
5362    info!("all device processes have exited");
5363
5364    Ok(())
5365}
5366
5367/// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid
5368/// making crash reports incomprehensible.
5369#[cfg(feature = "crash-report")]
5370pub fn setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String> {
5371    crash_report::setup_crash_reporting(crash_report::CrashReportAttributes {
5372        product_type: "emulator".to_owned(),
5373        pipe_name: None,
5374        report_uuid: None,
5375        product_name: None,
5376        product_version: None,
5377    })
5378}
5379
5380#[cfg(test)]
5381mod tests {
5382    use std::path::PathBuf;
5383
5384    use arch::CpuSet;
5385    use vm_memory::MemoryRegionPurpose;
5386
5387    use super::*;
5388
5389    // Create a file-backed mapping parameters struct with the given `address` and `size` and other
5390    // parameters set to default values.
5391    fn test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters {
5392        FileBackedMappingParameters {
5393            address,
5394            size,
5395            path: PathBuf::new(),
5396            offset: 0,
5397            writable: false,
5398            sync: false,
5399            align: false,
5400            ram: true,
5401        }
5402    }
5403
5404    #[test]
5405    fn guest_mem_file_backed_mappings_overlap() {
5406        // Base case: no file mappings; output layout should be identical.
5407        assert_eq!(
5408            punch_holes_in_guest_mem_layout_for_mappings(
5409                vec![
5410                    (GuestAddress(0), 0xD000_0000, Default::default()),
5411                    (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5412                ],
5413                &[]
5414            )
5415            .unwrap(),
5416            vec![
5417                (GuestAddress(0), 0xD000_0000, Default::default()),
5418                (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5419            ],
5420        );
5421
5422        // File mapping that does not overlap guest memory.
5423        assert_eq!(
5424            punch_holes_in_guest_mem_layout_for_mappings(
5425                vec![
5426                    (GuestAddress(0), 0xD000_0000, Default::default()),
5427                    (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5428                ],
5429                &[test_file_backed_mapping(0xD000_0000, 0x1000)]
5430            )
5431            .unwrap_err()
5432            .to_string(),
5433            "RAM file-backed-mapping must be a subset of a RAM region",
5434        );
5435
5436        // File mapping at the start of the low address space region.
5437        assert_eq!(
5438            punch_holes_in_guest_mem_layout_for_mappings(
5439                vec![
5440                    (GuestAddress(0), 0xD000_0000, Default::default()),
5441                    (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5442                ],
5443                &[test_file_backed_mapping(0, 0x2000)]
5444            )
5445            .unwrap(),
5446            vec![
5447                (
5448                    GuestAddress(0),
5449                    0x2000,
5450                    MemoryRegionOptions::new()
5451                        .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5452                        .file_backed(test_file_backed_mapping(0, 0x2000)),
5453                ),
5454                (
5455                    GuestAddress(0x2000),
5456                    0xD000_0000 - 0x2000,
5457                    Default::default()
5458                ),
5459                (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5460            ],
5461        );
5462
5463        // File mapping at the end of the low address space region.
5464        assert_eq!(
5465            punch_holes_in_guest_mem_layout_for_mappings(
5466                vec![
5467                    (GuestAddress(0), 0xD000_0000, Default::default()),
5468                    (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5469                ],
5470                &[test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)]
5471            )
5472            .unwrap(),
5473            vec![
5474                (GuestAddress(0), 0xD000_0000 - 0x2000, Default::default()),
5475                (
5476                    GuestAddress(0xD000_0000 - 0x2000),
5477                    0x2000,
5478                    MemoryRegionOptions::new()
5479                        .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5480                        .file_backed(test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)),
5481                ),
5482                (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5483            ],
5484        );
5485
5486        // File mapping fully contained within the middle of the low address space region.
5487        assert_eq!(
5488            punch_holes_in_guest_mem_layout_for_mappings(
5489                vec![
5490                    (GuestAddress(0), 0xD000_0000, Default::default()),
5491                    (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5492                ],
5493                &[test_file_backed_mapping(0x1000, 0x2000)]
5494            )
5495            .unwrap(),
5496            vec![
5497                (GuestAddress(0), 0x1000, Default::default()),
5498                (
5499                    GuestAddress(0x1000),
5500                    0x2000,
5501                    MemoryRegionOptions::new()
5502                        .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5503                        .file_backed(test_file_backed_mapping(0x1000, 0x2000)),
5504                ),
5505                (
5506                    GuestAddress(0x3000),
5507                    0xD000_0000 - 0x3000,
5508                    Default::default()
5509                ),
5510                (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5511            ],
5512        );
5513
5514        // File mapping at the start of the high address space region.
5515        assert_eq!(
5516            punch_holes_in_guest_mem_layout_for_mappings(
5517                vec![
5518                    (GuestAddress(0), 0xD000_0000, Default::default()),
5519                    (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5520                ],
5521                &[test_file_backed_mapping(0x1_0000_0000, 0x2000)]
5522            )
5523            .unwrap(),
5524            vec![
5525                (GuestAddress(0), 0xD000_0000, Default::default()),
5526                (
5527                    GuestAddress(0x1_0000_0000),
5528                    0x2000,
5529                    MemoryRegionOptions::new()
5530                        .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5531                        .file_backed(test_file_backed_mapping(0x1_0000_0000, 0x2000)),
5532                ),
5533                (
5534                    GuestAddress(0x1_0000_2000),
5535                    0x8_0000 - 0x2000,
5536                    Default::default()
5537                ),
5538            ],
5539        );
5540
5541        // File mapping at the end of the high address space region.
5542        assert_eq!(
5543            punch_holes_in_guest_mem_layout_for_mappings(
5544                vec![
5545                    (GuestAddress(0), 0xD000_0000, Default::default()),
5546                    (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5547                ],
5548                &[test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)]
5549            )
5550            .unwrap(),
5551            vec![
5552                (GuestAddress(0), 0xD000_0000, Default::default()),
5553                (
5554                    GuestAddress(0x1_0000_0000),
5555                    0x8_0000 - 0x2000,
5556                    Default::default()
5557                ),
5558                (
5559                    GuestAddress(0x1_0008_0000 - 0x2000),
5560                    0x2000,
5561                    MemoryRegionOptions::new()
5562                        .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5563                        .file_backed(test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)),
5564                ),
5565            ],
5566        );
5567
5568        // File mapping fully contained within the middle of the high address space region.
5569        assert_eq!(
5570            punch_holes_in_guest_mem_layout_for_mappings(
5571                vec![
5572                    (GuestAddress(0), 0xD000_0000, Default::default()),
5573                    (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5574                ],
5575                &[test_file_backed_mapping(0x1_0000_1000, 0x2000)]
5576            )
5577            .unwrap(),
5578            vec![
5579                (GuestAddress(0), 0xD000_0000, Default::default()),
5580                (GuestAddress(0x1_0000_0000), 0x1000, Default::default()),
5581                (
5582                    GuestAddress(0x1_0000_1000),
5583                    0x2000,
5584                    MemoryRegionOptions::new()
5585                        .purpose(MemoryRegionPurpose::GuestMemoryRegion)
5586                        .file_backed(test_file_backed_mapping(0x1_0000_1000, 0x2000)),
5587                ),
5588                (
5589                    GuestAddress(0x1_0000_3000),
5590                    0x8_0000 - 0x3000,
5591                    Default::default()
5592                ),
5593            ],
5594        );
5595
5596        // File mapping overlapping two guest memory regions.
5597        assert_eq!(
5598            punch_holes_in_guest_mem_layout_for_mappings(
5599                vec![
5600                    (GuestAddress(0), 0xD000_0000, Default::default()),
5601                    (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5602                ],
5603                &[test_file_backed_mapping(0xA000_0000, 0x60002000)]
5604            )
5605            .unwrap_err()
5606            .to_string(),
5607            "RAM file-backed-mapping must be a subset of a RAM region",
5608        );
5609
5610        // File mapping with different region purpose.
5611        assert_eq!(
5612            punch_holes_in_guest_mem_layout_for_mappings(
5613                vec![
5614                    (GuestAddress(0x0000), 0x2000, Default::default()),
5615                    (
5616                        GuestAddress(0x2000),
5617                        0x2000,
5618                        MemoryRegionOptions::new().purpose(MemoryRegionPurpose::Bios)
5619                    ),
5620                ],
5621                &[test_file_backed_mapping(0x2000, 0x2000)]
5622            )
5623            .unwrap(),
5624            vec![
5625                (GuestAddress(0x0000), 0x2000, Default::default()),
5626                (
5627                    GuestAddress(0x2000),
5628                    0x2000,
5629                    MemoryRegionOptions::new()
5630                        .purpose(MemoryRegionPurpose::Bios)
5631                        .file_backed(test_file_backed_mapping(0x2000, 0x2000)),
5632                ),
5633            ],
5634        );
5635    }
5636
5637    #[cfg(target_arch = "aarch64")]
5638    #[test]
5639    fn normalized_cpu_ipc_ratios_simple() {
5640        let host_max_freq = 5000000;
5641        let mut cpu_frequencies = BTreeMap::new();
5642        cpu_frequencies.insert(0, vec![100000, 200000, 500000]);
5643        cpu_frequencies.insert(1, vec![50000, 75000, 200000]);
5644
5645        let mut cpu_ipc_ratio = BTreeMap::new();
5646        cpu_ipc_ratio.insert(0, 1024);
5647        cpu_ipc_ratio.insert(1, 512);
5648
5649        let normalized_cpu_ipc_ratios = normalize_cpu_ipc_ratios(
5650            cpu_frequencies.iter().map(|(cpu_id, frequencies)| {
5651                (
5652                    *cpu_id,
5653                    frequencies.iter().copied().max().unwrap_or_default(),
5654                )
5655            }),
5656            host_max_freq,
5657            |cpu_id| {
5658                cpu_ipc_ratio
5659                    .get(&cpu_id)
5660                    .copied()
5661                    .unwrap_or(DEFAULT_CPU_CAPACITY)
5662            },
5663        )
5664        .expect("normalize_cpu_ipc_ratios failed");
5665
5666        let ratios: Vec<(usize, u32)> = normalized_cpu_ipc_ratios.into_iter().collect();
5667        assert_eq!(ratios, vec![(0, 102), (1, 20)]);
5668    }
5669
5670    #[test]
5671    fn test_get_representative_pcpu() {
5672        use std::collections::BTreeMap;
5673        let mut affinity_map = BTreeMap::new();
5674        affinity_map.insert(0, arch::CpuSet::new(vec![4, 5]));
5675        affinity_map.insert(1, arch::CpuSet::new(vec![6]));
5676        let vcpu_affinity = Some(VcpuAffinity::PerVcpu(affinity_map));
5677
5678        assert_eq!(get_representative_pcpu(0, &vcpu_affinity), 4);
5679        assert_eq!(get_representative_pcpu(1, &vcpu_affinity), 6);
5680        assert_eq!(get_representative_pcpu(2, &vcpu_affinity), 2); // Fallback to vcpu_id on missing vCPU
5681
5682        let global_affinity = Some(VcpuAffinity::Global(arch::CpuSet::new(vec![7, 8])));
5683        assert_eq!(get_representative_pcpu(0, &global_affinity), 7);
5684        assert_eq!(get_representative_pcpu(1, &global_affinity), 7);
5685
5686        assert_eq!(get_representative_pcpu(0, &None), 0);
5687        assert_eq!(get_representative_pcpu(1, &None), 1);
5688    }
5689
5690    #[test]
5691    fn test_map_vcpu_capacity() {
5692        let vcpu_count = 2;
5693        // Assume PCPU 1 is offline or skipped.
5694        // VCPU 0 -> PCPU 0
5695        // VCPU 1 -> PCPU 2
5696        let mut affinity_map = BTreeMap::new();
5697        affinity_map.insert(0, CpuSet::new(vec![0]));
5698        affinity_map.insert(1, CpuSet::new(vec![2]));
5699        let vcpu_affinity = Some(VcpuAffinity::PerVcpu(affinity_map));
5700
5701        let mut host_capacity = BTreeMap::new();
5702        host_capacity.insert(0, 512);
5703        host_capacity.insert(2, 1024);
5704        // PCPU 1 is missing (offline).
5705
5706        let vcpu_capacity = map_vcpu_capacity(vcpu_count, &vcpu_affinity, &host_capacity).unwrap();
5707
5708        // Verify lookup by VCPU ID
5709        assert_eq!(*vcpu_capacity.get(&0).unwrap(), 512);
5710        assert_eq!(*vcpu_capacity.get(&1).unwrap(), 1024);
5711    }
5712
5713    #[test]
5714    fn test_map_vcpu_clusters() {
5715        use std::collections::BTreeMap;
5716        let host_clusters = vec![
5717            arch::CpuSet::new(vec![0, 1, 2, 3]),
5718            arch::CpuSet::new(vec![4, 5, 6, 7]),
5719        ];
5720
5721        let mut affinity_map = BTreeMap::new();
5722        affinity_map.insert(0, arch::CpuSet::new(vec![0])); // in cluster 0
5723        affinity_map.insert(1, arch::CpuSet::new(vec![4])); // in cluster 1
5724        affinity_map.insert(2, arch::CpuSet::new(vec![1])); // in cluster 0
5725        let vcpu_affinity = Some(VcpuAffinity::PerVcpu(affinity_map));
5726
5727        let vcpu_clusters = map_vcpu_clusters(3, &vcpu_affinity, host_clusters.clone()).unwrap();
5728
5729        assert_eq!(vcpu_clusters.len(), 2);
5730        // Cluster 0 should have vCPU 0 and 2
5731        assert!(vcpu_clusters[0].contains(&0));
5732        assert!(vcpu_clusters[0].contains(&2));
5733        assert!(!vcpu_clusters[0].contains(&1));
5734        // Cluster 1 should have vCPU 1
5735        assert!(vcpu_clusters[1].contains(&1));
5736        assert!(!vcpu_clusters[1].contains(&0));
5737        assert!(!vcpu_clusters[1].contains(&2));
5738    }
5739}