devices/pci/
vfio_pci.rs

1// Copyright 2019 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::cmp::max;
6use std::cmp::Reverse;
7use std::collections::BTreeMap;
8use std::collections::BTreeSet;
9use std::fs;
10use std::path::Path;
11use std::path::PathBuf;
12use std::str::FromStr;
13use std::sync::Arc;
14
15use acpi_tables::aml::Aml;
16use base::debug;
17use base::error;
18use base::pagesize;
19use base::warn;
20use base::AsRawDescriptor;
21use base::AsRawDescriptors;
22use base::Event;
23use base::EventToken;
24use base::MemoryMapping;
25use base::Protection;
26use base::RawDescriptor;
27use base::Tube;
28use base::WaitContext;
29use base::WorkerThread;
30use hypervisor::MemCacheType;
31use resources::AddressRange;
32use resources::Alloc;
33use resources::AllocOptions;
34use resources::MmioType;
35use resources::SystemAllocator;
36use sync::Mutex;
37use vfio_sys::vfio::VFIO_PCI_ACPI_NTFY_IRQ_INDEX;
38use vfio_sys::*;
39use vm_control::api::VmMemoryClient;
40use vm_control::HotPlugDeviceInfo;
41use vm_control::HotPlugDeviceType;
42use vm_control::VmMemoryDestination;
43use vm_control::VmMemoryRegionId;
44use vm_control::VmMemorySource;
45use vm_control::VmRequest;
46use vm_control::VmResponse;
47
48use crate::pci::acpi::DeviceVcfgRegister;
49use crate::pci::acpi::DsmMethod;
50use crate::pci::acpi::PowerResourceMethod;
51use crate::pci::acpi::SHM_OFFSET;
52use crate::pci::msi::MsiConfig;
53use crate::pci::msi::MsiStatus;
54use crate::pci::msi::PCI_MSI_FLAGS;
55use crate::pci::msi::PCI_MSI_FLAGS_64BIT;
56use crate::pci::msi::PCI_MSI_FLAGS_MASKBIT;
57use crate::pci::msi::PCI_MSI_NEXT_POINTER;
58use crate::pci::msix::MsixConfig;
59use crate::pci::msix::MsixStatus;
60use crate::pci::msix::BITS_PER_PBA_ENTRY;
61use crate::pci::msix::MSIX_PBA_ENTRIES_MODULO;
62use crate::pci::msix::MSIX_TABLE_ENTRIES_MODULO;
63use crate::pci::pci_device::BarRange;
64use crate::pci::pci_device::Error as PciDeviceError;
65use crate::pci::pci_device::PciDevice;
66use crate::pci::pci_device::PreferredIrq;
67use crate::pci::pm::PciPmCap;
68use crate::pci::pm::PmConfig;
69use crate::pci::pm::PM_CAP_LENGTH;
70use crate::pci::PciAddress;
71use crate::pci::PciBarConfiguration;
72use crate::pci::PciBarIndex;
73use crate::pci::PciBarPrefetchable;
74use crate::pci::PciBarRegionType;
75use crate::pci::PciCapabilityID;
76use crate::pci::PciClassCode;
77use crate::pci::PciId;
78use crate::pci::PciInterruptPin;
79use crate::pci::PCI_VCFG_DSM;
80use crate::pci::PCI_VCFG_NOTY;
81use crate::pci::PCI_VCFG_PM;
82use crate::pci::PCI_VENDOR_ID_INTEL;
83use crate::vfio::VfioDevice;
84use crate::vfio::VfioError;
85use crate::vfio::VfioIrqType;
86use crate::vfio::VfioPciConfig;
87use crate::IrqLevelEvent;
88use crate::Suspendable;
89
90const PCI_VENDOR_ID: u32 = 0x0;
91const PCI_DEVICE_ID: u32 = 0x2;
92const PCI_COMMAND: u32 = 0x4;
93const PCI_COMMAND_MEMORY: u8 = 0x2;
94const PCI_BASE_CLASS_CODE: u32 = 0x0B;
95const PCI_INTERRUPT_NUM: u32 = 0x3C;
96const PCI_INTERRUPT_PIN: u32 = 0x3D;
97
98const PCI_CAPABILITY_LIST: u32 = 0x34;
99const PCI_CAP_ID_MSI: u8 = 0x05;
100const PCI_CAP_ID_MSIX: u8 = 0x11;
101const PCI_CAP_ID_PM: u8 = 0x01;
102
103// Size of the standard PCI config space
104const PCI_CONFIG_SPACE_SIZE: u32 = 0x100;
105// Size of the standard PCIe config space: 4KB
106const PCIE_CONFIG_SPACE_SIZE: u32 = 0x1000;
107
108// Extended Capabilities
109const PCI_EXT_CAP_ID_CAC: u16 = 0x0C;
110const PCI_EXT_CAP_ID_ARI: u16 = 0x0E;
111const PCI_EXT_CAP_ID_SRIOV: u16 = 0x10;
112const PCI_EXT_CAP_ID_REBAR: u16 = 0x15;
113
114struct VfioPmCap {
115    offset: u32,
116    capabilities: u32,
117    config: PmConfig,
118}
119
120impl VfioPmCap {
121    fn new(config: &VfioPciConfig, cap_start: u32) -> Self {
122        let mut capabilities: u32 = config.read_config(cap_start);
123        capabilities |= (PciPmCap::default_cap() as u32) << 16;
124        VfioPmCap {
125            offset: cap_start,
126            capabilities,
127            config: PmConfig::new(false),
128        }
129    }
130
131    pub fn should_trigger_pme(&mut self) -> bool {
132        self.config.should_trigger_pme()
133    }
134
135    fn is_pm_reg(&self, offset: u32) -> bool {
136        (offset >= self.offset) && (offset < self.offset + PM_CAP_LENGTH as u32)
137    }
138
139    pub fn read(&self, offset: u32) -> u32 {
140        let offset = offset - self.offset;
141        if offset == 0 {
142            self.capabilities
143        } else {
144            let mut data = 0;
145            self.config.read(&mut data);
146            data
147        }
148    }
149
150    pub fn write(&mut self, offset: u64, data: &[u8]) {
151        let offset = offset - self.offset as u64;
152        if offset >= std::mem::size_of::<u32>() as u64 {
153            let offset = offset - std::mem::size_of::<u32>() as u64;
154            self.config.write(offset, data);
155        }
156    }
157}
158
159enum VfioMsiChange {
160    Disable,
161    Enable,
162    FunctionChanged,
163}
164
165struct VfioMsiCap {
166    config: MsiConfig,
167    offset: u32,
168}
169
170impl VfioMsiCap {
171    fn new(
172        config: &VfioPciConfig,
173        msi_cap_start: u32,
174        vm_socket_irq: Tube,
175        device_id: u32,
176        device_name: String,
177    ) -> Self {
178        let msi_ctl: u16 = config.read_config(msi_cap_start + PCI_MSI_FLAGS);
179        let is_64bit = (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0;
180        let mask_cap = (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0;
181
182        VfioMsiCap {
183            config: MsiConfig::new(is_64bit, mask_cap, vm_socket_irq, device_id, device_name),
184            offset: msi_cap_start,
185        }
186    }
187
188    fn is_msi_reg(&self, index: u64, len: usize) -> bool {
189        self.config.is_msi_reg(self.offset, index, len)
190    }
191
192    fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
193        let offset = index as u32 - self.offset;
194        match self.config.write_msi_capability(offset, data) {
195            MsiStatus::Enabled => Some(VfioMsiChange::Enable),
196            MsiStatus::Disabled => Some(VfioMsiChange::Disable),
197            MsiStatus::NothingToDo => None,
198        }
199    }
200
201    fn get_msi_irqfd(&self) -> Option<&Event> {
202        self.config.get_irqfd()
203    }
204
205    fn destroy(&mut self) {
206        self.config.destroy()
207    }
208}
209
210// MSI-X registers in MSI-X capability
211const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
212const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
213const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
214const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
215const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
216const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
217const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
218const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
219
220struct VfioMsixCap {
221    config: MsixConfig,
222    offset: u32,
223    table_size: u16,
224    table_pci_bar: PciBarIndex,
225    table_offset: u64,
226    table_size_bytes: u64,
227    pba_pci_bar: PciBarIndex,
228    pba_offset: u64,
229    pba_size_bytes: u64,
230    msix_interrupt_evt: Vec<Event>,
231}
232
233impl VfioMsixCap {
234    fn new(
235        config: &VfioPciConfig,
236        msix_cap_start: u32,
237        vm_socket_irq: Tube,
238        pci_id: u32,
239        device_name: String,
240    ) -> Self {
241        let msix_ctl: u16 = config.read_config(msix_cap_start + PCI_MSIX_FLAGS);
242        let table: u32 = config.read_config(msix_cap_start + PCI_MSIX_TABLE);
243        let table_pci_bar = (table & PCI_MSIX_TABLE_BIR) as PciBarIndex;
244        let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
245        let pba: u32 = config.read_config(msix_cap_start + PCI_MSIX_PBA);
246        let pba_pci_bar = (pba & PCI_MSIX_PBA_BIR) as PciBarIndex;
247        let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
248
249        let mut table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) as u64 + 1;
250        if table_pci_bar == pba_pci_bar
251            && pba_offset > table_offset
252            && (table_offset + table_size * MSIX_TABLE_ENTRIES_MODULO) > pba_offset
253        {
254            table_size = (pba_offset - table_offset) / MSIX_TABLE_ENTRIES_MODULO;
255        }
256
257        let table_size_bytes = table_size * MSIX_TABLE_ENTRIES_MODULO;
258        let pba_size_bytes =
259            table_size.div_ceil(BITS_PER_PBA_ENTRY as u64) * MSIX_PBA_ENTRIES_MODULO;
260        let mut msix_interrupt_evt = Vec::new();
261        for _ in 0..table_size {
262            msix_interrupt_evt.push(Event::new().expect("failed to create msix interrupt"));
263        }
264        VfioMsixCap {
265            config: MsixConfig::new(table_size as u16, vm_socket_irq, pci_id, device_name),
266            offset: msix_cap_start,
267            table_size: table_size as u16,
268            table_pci_bar,
269            table_offset,
270            table_size_bytes,
271            pba_pci_bar,
272            pba_offset,
273            pba_size_bytes,
274            msix_interrupt_evt,
275        }
276    }
277
278    // only msix control register is writable and need special handle in pci r/w
279    fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
280        let control_start = self.offset + PCI_MSIX_FLAGS;
281        let control_end = control_start + 2;
282
283        offset < control_end && offset + size > control_start
284    }
285
286    fn read_msix_control(&self, data: &mut u32) {
287        *data = self.config.read_msix_capability(*data);
288    }
289
290    fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
291        let old_enabled = self.config.enabled();
292        let old_masked = self.config.masked();
293
294        self.config
295            .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
296
297        let new_enabled = self.config.enabled();
298        let new_masked = self.config.masked();
299
300        if !old_enabled && new_enabled {
301            Some(VfioMsiChange::Enable)
302        } else if old_enabled && !new_enabled {
303            Some(VfioMsiChange::Disable)
304        } else if new_enabled && old_masked != new_masked {
305            Some(VfioMsiChange::FunctionChanged)
306        } else {
307            None
308        }
309    }
310
311    fn is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool {
312        bar_index == self.table_pci_bar
313            && offset >= self.table_offset
314            && offset < self.table_offset + self.table_size_bytes
315    }
316
317    fn get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
318        if bar_index == self.table_pci_bar {
319            AddressRange::from_start_and_size(self.table_offset, self.table_size_bytes)
320        } else {
321            None
322        }
323    }
324
325    fn read_table(&self, offset: u64, data: &mut [u8]) {
326        let offset = offset - self.table_offset;
327        self.config.read_msix_table(offset, data);
328    }
329
330    fn write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
331        let offset = offset - self.table_offset;
332        self.config.write_msix_table(offset, data)
333    }
334
335    fn is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool {
336        bar_index == self.pba_pci_bar
337            && offset >= self.pba_offset
338            && offset < self.pba_offset + self.pba_size_bytes
339    }
340
341    fn get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
342        if bar_index == self.pba_pci_bar {
343            AddressRange::from_start_and_size(self.pba_offset, self.pba_size_bytes)
344        } else {
345            None
346        }
347    }
348
349    fn read_pba(&self, offset: u64, data: &mut [u8]) {
350        let offset = offset - self.pba_offset;
351        self.config.read_pba_entries(offset, data);
352    }
353
354    fn write_pba(&mut self, offset: u64, data: &[u8]) {
355        let offset = offset - self.pba_offset;
356        self.config.write_pba_entries(offset, data);
357    }
358
359    fn get_msix_irqfd(&self, index: usize) -> Option<&Event> {
360        let irqfd = self.config.get_irqfd(index);
361        if let Some(fd) = irqfd {
362            if self.msix_vector_masked(index) {
363                Some(&self.msix_interrupt_evt[index])
364            } else {
365                Some(fd)
366            }
367        } else {
368            None
369        }
370    }
371
372    fn get_msix_irqfds(&self) -> Vec<Option<&Event>> {
373        let mut irqfds = Vec::new();
374
375        for i in 0..self.table_size {
376            irqfds.push(self.get_msix_irqfd(i as usize));
377        }
378
379        irqfds
380    }
381
382    fn table_size(&self) -> usize {
383        self.table_size.into()
384    }
385
386    fn clone_msix_evt(&self) -> Vec<Event> {
387        self.msix_interrupt_evt
388            .iter()
389            .map(|irq| irq.try_clone().unwrap())
390            .collect()
391    }
392
393    fn msix_vector_masked(&self, index: usize) -> bool {
394        !self.config.enabled() || self.config.masked() || self.config.table_masked(index)
395    }
396
397    fn trigger(&mut self, index: usize) {
398        self.config.trigger(index as u16);
399    }
400
401    fn destroy(&mut self) {
402        self.config.destroy()
403    }
404}
405
406impl AsRawDescriptors for VfioMsixCap {
407    fn as_raw_descriptors(&self) -> Vec<RawDescriptor> {
408        let mut rds = vec![self.config.as_raw_descriptor()];
409        rds.extend(
410            self.msix_interrupt_evt
411                .iter()
412                .map(|evt| evt.as_raw_descriptor()),
413        );
414        rds
415    }
416}
417
418struct VfioResourceAllocator {
419    // The region that is not allocated yet.
420    regions: BTreeSet<AddressRange>,
421}
422
423impl VfioResourceAllocator {
424    // Creates a new `VfioResourceAllocator` for managing VFIO resources.
425    // Can return `Err` if `base` + `size` overflows a u64.
426    //
427    // * `base` - The starting address of the range to manage.
428    // * `size` - The size of the address range in bytes.
429    fn new(pool: AddressRange) -> Result<Self, PciDeviceError> {
430        if pool.is_empty() {
431            return Err(PciDeviceError::SizeZero);
432        }
433        let mut regions = BTreeSet::new();
434        regions.insert(pool);
435        Ok(VfioResourceAllocator { regions })
436    }
437
438    fn internal_allocate_from_slot(
439        &mut self,
440        slot: AddressRange,
441        range: AddressRange,
442    ) -> Result<u64, PciDeviceError> {
443        let slot_was_present = self.regions.remove(&slot);
444        assert!(slot_was_present);
445
446        let (before, after) = slot.non_overlapping_ranges(range);
447
448        if !before.is_empty() {
449            self.regions.insert(before);
450        }
451        if !after.is_empty() {
452            self.regions.insert(after);
453        }
454
455        Ok(range.start)
456    }
457
458    // Allocates a range of addresses from the managed region with a minimal alignment.
459    // Overlapping with a previous allocation is _not_ allowed.
460    // Returns allocated address.
461    fn allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError> {
462        if size == 0 {
463            return Err(PciDeviceError::SizeZero);
464        }
465        if !alignment.is_power_of_two() {
466            return Err(PciDeviceError::BadAlignment);
467        }
468
469        // finds first region matching alignment and size.
470        let region = self.regions.iter().find(|range| {
471            match range.start % alignment {
472                0 => range.start.checked_add(size - 1),
473                r => range.start.checked_add(size - 1 + alignment - r),
474            }
475            .is_some_and(|end| end <= range.end)
476        });
477
478        match region {
479            Some(&slot) => {
480                let start = match slot.start % alignment {
481                    0 => slot.start,
482                    r => slot.start + alignment - r,
483                };
484                let end = start + size - 1;
485                let range = AddressRange::from_start_and_end(start, end);
486
487                self.internal_allocate_from_slot(slot, range)
488            }
489            None => Err(PciDeviceError::OutOfSpace),
490        }
491    }
492
493    // Allocates a range of addresses from the managed region with a required location.
494    // Overlapping with a previous allocation is allowed.
495    fn allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError> {
496        if range.is_empty() {
497            return Err(PciDeviceError::SizeZero);
498        }
499
500        while let Some(&slot) = self
501            .regions
502            .iter()
503            .find(|avail_range| avail_range.overlaps(range))
504        {
505            let _address = self.internal_allocate_from_slot(slot, range)?;
506        }
507        Ok(())
508    }
509}
510
511struct VfioPciWorker {
512    address: PciAddress,
513    sysfs_path: PathBuf,
514    vm_socket: Tube,
515    name: String,
516    pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
517    msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
518}
519
520impl VfioPciWorker {
521    fn run(
522        &mut self,
523        req_irq_evt: Event,
524        wakeup_evt: Event,
525        acpi_notify_evt: Event,
526        kill_evt: Event,
527        msix_evt: Vec<Event>,
528        is_in_low_power: Arc<Mutex<bool>>,
529        gpe: Option<u32>,
530        notification_val: Arc<Mutex<Vec<u32>>>,
531    ) {
532        #[derive(EventToken, Debug)]
533        enum Token {
534            ReqIrq,
535            WakeUp,
536            AcpiNotifyEvent,
537            Kill,
538            MsixIrqi { index: usize },
539        }
540
541        let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
542            (&req_irq_evt, Token::ReqIrq),
543            (&wakeup_evt, Token::WakeUp),
544            (&acpi_notify_evt, Token::AcpiNotifyEvent),
545            (&kill_evt, Token::Kill),
546        ]) {
547            Ok(pc) => pc,
548            Err(e) => {
549                error!(
550                    "{} failed creating vfio WaitContext: {}",
551                    self.name.clone(),
552                    e
553                );
554                return;
555            }
556        };
557
558        for (index, msix_int) in msix_evt.iter().enumerate() {
559            wait_ctx
560                .add(msix_int, Token::MsixIrqi { index })
561                .expect("Failed to create vfio WaitContext for msix interrupt event")
562        }
563
564        'wait: loop {
565            let events = match wait_ctx.wait() {
566                Ok(v) => v,
567                Err(e) => {
568                    error!("{} failed polling vfio events: {}", self.name.clone(), e);
569                    break;
570                }
571            };
572
573            for event in events.iter().filter(|e| e.is_readable) {
574                match event.token {
575                    Token::MsixIrqi { index } => {
576                        if let Some(msix_cap) = &self.msix_cap {
577                            msix_cap.lock().trigger(index);
578                        }
579                    }
580                    Token::ReqIrq => {
581                        let device = HotPlugDeviceInfo {
582                            device_type: HotPlugDeviceType::EndPoint,
583                            path: self.sysfs_path.clone(),
584                            hp_interrupt: false,
585                        };
586
587                        let request = VmRequest::HotPlugVfioCommand { device, add: false };
588                        if self.vm_socket.send(&request).is_ok() {
589                            if let Err(e) = self.vm_socket.recv::<VmResponse>() {
590                                error!("{} failed to remove vfio_device: {}", self.name.clone(), e);
591                            } else {
592                                break 'wait;
593                            }
594                        }
595                    }
596                    Token::WakeUp => {
597                        let _ = wakeup_evt.wait();
598
599                        if *is_in_low_power.lock() {
600                            if let Some(pm_cap) = &self.pm_cap {
601                                if pm_cap.lock().should_trigger_pme() {
602                                    let request =
603                                        VmRequest::PciPme(self.address.pme_requester_id());
604                                    if self.vm_socket.send(&request).is_ok() {
605                                        if let Err(e) = self.vm_socket.recv::<VmResponse>() {
606                                            error!(
607                                                "{} failed to send PME: {}",
608                                                self.name.clone(),
609                                                e
610                                            );
611                                        }
612                                    }
613                                }
614                            }
615                        }
616                    }
617                    Token::AcpiNotifyEvent => {
618                        if let Some(gpe) = gpe {
619                            if let Ok(val) = base::EventExt::read_count(&acpi_notify_evt) {
620                                notification_val.lock().push(val as u32);
621                                let request = VmRequest::Gpe {
622                                    gpe,
623                                    clear_evt: None,
624                                };
625                                if self.vm_socket.send(&request).is_ok() {
626                                    if let Err(e) = self.vm_socket.recv::<VmResponse>() {
627                                        error!("{} failed to send GPE: {}", self.name.clone(), e);
628                                    }
629                                }
630                            } else {
631                                error!("{} failed to read acpi_notify_evt", self.name.clone());
632                            }
633                        }
634                    }
635                    Token::Kill => break 'wait,
636                }
637            }
638        }
639    }
640}
641
642fn get_next_from_extcap_header(cap_header: u32) -> u32 {
643    (cap_header >> 20) & 0xffc
644}
645
646fn is_skipped_ext_cap(cap_id: u16) -> bool {
647    matches!(
648        cap_id,
649        // SR-IOV/ARI/Resizable_BAR capabilities are not well handled and should not be exposed
650        PCI_EXT_CAP_ID_ARI | PCI_EXT_CAP_ID_SRIOV | PCI_EXT_CAP_ID_REBAR
651    )
652}
653
654enum DeviceData {
655    IntelGfxData { opregion_index: u32 },
656}
657
658/// PCI Express Extended Capabilities information
659#[derive(Copy, Clone)]
660struct ExtCap {
661    /// cap offset in Configuration Space
662    offset: u32,
663    /// cap size
664    size: u32,
665    /// next offset, set next non-skipped offset for non-skipped ext cap
666    next: u16,
667    /// whether to be exposed to guest
668    is_skipped: bool,
669}
670
671/// Implements the Vfio Pci device, then a pci device is added into vm
672pub struct VfioPciDevice {
673    device: Arc<VfioDevice>,
674    config: VfioPciConfig,
675    hotplug: bool,
676    hotplug_bus_number: Option<u8>,
677    preferred_address: PciAddress,
678    pci_address: Option<PciAddress>,
679    interrupt_evt: Option<IrqLevelEvent>,
680    acpi_notification_evt: Option<Event>,
681    mmio_regions: Vec<PciBarConfiguration>,
682    io_regions: Vec<PciBarConfiguration>,
683    pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
684    msi_cap: Option<VfioMsiCap>,
685    msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
686    irq_type: Option<VfioIrqType>,
687    vm_memory_client: VmMemoryClient,
688    device_data: Option<DeviceData>,
689    pm_evt: Option<Event>,
690    is_in_low_power: Arc<Mutex<bool>>,
691    worker_thread: Option<WorkerThread<VfioPciWorker>>,
692    vm_socket_vm: Option<Tube>,
693    sysfs_path: PathBuf,
694    // PCI Express Extended Capabilities
695    ext_caps: Vec<ExtCap>,
696    vcfg_shm_mmap: Option<MemoryMapping>,
697    mapped_mmio_bars: BTreeMap<PciBarIndex, (u64, Vec<VmMemoryRegionId>)>,
698    activated: bool,
699    acpi_notifier_val: Arc<Mutex<Vec<u32>>>,
700    gpe: Option<u32>,
701    base_class_code: PciClassCode,
702}
703
704impl VfioPciDevice {
705    /// Constructs a new Vfio Pci device for the give Vfio device
706    pub fn new(
707        sysfs_path: &Path,
708        device: VfioDevice,
709        hotplug: bool,
710        hotplug_bus_number: Option<u8>,
711        guest_address: Option<PciAddress>,
712        vfio_device_socket_msi: Tube,
713        vfio_device_socket_msix: Tube,
714        vm_memory_client: VmMemoryClient,
715        vfio_device_socket_vm: Tube,
716    ) -> Result<Self, PciDeviceError> {
717        let preferred_address = if let Some(bus_num) = hotplug_bus_number {
718            debug!("hotplug bus {}", bus_num);
719            PciAddress {
720                // Caller specify pcie bus number for hotplug device
721                bus: bus_num,
722                // devfn should be 0, otherwise pcie root port couldn't detect it
723                dev: 0,
724                func: 0,
725            }
726        } else if let Some(guest_address) = guest_address {
727            debug!("guest PCI address {}", guest_address);
728            guest_address
729        } else {
730            let addr = PciAddress::from_str(device.device_name()).map_err(|e| {
731                PciDeviceError::PciAddressParseFailure(device.device_name().clone(), e)
732            })?;
733            debug!("parsed device PCI address {}", addr);
734            addr
735        };
736
737        let dev = Arc::new(device);
738        let config = VfioPciConfig::new(Arc::clone(&dev));
739        let mut msi_socket = Some(vfio_device_socket_msi);
740        let mut msix_socket = Some(vfio_device_socket_msix);
741        let mut msi_cap: Option<VfioMsiCap> = None;
742        let mut msix_cap: Option<Arc<Mutex<VfioMsixCap>>> = None;
743        let mut pm_cap: Option<Arc<Mutex<VfioPmCap>>> = None;
744
745        let mut is_pcie = false;
746        let mut cap_next: u32 = config.read_config::<u8>(PCI_CAPABILITY_LIST).into();
747        let vendor_id: u16 = config.read_config(PCI_VENDOR_ID);
748        let device_id: u16 = config.read_config(PCI_DEVICE_ID);
749        let base_class_code = PciClassCode::try_from(config.read_config::<u8>(PCI_BASE_CLASS_CODE))
750            .unwrap_or(PciClassCode::Other);
751
752        let pci_id = PciId::new(vendor_id, device_id);
753
754        while cap_next != 0 {
755            let cap_id: u8 = config.read_config(cap_next);
756            if cap_id == PCI_CAP_ID_PM {
757                pm_cap = Some(Arc::new(Mutex::new(VfioPmCap::new(&config, cap_next))));
758            } else if cap_id == PCI_CAP_ID_MSI {
759                if let Some(msi_socket) = msi_socket.take() {
760                    msi_cap = Some(VfioMsiCap::new(
761                        &config,
762                        cap_next,
763                        msi_socket,
764                        pci_id.into(),
765                        dev.device_name().to_string(),
766                    ));
767                }
768            } else if cap_id == PCI_CAP_ID_MSIX {
769                if let Some(msix_socket) = msix_socket.take() {
770                    msix_cap = Some(Arc::new(Mutex::new(VfioMsixCap::new(
771                        &config,
772                        cap_next,
773                        msix_socket,
774                        pci_id.into(),
775                        dev.device_name().to_string(),
776                    ))));
777                }
778            } else if cap_id == PciCapabilityID::PciExpress as u8 {
779                is_pcie = true;
780            }
781            let offset = cap_next + PCI_MSI_NEXT_POINTER;
782            cap_next = config.read_config::<u8>(offset).into();
783        }
784
785        let mut ext_caps: Vec<ExtCap> = Vec::new();
786        if is_pcie {
787            let mut ext_cap_next: u32 = PCI_CONFIG_SPACE_SIZE;
788            while ext_cap_next != 0 {
789                let ext_cap_config: u32 = config.read_config::<u32>(ext_cap_next);
790                if ext_cap_config == 0 {
791                    break;
792                }
793                ext_caps.push(ExtCap {
794                    offset: ext_cap_next,
795                    // Calculate the size later
796                    size: 0,
797                    // init as the real value
798                    next: get_next_from_extcap_header(ext_cap_config) as u16,
799                    is_skipped: is_skipped_ext_cap((ext_cap_config & 0xffff) as u16),
800                });
801                ext_cap_next = get_next_from_extcap_header(ext_cap_config);
802            }
803
804            // Manage extended caps
805            //
806            // Extended capabilities are chained with each pointing to the next, so
807            // we can drop anything other than the head of the chain simply by
808            // modifying the previous next pointer. For the head of the chain, we
809            // can modify the capability ID to something that cannot match a valid
810            // capability. ID PCI_EXT_CAP_ID_CAC is for this since it is no longer
811            // supported.
812            //
813            // reverse order by offset
814            ext_caps.sort_by(|a, b| b.offset.cmp(&a.offset));
815            let mut next_offset: u32 = PCIE_CONFIG_SPACE_SIZE;
816            let mut non_skipped_next: u16 = 0;
817            for ext_cap in ext_caps.iter_mut() {
818                if !ext_cap.is_skipped {
819                    ext_cap.next = non_skipped_next;
820                    non_skipped_next = ext_cap.offset as u16;
821                } else if ext_cap.offset == PCI_CONFIG_SPACE_SIZE {
822                    ext_cap.next = non_skipped_next;
823                }
824                ext_cap.size = next_offset - ext_cap.offset;
825                next_offset = ext_cap.offset;
826            }
827            // order by offset
828            ext_caps.reverse();
829        }
830
831        let is_intel_gfx =
832            base_class_code == PciClassCode::DisplayController && vendor_id == PCI_VENDOR_ID_INTEL;
833        let device_data = if is_intel_gfx {
834            Some(DeviceData::IntelGfxData {
835                opregion_index: u32::MAX,
836            })
837        } else {
838            None
839        };
840
841        Ok(VfioPciDevice {
842            device: dev,
843            config,
844            hotplug,
845            hotplug_bus_number,
846            preferred_address,
847            pci_address: None,
848            interrupt_evt: None,
849            acpi_notification_evt: None,
850            mmio_regions: Vec::new(),
851            io_regions: Vec::new(),
852            pm_cap,
853            msi_cap,
854            msix_cap,
855            irq_type: None,
856            vm_memory_client,
857            device_data,
858            pm_evt: None,
859            is_in_low_power: Arc::new(Mutex::new(false)),
860            worker_thread: None,
861            vm_socket_vm: Some(vfio_device_socket_vm),
862            sysfs_path: sysfs_path.to_path_buf(),
863            ext_caps,
864            vcfg_shm_mmap: None,
865            mapped_mmio_bars: BTreeMap::new(),
866            activated: false,
867            acpi_notifier_val: Arc::new(Mutex::new(Vec::new())),
868            gpe: None,
869            base_class_code,
870        })
871    }
872
873    /// Gets the pci address of the device, if one has already been allocated.
874    pub fn pci_address(&self) -> Option<PciAddress> {
875        self.pci_address
876    }
877
878    pub fn is_gfx(&self) -> bool {
879        self.base_class_code == PciClassCode::DisplayController
880    }
881
882    fn is_intel_gfx(&self) -> bool {
883        matches!(self.device_data, Some(DeviceData::IntelGfxData { .. }))
884    }
885
886    fn enable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
887        if let Some(ref acpi_notification_evt) = self.acpi_notification_evt {
888            return self
889                .device
890                .acpi_notification_evt_enable(acpi_notification_evt, VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
891                .map_err(|_| PciDeviceError::AcpiNotifySetupFailed);
892        }
893        Err(PciDeviceError::AcpiNotifySetupFailed)
894    }
895
896    #[allow(dead_code)]
897    fn disable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
898        if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
899            return self
900                .device
901                .acpi_notification_disable(VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
902                .map_err(|_| PciDeviceError::AcpiNotifyDeactivationFailed);
903        }
904        Err(PciDeviceError::AcpiNotifyDeactivationFailed)
905    }
906
907    #[allow(dead_code)]
908    fn test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError> {
909        if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
910            return self
911                .device
912                .acpi_notification_test(VFIO_PCI_ACPI_NTFY_IRQ_INDEX, val)
913                .map_err(|_| PciDeviceError::AcpiNotifyTestFailed);
914        }
915        Err(PciDeviceError::AcpiNotifyTestFailed)
916    }
917
918    fn enable_intx(&mut self) {
919        if let Some(ref interrupt_evt) = self.interrupt_evt {
920            if let Err(e) = self.device.irq_enable(
921                &[Some(interrupt_evt.get_trigger())],
922                VFIO_PCI_INTX_IRQ_INDEX,
923                0,
924            ) {
925                error!("{} Intx enable failed: {}", self.debug_label(), e);
926                return;
927            }
928            if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
929                error!("{} Intx mask failed: {}", self.debug_label(), e);
930                self.disable_intx();
931                return;
932            }
933            if let Err(e) = self
934                .device
935                .resample_virq_enable(interrupt_evt.get_resample(), VFIO_PCI_INTX_IRQ_INDEX)
936            {
937                error!("{} resample enable failed: {}", self.debug_label(), e);
938                self.disable_intx();
939                return;
940            }
941            if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
942                error!("{} Intx unmask failed: {}", self.debug_label(), e);
943                self.disable_intx();
944                return;
945            }
946            self.irq_type = Some(VfioIrqType::Intx);
947        }
948    }
949
950    fn disable_intx(&mut self) {
951        if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
952            error!("{} Intx disable failed: {}", self.debug_label(), e);
953        }
954        self.irq_type = None;
955    }
956
957    fn disable_irqs(&mut self) {
958        match self.irq_type {
959            Some(VfioIrqType::Msi) => self.disable_msi(),
960            Some(VfioIrqType::Msix) => self.disable_msix(),
961            _ => (),
962        }
963
964        // Above disable_msi() or disable_msix() will enable intx again.
965        // so disable_intx here again.
966        if let Some(VfioIrqType::Intx) = self.irq_type {
967            self.disable_intx();
968        }
969    }
970
971    fn enable_msi(&mut self) {
972        self.disable_irqs();
973
974        let irqfd = match &self.msi_cap {
975            Some(cap) => {
976                if let Some(fd) = cap.get_msi_irqfd() {
977                    fd
978                } else {
979                    self.enable_intx();
980                    return;
981                }
982            }
983            None => {
984                self.enable_intx();
985                return;
986            }
987        };
988
989        if let Err(e) = self
990            .device
991            .irq_enable(&[Some(irqfd)], VFIO_PCI_MSI_IRQ_INDEX, 0)
992        {
993            error!("{} failed to enable msi: {}", self.debug_label(), e);
994            self.enable_intx();
995            return;
996        }
997
998        self.irq_type = Some(VfioIrqType::Msi);
999    }
1000
1001    fn disable_msi(&mut self) {
1002        if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
1003            error!("{} failed to disable msi: {}", self.debug_label(), e);
1004            return;
1005        }
1006        self.irq_type = None;
1007
1008        self.enable_intx();
1009    }
1010
1011    fn enable_msix(&mut self) {
1012        if self.msix_cap.is_none() {
1013            return;
1014        }
1015
1016        self.disable_irqs();
1017        let cap = self.msix_cap.as_ref().unwrap().lock();
1018        let vector_in_use = cap.get_msix_irqfds().iter().any(|&irq| irq.is_some());
1019
1020        let mut failed = false;
1021        if !vector_in_use {
1022            // If there are no msix vectors currently in use, we explicitly assign a new eventfd
1023            // to vector 0. Then we enable it and immediately disable it, so that vfio will
1024            // activate physical device. If there are available msix vectors, just enable them
1025            // instead.
1026            let fd = Event::new().expect("failed to create event");
1027            let table_size = cap.table_size();
1028            let mut irqfds = vec![None; table_size];
1029            irqfds[0] = Some(&fd);
1030            for fd in irqfds.iter_mut().skip(1) {
1031                *fd = None;
1032            }
1033            if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1034                error!("{} failed to enable msix: {}", self.debug_label(), e);
1035                failed = true;
1036            }
1037            irqfds[0] = None;
1038            if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1039                error!("{} failed to enable msix: {}", self.debug_label(), e);
1040                failed = true;
1041            }
1042        } else {
1043            let result = self
1044                .device
1045                .irq_enable(&cap.get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0);
1046            if let Err(e) = result {
1047                error!("{} failed to enable msix: {}", self.debug_label(), e);
1048                failed = true;
1049            }
1050        }
1051
1052        std::mem::drop(cap);
1053        if failed {
1054            self.enable_intx();
1055            return;
1056        }
1057        self.irq_type = Some(VfioIrqType::Msix);
1058    }
1059
1060    fn disable_msix(&mut self) {
1061        if self.msix_cap.is_none() {
1062            return;
1063        }
1064        if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
1065            error!("{} failed to disable msix: {}", self.debug_label(), e);
1066            return;
1067        }
1068        self.irq_type = None;
1069        self.enable_intx();
1070    }
1071
1072    fn msix_vectors_update(&self) -> Result<(), VfioError> {
1073        if let Some(cap) = &self.msix_cap {
1074            self.device
1075                .irq_enable(&cap.lock().get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0)?;
1076        }
1077        Ok(())
1078    }
1079
1080    fn msix_vector_update(&self, index: usize, irqfd: Option<&Event>) {
1081        if let Err(e) = self
1082            .device
1083            .irq_enable(&[irqfd], VFIO_PCI_MSIX_IRQ_INDEX, index as u32)
1084        {
1085            error!(
1086                "{} failed to update msix vector {}: {}",
1087                self.debug_label(),
1088                index,
1089                e
1090            );
1091        }
1092    }
1093
1094    fn adjust_bar_mmap(
1095        &self,
1096        bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1097        remove_mmaps: &[AddressRange],
1098    ) -> Vec<vfio_region_sparse_mmap_area> {
1099        let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::with_capacity(bar_mmaps.len());
1100        let pgmask = (pagesize() as u64) - 1;
1101
1102        for mmap in bar_mmaps.iter() {
1103            let mmap_range = if let Some(mmap_range) =
1104                AddressRange::from_start_and_size(mmap.offset, mmap.size)
1105            {
1106                mmap_range
1107            } else {
1108                continue;
1109            };
1110            let mut to_mmap = match VfioResourceAllocator::new(mmap_range) {
1111                Ok(a) => a,
1112                Err(e) => {
1113                    error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1114                    mmaps.clear();
1115                    return mmaps;
1116                }
1117            };
1118
1119            for &(mut remove_range) in remove_mmaps.iter() {
1120                remove_range = remove_range.intersect(mmap_range);
1121                if !remove_range.is_empty() {
1122                    // align offsets to page size
1123                    let begin = remove_range.start & !pgmask;
1124                    let end = ((remove_range.end + 1 + pgmask) & !pgmask) - 1;
1125                    let remove_range = AddressRange::from_start_and_end(begin, end);
1126                    if let Err(e) = to_mmap.allocate_at_can_overlap(remove_range) {
1127                        error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1128                    }
1129                }
1130            }
1131
1132            for mmap in to_mmap.regions {
1133                mmaps.push(vfio_region_sparse_mmap_area {
1134                    offset: mmap.start,
1135                    size: mmap.end - mmap.start + 1,
1136                });
1137            }
1138        }
1139
1140        mmaps
1141    }
1142
1143    fn remove_bar_mmap_msix(
1144        &self,
1145        bar_index: PciBarIndex,
1146        bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1147    ) -> Vec<vfio_region_sparse_mmap_area> {
1148        let msix_cap = &self.msix_cap.as_ref().unwrap().lock();
1149        let mut msix_regions = Vec::new();
1150
1151        if let Some(t) = msix_cap.get_msix_table(bar_index) {
1152            msix_regions.push(t);
1153        }
1154        if let Some(p) = msix_cap.get_msix_pba(bar_index) {
1155            msix_regions.push(p);
1156        }
1157
1158        if msix_regions.is_empty() {
1159            return bar_mmaps;
1160        }
1161
1162        self.adjust_bar_mmap(bar_mmaps, &msix_regions)
1163    }
1164
1165    fn add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId> {
1166        let mut mmaps_ids: Vec<VmMemoryRegionId> = Vec::new();
1167        if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1168            // the bar storing msix table and pba couldn't mmap.
1169            // these bars should be trapped, so that msix could be emulated.
1170            let mut mmaps = self.device.get_region_mmap(index);
1171
1172            if self.msix_cap.is_some() && !self.device.get_region_msix_mmappable(index) {
1173                mmaps = self.remove_bar_mmap_msix(index, mmaps);
1174            }
1175            if mmaps.is_empty() {
1176                return mmaps_ids;
1177            }
1178
1179            for mmap in mmaps.iter() {
1180                let mmap_offset = mmap.offset;
1181                let mmap_size = mmap.size;
1182                let guest_map_start = bar_addr + mmap_offset;
1183                let region_offset = self.device.get_region_offset(index);
1184                let offset = region_offset + mmap_offset;
1185                let descriptor = match self.device.device_file().try_clone() {
1186                    Ok(device_file) => device_file.into(),
1187                    Err(_) => break,
1188                };
1189                match self.vm_memory_client.register_memory(
1190                    VmMemorySource::Descriptor {
1191                        descriptor,
1192                        offset,
1193                        size: mmap_size,
1194                    },
1195                    VmMemoryDestination::GuestPhysicalAddress(guest_map_start),
1196                    Protection::read_write(),
1197                    MemCacheType::CacheCoherent,
1198                ) {
1199                    Ok(id) => {
1200                        mmaps_ids.push(id);
1201                    }
1202                    Err(e) => {
1203                        error!("register_memory failed: {}", e);
1204                        break;
1205                    }
1206                }
1207            }
1208        }
1209
1210        mmaps_ids
1211    }
1212
1213    fn remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId]) {
1214        for mmap_id in mmap_ids {
1215            if let Err(e) = self.vm_memory_client.unregister_memory(*mmap_id) {
1216                error!("unregister_memory failed: {}", e);
1217            }
1218        }
1219    }
1220
1221    fn disable_bars_mmap(&mut self) {
1222        for (_, (_, mmap_ids)) in self.mapped_mmio_bars.iter() {
1223            self.remove_bar_mmap(mmap_ids);
1224        }
1225        self.mapped_mmio_bars.clear();
1226    }
1227
1228    fn commit_bars_mmap(&mut self) {
1229        // Unmap all bars before remapping bars, to prevent issues with overlap
1230        let mut needs_map = Vec::new();
1231        for mmio_info in self.mmio_regions.iter() {
1232            let bar_idx = mmio_info.bar_index();
1233            let addr = mmio_info.address();
1234
1235            if let Some((cur_addr, ids)) = self.mapped_mmio_bars.remove(&bar_idx) {
1236                if cur_addr == addr {
1237                    self.mapped_mmio_bars.insert(bar_idx, (cur_addr, ids));
1238                    continue;
1239                } else {
1240                    self.remove_bar_mmap(&ids);
1241                }
1242            }
1243
1244            if addr != 0 {
1245                needs_map.push((bar_idx, addr));
1246            }
1247        }
1248
1249        for (bar_idx, addr) in needs_map.iter() {
1250            let ids = self.add_bar_mmap(*bar_idx, *addr);
1251            self.mapped_mmio_bars.insert(*bar_idx, (*addr, ids));
1252        }
1253    }
1254
1255    fn close(&mut self) {
1256        if let Some(msi) = self.msi_cap.as_mut() {
1257            msi.destroy();
1258        }
1259        if let Some(msix) = &self.msix_cap {
1260            msix.lock().destroy();
1261        }
1262        self.disable_bars_mmap();
1263        self.device.close();
1264    }
1265
1266    fn start_work_thread(&mut self) {
1267        let vm_socket = match self.vm_socket_vm.take() {
1268            Some(socket) => socket,
1269            None => return,
1270        };
1271
1272        let req_evt = match Event::new() {
1273            Ok(evt) => {
1274                if let Err(e) = self
1275                    .device
1276                    .irq_enable(&[Some(&evt)], VFIO_PCI_REQ_IRQ_INDEX, 0)
1277                {
1278                    error!("{} enable req_irq failed: {}", self.debug_label(), e);
1279                    return;
1280                }
1281                evt
1282            }
1283            Err(_) => return,
1284        };
1285
1286        let (self_pm_evt, pm_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1287            Ok(v) => v,
1288            Err(e) => {
1289                error!(
1290                    "{} failed creating PM Event pair: {}",
1291                    self.debug_label(),
1292                    e
1293                );
1294                return;
1295            }
1296        };
1297        self.pm_evt = Some(self_pm_evt);
1298
1299        let (self_acpi_notify_evt, acpi_notify_evt) =
1300            match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1301                Ok(v) => v,
1302                Err(e) => {
1303                    error!(
1304                        "{} failed creating ACPI Event pair: {}",
1305                        self.debug_label(),
1306                        e
1307                    );
1308                    return;
1309                }
1310            };
1311        self.acpi_notification_evt = Some(self_acpi_notify_evt);
1312
1313        if let Err(e) = self.enable_acpi_notification() {
1314            error!("{}: {}", self.debug_label(), e);
1315        }
1316
1317        let mut msix_evt = Vec::new();
1318        if let Some(msix_cap) = &self.msix_cap {
1319            msix_evt = msix_cap.lock().clone_msix_evt();
1320        }
1321
1322        let name = self.device.device_name().to_string();
1323        let address = self.pci_address.expect("Unassigned PCI Address.");
1324        let sysfs_path = self.sysfs_path.clone();
1325        let pm_cap = self.pm_cap.clone();
1326        let msix_cap = self.msix_cap.clone();
1327        let is_in_low_power = self.is_in_low_power.clone();
1328        let gpe_nr = self.gpe;
1329        let notification_val = self.acpi_notifier_val.clone();
1330        self.worker_thread = Some(WorkerThread::start("vfio_pci", move |kill_evt| {
1331            let mut worker = VfioPciWorker {
1332                address,
1333                sysfs_path,
1334                vm_socket,
1335                name,
1336                pm_cap,
1337                msix_cap,
1338            };
1339            worker.run(
1340                req_evt,
1341                pm_evt,
1342                acpi_notify_evt,
1343                kill_evt,
1344                msix_evt,
1345                is_in_low_power,
1346                gpe_nr,
1347                notification_val,
1348            );
1349            worker
1350        }));
1351        self.activated = true;
1352    }
1353
1354    fn collect_bars(&mut self) -> Vec<PciBarConfiguration> {
1355        let mut i = VFIO_PCI_BAR0_REGION_INDEX;
1356        let mut mem_bars: Vec<PciBarConfiguration> = Vec::new();
1357
1358        while i <= VFIO_PCI_ROM_REGION_INDEX {
1359            let mut low: u32 = 0xffffffff;
1360            let offset: u32 = if i == VFIO_PCI_ROM_REGION_INDEX {
1361                0x30
1362            } else {
1363                0x10 + i * 4
1364            };
1365            self.config.write_config(low, offset);
1366            low = self.config.read_config(offset);
1367
1368            let low_flag = low & 0xf;
1369            let is_64bit = low_flag & 0x4 == 0x4;
1370            if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
1371                let mut upper: u32 = 0xffffffff;
1372                if is_64bit {
1373                    self.config.write_config(upper, offset + 4);
1374                    upper = self.config.read_config(offset + 4);
1375                }
1376
1377                low &= 0xffff_fff0;
1378                let mut size: u64 = u64::from(upper);
1379                size <<= 32;
1380                size |= u64::from(low);
1381                size = !size + 1;
1382                let region_type = if is_64bit {
1383                    PciBarRegionType::Memory64BitRegion
1384                } else {
1385                    PciBarRegionType::Memory32BitRegion
1386                };
1387                let prefetch = if low_flag & 0x8 == 0x8 {
1388                    PciBarPrefetchable::Prefetchable
1389                } else {
1390                    PciBarPrefetchable::NotPrefetchable
1391                };
1392                mem_bars.push(PciBarConfiguration::new(
1393                    i as usize,
1394                    size,
1395                    region_type,
1396                    prefetch,
1397                ));
1398            } else if low_flag & 0x1 == 0x1 {
1399                let size = !(low & 0xffff_fffc) + 1;
1400                self.io_regions.push(PciBarConfiguration::new(
1401                    i as usize,
1402                    size.into(),
1403                    PciBarRegionType::IoRegion,
1404                    PciBarPrefetchable::NotPrefetchable,
1405                ));
1406            }
1407
1408            if is_64bit {
1409                i += 2;
1410            } else {
1411                i += 1;
1412            }
1413        }
1414        mem_bars
1415    }
1416
1417    fn configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64) {
1418        let offset: u32 = bar_info.reg_index() as u32 * 4;
1419        let mmio_region = *bar_info;
1420        self.mmio_regions.push(mmio_region.set_address(bar_addr));
1421
1422        let val: u32 = self.config.read_config(offset);
1423        let low = ((bar_addr & !0xf) as u32) | (val & 0xf);
1424        self.config.write_config(low, offset);
1425        if bar_info.is_64bit_memory() {
1426            let upper = (bar_addr >> 32) as u32;
1427            self.config.write_config(upper, offset + 4);
1428        }
1429    }
1430
1431    fn allocate_root_barmem(
1432        &mut self,
1433        mem_bars: &[PciBarConfiguration],
1434        resources: &mut SystemAllocator,
1435    ) -> Result<Vec<BarRange>, PciDeviceError> {
1436        let address = self.pci_address.unwrap();
1437        let mut ranges: Vec<BarRange> = Vec::new();
1438        for mem_bar in mem_bars {
1439            let bar_size = mem_bar.size();
1440            let mut bar_addr: u64 = 0;
1441            // Don't allocate mmio for hotplug device, OS will allocate it from
1442            // its parent's bridge window.
1443            if !self.hotplug {
1444                bar_addr = resources
1445                    .allocate_mmio(
1446                        bar_size,
1447                        Alloc::PciBar {
1448                            bus: address.bus,
1449                            dev: address.dev,
1450                            func: address.func,
1451                            bar: mem_bar.bar_index() as u8,
1452                        },
1453                        "vfio_bar".to_string(),
1454                        AllocOptions::new()
1455                            .prefetchable(mem_bar.is_prefetchable())
1456                            .max_address(if mem_bar.is_64bit_memory() {
1457                                u64::MAX
1458                            } else {
1459                                u32::MAX.into()
1460                            })
1461                            .align(bar_size),
1462                    )
1463                    .map_err(|e| PciDeviceError::IoAllocationFailed(bar_size, e))?;
1464                ranges.push(BarRange {
1465                    addr: bar_addr,
1466                    size: bar_size,
1467                    prefetchable: mem_bar.is_prefetchable(),
1468                });
1469            }
1470            self.configure_barmem(mem_bar, bar_addr);
1471        }
1472        Ok(ranges)
1473    }
1474
1475    fn allocate_nonroot_barmem(
1476        &mut self,
1477        mem_bars: &mut [PciBarConfiguration],
1478        resources: &mut SystemAllocator,
1479    ) -> Result<Vec<BarRange>, PciDeviceError> {
1480        const NON_PREFETCHABLE: usize = 0;
1481        const PREFETCHABLE: usize = 1;
1482        const ARRAY_SIZE: usize = 2;
1483        let mut membars: [Vec<PciBarConfiguration>; ARRAY_SIZE] = [Vec::new(), Vec::new()];
1484        let mut allocator: [VfioResourceAllocator; ARRAY_SIZE] = [
1485            match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u32::MAX as u64)) {
1486                Ok(a) => a,
1487                Err(e) => {
1488                    error!(
1489                        "{} init nonroot VfioResourceAllocator failed: {}",
1490                        self.debug_label(),
1491                        e
1492                    );
1493                    return Err(e);
1494                }
1495            },
1496            match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u64::MAX)) {
1497                Ok(a) => a,
1498                Err(e) => {
1499                    error!(
1500                        "{} init nonroot VfioResourceAllocator failed: {}",
1501                        self.debug_label(),
1502                        e
1503                    );
1504                    return Err(e);
1505                }
1506            },
1507        ];
1508        let mut memtype: [MmioType; ARRAY_SIZE] = [MmioType::Low, MmioType::High];
1509        // the window must be 1M-aligned as per the PCI spec
1510        let mut window_sz: [u64; ARRAY_SIZE] = [0; 2];
1511        let mut alignment: [u64; ARRAY_SIZE] = [0x100000; 2];
1512
1513        // Descend by bar size, this could reduce allocated size for all the bars.
1514        mem_bars.sort_by_key(|a| Reverse(a.size()));
1515        for mem_bar in mem_bars {
1516            let prefetchable = mem_bar.is_prefetchable();
1517            let is_64bit = mem_bar.is_64bit_memory();
1518
1519            // if one prefetchable bar is 32bit, all the prefetchable bars should be in Low MMIO,
1520            // as all the prefetchable bars should be in one region
1521            if prefetchable && !is_64bit {
1522                memtype[PREFETCHABLE] = MmioType::Low;
1523            }
1524            let i = if prefetchable {
1525                PREFETCHABLE
1526            } else {
1527                NON_PREFETCHABLE
1528            };
1529            let bar_size = mem_bar.size();
1530            let start = match allocator[i].allocate_with_align(bar_size, bar_size) {
1531                Ok(s) => s,
1532                Err(e) => {
1533                    error!(
1534                        "{} nonroot allocate_wit_align failed: {}",
1535                        self.debug_label(),
1536                        e
1537                    );
1538                    return Err(e);
1539                }
1540            };
1541            window_sz[i] = max(window_sz[i], start + bar_size);
1542            alignment[i] = max(alignment[i], bar_size);
1543            let mem_info = (*mem_bar).set_address(start);
1544            membars[i].push(mem_info);
1545        }
1546
1547        let address = self.pci_address.unwrap();
1548        let mut ranges: Vec<BarRange> = Vec::new();
1549        for (index, bars) in membars.iter().enumerate() {
1550            if bars.is_empty() {
1551                continue;
1552            }
1553
1554            let i = if index == 1 {
1555                PREFETCHABLE
1556            } else {
1557                NON_PREFETCHABLE
1558            };
1559            let mut window_addr: u64 = 0;
1560            // Don't allocate mmio for hotplug device, OS will allocate it from
1561            // its parent's bridge window.
1562            if !self.hotplug {
1563                window_sz[i] = (window_sz[i] + 0xfffff) & !0xfffff;
1564                let alloc = if i == NON_PREFETCHABLE {
1565                    Alloc::PciBridgeWindow {
1566                        bus: address.bus,
1567                        dev: address.dev,
1568                        func: address.func,
1569                    }
1570                } else {
1571                    Alloc::PciBridgePrefetchWindow {
1572                        bus: address.bus,
1573                        dev: address.dev,
1574                        func: address.func,
1575                    }
1576                };
1577                window_addr = resources
1578                    .mmio_allocator(memtype[i])
1579                    .allocate_with_align(
1580                        window_sz[i],
1581                        alloc,
1582                        "vfio_bar_window".to_string(),
1583                        alignment[i],
1584                    )
1585                    .map_err(|e| PciDeviceError::IoAllocationFailed(window_sz[i], e))?;
1586                for mem_info in bars {
1587                    let bar_addr = window_addr + mem_info.address();
1588                    ranges.push(BarRange {
1589                        addr: bar_addr,
1590                        size: mem_info.size(),
1591                        prefetchable: mem_info.is_prefetchable(),
1592                    });
1593                }
1594            }
1595
1596            for mem_info in bars {
1597                let bar_addr = window_addr + mem_info.address();
1598                self.configure_barmem(mem_info, bar_addr);
1599            }
1600        }
1601        Ok(ranges)
1602    }
1603
1604    /// Return the supported iova max address of the Vfio Pci device
1605    pub fn get_max_iova(&self) -> u64 {
1606        self.device.get_max_addr()
1607    }
1608
1609    fn get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap> {
1610        self.ext_caps
1611            .iter()
1612            .find(|ext_cap| reg >= ext_cap.offset && reg < ext_cap.offset + ext_cap.size)
1613            .cloned()
1614    }
1615
1616    fn is_skipped_reg(&self, reg: u32) -> bool {
1617        // fast handle for pci config space
1618        if reg < PCI_CONFIG_SPACE_SIZE {
1619            return false;
1620        }
1621
1622        self.get_ext_cap_by_reg(reg)
1623            .is_some_and(|cap| cap.is_skipped)
1624    }
1625}
1626
1627impl PciDevice for VfioPciDevice {
1628    fn debug_label(&self) -> String {
1629        format!("vfio {} device", self.device.device_name())
1630    }
1631
1632    fn preferred_address(&self) -> Option<PciAddress> {
1633        Some(self.preferred_address)
1634    }
1635
1636    fn allocate_address(
1637        &mut self,
1638        resources: &mut SystemAllocator,
1639    ) -> Result<PciAddress, PciDeviceError> {
1640        if self.pci_address.is_none() {
1641            let mut address = self.preferred_address;
1642            while address.func < 8 {
1643                if resources.reserve_pci(address, self.debug_label()) {
1644                    self.pci_address = Some(address);
1645                    break;
1646                } else if self.hotplug_bus_number.is_none() {
1647                    break;
1648                } else {
1649                    address.func += 1;
1650                }
1651            }
1652            if let Some(msi_cap) = &mut self.msi_cap {
1653                msi_cap.config.set_pci_address(self.pci_address.unwrap());
1654            }
1655            if let Some(msix_cap) = &mut self.msix_cap {
1656                msix_cap
1657                    .lock()
1658                    .config
1659                    .set_pci_address(self.pci_address.unwrap());
1660            }
1661        }
1662        self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1663    }
1664
1665    fn keep_rds(&self) -> Vec<RawDescriptor> {
1666        let mut rds = self.device.keep_rds();
1667        if let Some(ref interrupt_evt) = self.interrupt_evt {
1668            rds.extend(interrupt_evt.as_raw_descriptors());
1669        }
1670        rds.push(self.vm_memory_client.as_raw_descriptor());
1671        if let Some(vm_socket_vm) = &self.vm_socket_vm {
1672            rds.push(vm_socket_vm.as_raw_descriptor());
1673        }
1674        if let Some(msi_cap) = &self.msi_cap {
1675            rds.push(msi_cap.config.get_msi_socket());
1676        }
1677        if let Some(msix_cap) = &self.msix_cap {
1678            rds.extend(msix_cap.lock().as_raw_descriptors());
1679        }
1680        rds
1681    }
1682
1683    fn preferred_irq(&self) -> PreferredIrq {
1684        // Is INTx configured?
1685        let pin = match self.config.read_config::<u8>(PCI_INTERRUPT_PIN) {
1686            1 => PciInterruptPin::IntA,
1687            2 => PciInterruptPin::IntB,
1688            3 => PciInterruptPin::IntC,
1689            4 => PciInterruptPin::IntD,
1690            _ => return PreferredIrq::None,
1691        };
1692
1693        // TODO: replace sysfs/irq value parsing with vfio interface
1694        //       reporting host allocated interrupt number and type.
1695        let path = self.sysfs_path.join("irq");
1696        let gsi = fs::read_to_string(path)
1697            .map(|v| v.trim().parse::<u32>().unwrap_or(0))
1698            .unwrap_or(0);
1699
1700        PreferredIrq::Fixed { pin, gsi }
1701    }
1702
1703    fn assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32) {
1704        // Keep event/resample event references.
1705        self.interrupt_evt = Some(irq_evt);
1706
1707        // enable INTX
1708        self.enable_intx();
1709
1710        self.config
1711            .write_config(pin.to_mask() as u8, PCI_INTERRUPT_PIN);
1712        self.config.write_config(irq_num as u8, PCI_INTERRUPT_NUM);
1713    }
1714
1715    fn allocate_io_bars(
1716        &mut self,
1717        resources: &mut SystemAllocator,
1718    ) -> Result<Vec<BarRange>, PciDeviceError> {
1719        let address = self
1720            .pci_address
1721            .expect("allocate_address must be called prior to allocate_device_bars");
1722
1723        let mut mem_bars = self.collect_bars();
1724
1725        let ranges = if address.bus == 0 {
1726            self.allocate_root_barmem(&mem_bars, resources)?
1727        } else {
1728            self.allocate_nonroot_barmem(&mut mem_bars, resources)?
1729        };
1730
1731        // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
1732        // driver doesn't claim this vga device, then xorg couldn't boot up.
1733        if self.is_intel_gfx() {
1734            let mut cmd = self.config.read_config::<u8>(PCI_COMMAND);
1735            cmd |= PCI_COMMAND_MEMORY;
1736            self.config.write_config(cmd, PCI_COMMAND);
1737        }
1738        Ok(ranges)
1739    }
1740
1741    fn allocate_device_bars(
1742        &mut self,
1743        resources: &mut SystemAllocator,
1744    ) -> Result<Vec<BarRange>, PciDeviceError> {
1745        let mut ranges: Vec<BarRange> = Vec::new();
1746
1747        if !self.is_intel_gfx() {
1748            return Ok(ranges);
1749        }
1750
1751        // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
1752        // then write this gpa into pci cfg register
1753        if let Some((index, size)) = self.device.get_cap_type_info(
1754            VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (PCI_VENDOR_ID_INTEL as u32),
1755            VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
1756        ) {
1757            let address = self
1758                .pci_address
1759                .expect("allocate_address must be called prior to allocate_device_bars");
1760            let bar_addr = resources
1761                .allocate_mmio(
1762                    size,
1763                    Alloc::PciBar {
1764                        bus: address.bus,
1765                        dev: address.dev,
1766                        func: address.func,
1767                        bar: (index * 4) as u8,
1768                    },
1769                    "vfio_bar".to_string(),
1770                    AllocOptions::new().max_address(u32::MAX.into()),
1771                )
1772                .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1773            ranges.push(BarRange {
1774                addr: bar_addr,
1775                size,
1776                prefetchable: false,
1777            });
1778            self.device_data = Some(DeviceData::IntelGfxData {
1779                opregion_index: index,
1780            });
1781
1782            self.mmio_regions.push(
1783                PciBarConfiguration::new(
1784                    index as usize,
1785                    size,
1786                    PciBarRegionType::Memory32BitRegion,
1787                    PciBarPrefetchable::NotPrefetchable,
1788                )
1789                .set_address(bar_addr),
1790            );
1791            self.config.write_config(bar_addr as u32, 0xFC);
1792        }
1793
1794        Ok(ranges)
1795    }
1796
1797    fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1798        for region in self.mmio_regions.iter().chain(self.io_regions.iter()) {
1799            if region.bar_index() == bar_num {
1800                let command: u8 = self.config.read_config(PCI_COMMAND);
1801                if (region.is_memory() && (command & PCI_COMMAND_MEMORY == 0)) || region.is_io() {
1802                    return None;
1803                } else {
1804                    return Some(*region);
1805                }
1806            }
1807        }
1808
1809        None
1810    }
1811
1812    fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
1813        Ok(())
1814    }
1815
1816    fn read_config_register(&self, reg_idx: usize) -> u32 {
1817        let reg: u32 = (reg_idx * 4) as u32;
1818        let mut config: u32 = self.config.read_config(reg);
1819
1820        // See VfioPciDevice::new for details how extended caps are managed
1821        if reg >= PCI_CONFIG_SPACE_SIZE {
1822            let ext_cap = self.get_ext_cap_by_reg(reg);
1823            if let Some(ext_cap) = ext_cap {
1824                if ext_cap.offset == reg {
1825                    config = (config & !(0xffc << 20)) | (((ext_cap.next & 0xffc) as u32) << 20);
1826                }
1827
1828                if ext_cap.is_skipped {
1829                    if reg == PCI_CONFIG_SPACE_SIZE {
1830                        config = (config & (0xffc << 20)) | (PCI_EXT_CAP_ID_CAC as u32);
1831                    } else {
1832                        config = 0;
1833                    }
1834                }
1835            }
1836        }
1837
1838        // Ignore IO bar
1839        if (0x10..=0x24).contains(&reg) {
1840            let bar_idx = (reg as usize - 0x10) / 4;
1841            if let Some(bar) = self.get_bar_configuration(bar_idx) {
1842                if bar.is_io() {
1843                    config = 0;
1844                }
1845            }
1846        } else if let Some(msix_cap) = &self.msix_cap {
1847            let msix_cap = msix_cap.lock();
1848            if msix_cap.is_msix_control_reg(reg, 4) {
1849                msix_cap.read_msix_control(&mut config);
1850            }
1851        } else if let Some(pm_cap) = &self.pm_cap {
1852            let pm_cap = pm_cap.lock();
1853            if pm_cap.is_pm_reg(reg) {
1854                config = pm_cap.read(reg);
1855            }
1856        }
1857
1858        // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
1859        if self.is_intel_gfx() && reg == 0x50 {
1860            config &= 0xffff00ff;
1861        }
1862
1863        config
1864    }
1865
1866    fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1867        // When guest write config register at the first time, start worker thread
1868        if self.worker_thread.is_none() && self.vm_socket_vm.is_some() {
1869            self.start_work_thread();
1870        };
1871
1872        let start = (reg_idx * 4) as u64 + offset;
1873
1874        if let Some(pm_cap) = self.pm_cap.as_mut() {
1875            let mut pm_cap = pm_cap.lock();
1876            if pm_cap.is_pm_reg(start as u32) {
1877                pm_cap.write(start, data);
1878            }
1879        }
1880
1881        let mut msi_change: Option<VfioMsiChange> = None;
1882        if let Some(msi_cap) = self.msi_cap.as_mut() {
1883            if msi_cap.is_msi_reg(start, data.len()) {
1884                msi_change = msi_cap.write_msi_reg(start, data);
1885            }
1886        }
1887
1888        match msi_change {
1889            Some(VfioMsiChange::Enable) => self.enable_msi(),
1890            Some(VfioMsiChange::Disable) => self.disable_msi(),
1891            _ => (),
1892        }
1893
1894        msi_change = None;
1895        if let Some(msix_cap) = &self.msix_cap {
1896            let mut msix_cap = msix_cap.lock();
1897            if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
1898                msi_change = msix_cap.write_msix_control(data);
1899            }
1900        }
1901
1902        match msi_change {
1903            Some(VfioMsiChange::Enable) => self.enable_msix(),
1904            Some(VfioMsiChange::Disable) => self.disable_msix(),
1905            Some(VfioMsiChange::FunctionChanged) => {
1906                if let Err(e) = self.msix_vectors_update() {
1907                    error!("update msix vectors failed: {}", e);
1908                }
1909            }
1910            _ => (),
1911        }
1912
1913        if !self.is_skipped_reg(start as u32) {
1914            self.device
1915                .region_write(VFIO_PCI_CONFIG_REGION_INDEX as usize, data, start);
1916        }
1917
1918        // if guest enable memory access, then enable bar mappable once
1919        if start == PCI_COMMAND as u64
1920            && data.len() == 2
1921            && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
1922        {
1923            self.commit_bars_mmap();
1924        } else if (0x10..=0x24).contains(&start) && data.len() == 4 {
1925            let bar_idx = (start as u32 - 0x10) / 4;
1926            let value: [u8; 4] = [data[0], data[1], data[2], data[3]];
1927            let val = u32::from_le_bytes(value);
1928            let mut modify = false;
1929            for region in self.mmio_regions.iter_mut() {
1930                if region.bar_index() == bar_idx as usize {
1931                    let old_addr = region.address();
1932                    let new_addr = val & 0xFFFFFFF0;
1933                    if !region.is_64bit_memory() && (old_addr as u32) != new_addr {
1934                        // Change 32bit bar address
1935                        *region = region.set_address(u64::from(new_addr));
1936                        modify = true;
1937                    } else if region.is_64bit_memory() && (old_addr as u32) != new_addr {
1938                        // Change 64bit bar low address
1939                        *region =
1940                            region.set_address(u64::from(new_addr) | ((old_addr >> 32) << 32));
1941                        modify = true;
1942                    }
1943                    break;
1944                } else if region.is_64bit_memory()
1945                    && ((bar_idx % 2) == 1)
1946                    && (region.bar_index() + 1 == bar_idx as usize)
1947                {
1948                    // Change 64bit bar high address
1949                    let old_addr = region.address();
1950                    if val != (old_addr >> 32) as u32 {
1951                        let mut new_addr = (u64::from(val)) << 32;
1952                        new_addr |= old_addr & 0xFFFFFFFF;
1953                        *region = region.set_address(new_addr);
1954                        modify = true;
1955                    }
1956                    break;
1957                }
1958            }
1959            if modify {
1960                // if bar is changed under memory enabled, mmap the
1961                // new bar immediately.
1962                let cmd = self.config.read_config::<u8>(PCI_COMMAND);
1963                if cmd & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY {
1964                    self.commit_bars_mmap();
1965                }
1966            }
1967        }
1968    }
1969
1970    fn read_virtual_config_register(&self, reg_idx: usize) -> u32 {
1971        if reg_idx == PCI_VCFG_NOTY {
1972            let mut q = self.acpi_notifier_val.lock();
1973            let mut val = 0;
1974            if !q.is_empty() {
1975                val = q.remove(0);
1976            }
1977            drop(q);
1978            return val;
1979        }
1980
1981        warn!(
1982            "{} read unsupported vcfg register {}",
1983            self.debug_label(),
1984            reg_idx
1985        );
1986        0xFFFF_FFFF
1987    }
1988
1989    fn write_virtual_config_register(&mut self, reg_idx: usize, value: u32) {
1990        match reg_idx {
1991            PCI_VCFG_PM => {
1992                match value {
1993                    0 => {
1994                        if let Some(pm_evt) =
1995                            self.pm_evt.as_ref().map(|evt| evt.try_clone().unwrap())
1996                        {
1997                            *self.is_in_low_power.lock() = true;
1998                            let _ = self.device.pm_low_power_enter_with_wakeup(pm_evt);
1999                        } else {
2000                            let _ = self.device.pm_low_power_enter();
2001                        }
2002                    }
2003                    _ => {
2004                        *self.is_in_low_power.lock() = false;
2005                        let _ = self.device.pm_low_power_exit();
2006                    }
2007                };
2008            }
2009            PCI_VCFG_DSM => {
2010                if let Some(shm) = &self.vcfg_shm_mmap {
2011                    let mut args = [0u8; 4096];
2012                    if let Err(e) = shm.read_slice(&mut args, 0) {
2013                        error!("failed to read DSM Args: {}", e);
2014                        return;
2015                    }
2016                    let res = match self.device.acpi_dsm(&args) {
2017                        Ok(r) => r,
2018                        Err(e) => {
2019                            error!("failed to call DSM: {}", e);
2020                            return;
2021                        }
2022                    };
2023                    if let Err(e) = shm.write_slice(&res, 0) {
2024                        error!("failed to write DSM result: {}", e);
2025                        return;
2026                    }
2027                    if let Err(e) = shm.msync() {
2028                        error!("failed to msync: {}", e)
2029                    }
2030                }
2031            }
2032            _ => warn!(
2033                "{} write unsupported vcfg register {}",
2034                self.debug_label(),
2035                reg_idx
2036            ),
2037        };
2038    }
2039
2040    fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
2041        if let Some(msix_cap) = &self.msix_cap {
2042            let msix_cap = msix_cap.lock();
2043            if msix_cap.is_msix_table(bar_index, offset) {
2044                msix_cap.read_table(offset, data);
2045                return;
2046            } else if msix_cap.is_msix_pba(bar_index, offset) {
2047                msix_cap.read_pba(offset, data);
2048                return;
2049            }
2050        }
2051        self.device.region_read(bar_index, data, offset);
2052    }
2053
2054    fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
2055        // Ignore igd opregion's write
2056        if let Some(device_data) = &self.device_data {
2057            match *device_data {
2058                DeviceData::IntelGfxData { opregion_index } => {
2059                    if opregion_index == bar_index as u32 {
2060                        return;
2061                    }
2062                }
2063            }
2064        }
2065
2066        if let Some(msix_cap) = &self.msix_cap {
2067            let mut msix_cap = msix_cap.lock();
2068            if msix_cap.is_msix_table(bar_index, offset) {
2069                let behavior = msix_cap.write_table(offset, data);
2070                if let MsixStatus::EntryChanged(index) = behavior {
2071                    let irqfd = msix_cap.get_msix_irqfd(index);
2072                    self.msix_vector_update(index, irqfd);
2073                }
2074                return;
2075            } else if msix_cap.is_msix_pba(bar_index, offset) {
2076                msix_cap.write_pba(offset, data);
2077                return;
2078            }
2079        }
2080
2081        self.device.region_write(bar_index, data, offset);
2082    }
2083
2084    fn destroy_device(&mut self) {
2085        self.close();
2086    }
2087
2088    fn generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>) {
2089        let mut amls = Vec::new();
2090        let mut shm = None;
2091        if let Some(pci_address) = self.pci_address {
2092            let vcfg_offset = pci_address.to_config_address(0, 13);
2093            if let Ok(vcfg_register) = DeviceVcfgRegister::new(vcfg_offset) {
2094                vcfg_register.to_aml_bytes(&mut amls);
2095                shm = vcfg_register
2096                    .create_shm_mmap()
2097                    .map(|shm| (vcfg_offset + SHM_OFFSET, shm));
2098                self.vcfg_shm_mmap = vcfg_register.create_shm_mmap();
2099                // All vfio-pci devices should have virtual _PRx method, otherwise
2100                // host couldn't know whether device has enter into suspend state,
2101                // host would always think it is in active state, so its parent PCIe
2102                // switch couldn't enter into suspend state.
2103                PowerResourceMethod {}.to_aml_bytes(&mut amls);
2104                // TODO: WIP: Ideally, we should generate DSM only if the physical
2105                // device has a _DSM; however, such information is not provided by
2106                // Linux. As a temporary workaround, we chech whether there is an
2107                // associated ACPI companion device node and skip generating guest
2108                // _DSM if there is none.
2109                let acpi_path = self.sysfs_path.join("firmware_node/path");
2110                if acpi_path.exists() {
2111                    DsmMethod {}.to_aml_bytes(&mut amls);
2112                }
2113            }
2114        }
2115
2116        (amls, shm)
2117    }
2118
2119    fn set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32> {
2120        if let Some(gpe_nr) = resources.allocate_gpe() {
2121            base::debug!("set_gpe: gpe-nr {} addr {:?}", gpe_nr, self.pci_address);
2122            self.gpe = Some(gpe_nr);
2123        }
2124        self.gpe
2125    }
2126}
2127
2128impl Suspendable for VfioPciDevice {
2129    fn sleep(&mut self) -> anyhow::Result<()> {
2130        if let Some(worker_thread) = self.worker_thread.take() {
2131            let res = worker_thread.stop();
2132            self.pci_address = Some(res.address);
2133            self.sysfs_path = res.sysfs_path;
2134            self.pm_cap = res.pm_cap;
2135            self.msix_cap = res.msix_cap;
2136            self.vm_socket_vm = Some(res.vm_socket);
2137        }
2138        Ok(())
2139    }
2140
2141    fn wake(&mut self) -> anyhow::Result<()> {
2142        if self.activated {
2143            self.start_work_thread();
2144        }
2145        Ok(())
2146    }
2147}
2148
2149#[cfg(test)]
2150mod tests {
2151    use resources::AddressRange;
2152
2153    use super::VfioResourceAllocator;
2154
2155    #[test]
2156    fn no_overlap() {
2157        // regions [32, 95]
2158        let mut memory =
2159            VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2160        memory
2161            .allocate_at_can_overlap(AddressRange::from_start_and_end(0, 15))
2162            .unwrap();
2163        memory
2164            .allocate_at_can_overlap(AddressRange::from_start_and_end(100, 115))
2165            .unwrap();
2166
2167        let mut iter = memory.regions.iter();
2168        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 95)));
2169    }
2170
2171    #[test]
2172    fn complete_overlap() {
2173        // regions [32, 95]
2174        let mut memory =
2175            VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2176        // regions [32, 47], [64, 95]
2177        memory
2178            .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2179            .unwrap();
2180        // regions [64, 95]
2181        memory
2182            .allocate_at_can_overlap(AddressRange::from_start_and_end(32, 47))
2183            .unwrap();
2184
2185        let mut iter = memory.regions.iter();
2186        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2187    }
2188
2189    #[test]
2190    fn partial_overlap_one() {
2191        // regions [32, 95]
2192        let mut memory =
2193            VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2194        // regions [32, 47], [64, 95]
2195        memory
2196            .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2197            .unwrap();
2198        // regions [32, 39], [64, 95]
2199        memory
2200            .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 55))
2201            .unwrap();
2202
2203        let mut iter = memory.regions.iter();
2204        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2205        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2206    }
2207
2208    #[test]
2209    fn partial_overlap_two() {
2210        // regions [32, 95]
2211        let mut memory =
2212            VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2213        // regions [32, 47], [64, 95]
2214        memory
2215            .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2216            .unwrap();
2217        // regions [32, 39], [72, 95]
2218        memory
2219            .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 71))
2220            .unwrap();
2221
2222        let mut iter = memory.regions.iter();
2223        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2224        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(72, 95)));
2225    }
2226
2227    #[test]
2228    fn partial_overlap_three() {
2229        // regions [32, 95]
2230        let mut memory =
2231            VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2232        // regions [32, 39], [48, 95]
2233        memory
2234            .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 47))
2235            .unwrap();
2236        // regions [32, 39], [48, 63], [72, 95]
2237        memory
2238            .allocate_at_can_overlap(AddressRange::from_start_and_end(64, 71))
2239            .unwrap();
2240        // regions [32, 35], [76, 95]
2241        memory
2242            .allocate_at_can_overlap(AddressRange::from_start_and_end(36, 75))
2243            .unwrap();
2244
2245        let mut iter = memory.regions.iter();
2246        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 35)));
2247        assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(76, 95)));
2248    }
2249}