use std::collections::VecDeque;
use std::convert::TryInto;
use std::default::Default;
use std::fmt;
use std::mem;
use std::panic;
use std::sync::atomic::fence;
use std::sync::atomic::AtomicU32;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::thread;
use std::time::Duration;
use anyhow::bail;
use anyhow::ensure;
use anyhow::Context;
use anyhow::Result;
use base::error;
use base::info;
use base::AsRawDescriptor;
use base::Event;
use base::EventToken;
use base::MemoryMapping;
use base::MemoryMappingBuilder;
use base::Protection;
use base::RawDescriptor;
use base::SafeDescriptor;
use base::SharedMemory;
use base::Timer;
use base::TimerTrait;
use base::Tube;
use base::TubeError;
use base::WaitContext;
use base::WorkerThread;
use hypervisor::Datamatch;
use hypervisor::MemCacheType;
use resources::Alloc;
use resources::AllocOptions;
use resources::SystemAllocator;
use serde::Deserialize;
use serde::Deserializer;
use serde::Serialize;
use serde_keyvalue::FromKeyValues;
use sync::Mutex;
use thiserror::Error as ThisError;
use vm_control::api::VmMemoryClient;
use vm_control::VmMemoryDestination;
use vm_control::VmMemorySource;
use vm_memory::GuestAddress;
use vm_memory::GuestMemory;
use zerocopy::FromBytes;
use zerocopy::IntoBytes;
use crate::pci::pci_configuration::PciBarConfiguration;
use crate::pci::pci_configuration::PciBarPrefetchable;
use crate::pci::pci_configuration::PciBarRegionType;
use crate::pci::pci_configuration::PciClassCode;
use crate::pci::pci_configuration::PciConfiguration;
use crate::pci::pci_configuration::PciHeaderType;
use crate::pci::pci_configuration::PciOtherSubclass;
use crate::pci::pci_configuration::COMMAND_REG;
use crate::pci::pci_configuration::COMMAND_REG_MEMORY_SPACE_MASK;
use crate::pci::pci_device::BarRange;
use crate::pci::pci_device::PciDevice;
use crate::pci::pci_device::Result as PciResult;
use crate::pci::PciAddress;
use crate::pci::PciBarIndex;
use crate::pci::PciDeviceError;
use crate::vfio::VfioContainer;
use crate::Suspendable;
use crate::UnpinRequest;
use crate::UnpinResponse;
const PCI_VENDOR_ID_COIOMMU: u16 = 0x1234;
const PCI_DEVICE_ID_COIOMMU: u16 = 0xabcd;
const COIOMMU_CMD_DEACTIVATE: u64 = 0;
const COIOMMU_CMD_ACTIVATE: u64 = 1;
const COIOMMU_CMD_PARK_UNPIN: u64 = 2;
const COIOMMU_CMD_UNPARK_UNPIN: u64 = 3;
const COIOMMU_REVISION_ID: u8 = 0x10;
const COIOMMU_MMIO_BAR: PciBarIndex = 0;
const COIOMMU_MMIO_BAR_SIZE: u64 = 0x2000;
const COIOMMU_NOTIFYMAP_BAR: PciBarIndex = 2;
const COIOMMU_NOTIFYMAP_SIZE: usize = 0x2000;
const COIOMMU_TOPOLOGYMAP_BAR: u8 = 4;
const COIOMMU_TOPOLOGYMAP_SIZE: usize = 0x2000;
const PAGE_SIZE_4K: u64 = 4096;
const PAGE_SHIFT_4K: u64 = 12;
const PIN_PAGES_IN_BATCH: u64 = 1 << 63;
const DTTE_PINNED_FLAG: u32 = 1 << 31;
const DTTE_ACCESSED_FLAG: u32 = 1 << 30;
const DTT_ENTRY_PRESENT: u64 = 1;
const DTT_ENTRY_PFN_SHIFT: u64 = 12;
#[derive(ThisError, Debug)]
enum Error {
    #[error("CoIommu failed to create shared memory")]
    CreateSharedMemory,
    #[error("Failed to get DTT entry")]
    GetDTTEntry,
}
const UNPIN_DEFAULT_INTERVAL: Duration = Duration::from_secs(60);
const UNPIN_GEN_DEFAULT_THRES: u64 = 10;
#[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum CoIommuUnpinPolicy {
    #[default]
    Off,
    Lru,
}
impl fmt::Display for CoIommuUnpinPolicy {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        use self::CoIommuUnpinPolicy::*;
        match self {
            Off => write!(f, "off"),
            Lru => write!(f, "lru"),
        }
    }
}
fn deserialize_unpin_interval<'de, D: Deserializer<'de>>(
    deserializer: D,
) -> Result<Duration, D::Error> {
    let secs = u64::deserialize(deserializer)?;
    Ok(Duration::from_secs(secs))
}
fn deserialize_unpin_limit<'de, D: Deserializer<'de>>(
    deserializer: D,
) -> Result<Option<u64>, D::Error> {
    let limit = u64::deserialize(deserializer)?;
    match limit {
        0 => Err(serde::de::Error::custom(
            "Please use non-zero unpin_limit value",
        )),
        limit => Ok(Some(limit)),
    }
}
fn unpin_interval_default() -> Duration {
    UNPIN_DEFAULT_INTERVAL
}
fn unpin_gen_threshold_default() -> u64 {
    UNPIN_GEN_DEFAULT_THRES
}
#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize, FromKeyValues)]
#[serde(deny_unknown_fields)]
pub struct CoIommuParameters {
    #[serde(default)]
    pub unpin_policy: CoIommuUnpinPolicy,
    #[serde(
        deserialize_with = "deserialize_unpin_interval",
        default = "unpin_interval_default"
    )]
    pub unpin_interval: Duration,
    #[serde(deserialize_with = "deserialize_unpin_limit", default)]
    pub unpin_limit: Option<u64>,
    #[serde(default = "unpin_gen_threshold_default")]
    pub unpin_gen_threshold: u64,
}
impl Default for CoIommuParameters {
    fn default() -> Self {
        Self {
            unpin_policy: CoIommuUnpinPolicy::Off,
            unpin_interval: UNPIN_DEFAULT_INTERVAL,
            unpin_limit: None,
            unpin_gen_threshold: UNPIN_GEN_DEFAULT_THRES,
        }
    }
}
#[derive(Default, Debug, Copy, Clone)]
struct CoIommuReg {
    dtt_root: u64,
    cmd: u64,
    dtt_level: u64,
}
#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
struct PinnedPageInfo {
    gfn: u64,
    unpin_busy_cnt: u64,
}
impl PinnedPageInfo {
    fn new(gfn: u64, unpin_busy_cnt: u64) -> Self {
        PinnedPageInfo {
            gfn,
            unpin_busy_cnt,
        }
    }
}
#[derive(PartialEq, Debug, Eq)]
enum UnpinThreadState {
    Unparked,
    Parked,
}
struct CoIommuPinState {
    new_gen_pinned_pages: VecDeque<PinnedPageInfo>,
    old_gen_pinned_pages: VecDeque<u64>,
    unpin_thread_state: UnpinThreadState,
    unpin_park_count: u64,
}
unsafe fn vfio_map(
    vfio_container: &Arc<Mutex<VfioContainer>>,
    iova: u64,
    size: u64,
    user_addr: u64,
) -> bool {
    match vfio_container
        .lock()
        .vfio_dma_map(iova, size, user_addr, true)
    {
        Ok(_) => true,
        Err(e) => {
            if let Some(errno) = std::io::Error::last_os_error().raw_os_error() {
                if errno == libc::EEXIST {
                    error!("CoIommu: iova 0x{:x} already pinned", iova);
                    return true;
                }
            }
            error!("CoIommu: failed to map iova 0x{:x}: {}", iova, e);
            false
        }
    }
}
fn vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool {
    match vfio_container.lock().vfio_dma_unmap(iova, size) {
        Ok(_) => true,
        Err(e) => {
            error!("CoIommu: failed to unmap iova 0x{:x}: {}", iova, e);
            false
        }
    }
}
#[derive(Default, Debug, Copy, Clone, FromBytes, IntoBytes)]
#[repr(C)]
struct PinPageInfo {
    bdf: u16,
    pad: [u16; 3],
    nr_pages: u64,
}
const COIOMMU_UPPER_LEVEL_STRIDE: u64 = 9;
const COIOMMU_UPPER_LEVEL_MASK: u64 = (1 << COIOMMU_UPPER_LEVEL_STRIDE) - 1;
const COIOMMU_PT_LEVEL_STRIDE: u64 = 10;
const COIOMMU_PT_LEVEL_MASK: u64 = (1 << COIOMMU_PT_LEVEL_STRIDE) - 1;
fn level_to_offset(gfn: u64, level: u64) -> Result<u64> {
    if level == 1 {
        return Ok(gfn & COIOMMU_PT_LEVEL_MASK);
    }
    if level == 0 {
        bail!("Invalid level for gfn 0x{:x}", gfn);
    }
    let offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE;
    Ok((gfn >> offset) & COIOMMU_UPPER_LEVEL_MASK)
}
struct DTTIter {
    ptr: *const u8,
    gfn: u64,
}
impl Default for DTTIter {
    fn default() -> Self {
        DTTIter {
            ptr: std::ptr::null(),
            gfn: 0,
        }
    }
}
fn gfn_to_dtt_pte(
    mem: &GuestMemory,
    dtt_level: u64,
    dtt_root: u64,
    dtt_iter: &mut DTTIter,
    gfn: u64,
) -> Result<*const AtomicU32> {
    let ptr = if dtt_iter.ptr.is_null()
        || dtt_iter.gfn >> COIOMMU_PT_LEVEL_STRIDE != gfn >> COIOMMU_PT_LEVEL_STRIDE
    {
        let mut level = dtt_level;
        let mut pt_gpa = dtt_root;
        let dtt_nonleaf_entry_size = mem::size_of::<u64>() as u64;
        while level != 1 {
            let index = level_to_offset(gfn, level)? * dtt_nonleaf_entry_size;
            let parent_pt = mem
                .read_obj_from_addr::<u64>(GuestAddress(pt_gpa + index))
                .context(Error::GetDTTEntry)?;
            if (parent_pt & DTT_ENTRY_PRESENT) == 0 {
                bail!("DTT absent at level {} for gfn 0x{:x}", level, gfn);
            }
            pt_gpa = (parent_pt >> DTT_ENTRY_PFN_SHIFT) << PAGE_SHIFT_4K;
            level -= 1;
        }
        let index = level_to_offset(gfn, level)? * mem::size_of::<u32>() as u64;
        mem.get_host_address(GuestAddress(pt_gpa + index))
            .context(Error::GetDTTEntry)?
    } else if gfn > dtt_iter.gfn {
        unsafe {
            dtt_iter
                .ptr
                .add(mem::size_of::<AtomicU32>() * (gfn - dtt_iter.gfn) as usize)
        }
    } else {
        unsafe {
            dtt_iter
                .ptr
                .sub(mem::size_of::<AtomicU32>() * (dtt_iter.gfn - gfn) as usize)
        }
    };
    dtt_iter.ptr = ptr;
    dtt_iter.gfn = gfn;
    Ok(ptr as *const AtomicU32)
}
fn pin_page(
    pinstate: &mut CoIommuPinState,
    policy: CoIommuUnpinPolicy,
    vfio_container: &Arc<Mutex<VfioContainer>>,
    mem: &GuestMemory,
    dtt_level: u64,
    dtt_root: u64,
    dtt_iter: &mut DTTIter,
    gfn: u64,
) -> Result<()> {
    let leaf_entry = gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn)?;
    let gpa = gfn << PAGE_SHIFT_4K;
    let host_addr = mem
        .get_host_address_range(GuestAddress(gpa), PAGE_SIZE_4K as usize)
        .context("failed to get host address")? as u64;
    if (unsafe { (*leaf_entry).load(Ordering::Relaxed) } & DTTE_PINNED_FLAG) != 0 {
        info!("CoIommu: gfn 0x{:x} already pinned", gfn);
        return Ok(());
    }
    if unsafe { vfio_map(vfio_container, gpa, PAGE_SIZE_4K, host_addr) } {
        unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
        if policy == CoIommuUnpinPolicy::Lru {
            pinstate
                .new_gen_pinned_pages
                .push_back(PinnedPageInfo::new(gfn, 0));
        }
    }
    Ok(())
}
#[derive(PartialEq, Debug, Eq)]
enum UnpinResult {
    UnpinlistEmpty,
    Unpinned,
    NotPinned,
    NotUnpinned,
    FailedUnpin,
    UnpinParked,
}
fn unpin_page(
    pinstate: &mut CoIommuPinState,
    vfio_container: &Arc<Mutex<VfioContainer>>,
    mem: &GuestMemory,
    dtt_level: u64,
    dtt_root: u64,
    dtt_iter: &mut DTTIter,
    gfn: u64,
    force: bool,
) -> UnpinResult {
    if pinstate.unpin_thread_state == UnpinThreadState::Parked {
        return UnpinResult::UnpinParked;
    }
    let leaf_entry = match gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn) {
        Ok(v) => v,
        Err(_) => {
            return UnpinResult::NotPinned;
        }
    };
    if force {
        unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
    }
    if let Err(entry) = unsafe {
        (*leaf_entry).compare_exchange(DTTE_PINNED_FLAG, 0, Ordering::SeqCst, Ordering::SeqCst)
    } {
        if entry == 0 {
            UnpinResult::NotPinned
        } else {
            if !force {
                unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
            } else {
                error!(
                    "CoIommu: force case cannot pin gfn 0x{:x} entry 0x{:x}",
                    gfn, entry
                );
            }
            UnpinResult::NotUnpinned
        }
    } else {
        let gpa = gfn << PAGE_SHIFT_4K;
        if vfio_unmap(vfio_container, gpa, PAGE_SIZE_4K) {
            UnpinResult::Unpinned
        } else {
            unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
            UnpinResult::FailedUnpin
        }
    }
}
struct PinWorker {
    mem: GuestMemory,
    endpoints: Vec<u16>,
    notifymap_mmap: Arc<MemoryMapping>,
    dtt_level: u64,
    dtt_root: u64,
    ioevents: Vec<Event>,
    vfio_container: Arc<Mutex<VfioContainer>>,
    pinstate: Arc<Mutex<CoIommuPinState>>,
    params: CoIommuParameters,
}
impl PinWorker {
    fn debug_label(&self) -> &'static str {
        "CoIommuPinWorker"
    }
    fn run(&mut self, kill_evt: Event) {
        #[derive(EventToken)]
        enum Token {
            Kill,
            Pin { index: usize },
        }
        let wait_ctx: WaitContext<Token> =
            match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
                Ok(pc) => pc,
                Err(e) => {
                    error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
                    return;
                }
            };
        for (index, event) in self.ioevents.iter().enumerate() {
            match wait_ctx.add(event, Token::Pin { index }) {
                Ok(_) => {}
                Err(e) => {
                    error!(
                        "{}: failed to add ioevent for index {}: {}",
                        self.debug_label(),
                        index,
                        e
                    );
                    return;
                }
            }
        }
        'wait: loop {
            let events = match wait_ctx.wait() {
                Ok(v) => v,
                Err(e) => {
                    error!("{}: failed polling for events: {}", self.debug_label(), e);
                    break;
                }
            };
            for event in events.iter().filter(|e| e.is_readable) {
                match event.token {
                    Token::Kill => break 'wait,
                    Token::Pin { index } => {
                        let offset = index * mem::size_of::<u64>();
                        if let Some(event) = self.ioevents.get(index) {
                            if let Err(e) = event.wait() {
                                error!(
                                    "{}: failed reading event {}: {}",
                                    self.debug_label(),
                                    index,
                                    e
                                );
                                self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
                                break 'wait;
                            }
                        }
                        if let Ok(data) = self.notifymap_mmap.read_obj::<u64>(offset) {
                            if let Err(e) = self.pin_pages(data) {
                                error!("{}: {}", self.debug_label(), e);
                            }
                        }
                        fence(Ordering::SeqCst);
                        self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
                    }
                }
            }
        }
    }
    fn pin_pages_in_batch(&mut self, gpa: u64) -> Result<()> {
        let pin_page_info = self
            .mem
            .read_obj_from_addr::<PinPageInfo>(GuestAddress(gpa))
            .context("failed to get pin page info")?;
        let bdf = pin_page_info.bdf;
        ensure!(
            self.endpoints.iter().any(|&x| x == bdf),
            "pin page for unexpected bdf 0x{:x}",
            bdf
        );
        let mut nr_pages = pin_page_info.nr_pages;
        let mut offset = mem::size_of::<PinPageInfo>() as u64;
        let mut dtt_iter: DTTIter = Default::default();
        let mut pinstate = self.pinstate.lock();
        while nr_pages > 0 {
            let gfn = self
                .mem
                .read_obj_from_addr::<u64>(GuestAddress(gpa + offset))
                .context("failed to get pin page gfn")?;
            pin_page(
                &mut pinstate,
                self.params.unpin_policy,
                &self.vfio_container,
                &self.mem,
                self.dtt_level,
                self.dtt_root,
                &mut dtt_iter,
                gfn,
            )?;
            offset += mem::size_of::<u64>() as u64;
            nr_pages -= 1;
        }
        Ok(())
    }
    fn pin_pages(&mut self, gfn_bdf: u64) -> Result<()> {
        if gfn_bdf & PIN_PAGES_IN_BATCH != 0 {
            let gpa = gfn_bdf & !PIN_PAGES_IN_BATCH;
            self.pin_pages_in_batch(gpa)
        } else {
            let bdf = (gfn_bdf & 0xffff) as u16;
            let gfn = gfn_bdf >> 16;
            let mut dtt_iter: DTTIter = Default::default();
            ensure!(
                self.endpoints.iter().any(|&x| x == bdf),
                "pin page for unexpected bdf 0x{:x}",
                bdf
            );
            let mut pinstate = self.pinstate.lock();
            pin_page(
                &mut pinstate,
                self.params.unpin_policy,
                &self.vfio_container,
                &self.mem,
                self.dtt_level,
                self.dtt_root,
                &mut dtt_iter,
                gfn,
            )
        }
    }
}
struct UnpinWorker {
    mem: GuestMemory,
    dtt_level: u64,
    dtt_root: u64,
    vfio_container: Arc<Mutex<VfioContainer>>,
    unpin_tube: Option<Tube>,
    pinstate: Arc<Mutex<CoIommuPinState>>,
    params: CoIommuParameters,
    unpin_gen_threshold: u64,
}
impl UnpinWorker {
    fn debug_label(&self) -> &'static str {
        "CoIommuUnpinWorker"
    }
    fn run(&mut self, kill_evt: Event) {
        #[derive(EventToken)]
        enum Token {
            UnpinTimer,
            UnpinReq,
            Kill,
        }
        let wait_ctx: WaitContext<Token> =
            match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
                Ok(pc) => pc,
                Err(e) => {
                    error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
                    return;
                }
            };
        if let Some(tube) = &self.unpin_tube {
            if let Err(e) = wait_ctx.add(tube, Token::UnpinReq) {
                error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
                return;
            }
        }
        let mut unpin_timer = if self.params.unpin_policy != CoIommuUnpinPolicy::Off
            && !self.params.unpin_interval.is_zero()
        {
            let mut timer = match Timer::new() {
                Ok(t) => t,
                Err(e) => {
                    error!(
                        "{}: failed to create the unpin timer: {}",
                        self.debug_label(),
                        e
                    );
                    return;
                }
            };
            if let Err(e) = timer.reset_repeating(self.params.unpin_interval) {
                error!(
                    "{}: failed to start the unpin timer: {}",
                    self.debug_label(),
                    e
                );
                return;
            }
            if let Err(e) = wait_ctx.add(&timer, Token::UnpinTimer) {
                error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
                return;
            }
            Some(timer)
        } else {
            None
        };
        let unpin_tube = self.unpin_tube.take();
        'wait: loop {
            let events = match wait_ctx.wait() {
                Ok(v) => v,
                Err(e) => {
                    error!("{}: failed polling for events: {}", self.debug_label(), e);
                    break;
                }
            };
            for event in events.iter().filter(|e| e.is_readable) {
                match event.token {
                    Token::UnpinTimer => {
                        self.unpin_pages();
                        if let Some(timer) = &mut unpin_timer {
                            if let Err(e) = timer.mark_waited() {
                                error!(
                                    "{}: failed to clear unpin timer: {}",
                                    self.debug_label(),
                                    e
                                );
                                break 'wait;
                            }
                        }
                    }
                    Token::UnpinReq => {
                        if let Some(tube) = &unpin_tube {
                            match tube.recv::<UnpinRequest>() {
                                Ok(req) => {
                                    let mut unpin_done = true;
                                    for range in req.ranges {
                                        if !self.unpin_pages_in_range(range.0, range.1) {
                                            unpin_done = false;
                                            break;
                                        }
                                    }
                                    let resp = if unpin_done {
                                        UnpinResponse::Success
                                    } else {
                                        UnpinResponse::Failed
                                    };
                                    if let Err(e) = tube.send(&resp) {
                                        error!(
                                            "{}: failed to send unpin response {}",
                                            self.debug_label(),
                                            e
                                        );
                                    }
                                }
                                Err(e) => {
                                    if let TubeError::Disconnected = e {
                                        if let Err(e) = wait_ctx.delete(tube) {
                                            error!(
                                                "{}: failed to remove unpin_tube: {}",
                                                self.debug_label(),
                                                e
                                            );
                                        }
                                    } else {
                                        error!(
                                            "{}: failed to recv Unpin Request: {}",
                                            self.debug_label(),
                                            e
                                        );
                                    }
                                }
                            }
                        }
                    }
                    Token::Kill => break 'wait,
                }
            }
        }
        self.unpin_tube = unpin_tube;
    }
    fn unpin_pages(&mut self) {
        if self.params.unpin_policy == CoIommuUnpinPolicy::Lru {
            self.lru_unpin_pages();
        }
    }
    fn lru_unpin_page(
        &mut self,
        dtt_iter: &mut DTTIter,
        new_gen: bool,
    ) -> (UnpinResult, Option<PinnedPageInfo>) {
        let mut pinstate = self.pinstate.lock();
        let pageinfo = if new_gen {
            pinstate.new_gen_pinned_pages.pop_front()
        } else {
            pinstate
                .old_gen_pinned_pages
                .pop_front()
                .map(|gfn| PinnedPageInfo::new(gfn, 0))
        };
        pageinfo.map_or((UnpinResult::UnpinlistEmpty, None), |pageinfo| {
            (
                unpin_page(
                    &mut pinstate,
                    &self.vfio_container,
                    &self.mem,
                    self.dtt_level,
                    self.dtt_root,
                    dtt_iter,
                    pageinfo.gfn,
                    false,
                ),
                Some(pageinfo),
            )
        })
    }
    fn lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64 {
        let mut not_unpinned_new_gen_pages = VecDeque::new();
        let mut not_unpinned_old_gen_pages = VecDeque::new();
        let mut unpinned_count = 0;
        let has_limit = unpin_limit.is_some();
        let limit_count = unpin_limit.unwrap_or(0);
        let mut dtt_iter: DTTIter = Default::default();
        while !has_limit || unpinned_count != limit_count {
            let (result, pinned_page) = self.lru_unpin_page(&mut dtt_iter, new_gen);
            match result {
                UnpinResult::UnpinlistEmpty => break,
                UnpinResult::Unpinned => unpinned_count += 1,
                UnpinResult::NotPinned => {}
                UnpinResult::NotUnpinned => {
                    if let Some(mut page) = pinned_page {
                        if self.params.unpin_gen_threshold != 0 {
                            page.unpin_busy_cnt += 1;
                            if !new_gen || page.unpin_busy_cnt >= self.params.unpin_gen_threshold {
                                not_unpinned_old_gen_pages.push_back(page.gfn);
                            } else {
                                not_unpinned_new_gen_pages.push_back(page);
                            }
                        }
                    }
                }
                UnpinResult::FailedUnpin | UnpinResult::UnpinParked => {
                    if let Some(page) = pinned_page {
                        if new_gen {
                            not_unpinned_new_gen_pages.push_back(page);
                        } else {
                            not_unpinned_old_gen_pages.push_back(page.gfn);
                        }
                    }
                    if result == UnpinResult::UnpinParked {
                        thread::park();
                    }
                }
            }
        }
        if !not_unpinned_new_gen_pages.is_empty() {
            let mut pinstate = self.pinstate.lock();
            pinstate
                .new_gen_pinned_pages
                .append(&mut not_unpinned_new_gen_pages);
        }
        if !not_unpinned_old_gen_pages.is_empty() {
            let mut pinstate = self.pinstate.lock();
            pinstate
                .old_gen_pinned_pages
                .append(&mut not_unpinned_old_gen_pages);
        }
        unpinned_count
    }
    fn lru_unpin_pages(&mut self) {
        let mut unpin_count = 0;
        if self.params.unpin_gen_threshold != 0 {
            self.unpin_gen_threshold += 1;
            if self.unpin_gen_threshold == self.params.unpin_gen_threshold {
                self.unpin_gen_threshold = 0;
                unpin_count = self.lru_unpin_pages_in_loop(self.params.unpin_limit, false);
            }
        }
        self.lru_unpin_pages_in_loop(
            self.params
                .unpin_limit
                .map(|limit| limit.saturating_sub(unpin_count)),
            true,
        );
    }
    fn unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool {
        let mut dtt_iter: DTTIter = Default::default();
        let mut index = 0;
        while index != count {
            let mut pinstate = self.pinstate.lock();
            let result = unpin_page(
                &mut pinstate,
                &self.vfio_container,
                &self.mem,
                self.dtt_level,
                self.dtt_root,
                &mut dtt_iter,
                gfn + index,
                true,
            );
            drop(pinstate);
            match result {
                UnpinResult::Unpinned | UnpinResult::NotPinned => {}
                UnpinResult::UnpinParked => {
                    thread::park();
                    continue;
                }
                _ => {
                    error!("coiommu: force unpin failed by {:?}", result);
                    return false;
                }
            }
            index += 1;
        }
        true
    }
}
pub struct CoIommuDev {
    config_regs: PciConfiguration,
    pci_address: Option<PciAddress>,
    mem: GuestMemory,
    coiommu_reg: CoIommuReg,
    endpoints: Vec<u16>,
    notifymap_mem: SafeDescriptor,
    notifymap_mmap: Arc<MemoryMapping>,
    notifymap_addr: Option<u64>,
    topologymap_mem: SafeDescriptor,
    topologymap_addr: Option<u64>,
    mmapped: bool,
    vm_memory_client: VmMemoryClient,
    pin_thread: Option<WorkerThread<PinWorker>>,
    unpin_thread: Option<WorkerThread<UnpinWorker>>,
    unpin_tube: Option<Tube>,
    ioevents: Vec<Event>,
    vfio_container: Arc<Mutex<VfioContainer>>,
    pinstate: Arc<Mutex<CoIommuPinState>>,
    params: CoIommuParameters,
}
impl CoIommuDev {
    pub fn new(
        mem: GuestMemory,
        vfio_container: Arc<Mutex<VfioContainer>>,
        vm_memory_client: VmMemoryClient,
        unpin_tube: Option<Tube>,
        endpoints: Vec<u16>,
        vcpu_count: u64,
        params: CoIommuParameters,
    ) -> Result<Self> {
        let config_regs = PciConfiguration::new(
            PCI_VENDOR_ID_COIOMMU,
            PCI_DEVICE_ID_COIOMMU,
            PciClassCode::Other,
            &PciOtherSubclass::Other,
            None, PciHeaderType::Device,
            PCI_VENDOR_ID_COIOMMU,
            PCI_DEVICE_ID_COIOMMU,
            COIOMMU_REVISION_ID,
        );
        let notifymap_mem = SharedMemory::new("coiommu_notifymap", COIOMMU_NOTIFYMAP_SIZE as u64)
            .context(Error::CreateSharedMemory)?;
        let notifymap_mmap = Arc::new(
            MemoryMappingBuilder::new(COIOMMU_NOTIFYMAP_SIZE)
                .from_shared_memory(¬ifymap_mem)
                .offset(0)
                .build()?,
        );
        let topologymap_mem =
            SharedMemory::new("coiommu_topologymap", COIOMMU_TOPOLOGYMAP_SIZE as u64)
                .context(Error::CreateSharedMemory)?;
        let topologymap_mmap = Arc::new(
            MemoryMappingBuilder::new(COIOMMU_TOPOLOGYMAP_SIZE)
                .from_shared_memory(&topologymap_mem)
                .offset(0)
                .build()?,
        );
        ensure!(
            (endpoints.len() + 1) * mem::size_of::<u16>() <= COIOMMU_TOPOLOGYMAP_SIZE,
            "Coiommu: too many endpoints"
        );
        topologymap_mmap.write_obj::<u16>(endpoints.len() as u16, 0)?;
        for (index, endpoint) in endpoints.iter().enumerate() {
            topologymap_mmap.write_obj::<u16>(*endpoint, (index + 1) * mem::size_of::<u16>())?;
        }
        let mut ioevents = Vec::new();
        for _ in 0..vcpu_count {
            ioevents.push(Event::new().context("CoIommu failed to create event fd")?);
        }
        Ok(Self {
            config_regs,
            pci_address: None,
            mem,
            coiommu_reg: Default::default(),
            endpoints,
            notifymap_mem: notifymap_mem.into(),
            notifymap_mmap,
            notifymap_addr: None,
            topologymap_mem: topologymap_mem.into(),
            topologymap_addr: None,
            mmapped: false,
            vm_memory_client,
            pin_thread: None,
            unpin_thread: None,
            unpin_tube,
            ioevents,
            vfio_container,
            pinstate: Arc::new(Mutex::new(CoIommuPinState {
                new_gen_pinned_pages: VecDeque::new(),
                old_gen_pinned_pages: VecDeque::new(),
                unpin_thread_state: UnpinThreadState::Unparked,
                unpin_park_count: 0,
            })),
            params,
        })
    }
    fn register_mmap(
        &self,
        descriptor: SafeDescriptor,
        size: usize,
        offset: u64,
        gpa: u64,
        prot: Protection,
    ) -> Result<()> {
        let _region = self
            .vm_memory_client
            .register_memory(
                VmMemorySource::Descriptor {
                    descriptor,
                    offset,
                    size: size as u64,
                },
                VmMemoryDestination::GuestPhysicalAddress(gpa),
                prot,
                MemCacheType::CacheCoherent,
            )
            .context("register_mmap register_memory failed")?;
        Ok(())
    }
    fn mmap(&mut self) {
        if self.mmapped {
            return;
        }
        if let Some(gpa) = self.notifymap_addr {
            match self.register_mmap(
                self.notifymap_mem.try_clone().unwrap(),
                COIOMMU_NOTIFYMAP_SIZE,
                0,
                gpa,
                Protection::read_write(),
            ) {
                Ok(_) => {}
                Err(e) => {
                    panic!("{}: map notifymap failed: {}", self.debug_label(), e);
                }
            }
        }
        if let Some(gpa) = self.topologymap_addr {
            match self.register_mmap(
                self.topologymap_mem.try_clone().unwrap(),
                COIOMMU_TOPOLOGYMAP_SIZE,
                0,
                gpa,
                Protection::read(),
            ) {
                Ok(_) => {}
                Err(e) => {
                    panic!("{}: map topologymap failed: {}", self.debug_label(), e);
                }
            }
        }
        self.mmapped = true;
    }
    fn start_workers(&mut self) {
        if self.pin_thread.is_none() {
            self.start_pin_thread();
        }
        if self.unpin_thread.is_none() {
            self.start_unpin_thread();
        }
    }
    fn start_pin_thread(&mut self) {
        let mem = self.mem.clone();
        let endpoints = self.endpoints.to_vec();
        let notifymap_mmap = self.notifymap_mmap.clone();
        let dtt_root = self.coiommu_reg.dtt_root;
        let dtt_level = self.coiommu_reg.dtt_level;
        let ioevents: Vec<Event> = self
            .ioevents
            .iter()
            .map(|e| e.try_clone().unwrap())
            .collect();
        let bar0 = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR);
        let notify_base = bar0 + mem::size_of::<CoIommuReg>() as u64;
        for (i, evt) in self.ioevents.iter().enumerate() {
            self.vm_memory_client
                .register_io_event(
                    evt.try_clone().expect("failed to clone event"),
                    notify_base + i as u64,
                    Datamatch::AnyLength,
                )
                .expect("failed to register ioevent");
        }
        let vfio_container = self.vfio_container.clone();
        let pinstate = self.pinstate.clone();
        let params = self.params;
        self.pin_thread = Some(WorkerThread::start("coiommu_pin", move |kill_evt| {
            let mut worker = PinWorker {
                mem,
                endpoints,
                notifymap_mmap,
                dtt_root,
                dtt_level,
                ioevents,
                vfio_container,
                pinstate,
                params,
            };
            worker.run(kill_evt);
            worker
        }));
    }
    fn start_unpin_thread(&mut self) {
        let mem = self.mem.clone();
        let dtt_root = self.coiommu_reg.dtt_root;
        let dtt_level = self.coiommu_reg.dtt_level;
        let vfio_container = self.vfio_container.clone();
        let unpin_tube = self.unpin_tube.take();
        let pinstate = self.pinstate.clone();
        let params = self.params;
        self.unpin_thread = Some(WorkerThread::start("coiommu_unpin", move |kill_evt| {
            let mut worker = UnpinWorker {
                mem,
                dtt_level,
                dtt_root,
                vfio_container,
                unpin_tube,
                pinstate,
                params,
                unpin_gen_threshold: 0,
            };
            worker.run(kill_evt);
            worker
        }));
    }
    fn allocate_bar_address(
        &mut self,
        resources: &mut SystemAllocator,
        address: PciAddress,
        size: u64,
        bar_num: u8,
        name: &str,
    ) -> PciResult<u64> {
        let addr = resources
            .allocate_mmio(
                size,
                Alloc::PciBar {
                    bus: address.bus,
                    dev: address.dev,
                    func: address.func,
                    bar: bar_num,
                },
                name.to_string(),
                AllocOptions::new().prefetchable(true).align(size),
            )
            .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
        let bar = PciBarConfiguration::new(
            bar_num as usize,
            size,
            PciBarRegionType::Memory64BitRegion,
            PciBarPrefetchable::Prefetchable,
        )
        .set_address(addr);
        self.config_regs
            .add_pci_bar(bar)
            .map_err(|e| PciDeviceError::IoRegistrationFailed(addr, e))?;
        Ok(addr)
    }
    fn read_mmio(&mut self, offset: u64, data: &mut [u8]) {
        if offset >= mem::size_of::<CoIommuReg>() as u64 {
            error!(
                "{}: read_mmio: invalid offset 0x{:x}",
                self.debug_label(),
                offset
            );
            return;
        }
        if offset % 8 != 0 || data.len() != 8 {
            error!(
                "{}: read_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
                self.debug_label(),
                offset,
                data.len()
            );
            return;
        }
        let v = match offset / 8 {
            0 => self.coiommu_reg.dtt_root,
            1 => self.coiommu_reg.cmd,
            2 => self.coiommu_reg.dtt_level,
            _ => return,
        };
        data.copy_from_slice(&v.to_ne_bytes());
    }
    fn write_mmio(&mut self, offset: u64, data: &[u8]) {
        let mmio_len = mem::size_of::<CoIommuReg>() as u64;
        if offset >= mmio_len {
            if data.len() != 1 {
                error!(
                    "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 1",
                    self.debug_label(),
                    offset,
                    data.len()
                );
                return;
            }
            let index = (offset - mmio_len) as usize;
            if let Some(event) = self.ioevents.get(index) {
                let _ = event.signal();
            } else {
                self.notifymap_mmap
                    .write_obj::<u64>(0, index * mem::size_of::<u64>())
                    .unwrap();
                error!(
                    "{}: No page will be pinned as driver is accessing unused trigger register: offset 0x{:x}",
                    self.debug_label(),
                    offset
                );
            }
            return;
        }
        if offset % 8 != 0 || data.len() != 8 {
            error!(
                "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
                self.debug_label(),
                offset,
                data.len()
            );
            return;
        }
        let index = offset / 8;
        let v = u64::from_ne_bytes(data.try_into().unwrap());
        match index {
            0 => {
                if self.coiommu_reg.dtt_root == 0 {
                    self.coiommu_reg.dtt_root = v;
                }
            }
            1 => match v {
                COIOMMU_CMD_DEACTIVATE => {
                    panic!("{}: Deactivate is not supported", self.debug_label())
                }
                COIOMMU_CMD_ACTIVATE => {
                    if self.coiommu_reg.dtt_root != 0 && self.coiommu_reg.dtt_level != 0 {
                        self.start_workers();
                    }
                }
                COIOMMU_CMD_PARK_UNPIN => {
                    let mut pinstate = self.pinstate.lock();
                    pinstate.unpin_thread_state = UnpinThreadState::Parked;
                    if let Some(v) = pinstate.unpin_park_count.checked_add(1) {
                        pinstate.unpin_park_count = v;
                    } else {
                        panic!("{}: Park request overflowing", self.debug_label());
                    }
                }
                COIOMMU_CMD_UNPARK_UNPIN => {
                    let mut pinstate = self.pinstate.lock();
                    if pinstate.unpin_thread_state == UnpinThreadState::Parked {
                        if let Some(v) = pinstate.unpin_park_count.checked_sub(1) {
                            pinstate.unpin_park_count = v;
                            if pinstate.unpin_park_count == 0 {
                                if let Some(worker_thread) = &self.unpin_thread {
                                    worker_thread.thread().unpark();
                                }
                                pinstate.unpin_thread_state = UnpinThreadState::Unparked;
                            }
                        } else {
                            error!("{}: Park count is already reached to 0", self.debug_label());
                        }
                    }
                }
                _ => {}
            },
            2 => {
                if self.coiommu_reg.dtt_level == 0 {
                    self.coiommu_reg.dtt_level = v;
                }
            }
            _ => {}
        }
    }
}
impl PciDevice for CoIommuDev {
    fn debug_label(&self) -> String {
        "CoIommu".to_owned()
    }
    fn allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress> {
        if self.pci_address.is_none() {
            self.pci_address = resources.allocate_pci(0, self.debug_label());
        }
        self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
    }
    fn allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>> {
        let address = self
            .pci_address
            .expect("allocate_address must be called prior to allocate_io_bars");
        let mut ranges: Vec<BarRange> = Vec::new();
        let mmio_addr = self.allocate_bar_address(
            resources,
            address,
            COIOMMU_MMIO_BAR_SIZE,
            COIOMMU_MMIO_BAR as u8,
            "coiommu-mmiobar",
        )?;
        ranges.push(BarRange {
            addr: mmio_addr,
            size: COIOMMU_MMIO_BAR_SIZE,
            prefetchable: false,
        });
        Ok(ranges)
    }
    fn allocate_device_bars(
        &mut self,
        resources: &mut SystemAllocator,
    ) -> PciResult<Vec<BarRange>> {
        let address = self
            .pci_address
            .expect("allocate_address must be called prior to allocate_device_bars");
        let mut ranges: Vec<BarRange> = Vec::new();
        let topologymap_addr = self.allocate_bar_address(
            resources,
            address,
            COIOMMU_TOPOLOGYMAP_SIZE as u64,
            COIOMMU_TOPOLOGYMAP_BAR,
            "coiommu-topology",
        )?;
        self.topologymap_addr = Some(topologymap_addr);
        ranges.push(BarRange {
            addr: topologymap_addr,
            size: COIOMMU_TOPOLOGYMAP_SIZE as u64,
            prefetchable: false,
        });
        let notifymap_addr = self.allocate_bar_address(
            resources,
            address,
            COIOMMU_NOTIFYMAP_SIZE as u64,
            COIOMMU_NOTIFYMAP_BAR as u8,
            "coiommu-notifymap",
        )?;
        self.notifymap_addr = Some(notifymap_addr);
        ranges.push(BarRange {
            addr: notifymap_addr,
            size: COIOMMU_NOTIFYMAP_SIZE as u64,
            prefetchable: false,
        });
        Ok(ranges)
    }
    fn read_config_register(&self, reg_idx: usize) -> u32 {
        self.config_regs.read_reg(reg_idx)
    }
    fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
        if reg_idx == COMMAND_REG
            && data.len() == 2
            && data[0] & COMMAND_REG_MEMORY_SPACE_MASK as u8 != 0
            && !self.mmapped
        {
            self.mmap();
        }
        self.config_regs.write_reg(reg_idx, offset, data);
    }
    fn keep_rds(&self) -> Vec<RawDescriptor> {
        let mut rds = vec![
            self.vfio_container.lock().as_raw_descriptor(),
            self.vm_memory_client.as_raw_descriptor(),
            self.notifymap_mem.as_raw_descriptor(),
            self.topologymap_mem.as_raw_descriptor(),
        ];
        if let Some(unpin_tube) = &self.unpin_tube {
            rds.push(unpin_tube.as_raw_descriptor());
        }
        rds.extend(self.ioevents.iter().map(Event::as_raw_descriptor));
        rds
    }
    fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
        match bar_index {
            COIOMMU_MMIO_BAR => self.read_mmio(offset, data),
            COIOMMU_NOTIFYMAP_BAR => {
                }
            _ => {}
        }
    }
    fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
        match bar_index {
            COIOMMU_MMIO_BAR => self.write_mmio(offset, data),
            COIOMMU_NOTIFYMAP_BAR => {
                }
            _ => {}
        }
    }
    fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
        self.config_regs.get_bar_configuration(bar_num)
    }
}
impl Suspendable for CoIommuDev {}