use std::alloc::Layout;
use std::cell::Cell;
use std::cell::RefCell;
use std::cmp;
use std::cmp::min;
use std::cmp::Ord;
use std::cmp::PartialEq;
use std::cmp::PartialOrd;
use std::collections::btree_set::BTreeSet;
use std::io::Read;
use std::io::Write;
use std::mem;
use std::sync::Arc;
use std::sync::RwLock;
use base::error;
use base::LayoutAllocation;
use kvm::CpuId;
use kvm::Vcpu;
use kvm_sys::kvm_debugregs;
use kvm_sys::kvm_enable_cap;
use kvm_sys::kvm_fpu;
use kvm_sys::kvm_lapic_state;
use kvm_sys::kvm_mp_state;
use kvm_sys::kvm_msr_entry;
use kvm_sys::kvm_msrs;
use kvm_sys::kvm_regs;
use kvm_sys::kvm_sregs;
use kvm_sys::kvm_vcpu_events;
use kvm_sys::kvm_xcrs;
use kvm_sys::KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
use libc::EINVAL;
use libc::ENOENT;
use libc::ENOTTY;
use libc::EPERM;
use libc::EPIPE;
use libc::EPROTO;
use protobuf::CodedOutputStream;
use protobuf::EnumOrUnknown;
use protobuf::Message;
use protos::plugin::*;
use static_assertions::const_assert;
use sync::Mutex;
use zerocopy::AsBytes;
use zerocopy::FromBytes;
use super::*;
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum IoSpace {
Ioport,
Mmio,
}
#[derive(Debug, Copy, Clone)]
struct Range(u64, u64, bool);
impl Eq for Range {}
impl PartialEq for Range {
fn eq(&self, other: &Range) -> bool {
self.0 == other.0
}
}
impl Ord for Range {
fn cmp(&self, other: &Range) -> cmp::Ordering {
self.0.cmp(&other.0)
}
}
impl PartialOrd for Range {
fn partial_cmp(&self, other: &Range) -> Option<cmp::Ordering> {
Some(self.cmp(other))
}
}
fn get_vcpu_state_enum_or_unknown(
vcpu: &Vcpu,
state_set: EnumOrUnknown<vcpu_request::StateSet>,
) -> SysResult<Vec<u8>> {
get_vcpu_state(
vcpu,
state_set.enum_value().map_err(|_| SysError::new(EINVAL))?,
)
}
fn get_vcpu_state(vcpu: &Vcpu, state_set: vcpu_request::StateSet) -> SysResult<Vec<u8>> {
Ok(match state_set {
vcpu_request::StateSet::REGS => vcpu.get_regs()?.as_bytes().to_vec(),
vcpu_request::StateSet::SREGS => vcpu.get_sregs()?.as_bytes().to_vec(),
vcpu_request::StateSet::FPU => vcpu.get_fpu()?.as_bytes().to_vec(),
vcpu_request::StateSet::DEBUGREGS => vcpu.get_debugregs()?.as_bytes().to_vec(),
vcpu_request::StateSet::XCREGS => vcpu.get_xcrs()?.as_bytes().to_vec(),
vcpu_request::StateSet::LAPIC => vcpu.get_lapic()?.as_bytes().to_vec(),
vcpu_request::StateSet::MP => vcpu.get_mp_state()?.as_bytes().to_vec(),
vcpu_request::StateSet::EVENTS => vcpu.get_vcpu_events()?.as_bytes().to_vec(),
})
}
fn set_vcpu_state_enum_or_unknown(
vcpu: &Vcpu,
state_set: EnumOrUnknown<vcpu_request::StateSet>,
state: &[u8],
) -> SysResult<()> {
set_vcpu_state(
vcpu,
state_set.enum_value().map_err(|_| SysError::new(EINVAL))?,
state,
)
}
fn set_vcpu_state(vcpu: &Vcpu, state_set: vcpu_request::StateSet, state: &[u8]) -> SysResult<()> {
match state_set {
vcpu_request::StateSet::REGS => {
let regs = kvm_regs::read_from(state).ok_or(SysError::new(EINVAL))?;
vcpu.set_regs(®s)
}
vcpu_request::StateSet::SREGS => {
let sregs = kvm_sregs::read_from(state).ok_or(SysError::new(EINVAL))?;
vcpu.set_sregs(&sregs)
}
vcpu_request::StateSet::FPU => {
let fpu = kvm_fpu::read_from(state).ok_or(SysError::new(EINVAL))?;
vcpu.set_fpu(&fpu)
}
vcpu_request::StateSet::DEBUGREGS => {
let debugregs = kvm_debugregs::read_from(state).ok_or(SysError::new(EINVAL))?;
vcpu.set_debugregs(&debugregs)
}
vcpu_request::StateSet::XCREGS => {
let xcrs = kvm_xcrs::read_from(state).ok_or(SysError::new(EINVAL))?;
vcpu.set_xcrs(&xcrs)
}
vcpu_request::StateSet::LAPIC => {
let lapic_state = kvm_lapic_state::read_from(state).ok_or(SysError::new(EINVAL))?;
vcpu.set_lapic(&lapic_state)
}
vcpu_request::StateSet::MP => {
let mp_state = kvm_mp_state::read_from(state).ok_or(SysError::new(EINVAL))?;
vcpu.set_mp_state(&mp_state)
}
vcpu_request::StateSet::EVENTS => {
let vcpu_events = kvm_vcpu_events::read_from(state).ok_or(SysError::new(EINVAL))?;
vcpu.set_vcpu_events(&vcpu_events)
}
}
}
pub struct CallHintDetails {
pub match_rax: bool,
pub match_rbx: bool,
pub match_rcx: bool,
pub match_rdx: bool,
pub rax: u64,
pub rbx: u64,
pub rcx: u64,
pub rdx: u64,
pub send_sregs: bool,
pub send_debugregs: bool,
}
pub struct CallHint {
io_space: IoSpace,
addr: u64,
on_write: bool,
regs: Vec<CallHintDetails>,
}
#[derive(Default)]
pub struct SharedVcpuState {
ioport_regions: BTreeSet<Range>,
mmio_regions: BTreeSet<Range>,
hint: Option<CallHint>,
}
impl SharedVcpuState {
pub fn reserve_range(
&mut self,
space: IoSpace,
start: u64,
length: u64,
async_write: bool,
) -> SysResult<()> {
if length == 0 {
return Err(SysError::new(EINVAL));
}
if self.is_reserved(space, start) {
return Err(SysError::new(EPERM));
}
let last_address = match start.checked_add(length) {
Some(end) => end - 1,
None => return Err(SysError::new(EINVAL)),
};
let space = match space {
IoSpace::Ioport => &mut self.ioport_regions,
IoSpace::Mmio => &mut self.mmio_regions,
};
match space
.range(..Range(last_address, 0, false))
.next_back()
.cloned()
{
Some(Range(existing_start, _, _)) if existing_start >= start => {
Err(SysError::new(EPERM))
}
_ => {
space.insert(Range(start, length, async_write));
Ok(())
}
}
}
pub fn unreserve_range(&mut self, space: IoSpace, start: u64) -> SysResult<()> {
let range = Range(start, 0, false);
let space = match space {
IoSpace::Ioport => &mut self.ioport_regions,
IoSpace::Mmio => &mut self.mmio_regions,
};
if space.remove(&range) {
Ok(())
} else {
Err(SysError::new(ENOENT))
}
}
pub fn set_hint(
&mut self,
space: IoSpace,
addr: u64,
on_write: bool,
regs: Vec<CallHintDetails>,
) {
if addr == 0 {
self.hint = None;
} else {
let hint = CallHint {
io_space: space,
addr,
on_write,
regs,
};
self.hint = Some(hint);
}
}
fn is_reserved(&self, space: IoSpace, addr: u64) -> bool {
if let Some(Range(start, len, _)) = self.first_before(space, addr) {
let offset = addr - start;
if offset < len {
return true;
}
}
false
}
fn first_before(&self, io_space: IoSpace, addr: u64) -> Option<Range> {
let space = match io_space {
IoSpace::Ioport => &self.ioport_regions,
IoSpace::Mmio => &self.mmio_regions,
};
match addr.checked_add(1) {
Some(next_addr) => space
.range(..Range(next_addr, 0, false))
.next_back()
.cloned(),
None => None,
}
}
fn matches_hint(&self, io_space: IoSpace, addr: u64, is_write: bool) -> bool {
if let Some(hint) = &self.hint {
return io_space == hint.io_space && addr == hint.addr && is_write == hint.on_write;
}
false
}
fn check_hint_details(&self, regs: &kvm_regs) -> (bool, bool) {
if let Some(hint) = &self.hint {
for entry in hint.regs.iter() {
if (!entry.match_rax || entry.rax == regs.rax)
&& (!entry.match_rbx || entry.rbx == regs.rbx)
&& (!entry.match_rcx || entry.rcx == regs.rcx)
&& (!entry.match_rdx || entry.rdx == regs.rdx)
{
return (entry.send_sregs, entry.send_debugregs);
}
}
}
(false, false)
}
}
#[derive(Default)]
pub struct PerVcpuState {
pause_request: Option<u64>,
}
impl PerVcpuState {
pub fn request_pause(&mut self, data: u64) {
self.pause_request = Some(data);
}
}
enum VcpuRunData<'a> {
Read(&'a mut [u8]),
Write(&'a [u8]),
}
impl<'a> VcpuRunData<'a> {
fn is_write(&self) -> bool {
matches!(self, VcpuRunData::Write(_))
}
fn as_slice(&self) -> &[u8] {
match self {
VcpuRunData::Read(s) => s,
VcpuRunData::Write(s) => s,
}
}
fn copy_from_slice(&mut self, data: &[u8]) {
if let VcpuRunData::Read(s) = self {
let copy_size = min(s.len(), data.len());
s.copy_from_slice(&data[..copy_size]);
}
}
}
pub struct PluginVcpu {
shared_vcpu_state: Arc<RwLock<SharedVcpuState>>,
per_vcpu_state: Arc<Mutex<PerVcpuState>>,
read_pipe: File,
write_pipe: File,
wait_reason: Cell<Option<vcpu_response::Wait>>,
request_buffer: RefCell<Vec<u8>>,
response_buffer: RefCell<Vec<u8>>,
}
impl PluginVcpu {
pub fn new(
shared_vcpu_state: Arc<RwLock<SharedVcpuState>>,
per_vcpu_state: Arc<Mutex<PerVcpuState>>,
read_pipe: File,
write_pipe: File,
) -> PluginVcpu {
PluginVcpu {
shared_vcpu_state,
per_vcpu_state,
read_pipe,
write_pipe,
wait_reason: Default::default(),
request_buffer: Default::default(),
response_buffer: Default::default(),
}
}
pub fn init(&self, vcpu: &Vcpu) -> SysResult<()> {
let mut wait_reason = vcpu_response::Wait::new();
wait_reason.mut_init();
self.wait_reason.set(Some(wait_reason));
self.handle_until_resume(vcpu)?;
Ok(())
}
pub fn pre_run(&self, vcpu: &Vcpu) -> SysResult<()> {
let request = {
let mut lock = self.per_vcpu_state.lock();
lock.pause_request.take()
};
if let Some(user_data) = request {
let mut wait_reason = vcpu_response::Wait::new();
wait_reason.mut_user().user = user_data;
self.wait_reason.set(Some(wait_reason));
self.handle_until_resume(vcpu)?;
}
Ok(())
}
fn process(&self, io_space: IoSpace, addr: u64, mut data: VcpuRunData, vcpu: &Vcpu) -> bool {
let vcpu_state_lock = match self.shared_vcpu_state.read() {
Ok(l) => l,
Err(e) => {
error!("error read locking shared cpu state: {}", e);
return false;
}
};
let first_before_addr = vcpu_state_lock.first_before(io_space, addr);
match first_before_addr {
Some(Range(start, len, async_write)) => {
let offset = addr - start;
if offset >= len {
return false;
}
if async_write && !data.is_write() {
return false;
}
let mut wait_reason = vcpu_response::Wait::new();
let io = wait_reason.mut_io();
io.space = match io_space {
IoSpace::Ioport => AddressSpace::IOPORT,
IoSpace::Mmio => AddressSpace::MMIO,
}
.into();
io.address = addr;
io.is_write = data.is_write();
io.data = data.as_slice().to_vec();
io.no_resume = async_write;
if !async_write && vcpu_state_lock.matches_hint(io_space, addr, io.is_write) {
if let Ok(regs) = vcpu.get_regs() {
let (has_sregs, has_debugregs) = vcpu_state_lock.check_hint_details(®s);
io.regs = regs.as_bytes().to_vec();
if has_sregs {
if let Ok(state) = get_vcpu_state(vcpu, vcpu_request::StateSet::SREGS) {
io.sregs = state;
}
}
if has_debugregs {
if let Ok(state) =
get_vcpu_state(vcpu, vcpu_request::StateSet::DEBUGREGS)
{
io.debugregs = state;
}
}
}
}
drop(vcpu_state_lock);
if async_write {
let mut response = VcpuResponse::new();
response.set_wait(wait_reason);
let mut response_buffer = self.response_buffer.borrow_mut();
response_buffer.clear();
let mut stream = CodedOutputStream::vec(&mut response_buffer);
match response.write_length_delimited_to(&mut stream) {
Ok(_) => {
if let Err(e) = stream.flush() {
error!("failed to flush to vec: {}", e);
}
drop(stream);
let mut write_pipe = &self.write_pipe;
if let Err(e) = write_pipe.write_all(&response_buffer) {
error!("failed to write to pipe: {}", e);
}
}
Err(e) => error!("failed to write to buffer: {}", e),
}
} else {
self.wait_reason.set(Some(wait_reason));
match self.handle_until_resume(vcpu) {
Ok(resume_data) => data.copy_from_slice(&resume_data),
Err(e) if e.errno() == EPIPE => {}
Err(e) => error!("failed to process vcpu requests: {}", e),
}
}
true
}
None => false,
}
}
pub fn io_read(&self, addr: u64, data: &mut [u8], vcpu: &Vcpu) -> bool {
self.process(IoSpace::Ioport, addr, VcpuRunData::Read(data), vcpu)
}
pub fn io_write(&self, addr: u64, data: &[u8], vcpu: &Vcpu) -> bool {
self.process(IoSpace::Ioport, addr, VcpuRunData::Write(data), vcpu)
}
pub fn mmio_read(&self, addr: u64, data: &mut [u8], vcpu: &Vcpu) -> bool {
self.process(IoSpace::Mmio, addr, VcpuRunData::Read(data), vcpu)
}
pub fn mmio_write(&self, addr: u64, data: &[u8], vcpu: &Vcpu) -> bool {
self.process(IoSpace::Mmio, addr, VcpuRunData::Write(data), vcpu)
}
pub fn hyperv_call(&self, input: u64, params: [u64; 2], data: &mut [u8], vcpu: &Vcpu) -> bool {
let mut wait_reason = vcpu_response::Wait::new();
let hv = wait_reason.mut_hyperv_call();
hv.input = input;
hv.params0 = params[0];
hv.params1 = params[1];
self.wait_reason.set(Some(wait_reason));
match self.handle_until_resume(vcpu) {
Ok(resume_data) => {
data.copy_from_slice(&resume_data);
true
}
Err(e) if e.errno() == EPIPE => false,
Err(e) => {
error!("failed to process hyperv call request: {}", e);
false
}
}
}
pub fn hyperv_synic(
&self,
msr: u32,
control: u64,
evt_page: u64,
msg_page: u64,
vcpu: &Vcpu,
) -> bool {
let mut wait_reason = vcpu_response::Wait::new();
let hv = wait_reason.mut_hyperv_synic();
hv.msr = msr;
hv.control = control;
hv.evt_page = evt_page;
hv.msg_page = msg_page;
self.wait_reason.set(Some(wait_reason));
match self.handle_until_resume(vcpu) {
Ok(_resume_data) => true,
Err(e) if e.errno() == EPIPE => false,
Err(e) => {
error!("failed to process hyperv synic request: {}", e);
false
}
}
}
fn handle_request(&self, vcpu: &Vcpu) -> SysResult<Option<Vec<u8>>> {
let mut wait_reason = self.wait_reason.take();
let mut do_recv = true;
let mut resume_data = None;
let mut response = VcpuResponse::new();
let mut send_response = true;
if let Some(reason) = wait_reason {
if reason.has_init() {
wait_reason = Some(reason);
} else {
response.set_wait(reason);
do_recv = false;
wait_reason = None;
}
}
if do_recv {
let mut request_buffer = self.request_buffer.borrow_mut();
request_buffer.resize(MAX_VCPU_DATAGRAM_SIZE, 0);
let mut read_pipe = &self.read_pipe;
let msg_size = read_pipe.read(&mut request_buffer).map_err(io_to_sys_err)?;
let mut request: VcpuRequest =
Message::parse_from_bytes(&request_buffer[..msg_size]).map_err(proto_to_sys_err)?;
let res = if request.has_wait() {
match wait_reason {
Some(wait_reason) => {
response.set_wait(wait_reason);
Ok(())
}
None => Err(SysError::new(EPROTO)),
}
} else if wait_reason.is_some() {
self.wait_reason.set(wait_reason);
Err(SysError::new(EPROTO))
} else if request.has_resume() {
send_response = false;
let resume = request.take_resume();
if !resume.regs.is_empty() {
set_vcpu_state(vcpu, vcpu_request::StateSet::REGS, &resume.regs)?;
}
if !resume.sregs.is_empty() {
set_vcpu_state(vcpu, vcpu_request::StateSet::SREGS, &resume.sregs)?;
}
if !resume.debugregs.is_empty() {
set_vcpu_state(vcpu, vcpu_request::StateSet::DEBUGREGS, &resume.debugregs)?;
}
resume_data = Some(resume.data);
Ok(())
} else if request.has_get_state() {
let response_state = response.mut_get_state();
match get_vcpu_state_enum_or_unknown(vcpu, request.get_state().set) {
Ok(state) => {
response_state.state = state;
Ok(())
}
Err(e) => Err(e),
}
} else if request.has_set_state() {
response.mut_set_state();
let set_state = request.set_state();
set_vcpu_state_enum_or_unknown(vcpu, set_state.set, &set_state.state)
} else if request.has_get_hyperv_cpuid() {
let cpuid_response = &mut response.mut_get_hyperv_cpuid().entries;
match vcpu.get_hyperv_cpuid() {
Ok(mut cpuid) => {
for entry in cpuid.mut_entries_slice() {
cpuid_response.push(cpuid_kvm_to_proto(entry));
}
Ok(())
}
Err(e) => Err(e),
}
} else if request.has_get_msrs() {
let entry_data = &mut response.mut_get_msrs().entry_data;
let entry_indices = &request.get_msrs().entry_indices;
let mut msr_entries = Vec::with_capacity(entry_indices.len());
for &index in entry_indices {
msr_entries.push(kvm_msr_entry {
index,
..Default::default()
});
}
match vcpu.get_msrs(&mut msr_entries) {
Ok(()) => {
for msr_entry in msr_entries {
entry_data.push(msr_entry.data);
}
Ok(())
}
Err(e) => Err(e),
}
} else if request.has_set_msrs() {
const SIZE_OF_MSRS: usize = mem::size_of::<kvm_msrs>();
const SIZE_OF_ENTRY: usize = mem::size_of::<kvm_msr_entry>();
const ALIGN_OF_MSRS: usize = mem::align_of::<kvm_msrs>();
const_assert!(ALIGN_OF_MSRS >= mem::align_of::<kvm_msr_entry>());
response.mut_set_msrs();
let request_entries = &request.set_msrs().entries;
let size = SIZE_OF_MSRS + request_entries.len() * SIZE_OF_ENTRY;
let layout =
Layout::from_size_align(size, ALIGN_OF_MSRS).expect("impossible layout");
let mut allocation = LayoutAllocation::zeroed(layout);
let kvm_msrs = unsafe { allocation.as_mut::<kvm_msrs>() };
unsafe {
let kvm_msr_entries: &mut [kvm_msr_entry] =
kvm_msrs.entries.as_mut_slice(request_entries.len());
for (msr_entry, entry) in kvm_msr_entries.iter_mut().zip(request_entries) {
msr_entry.index = entry.index;
msr_entry.data = entry.data;
}
}
kvm_msrs.nmsrs = request_entries.len() as u32;
vcpu.set_msrs(kvm_msrs)
} else if request.has_set_cpuid() {
response.mut_set_cpuid();
let request_entries = &request.set_cpuid().entries;
let mut cpuid = CpuId::new(request_entries.len());
let cpuid_entries = cpuid.mut_entries_slice();
for (request_entry, cpuid_entry) in request_entries.iter().zip(cpuid_entries) {
cpuid_entry.function = request_entry.function;
if request_entry.has_index {
cpuid_entry.index = request_entry.index;
cpuid_entry.flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
}
cpuid_entry.eax = request_entry.eax;
cpuid_entry.ebx = request_entry.ebx;
cpuid_entry.ecx = request_entry.ecx;
cpuid_entry.edx = request_entry.edx;
}
vcpu.set_cpuid2(&cpuid)
} else if request.has_enable_capability() {
response.mut_enable_capability();
let capability = request.enable_capability().capability;
if capability != kvm_sys::KVM_CAP_HYPERV_SYNIC
&& capability != kvm_sys::KVM_CAP_HYPERV_SYNIC2
{
Err(SysError::new(EINVAL))
} else {
let cap = kvm_enable_cap {
cap: capability,
..Default::default()
};
unsafe { vcpu.kvm_enable_cap(&cap) }
}
} else if request.has_shutdown() {
return Err(SysError::new(EPIPE));
} else {
Err(SysError::new(ENOTTY))
};
if let Err(e) = res {
response.errno = e.errno();
}
}
if send_response {
let mut response_buffer = self.response_buffer.borrow_mut();
response_buffer.clear();
{
let mut stream = CodedOutputStream::vec(&mut response_buffer);
response
.write_length_delimited_to(&mut stream)
.map_err(proto_to_sys_err)?;
stream.flush().map_err(proto_to_sys_err)?;
}
let mut write_pipe = &self.write_pipe;
write_pipe
.write(&response_buffer[..])
.map_err(io_to_sys_err)?;
}
Ok(resume_data)
}
fn handle_until_resume(&self, vcpu: &Vcpu) -> SysResult<Vec<u8>> {
loop {
if let Some(resume_data) = self.handle_request(vcpu)? {
return Ok(resume_data);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn shared_vcpu_reserve() {
let mut shared_vcpu_state = SharedVcpuState::default();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x10, 0, false)
.unwrap_err();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x10, 0x10, false)
.unwrap();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x0f, 0x10, false)
.unwrap_err();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x10, 0x10, false)
.unwrap_err();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x10, 0x15, false)
.unwrap_err();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x12, 0x15, false)
.unwrap_err();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x12, 0x01, false)
.unwrap_err();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x0, 0x20, false)
.unwrap_err();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x20, 0x05, false)
.unwrap();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x25, 0x05, false)
.unwrap();
shared_vcpu_state
.reserve_range(IoSpace::Ioport, 0x0, 0x10, false)
.unwrap();
}
}