devices/pci/pcie/
pcie_host.rs

1// Copyright 2021 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std::fs::read;
6use std::fs::write;
7use std::fs::File;
8use std::fs::OpenOptions;
9use std::os::unix::fs::FileExt;
10use std::path::Path;
11use std::path::PathBuf;
12use std::sync::Arc;
13use std::thread;
14
15use anyhow::anyhow;
16use anyhow::bail;
17use anyhow::Context;
18use anyhow::Result;
19use base::error;
20use base::Tube;
21use sync::Mutex;
22use vm_control::HotPlugDeviceInfo;
23use vm_control::HotPlugDeviceType;
24use vm_control::VmRequest;
25use vm_control::VmResponse;
26use zerocopy::FromBytes;
27use zerocopy::IntoBytes;
28
29use crate::pci::pci_configuration::PciBridgeSubclass;
30use crate::pci::pci_configuration::CAPABILITY_LIST_HEAD_OFFSET;
31use crate::pci::pci_configuration::HEADER_TYPE_REG;
32use crate::pci::pci_configuration::PCI_CAP_NEXT_POINTER;
33use crate::pci::pcie::pci_bridge::PciBridgeBusRange;
34use crate::pci::pcie::pci_bridge::BR_BUS_NUMBER_REG;
35use crate::pci::pcie::pci_bridge::BR_MEM_BASE_MASK;
36use crate::pci::pcie::pci_bridge::BR_MEM_BASE_SHIFT;
37use crate::pci::pcie::pci_bridge::BR_MEM_LIMIT_MASK;
38use crate::pci::pcie::pci_bridge::BR_MEM_MINIMUM;
39use crate::pci::pcie::pci_bridge::BR_MEM_REG;
40use crate::pci::pcie::pci_bridge::BR_PREF_MEM_64BIT;
41use crate::pci::pcie::pci_bridge::BR_PREF_MEM_BASE_HIGH_REG;
42use crate::pci::pcie::pci_bridge::BR_PREF_MEM_LIMIT_HIGH_REG;
43use crate::pci::pcie::pci_bridge::BR_PREF_MEM_LOW_REG;
44use crate::pci::pcie::pci_bridge::BR_WINDOW_ALIGNMENT;
45use crate::pci::pcie::PcieDevicePortType;
46use crate::pci::PciCapabilityID;
47use crate::pci::PciClassCode;
48
49// Host Pci device's sysfs config file
50struct PciHostConfig {
51    config_file: File,
52}
53
54impl PciHostConfig {
55    // Create a new host pci device's sysfs config file
56    fn new(host_sysfs_path: &Path) -> Result<Self> {
57        let mut config_path = PathBuf::new();
58        config_path.push(host_sysfs_path);
59        config_path.push("config");
60        let f = OpenOptions::new()
61            .write(true)
62            .read(true)
63            .open(config_path.as_path())
64            .with_context(|| format!("failed to open: {}", config_path.display()))?;
65        Ok(PciHostConfig { config_file: f })
66    }
67
68    // Read host pci device's config register
69    fn read_config<T: IntoBytes + FromBytes + Copy + Default>(&self, offset: u64) -> T {
70        let length = std::mem::size_of::<T>();
71        let mut val = T::default();
72        if offset % length as u64 != 0 {
73            error!(
74                "read_config, offset {} isn't aligned to length {}",
75                offset, length
76            );
77        } else if let Err(e) = self.config_file.read_exact_at(val.as_mut_bytes(), offset) {
78            error!("failed to read host sysfs config: {}", e);
79        }
80
81        val
82    }
83
84    // write host pci device's config register
85    #[allow(dead_code)]
86    fn write_config(&self, offset: u64, data: &[u8]) {
87        if offset % data.len() as u64 != 0 {
88            error!(
89                "write_config, offset {} isn't aligned to length {}",
90                offset,
91                data.len()
92            );
93            return;
94        }
95        if let Err(e) = self.config_file.write_all_at(data, offset) {
96            error!("failed to write host sysfs config: {}", e);
97        }
98    }
99}
100
101// Find all the added pcie devices
102fn visit_children(dir: &Path, children: &mut Vec<HotPlugDeviceInfo>) -> Result<()> {
103    // Each pci device has a sysfs directory
104    if !dir.is_dir() {
105        bail!("{} isn't directory", dir.display());
106    }
107    // Loop device sysfs subdirectory
108    let entries = dir
109        .read_dir()
110        .with_context(|| format!("failed to read dir {}", dir.display()))?;
111    let mut devices = Vec::new();
112    for entry in entries {
113        let sub_dir = match entry {
114            Ok(sub) => sub,
115            _ => continue,
116        };
117
118        if !sub_dir.path().is_dir() {
119            continue;
120        }
121
122        let name = sub_dir
123            .file_name()
124            .into_string()
125            .map_err(|_| anyhow!("failed to get dir name"))?;
126        // Child pci device has name format 0000:xx:xx.x, length is 12
127        if name.len() != 12 || !name.starts_with("0000:") {
128            continue;
129        }
130        let child_path = dir.join(name);
131        devices.push(child_path);
132    }
133    devices.reverse();
134    let mut iter = devices.iter().peekable();
135    while let Some(device) = iter.next() {
136        let class_path = device.join("class");
137        let class_id = read(class_path.as_path())
138            .with_context(|| format!("failed to read {}", class_path.display()))?;
139        let hp_interrupt = iter.peek().is_none();
140        if !class_id.starts_with("0x0604".as_bytes()) {
141            // If the device isn't pci bridge, this is a pcie endpoint device
142            children.push(HotPlugDeviceInfo {
143                device_type: HotPlugDeviceType::EndPoint,
144                path: device.to_path_buf(),
145                hp_interrupt,
146            });
147            // No need to look further
148            return Ok(());
149        } else {
150            // Find the pci express cap to get the port type of the pcie bridge
151            let host_config = PciHostConfig::new(device)?;
152            let mut cap_pointer: u8 = host_config.read_config(CAPABILITY_LIST_HEAD_OFFSET as u64);
153            while cap_pointer != 0x0 {
154                let cap_id: u8 = host_config.read_config(cap_pointer as u64);
155                if cap_id == PciCapabilityID::PciExpress as u8 {
156                    break;
157                }
158                cap_pointer = host_config.read_config(cap_pointer as u64 + 0x1);
159            }
160            if cap_pointer == 0x0 {
161                bail!(
162                    "Failed to get pcie express capability for {}",
163                    device.display()
164                );
165            }
166            let express_cap_reg: u16 = host_config.read_config(cap_pointer as u64 + 0x2);
167            match (express_cap_reg & 0xf0) >> 4 {
168                x if x == PcieDevicePortType::UpstreamPort as u16 => {
169                    children.push(HotPlugDeviceInfo {
170                        device_type: HotPlugDeviceType::UpstreamPort,
171                        path: device.to_path_buf(),
172                        hp_interrupt,
173                    })
174                }
175                x if x == PcieDevicePortType::DownstreamPort as u16 => {
176                    children.push(HotPlugDeviceInfo {
177                        device_type: HotPlugDeviceType::DownstreamPort,
178                        path: device.to_path_buf(),
179                        hp_interrupt,
180                    })
181                }
182                _ => (),
183            }
184        }
185    }
186    for device in devices.iter() {
187        visit_children(device.as_path(), children)?;
188    }
189    Ok(())
190}
191
192struct HotplugWorker {
193    host_name: String,
194}
195
196impl HotplugWorker {
197    fn run(&self, vm_socket: Arc<Mutex<Tube>>, child_exist: Arc<Mutex<bool>>) -> Result<()> {
198        let mut host_sysfs = PathBuf::new();
199        host_sysfs.push("/sys/bus/pci/devices/");
200        host_sysfs.push(self.host_name.clone());
201        let rescan_path = host_sysfs.join("rescan");
202        // Let pcie root port rescan to find the added or removed children devices
203        write(rescan_path.as_path(), "1")
204            .with_context(|| format!("failed to write {}", rescan_path.display()))?;
205
206        // If child device existed, but code run here again, this means host has a
207        // hotplug out event, after the above rescan, host should find the removed
208        // child device, and host vfio-pci kernel driver should notify crosvm vfio-pci
209        // devie such hotplug out event, so nothing is needed to do here, just return
210        // it now.
211        let mut child_exist = child_exist.lock();
212        if *child_exist {
213            return Ok(());
214        }
215
216        // Probe the new added pcie endpoint devices
217        let mut children: Vec<HotPlugDeviceInfo> = Vec::new();
218        visit_children(host_sysfs.as_path(), &mut children)?;
219
220        // Without reverse children, physical larger BDF device is at the top, it will be
221        // added into guest first with smaller virtual function number, so physical smaller
222        // BDF device has larger virtual function number, phyiscal larger BDF device has
223        // smaller virtual function number. During hotplug out process, host pcie root port
224        // driver remove physical smaller BDF pcie endpoint device first, so host vfio-pci
225        // driver send plug out event first for smaller BDF device and wait for this device
226        // removed from crosvm, when crosvm receives this plug out event, crosvm will remove
227        // all the children devices, crosvm remove smaller virtual function number device
228        // first, this isn't the target device which host vfio-pci driver is waiting for.
229        // Host vfio-pci driver holds a lock when it is waiting, when crosvm remove another
230        // device throgh vfio-pci which try to get the same lock, so deadlock happens in
231        // host kernel.
232        //
233        // In order to fix the deadlock, children is reversed, so physical smaller BDF
234        // device has smaller virtual function number, and it will have the same order
235        // between host kernel and crosvm during hotplug out process.
236        children.reverse();
237        while let Some(child) = children.pop() {
238            if let HotPlugDeviceType::EndPoint = child.device_type {
239                // In order to bind device to vfio-pci driver, get device VID and DID
240                let vendor_path = child.path.join("vendor");
241                let vendor_id = read(vendor_path.as_path())
242                    .with_context(|| format!("failed to read {}", vendor_path.display()))?;
243                // Remove the first two elements 0x
244                let prefix: &str = "0x";
245                let vendor = match vendor_id.strip_prefix(prefix.as_bytes()) {
246                    Some(v) => v.to_vec(),
247                    None => vendor_id,
248                };
249                let device_path = child.path.join("device");
250                let device_id = read(device_path.as_path())
251                    .with_context(|| format!("failed to read {}", device_path.display()))?;
252                // Remove the first two elements 0x
253                let device = match device_id.strip_prefix(prefix.as_bytes()) {
254                    Some(d) => d.to_vec(),
255                    None => device_id,
256                };
257                let new_id = [
258                    String::from_utf8_lossy(&vendor),
259                    String::from_utf8_lossy(&device),
260                ]
261                .join(" ");
262                if Path::new("/sys/bus/pci/drivers/vfio-pci-pm/new_id").exists() {
263                    let _ = write("/sys/bus/pci/drivers/vfio-pci-pm/new_id", &new_id);
264                }
265                // This is normal - either the kernel doesn't support vfio-pci-pm driver,
266                // or the device failed to attach to vfio-pci-pm driver (most likely due to
267                // lack of power management capability).
268                if !child.path.join("driver/unbind").exists() {
269                    write("/sys/bus/pci/drivers/vfio-pci/new_id", &new_id).with_context(|| {
270                        format!("failed to write {new_id} into vfio-pci/new_id")
271                    })?;
272                }
273            }
274            // Request to hotplug the new added pcie device into guest
275            let request = VmRequest::HotPlugVfioCommand {
276                device: child.clone(),
277                add: true,
278            };
279            let vm_socket = vm_socket.lock();
280            vm_socket
281                .send(&request)
282                .with_context(|| format!("failed to send hotplug request for {child:?}"))?;
283            let response = vm_socket
284                .recv::<VmResponse>()
285                .with_context(|| format!("failed to receive hotplug response for {child:?}"))?;
286            match response {
287                VmResponse::Ok => {}
288                _ => bail!("unexpected hotplug response: {response}"),
289            };
290            if !*child_exist {
291                *child_exist = true;
292            }
293        }
294
295        Ok(())
296    }
297}
298
299const PCI_CONFIG_DEVICE_ID: u64 = 0x02;
300const PCI_BASE_CLASS_CODE: u64 = 0x0B;
301const PCI_SUB_CLASS_CODE: u64 = 0x0A;
302
303/// Pcie root port device has a corresponding host pcie root port.
304pub struct PcieHostPort {
305    host_config: PciHostConfig,
306    host_name: String,
307    hotplug_in_process: Arc<Mutex<bool>>,
308    hotplug_child_exist: Arc<Mutex<bool>>,
309    vm_socket: Arc<Mutex<Tube>>,
310}
311
312impl PcieHostPort {
313    /// Create PcieHostPort, host_syfsfs_patch specify host pcie port
314    /// sysfs path.
315    pub fn new(host_sysfs_path: &Path, socket: Tube) -> Result<Self> {
316        let host_config = PciHostConfig::new(host_sysfs_path)?;
317        let host_name = host_sysfs_path
318            .file_name()
319            .unwrap()
320            .to_str()
321            .unwrap()
322            .to_owned();
323        let base_class: u8 = host_config.read_config(PCI_BASE_CLASS_CODE);
324        if base_class != PciClassCode::BridgeDevice.get_register_value() {
325            return Err(anyhow!("host {} isn't bridge", host_name));
326        }
327        let sub_class: u8 = host_config.read_config(PCI_SUB_CLASS_CODE);
328        if sub_class != PciBridgeSubclass::PciToPciBridge as u8 {
329            return Err(anyhow!("host {} isn't pci to pci bridge", host_name));
330        }
331
332        let mut pcie_cap_reg: u8 = 0;
333
334        let mut cap_next: u8 = host_config.read_config(CAPABILITY_LIST_HEAD_OFFSET as u64);
335        let mut counter: u16 = 0;
336        while cap_next != 0 && counter < 256 {
337            let cap_id: u8 = host_config.read_config(cap_next.into());
338            if cap_id == PciCapabilityID::PciExpress as u8 {
339                pcie_cap_reg = cap_next;
340                break;
341            }
342            let offset = cap_next as u64 + PCI_CAP_NEXT_POINTER as u64;
343            cap_next = host_config.read_config(offset);
344            counter += 1;
345        }
346
347        if pcie_cap_reg == 0 {
348            return Err(anyhow!("host {} isn't pcie device", host_name));
349        }
350
351        Ok(PcieHostPort {
352            host_config,
353            host_name,
354            hotplug_in_process: Arc::new(Mutex::new(false)),
355            hotplug_child_exist: Arc::new(Mutex::new(false)),
356            vm_socket: Arc::new(Mutex::new(socket)),
357        })
358    }
359
360    pub fn get_bus_range(&self) -> PciBridgeBusRange {
361        let bus_num: u32 = self.host_config.read_config((BR_BUS_NUMBER_REG * 4) as u64);
362        let primary = (bus_num & 0xFF) as u8;
363        let secondary = ((bus_num >> 8) & 0xFF) as u8;
364        let subordinate = ((bus_num >> 16) & 0xFF) as u8;
365
366        PciBridgeBusRange {
367            primary,
368            secondary,
369            subordinate,
370        }
371    }
372
373    pub fn read_device_id(&self) -> u16 {
374        self.host_config.read_config::<u16>(PCI_CONFIG_DEVICE_ID)
375    }
376
377    pub fn host_name(&self) -> String {
378        self.host_name.clone()
379    }
380
381    pub fn read_config(&self, reg_idx: usize, data: &mut u32) {
382        if reg_idx == HEADER_TYPE_REG {
383            *data = self.host_config.read_config((HEADER_TYPE_REG as u64) * 4)
384        }
385    }
386
387    pub fn write_config(&mut self, _reg_idx: usize, _offset: u64, _data: &[u8]) {}
388
389    pub fn get_bridge_window_size(&self) -> (u64, u64) {
390        let br_memory: u32 = self.host_config.read_config(BR_MEM_REG as u64 * 4);
391        let mem_base = (br_memory & BR_MEM_BASE_MASK) << BR_MEM_BASE_SHIFT;
392        let mem_limit = br_memory & BR_MEM_LIMIT_MASK;
393        let mem_size = if mem_limit > mem_base {
394            (mem_limit - mem_base) as u64 + BR_WINDOW_ALIGNMENT
395        } else {
396            BR_MEM_MINIMUM
397        };
398        let br_pref_mem_low: u32 = self.host_config.read_config(BR_PREF_MEM_LOW_REG as u64 * 4);
399        let pref_mem_base_low = (br_pref_mem_low & BR_MEM_BASE_MASK) << BR_MEM_BASE_SHIFT;
400        let pref_mem_limit_low = br_pref_mem_low & BR_MEM_LIMIT_MASK;
401        let mut pref_mem_base: u64 = pref_mem_base_low as u64;
402        let mut pref_mem_limit: u64 = pref_mem_limit_low as u64;
403        if br_pref_mem_low & BR_PREF_MEM_64BIT == BR_PREF_MEM_64BIT {
404            // 64bit prefetch memory
405            let pref_mem_base_high: u32 = self
406                .host_config
407                .read_config(BR_PREF_MEM_BASE_HIGH_REG as u64 * 4);
408            let pref_mem_limit_high: u32 = self
409                .host_config
410                .read_config(BR_PREF_MEM_LIMIT_HIGH_REG as u64 * 4);
411            pref_mem_base = ((pref_mem_base_high as u64) << 32) | (pref_mem_base_low as u64);
412            pref_mem_limit = ((pref_mem_limit_high as u64) << 32) | (pref_mem_limit_low as u64);
413        }
414        let pref_mem_size = if pref_mem_limit > pref_mem_base {
415            pref_mem_limit - pref_mem_base + BR_WINDOW_ALIGNMENT
416        } else {
417            BR_MEM_MINIMUM
418        };
419
420        (mem_size, pref_mem_size)
421    }
422
423    pub fn hotplug_probe(&mut self) {
424        if *self.hotplug_in_process.lock() {
425            return;
426        }
427
428        let hotplug_process = self.hotplug_in_process.clone();
429        let child_exist = self.hotplug_child_exist.clone();
430        let socket = self.vm_socket.clone();
431        let name = self.host_name.clone();
432        let _ = thread::Builder::new()
433            .name("pcie_hotplug".to_string())
434            .spawn(move || {
435                let mut hotplug = hotplug_process.lock();
436                *hotplug = true;
437                let hotplug_worker = HotplugWorker { host_name: name };
438                let _ = hotplug_worker.run(socket, child_exist);
439                *hotplug = false;
440            });
441    }
442
443    pub fn hot_unplug(&mut self) {
444        *self.hotplug_child_exist.lock() = false;
445    }
446}