jail/
helpers.rs

1// Copyright 2017 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#![deny(missing_docs)]
6#![allow(dead_code)]
7
8use std::path::Path;
9use std::str;
10use std::sync::LazyLock;
11
12use anyhow::bail;
13use anyhow::Context;
14use anyhow::Result;
15#[cfg(feature = "seccomp_trace")]
16use base::debug;
17use base::getegid;
18use base::geteuid;
19#[cfg(feature = "seccomp_trace")]
20use base::warn;
21use libc::c_ulong;
22use minijail::Minijail;
23#[cfg(feature = "seccomp_trace")]
24use static_assertions::const_assert;
25#[cfg(feature = "seccomp_trace")]
26use zerocopy::Immutable;
27#[cfg(feature = "seccomp_trace")]
28use zerocopy::IntoBytes;
29
30use crate::config::JailConfig;
31
32static EMBEDDED_BPFS: LazyLock<std::collections::HashMap<&str, Vec<u8>>> =
33    LazyLock::new(|| include!(concat!(env!("OUT_DIR"), "/bpf_includes.in")));
34
35/// Most devices don't need to open many fds. However, an implementation detail of minijail is that
36/// after applying this limit, it opens an additional file descriptor to scan the /proc/self/fd
37/// directory to choose which file descriptors to close in the child process. The open files limit
38/// therefore has to be higher than the number file descriptors that the parent thread holds open
39/// before the jail is started.
40pub const MAX_OPEN_FILES_DEFAULT: u64 = 4096;
41/// The max open files for gpu processes.
42const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
43/// The max open files for jail warden, matching FD_RAW_FAILURE.
44pub const MAX_OPEN_FILES_FOR_JAIL_WARDEN: u64 = 65536;
45
46/// The user in the jail to run as.
47pub enum RunAsUser {
48    /// Do not specify the user
49    Unspecified,
50    /// Runs as the same user in the jail as the current user.
51    CurrentUser,
52    /// Runs as the root user in the jail.
53    Root,
54    /// Runs as the specified uid and gid.
55    /// This requires `SandboxConfig::ugid_map` to be set.
56    Specified(u32, u32),
57}
58
59/// Config for the sandbox to be created by [Minijail].
60pub struct SandboxConfig<'a> {
61    /// Whether or not to drop all capabilities in the sandbox.
62    pub limit_caps: bool,
63    log_failures: bool,
64    seccomp_policy_dir: Option<&'a Path>,
65    seccomp_policy_name: &'a str,
66    /// The pair of `uid_map` and `gid_map`.
67    pub ugid_map: Option<(&'a str, &'a str)>,
68    /// The remount mode instead of default MS_PRIVATE.
69    pub remount_mode: Option<c_ulong>,
70    /// Whether to use empty net namespace. Enabled by default.
71    pub namespace_net: bool,
72    /// Whether or not to configure the jail to support bind-mounts.
73    ///
74    /// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
75    /// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
76    /// sandbox.
77    pub bind_mounts: bool,
78    /// Specify the user in the jail to run as.
79    pub run_as: RunAsUser,
80}
81
82impl<'a> SandboxConfig<'a> {
83    /// Creates [SandboxConfig].
84    pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
85        Self {
86            limit_caps: true,
87            log_failures: jail_config.seccomp_log_failures,
88            seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
89            seccomp_policy_name: policy,
90            ugid_map: None,
91            remount_mode: None,
92            namespace_net: true,
93            bind_mounts: false,
94            run_as: RunAsUser::Unspecified,
95        }
96    }
97}
98
99/// Wrapper that cleans up a [Minijail] when it is dropped
100pub struct ScopedMinijail(pub Minijail);
101
102impl Drop for ScopedMinijail {
103    fn drop(&mut self) {
104        let _ = self.0.kill();
105    }
106}
107
108/// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
109/// `max_open_files` using `RLIMIT_NOFILE`.
110///
111/// If `root` path is "/", the minijail don't change the root.
112///
113/// # Arguments
114///
115/// * `root` - The root path to be changed to by minijail.
116/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
117#[allow(clippy::unnecessary_cast)]
118pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
119    // Validate new root directory. Path::is_dir() also checks the existence.
120    if !root.is_dir() {
121        bail!("{:?} is not a directory, cannot create jail", root);
122    }
123    // chroot accepts absolute path only.
124    if !root.is_absolute() {
125        bail!("{:?} is not absolute path", root);
126    }
127
128    let mut jail = Minijail::new().context("failed to jail device")?;
129
130    // Only pivot_root if we are not re-using the current root directory.
131    if root != Path::new("/") {
132        // Run in a new mount namespace.
133        jail.namespace_vfs();
134        jail.enter_pivot_root(root)
135            .context("failed to pivot root device")?;
136    }
137
138    jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
139        .context("error setting max open files")?;
140
141    Ok(jail)
142}
143
144/// Creates a [Minijail] instance which just invokes a jail process and sets
145/// `max_open_files` using `RLIMIT_NOFILE`. This is helpful with crosvm process
146/// runs as a non-root user without SYS_ADMIN capabilities.
147///
148/// Unlike `create_base_minijail`, this function doesn't call `pivot_root`
149/// and `mount namespace`. So, it runs as a non-root user without
150/// SYS_ADMIN capabilities.
151///
152/// Note that since there is no file system isolation provided by this function,
153/// caller of this function should enforce other security mechanisum such as selinux
154/// on the host to protect directories.
155///
156/// # Arguments
157///
158/// * `root` - The root path to checked before the process is jailed
159/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
160#[allow(clippy::unnecessary_cast)]
161pub fn create_base_minijail_without_pivot_root(
162    root: &Path,
163    max_open_files: u64,
164) -> Result<Minijail> {
165    // Validate new root directory. Path::is_dir() also checks the existence.
166    if !root.is_dir() {
167        bail!("{:?} is not a directory, cannot create jail", root);
168    }
169    if !root.is_absolute() {
170        bail!("{:?} is not absolute path", root);
171    }
172
173    let mut jail = Minijail::new().context("failed to jail device")?;
174    jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
175        .context("error setting max open files")?;
176
177    Ok(jail)
178}
179
180/// Creates a [Minijail] instance which creates a sandbox.
181///
182/// # Arguments
183///
184/// * `root` - The root path to be changed to by minijail.
185/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
186/// * `config` - The [SandboxConfig] to control details of the sandbox.
187pub fn create_sandbox_minijail(
188    root: &Path,
189    max_open_files: u64,
190    config: &SandboxConfig,
191) -> Result<Minijail> {
192    let mut jail = create_base_minijail(root, max_open_files)?;
193
194    jail.namespace_pids();
195    jail.namespace_user();
196    jail.namespace_user_disable_setgroups();
197    if config.limit_caps {
198        // Don't need any capabilities.
199        jail.use_caps(0);
200    }
201    match config.run_as {
202        RunAsUser::Unspecified => {
203            if config.bind_mounts && config.ugid_map.is_none() {
204                // Minijail requires to set user/group map to mount extra directories.
205                add_current_user_to_jail(&mut jail)?;
206            }
207        }
208        RunAsUser::CurrentUser => {
209            add_current_user_to_jail(&mut jail)?;
210        }
211        RunAsUser::Root => {
212            // Add the current user as root in the jail.
213            let crosvm_uid = geteuid();
214            let crosvm_gid = getegid();
215            jail.uidmap(&format!("0 {crosvm_uid} 1"))
216                .context("error setting UID map")?;
217            jail.gidmap(&format!("0 {crosvm_gid} 1"))
218                .context("error setting GID map")?;
219        }
220        RunAsUser::Specified(uid, gid) => {
221            if uid != 0 {
222                jail.change_uid(uid)
223            }
224            if gid != 0 {
225                jail.change_gid(gid)
226            }
227        }
228    }
229    if config.bind_mounts {
230        // Create a tmpfs in the device's root directory so that we can bind mount files.
231        // The size=67108864 is size=64*1024*1024 or size=64MB.
232        // TODO(b/267581374): Use appropriate size for tmpfs.
233        jail.mount_with_data(
234            Path::new("none"),
235            Path::new("/"),
236            "tmpfs",
237            (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
238            "size=67108864",
239        )?;
240
241        #[cfg(feature = "appimage")]
242        if let Ok(appdir) = std::env::var("APPDIR") {
243            let appdir_path = Path::new(&appdir);
244            let canonical_path = appdir_path.canonicalize().with_context(|| {
245                format!("failed to canonicalize APPDIR path: {:?}", appdir_path)
246            })?;
247            if !canonical_path.starts_with("/tmp") {
248                bail!("APPDIR path {:?} is not under /tmp", canonical_path);
249            }
250            jail.mount_bind(&canonical_path, &canonical_path, false)
251                .context("failed to bind mount APPDIR into jail")?;
252        }
253    }
254    if let Some((uid_map, gid_map)) = config.ugid_map {
255        jail.uidmap(uid_map).context("error setting UID map")?;
256        jail.gidmap(gid_map).context("error setting GID map")?;
257    }
258    // Run in a new mount namespace.
259    jail.namespace_vfs();
260
261    if config.namespace_net {
262        // Run in an empty network namespace.
263        jail.namespace_net();
264    }
265
266    // Don't allow the device to gain new privileges.
267    jail.no_new_privs();
268
269    #[cfg(feature = "seccomp_trace")]
270    {
271        #[repr(C)]
272        #[derive(Immutable, IntoBytes)]
273        struct sock_filter {
274            /* Filter block */
275            code: u16, /* Actual filter code */
276            jt: u8,    /* Jump true */
277            jf: u8,    /* Jump false */
278            k: u32,    /* Generic multiuse field */
279        }
280
281        // BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
282        // BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
283        const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
284        const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
285        const BPF_RET: u16 = 0x06;
286        const BPF_K: u16 = 0x00;
287
288        // return SECCOMP_RET_LOG for all syscalls
289        const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
290            code: BPF_RET | BPF_K,
291            jt: 0,
292            jf: 0,
293            k: SECCOMP_RET_LOG,
294        };
295
296        warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
297        debug!(
298            "seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
299            config.seccomp_policy_name,
300            read_jail_addr(&jail),
301        );
302        jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
303            .unwrap();
304    }
305
306    #[cfg(not(feature = "seccomp_trace"))]
307    if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
308        let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
309        // By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
310        // is expected to be compiled using "trap" as the failure behavior instead of the default
311        // "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
312        // the built-in pre-compiled policies will be used.
313        // Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
314        // explanation about why the |log_failures| flag forces the use of .policy files (and the
315        // build-time alternative to this run-time flag).
316        let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
317        if bpf_policy_file.exists() && !config.log_failures {
318            jail.parse_seccomp_program(&bpf_policy_file)
319                .with_context(|| {
320                    format!(
321                        "failed to parse precompiled seccomp policy: {}",
322                        bpf_policy_file.display()
323                    )
324                })?;
325        } else {
326            // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
327            // kill the entire device process if a worker thread commits a seccomp violation.
328            jail.set_seccomp_filter_tsync();
329            if config.log_failures {
330                jail.log_seccomp_filter_failures();
331            }
332            let bpf_policy_file = seccomp_policy_path.with_extension("policy");
333            jail.parse_seccomp_filters(&bpf_policy_file)
334                .with_context(|| {
335                    format!(
336                        "failed to parse seccomp policy: {}",
337                        bpf_policy_file.display()
338                    )
339                })?;
340        }
341    } else {
342        set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
343    }
344
345    jail.use_seccomp_filter();
346    // Don't do init setup.
347    jail.run_as_init();
348    // Set up requested remount mode instead of default MS_PRIVATE.
349    if let Some(mode) = config.remount_mode {
350        jail.set_remount_mode(mode);
351    }
352
353    Ok(jail)
354}
355
356/// Creates a basic [Minijail] if `jail_config` is present.
357///
358/// Returns `None` if `jail_config` is none.
359pub fn simple_jail(jail_config: Option<&JailConfig>, policy: &str) -> Result<Option<Minijail>> {
360    if let Some(jail_config) = jail_config {
361        let config = SandboxConfig::new(jail_config, policy);
362        Ok(Some(create_sandbox_minijail(
363            &jail_config.pivot_root,
364            MAX_OPEN_FILES_DEFAULT,
365            &config,
366        )?))
367    } else {
368        Ok(None)
369    }
370}
371
372/// Creates [Minijail] for gpu processes.
373pub fn create_gpu_minijail(
374    root: &Path,
375    config: &SandboxConfig,
376    render_node_only: bool,
377    snapshot_scratch_directory: Option<&Path>,
378) -> Result<Minijail> {
379    let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;
380
381    // Device nodes required for DRM.
382    let sys_dev_char_path = Path::new("/sys/dev/char");
383    jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
384
385    // Necessary for CGROUP control of the vGPU threads
386    // This is not necessary UNLESS one wants to make use
387    // of the gpu cgroup command line options.
388    let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
389    if sys_cpuset_path.exists() {
390        jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
391    }
392
393    let sys_devices_path = Path::new("/sys/devices");
394    jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
395
396    jail_mount_bind_drm(&mut jail, render_node_only)?;
397
398    // If the ARM specific devices exist on the host, bind mount them in.
399    let mali0_path = Path::new("/dev/mali0");
400    if mali0_path.exists() {
401        jail.mount_bind(mali0_path, mali0_path, true)?;
402    }
403
404    let pvr_sync_path = Path::new("/dev/pvr_sync");
405    if pvr_sync_path.exists() {
406        jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
407    }
408
409    // If the udmabuf driver exists on the host, bind mount it in.
410    let udmabuf_path = Path::new("/dev/udmabuf");
411    if udmabuf_path.exists() {
412        jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
413    }
414
415    // Libraries that are required when mesa drivers are dynamically loaded.
416    jail_mount_bind_if_exists(
417        &mut jail,
418        &[
419            "/usr/lib",
420            "/usr/lib64",
421            "/lib",
422            "/lib64",
423            "/usr/share/drirc.d",
424            "/usr/share/glvnd",
425            "/usr/share/libdrm",
426            "/usr/share/vulkan",
427        ],
428    )?;
429
430    // pvr driver requires read access to /proc/self/task/*/comm.
431    mount_proc(&mut jail)?;
432
433    // To enable perfetto tracing, we need to give access to the perfetto service IPC
434    // endpoints.
435    let perfetto_path = Path::new("/run/perfetto");
436    if perfetto_path.exists() {
437        jail.mount_bind(perfetto_path, perfetto_path, true)?;
438    }
439
440    // Provide scratch space for the GPU device to build or unpack snapshots.
441    if let Some(snapshot_scratch_directory) = snapshot_scratch_directory {
442        jail.mount_with_data(
443            Path::new("none"),
444            snapshot_scratch_directory,
445            "tmpfs",
446            (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
447            "size=4294967296",
448        )?;
449    }
450
451    Ok(jail)
452}
453
454/// Selectively bind mount drm nodes into `jail` based on `render_node_only`
455///
456/// This function will not return an error if drm nodes don't exist
457pub fn jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()> {
458    if render_node_only {
459        const DRM_NUM_NODES: u32 = 63;
460        const DRM_RENDER_NODE_START: u32 = 128;
461        for offset in 0..DRM_NUM_NODES {
462            let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset);
463            let drm_dri_path = Path::new(&path_str);
464            if !drm_dri_path.exists() {
465                break;
466            }
467            jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
468        }
469    } else {
470        let drm_dri_path = Path::new("/dev/dri");
471        if drm_dri_path.exists() {
472            jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
473        }
474    }
475
476    Ok(())
477}
478
479/// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
480///
481/// This function will not return an error if any of the directories in `dirs` is missing.
482pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
483    jail: &mut Minijail,
484    dirs: &[P],
485) -> Result<()> {
486    for dir in dirs {
487        let dir_path = Path::new(dir);
488        if dir_path.exists() {
489            jail.mount_bind(dir_path, dir_path, false)?;
490        }
491    }
492
493    Ok(())
494}
495
496/// Mount proc in the sandbox.
497pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
498    jail.mount(
499        Path::new("proc"),
500        Path::new("/proc"),
501        "proc",
502        (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
503    )?;
504    Ok(())
505}
506
507/// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
508#[cfg(feature = "seccomp_trace")]
509pub fn read_jail_addr(jail: &Minijail) -> usize {
510    // We can only hope minijail's rust object will always contain a pointer to C jail struct as the
511    // first field.
512    const_assert!(std::mem::size_of::<Minijail>() >= std::mem::size_of::<usize>());
513    // Safe because it's only doing a read within bound checked by static assert
514    unsafe { *(jail as *const Minijail as *const usize) }
515}
516
517/// Set the uid/gid for the jailed process and give a basic id map. This is
518/// required for bind mounts to work.
519fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
520    let crosvm_uid = geteuid();
521    let crosvm_gid = getegid();
522
523    jail.uidmap(&format!("{crosvm_uid} {crosvm_uid} 1"))
524        .context("error setting UID map")?;
525    jail.gidmap(&format!("{crosvm_gid} {crosvm_gid} 1"))
526        .context("error setting GID map")?;
527
528    if crosvm_uid != 0 {
529        jail.change_uid(crosvm_uid);
530    }
531    if crosvm_gid != 0 {
532        jail.change_gid(crosvm_gid);
533    }
534    Ok(())
535}
536
537/// Set the seccomp policy for a jail from embedded bpfs
538pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
539    let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
540        format!("failed to find embedded seccomp policy: {seccomp_policy_name}")
541    })?;
542    jail.parse_seccomp_bytes(bpf_program).with_context(|| {
543        format!("failed to parse embedded seccomp policy: {seccomp_policy_name}")
544    })?;
545    Ok(())
546}