jail/
helpers.rs

1// Copyright 2017 The ChromiumOS Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#![deny(missing_docs)]
6#![allow(dead_code)]
7
8use std::path::Path;
9use std::str;
10use std::sync::LazyLock;
11
12use anyhow::bail;
13use anyhow::Context;
14use anyhow::Result;
15#[cfg(feature = "seccomp_trace")]
16use base::debug;
17use base::getegid;
18use base::geteuid;
19#[cfg(feature = "seccomp_trace")]
20use base::warn;
21use libc::c_ulong;
22use minijail::Minijail;
23#[cfg(feature = "seccomp_trace")]
24use static_assertions::const_assert;
25#[cfg(feature = "seccomp_trace")]
26use zerocopy::Immutable;
27#[cfg(feature = "seccomp_trace")]
28use zerocopy::IntoBytes;
29
30use crate::config::JailConfig;
31
32static EMBEDDED_BPFS: LazyLock<std::collections::HashMap<&str, Vec<u8>>> =
33    LazyLock::new(|| include!(concat!(env!("OUT_DIR"), "/bpf_includes.in")));
34
35/// Most devices don't need to open many fds. However, an implementation detail of minijail is that
36/// after applying this limit, it opens an additional file descriptor to scan the /proc/self/fd
37/// directory to choose which file descriptors to close in the child process. The open files limit
38/// therefore has to be higher than the number file descriptors that the parent thread holds open
39/// before the jail is started.
40pub const MAX_OPEN_FILES_DEFAULT: u64 = 4096;
41/// The max open files for gpu processes.
42const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
43/// The max open files for jail warden, matching FD_RAW_FAILURE.
44pub const MAX_OPEN_FILES_FOR_JAIL_WARDEN: u64 = 65536;
45
46/// The user in the jail to run as.
47pub enum RunAsUser {
48    /// Do not specify the user
49    Unspecified,
50    /// Runs as the same user in the jail as the current user.
51    CurrentUser,
52    /// Runs as the root user in the jail.
53    Root,
54    /// Runs as the specified uid and gid.
55    /// This requires `SandboxConfig::ugid_map` to be set.
56    Specified(u32, u32),
57}
58
59/// Config for the sandbox to be created by [Minijail].
60pub struct SandboxConfig<'a> {
61    /// Whether or not to drop all capabilities in the sandbox.
62    pub limit_caps: bool,
63    log_failures: bool,
64    seccomp_policy_dir: Option<&'a Path>,
65    seccomp_policy_name: &'a str,
66    /// The pair of `uid_map` and `gid_map`.
67    pub ugid_map: Option<(&'a str, &'a str)>,
68    /// The remount mode instead of default MS_PRIVATE.
69    pub remount_mode: Option<c_ulong>,
70    /// Whether to use empty net namespace. Enabled by default.
71    pub namespace_net: bool,
72    /// Whether or not to configure the jail to support bind-mounts.
73    ///
74    /// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
75    /// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
76    /// sandbox.
77    pub bind_mounts: bool,
78    /// Specify the user in the jail to run as.
79    pub run_as: RunAsUser,
80}
81
82impl<'a> SandboxConfig<'a> {
83    /// Creates [SandboxConfig].
84    pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
85        Self {
86            limit_caps: true,
87            log_failures: jail_config.seccomp_log_failures,
88            seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
89            seccomp_policy_name: policy,
90            ugid_map: None,
91            remount_mode: None,
92            namespace_net: true,
93            bind_mounts: false,
94            run_as: RunAsUser::Unspecified,
95        }
96    }
97}
98
99/// Wrapper that cleans up a [Minijail] when it is dropped
100pub struct ScopedMinijail(pub Minijail);
101
102impl Drop for ScopedMinijail {
103    fn drop(&mut self) {
104        let _ = self.0.kill();
105    }
106}
107
108/// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
109/// `max_open_files` using `RLIMIT_NOFILE`.
110///
111/// If `root` path is "/", the minijail don't change the root.
112///
113/// # Arguments
114///
115/// * `root` - The root path to be changed to by minijail.
116/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
117#[allow(clippy::unnecessary_cast)]
118pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
119    // Validate new root directory. Path::is_dir() also checks the existence.
120    if !root.is_dir() {
121        bail!("{:?} is not a directory, cannot create jail", root);
122    }
123    // chroot accepts absolute path only.
124    if !root.is_absolute() {
125        bail!("{:?} is not absolute path", root);
126    }
127
128    let mut jail = Minijail::new().context("failed to jail device")?;
129
130    // Only pivot_root if we are not re-using the current root directory.
131    if root != Path::new("/") {
132        // Run in a new mount namespace.
133        jail.namespace_vfs();
134        jail.enter_pivot_root(root)
135            .context("failed to pivot root device")?;
136    }
137
138    jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
139        .context("error setting max open files")?;
140
141    Ok(jail)
142}
143
144/// Creates a [Minijail] instance which just invokes a jail process and sets
145/// `max_open_files` using `RLIMIT_NOFILE`. This is helpful with crosvm process
146/// runs as a non-root user without SYS_ADMIN capabilities.
147///
148/// Unlike `create_base_minijail`, this function doesn't call `pivot_root`
149/// and `mount namespace`. So, it runs as a non-root user without
150/// SYS_ADMIN capabilities.
151///
152/// Note that since there is no file system isolation provided by this function,
153/// caller of this function should enforce other security mechanisum such as selinux
154/// on the host to protect directories.
155///
156/// # Arguments
157///
158/// * `root` - The root path to checked before the process is jailed
159/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
160#[allow(clippy::unnecessary_cast)]
161pub fn create_base_minijail_without_pivot_root(
162    root: &Path,
163    max_open_files: u64,
164) -> Result<Minijail> {
165    // Validate new root directory. Path::is_dir() also checks the existence.
166    if !root.is_dir() {
167        bail!("{:?} is not a directory, cannot create jail", root);
168    }
169    if !root.is_absolute() {
170        bail!("{:?} is not absolute path", root);
171    }
172
173    let mut jail = Minijail::new().context("failed to jail device")?;
174    jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
175        .context("error setting max open files")?;
176
177    Ok(jail)
178}
179
180/// Creates a [Minijail] instance which creates a sandbox.
181///
182/// # Arguments
183///
184/// * `root` - The root path to be changed to by minijail.
185/// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
186/// * `config` - The [SandboxConfig] to control details of the sandbox.
187pub fn create_sandbox_minijail(
188    root: &Path,
189    max_open_files: u64,
190    config: &SandboxConfig,
191) -> Result<Minijail> {
192    let mut jail = create_base_minijail(root, max_open_files)?;
193
194    jail.namespace_pids();
195    jail.namespace_user();
196    jail.namespace_user_disable_setgroups();
197    if config.limit_caps {
198        // Don't need any capabilities.
199        jail.use_caps(0);
200    }
201    match config.run_as {
202        RunAsUser::Unspecified => {
203            if config.bind_mounts && config.ugid_map.is_none() {
204                // Minijail requires to set user/group map to mount extra directories.
205                add_current_user_to_jail(&mut jail)?;
206            }
207        }
208        RunAsUser::CurrentUser => {
209            add_current_user_to_jail(&mut jail)?;
210        }
211        RunAsUser::Root => {
212            // Add the current user as root in the jail.
213            let crosvm_uid = geteuid();
214            let crosvm_gid = getegid();
215            jail.uidmap(&format!("0 {crosvm_uid} 1"))
216                .context("error setting UID map")?;
217            jail.gidmap(&format!("0 {crosvm_gid} 1"))
218                .context("error setting GID map")?;
219        }
220        RunAsUser::Specified(uid, gid) => {
221            if uid != 0 {
222                jail.change_uid(uid)
223            }
224            if gid != 0 {
225                jail.change_gid(gid)
226            }
227        }
228    }
229    if config.bind_mounts {
230        // Create a tmpfs in the device's root directory so that we can bind mount files.
231        // The size=67108864 is size=64*1024*1024 or size=64MB.
232        // TODO(b/267581374): Use appropriate size for tmpfs.
233        jail.mount_with_data(
234            Path::new("none"),
235            Path::new("/"),
236            "tmpfs",
237            (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
238            "size=67108864",
239        )?;
240    }
241    if let Some((uid_map, gid_map)) = config.ugid_map {
242        jail.uidmap(uid_map).context("error setting UID map")?;
243        jail.gidmap(gid_map).context("error setting GID map")?;
244    }
245    // Run in a new mount namespace.
246    jail.namespace_vfs();
247
248    if config.namespace_net {
249        // Run in an empty network namespace.
250        jail.namespace_net();
251    }
252
253    // Don't allow the device to gain new privileges.
254    jail.no_new_privs();
255
256    #[cfg(feature = "seccomp_trace")]
257    {
258        #[repr(C)]
259        #[derive(Immutable, IntoBytes)]
260        struct sock_filter {
261            /* Filter block */
262            code: u16, /* Actual filter code */
263            jt: u8,    /* Jump true */
264            jf: u8,    /* Jump false */
265            k: u32,    /* Generic multiuse field */
266        }
267
268        // BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
269        // BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
270        const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
271        const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
272        const BPF_RET: u16 = 0x06;
273        const BPF_K: u16 = 0x00;
274
275        // return SECCOMP_RET_LOG for all syscalls
276        const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
277            code: BPF_RET | BPF_K,
278            jt: 0,
279            jf: 0,
280            k: SECCOMP_RET_LOG,
281        };
282
283        warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
284        debug!(
285            "seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
286            config.seccomp_policy_name,
287            read_jail_addr(&jail),
288        );
289        jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
290            .unwrap();
291    }
292
293    #[cfg(not(feature = "seccomp_trace"))]
294    if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
295        let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
296        // By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
297        // is expected to be compiled using "trap" as the failure behavior instead of the default
298        // "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
299        // the built-in pre-compiled policies will be used.
300        // Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
301        // explanation about why the |log_failures| flag forces the use of .policy files (and the
302        // build-time alternative to this run-time flag).
303        let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
304        if bpf_policy_file.exists() && !config.log_failures {
305            jail.parse_seccomp_program(&bpf_policy_file)
306                .with_context(|| {
307                    format!(
308                        "failed to parse precompiled seccomp policy: {}",
309                        bpf_policy_file.display()
310                    )
311                })?;
312        } else {
313            // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
314            // kill the entire device process if a worker thread commits a seccomp violation.
315            jail.set_seccomp_filter_tsync();
316            if config.log_failures {
317                jail.log_seccomp_filter_failures();
318            }
319            let bpf_policy_file = seccomp_policy_path.with_extension("policy");
320            jail.parse_seccomp_filters(&bpf_policy_file)
321                .with_context(|| {
322                    format!(
323                        "failed to parse seccomp policy: {}",
324                        bpf_policy_file.display()
325                    )
326                })?;
327        }
328    } else {
329        set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
330    }
331
332    jail.use_seccomp_filter();
333    // Don't do init setup.
334    jail.run_as_init();
335    // Set up requested remount mode instead of default MS_PRIVATE.
336    if let Some(mode) = config.remount_mode {
337        jail.set_remount_mode(mode);
338    }
339
340    Ok(jail)
341}
342
343/// Creates a basic [Minijail] if `jail_config` is present.
344///
345/// Returns `None` if `jail_config` is none.
346pub fn simple_jail(jail_config: Option<&JailConfig>, policy: &str) -> Result<Option<Minijail>> {
347    if let Some(jail_config) = jail_config {
348        let config = SandboxConfig::new(jail_config, policy);
349        Ok(Some(create_sandbox_minijail(
350            &jail_config.pivot_root,
351            MAX_OPEN_FILES_DEFAULT,
352            &config,
353        )?))
354    } else {
355        Ok(None)
356    }
357}
358
359/// Creates [Minijail] for gpu processes.
360pub fn create_gpu_minijail(
361    root: &Path,
362    config: &SandboxConfig,
363    render_node_only: bool,
364    snapshot_scratch_directory: Option<&Path>,
365) -> Result<Minijail> {
366    let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;
367
368    // Device nodes required for DRM.
369    let sys_dev_char_path = Path::new("/sys/dev/char");
370    jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
371
372    // Necessary for CGROUP control of the vGPU threads
373    // This is not necessary UNLESS one wants to make use
374    // of the gpu cgroup command line options.
375    let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
376    if sys_cpuset_path.exists() {
377        jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
378    }
379
380    let sys_devices_path = Path::new("/sys/devices");
381    jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
382
383    jail_mount_bind_drm(&mut jail, render_node_only)?;
384
385    // If the ARM specific devices exist on the host, bind mount them in.
386    let mali0_path = Path::new("/dev/mali0");
387    if mali0_path.exists() {
388        jail.mount_bind(mali0_path, mali0_path, true)?;
389    }
390
391    let pvr_sync_path = Path::new("/dev/pvr_sync");
392    if pvr_sync_path.exists() {
393        jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
394    }
395
396    // If the udmabuf driver exists on the host, bind mount it in.
397    let udmabuf_path = Path::new("/dev/udmabuf");
398    if udmabuf_path.exists() {
399        jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
400    }
401
402    // Libraries that are required when mesa drivers are dynamically loaded.
403    jail_mount_bind_if_exists(
404        &mut jail,
405        &[
406            "/usr/lib",
407            "/usr/lib64",
408            "/lib",
409            "/lib64",
410            "/usr/share/drirc.d",
411            "/usr/share/glvnd",
412            "/usr/share/libdrm",
413            "/usr/share/vulkan",
414        ],
415    )?;
416
417    // pvr driver requires read access to /proc/self/task/*/comm.
418    mount_proc(&mut jail)?;
419
420    // To enable perfetto tracing, we need to give access to the perfetto service IPC
421    // endpoints.
422    let perfetto_path = Path::new("/run/perfetto");
423    if perfetto_path.exists() {
424        jail.mount_bind(perfetto_path, perfetto_path, true)?;
425    }
426
427    // Provide scratch space for the GPU device to build or unpack snapshots.
428    if let Some(snapshot_scratch_directory) = snapshot_scratch_directory {
429        jail.mount_with_data(
430            Path::new("none"),
431            snapshot_scratch_directory,
432            "tmpfs",
433            (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
434            "size=4294967296",
435        )?;
436    }
437
438    Ok(jail)
439}
440
441/// Selectively bind mount drm nodes into `jail` based on `render_node_only`
442///
443/// This function will not return an error if drm nodes don't exist
444pub fn jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()> {
445    if render_node_only {
446        const DRM_NUM_NODES: u32 = 63;
447        const DRM_RENDER_NODE_START: u32 = 128;
448        for offset in 0..DRM_NUM_NODES {
449            let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset);
450            let drm_dri_path = Path::new(&path_str);
451            if !drm_dri_path.exists() {
452                break;
453            }
454            jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
455        }
456    } else {
457        let drm_dri_path = Path::new("/dev/dri");
458        if drm_dri_path.exists() {
459            jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
460        }
461    }
462
463    Ok(())
464}
465
466/// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
467///
468/// This function will not return an error if any of the directories in `dirs` is missing.
469pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
470    jail: &mut Minijail,
471    dirs: &[P],
472) -> Result<()> {
473    for dir in dirs {
474        let dir_path = Path::new(dir);
475        if dir_path.exists() {
476            jail.mount_bind(dir_path, dir_path, false)?;
477        }
478    }
479
480    Ok(())
481}
482
483/// Mount proc in the sandbox.
484pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
485    jail.mount(
486        Path::new("proc"),
487        Path::new("/proc"),
488        "proc",
489        (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
490    )?;
491    Ok(())
492}
493
494/// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
495#[cfg(feature = "seccomp_trace")]
496pub fn read_jail_addr(jail: &Minijail) -> usize {
497    // We can only hope minijail's rust object will always contain a pointer to C jail struct as the
498    // first field.
499    const_assert!(std::mem::size_of::<Minijail>() >= std::mem::size_of::<usize>());
500    // Safe because it's only doing a read within bound checked by static assert
501    unsafe { *(jail as *const Minijail as *const usize) }
502}
503
504/// Set the uid/gid for the jailed process and give a basic id map. This is
505/// required for bind mounts to work.
506fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
507    let crosvm_uid = geteuid();
508    let crosvm_gid = getegid();
509
510    jail.uidmap(&format!("{crosvm_uid} {crosvm_uid} 1"))
511        .context("error setting UID map")?;
512    jail.gidmap(&format!("{crosvm_gid} {crosvm_gid} 1"))
513        .context("error setting GID map")?;
514
515    if crosvm_uid != 0 {
516        jail.change_uid(crosvm_uid);
517    }
518    if crosvm_gid != 0 {
519        jail.change_gid(crosvm_gid);
520    }
521    Ok(())
522}
523
524/// Set the seccomp policy for a jail from embedded bpfs
525pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
526    let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
527        format!("failed to find embedded seccomp policy: {seccomp_policy_name}")
528    })?;
529    jail.parse_seccomp_bytes(bpf_program).with_context(|| {
530        format!("failed to parse embedded seccomp policy: {seccomp_policy_name}")
531    })?;
532    Ok(())
533}