CCv0: Merge main into CCv0 branch

Merge remote-tracking branch 'upstream/main' into CCv0

Fixes: #5905
Signed-off-by: Georgina Kinge <georgina.kinge@ibm.com>
This commit is contained in:
Georgina Kinge
2022-12-14 14:55:23 +00:00
71 changed files with 1531 additions and 308 deletions

2
.gitignore vendored
View File

@@ -4,6 +4,8 @@
**/*.rej **/*.rej
**/target **/target
**/.vscode **/.vscode
**/.idea
**/.fleet
pkg/logging/Cargo.lock pkg/logging/Cargo.lock
src/agent/src/version.rs src/agent/src/version.rs
src/agent/kata-agent.service src/agent/kata-agent.service

View File

@@ -86,6 +86,27 @@ $ sudo sed -i '/^disable_guest_seccomp/ s/true/false/' /etc/kata-containers/conf
This will pass container seccomp profiles to the kata agent. This will pass container seccomp profiles to the kata agent.
## Enable SELinux on the guest
> **Note:**
>
> - To enable SELinux on the guest, SELinux MUST be also enabled on the host.
> - You MUST create and build a rootfs image for SELinux in advance.
> See [Create a rootfs image](#create-a-rootfs-image) and [Build a rootfs image](#build-a-rootfs-image).
> - SELinux on the guest is supported in only a rootfs image currently, so
> you cannot enable SELinux with the agent init (`AGENT_INIT=yes`) yet.
Enable guest SELinux in Enforcing mode as follows:
```
$ sudo sed -i '/^disable_guest_selinux/ s/true/false/g' /etc/kata-containers/configuration.toml
```
The runtime automatically will set `selinux=1` to the kernel parameters and `xattr` option to
`virtiofsd` when `disable_guest_selinux` is set to `false`.
If you want to enable SELinux in Permissive mode, add `enforcing=0` to the kernel parameters.
## Enable full debug ## Enable full debug
Enable full debug as follows: Enable full debug as follows:
@@ -256,6 +277,12 @@ If you want to build the agent without seccomp capability, you need to run the `
$ script -fec 'sudo -E AGENT_INIT=yes USE_DOCKER=true SECCOMP=no ./rootfs.sh "${distro}"' $ script -fec 'sudo -E AGENT_INIT=yes USE_DOCKER=true SECCOMP=no ./rootfs.sh "${distro}"'
``` ```
If you want to enable SELinux on the guest, you MUST choose `centos` and run the `rootfs.sh` script with `SELINUX=yes` as follows.
```
$ script -fec 'sudo -E GOPATH=$GOPATH USE_DOCKER=true SELINUX=yes ./rootfs.sh centos'
```
> **Note:** > **Note:**
> >
> - Check the [compatibility matrix](../tools/osbuilder/README.md#platform-distro-compatibility-matrix) before creating rootfs. > - Check the [compatibility matrix](../tools/osbuilder/README.md#platform-distro-compatibility-matrix) before creating rootfs.
@@ -283,6 +310,19 @@ $ script -fec 'sudo -E USE_DOCKER=true ./image_builder.sh "${ROOTFS_DIR}"'
$ popd $ popd
``` ```
If you want to enable SELinux on the guest, you MUST run the `image_builder.sh` script with `SELINUX=yes`
to label the guest image as follows.
To label the image on the host, you need to make sure that SELinux is enabled (`selinuxfs` is mounted) on the host
and the rootfs MUST be created by running the `rootfs.sh` with `SELINUX=yes`.
```
$ script -fec 'sudo -E USE_DOCKER=true SELINUX=yes ./image_builder.sh ${ROOTFS_DIR}'
```
Currently, the `image_builder.sh` uses `chcon` as an interim solution in order to apply `container_runtime_exec_t`
to the `kata-agent`. Hence, if you run `restorecon` to the guest image after running the `image_builder.sh`,
the `kata-agent` needs to be labeled `container_runtime_exec_t` again by yourself.
> **Notes:** > **Notes:**
> >
> - You must ensure that the *default Docker runtime* is `runc` to make use of > - You must ensure that the *default Docker runtime* is `runc` to make use of

2
src/agent/Cargo.lock generated
View File

@@ -2300,7 +2300,6 @@ dependencies = [
"slog", "slog",
"slog-scope", "slog-scope",
"slog-stdlog", "slog-stdlog",
"sysinfo",
"tempfile", "tempfile",
"test-utils", "test-utils",
"thiserror", "thiserror",
@@ -4076,6 +4075,7 @@ dependencies = [
"tempfile", "tempfile",
"test-utils", "test-utils",
"tokio", "tokio",
"xattr",
"zbus", "zbus",
] ]

View File

@@ -35,6 +35,7 @@ inotify = "0.9.2"
libseccomp = { version = "0.3.0", optional = true } libseccomp = { version = "0.3.0", optional = true }
zbus = "2.3.0" zbus = "2.3.0"
bit-vec= "0.6.3" bit-vec= "0.6.3"
xattr = "0.2.3"
[dev-dependencies] [dev-dependencies]
serial_test = "0.5.0" serial_test = "0.5.0"

View File

@@ -30,6 +30,7 @@ use crate::log_child;
use crate::process::Process; use crate::process::Process;
#[cfg(feature = "seccomp")] #[cfg(feature = "seccomp")]
use crate::seccomp; use crate::seccomp;
use crate::selinux;
use crate::specconv::CreateOpts; use crate::specconv::CreateOpts;
use crate::{mount, validator}; use crate::{mount, validator};
@@ -109,7 +110,6 @@ impl Default for ContainerStatus {
} }
// We might want to change this to thiserror in the future // We might want to change this to thiserror in the future
const MissingCGroupManager: &str = "failed to get container's cgroup Manager";
const MissingLinux: &str = "no linux config"; const MissingLinux: &str = "no linux config";
const InvalidNamespace: &str = "invalid namespace type"; const InvalidNamespace: &str = "invalid namespace type";
@@ -243,7 +243,7 @@ pub struct LinuxContainer {
pub id: String, pub id: String,
pub root: String, pub root: String,
pub config: Config, pub config: Config,
pub cgroup_manager: Option<Box<dyn Manager + Send + Sync>>, pub cgroup_manager: Box<dyn Manager + Send + Sync>,
pub init_process_pid: pid_t, pub init_process_pid: pid_t,
pub init_process_start_time: u64, pub init_process_start_time: u64,
pub uid_map_path: String, pub uid_map_path: String,
@@ -292,16 +292,11 @@ impl Container for LinuxContainer {
)); ));
} }
if self.cgroup_manager.is_some() { self.cgroup_manager.as_ref().freeze(FreezerState::Frozen)?;
self.cgroup_manager
.as_ref()
.unwrap()
.freeze(FreezerState::Frozen)?;
self.status.transition(ContainerState::Paused); self.status.transition(ContainerState::Paused);
return Ok(());
} Ok(())
Err(anyhow!(MissingCGroupManager))
} }
fn resume(&mut self) -> Result<()> { fn resume(&mut self) -> Result<()> {
@@ -310,16 +305,11 @@ impl Container for LinuxContainer {
return Err(anyhow!("container status is: {:?}, not paused", status)); return Err(anyhow!("container status is: {:?}, not paused", status));
} }
if self.cgroup_manager.is_some() { self.cgroup_manager.as_ref().freeze(FreezerState::Thawed)?;
self.cgroup_manager
.as_ref()
.unwrap()
.freeze(FreezerState::Thawed)?;
self.status.transition(ContainerState::Running); self.status.transition(ContainerState::Running);
return Ok(());
} Ok(())
Err(anyhow!(MissingCGroupManager))
} }
} }
@@ -537,6 +527,8 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
} }
} }
let selinux_enabled = selinux::is_enabled()?;
sched::unshare(to_new & !CloneFlags::CLONE_NEWUSER)?; sched::unshare(to_new & !CloneFlags::CLONE_NEWUSER)?;
if userns { if userns {
@@ -638,6 +630,18 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
capctl::prctl::set_no_new_privs().map_err(|_| anyhow!("cannot set no new privileges"))?; capctl::prctl::set_no_new_privs().map_err(|_| anyhow!("cannot set no new privileges"))?;
} }
// Set SELinux label
if !oci_process.selinux_label.is_empty() {
if !selinux_enabled {
return Err(anyhow!(
"SELinux label for the process is provided but SELinux is not enabled on the running kernel"
));
}
log_child!(cfd_log, "Set SELinux label to the container process");
selinux::set_exec_label(&oci_process.selinux_label)?;
}
// Log unknown seccomp system calls in advance before the log file descriptor closes. // Log unknown seccomp system calls in advance before the log file descriptor closes.
#[cfg(feature = "seccomp")] #[cfg(feature = "seccomp")]
if let Some(ref scmp) = linux.seccomp { if let Some(ref scmp) = linux.seccomp {
@@ -847,22 +851,17 @@ impl BaseContainer for LinuxContainer {
} }
fn stats(&self) -> Result<StatsContainerResponse> { fn stats(&self) -> Result<StatsContainerResponse> {
let mut r = StatsContainerResponse::default();
if self.cgroup_manager.is_some() {
r.cgroup_stats =
SingularPtrField::some(self.cgroup_manager.as_ref().unwrap().get_stats()?);
}
// what about network interface stats? // what about network interface stats?
Ok(r) Ok(StatsContainerResponse {
cgroup_stats: SingularPtrField::some(self.cgroup_manager.as_ref().get_stats()?),
..Default::default()
})
} }
fn set(&mut self, r: LinuxResources) -> Result<()> { fn set(&mut self, r: LinuxResources) -> Result<()> {
if self.cgroup_manager.is_some() { self.cgroup_manager.as_ref().set(&r, true)?;
self.cgroup_manager.as_ref().unwrap().set(&r, true)?;
}
self.config self.config
.spec .spec
.as_mut() .as_mut()
@@ -1035,7 +1034,7 @@ impl BaseContainer for LinuxContainer {
&logger, &logger,
spec, spec,
&p, &p,
self.cgroup_manager.as_ref().unwrap().as_ref(), self.cgroup_manager.as_ref(),
self.config.use_systemd_cgroup, self.config.use_systemd_cgroup,
&st, &st,
&mut pipe_w, &mut pipe_w,
@@ -1114,19 +1113,19 @@ impl BaseContainer for LinuxContainer {
)?; )?;
fs::remove_dir_all(&self.root)?; fs::remove_dir_all(&self.root)?;
if let Some(cgm) = self.cgroup_manager.as_mut() { let cgm = self.cgroup_manager.as_mut();
// Kill all of the processes created in this container to prevent // Kill all of the processes created in this container to prevent
// the leak of some daemon process when this container shared pidns // the leak of some daemon process when this container shared pidns
// with the sandbox. // with the sandbox.
let pids = cgm.get_pids().context("get cgroup pids")?; let pids = cgm.get_pids().context("get cgroup pids")?;
for i in pids { for i in pids {
if let Err(e) = signal::kill(Pid::from_raw(i), Signal::SIGKILL) { if let Err(e) = signal::kill(Pid::from_raw(i), Signal::SIGKILL) {
warn!(self.logger, "kill the process {} error: {:?}", i, e); warn!(self.logger, "kill the process {} error: {:?}", i, e);
}
} }
cgm.destroy().context("destroy cgroups")?;
} }
cgm.destroy().context("destroy cgroups")?;
Ok(()) Ok(())
} }
@@ -1514,7 +1513,7 @@ impl LinuxContainer {
Ok(LinuxContainer { Ok(LinuxContainer {
id: id.clone(), id: id.clone(),
root, root,
cgroup_manager: Some(cgroup_manager), cgroup_manager,
status: ContainerStatus::new(), status: ContainerStatus::new(),
uid_map_path: String::from(""), uid_map_path: String::from(""),
gid_map_path: "".to_string(), gid_map_path: "".to_string(),
@@ -1980,24 +1979,12 @@ mod tests {
assert!(format!("{:?}", ret).contains("failed to pause container")) assert!(format!("{:?}", ret).contains("failed to pause container"))
} }
#[test]
fn test_linuxcontainer_pause_cgroupmgr_is_none() {
let ret = new_linux_container_and_then(|mut c: LinuxContainer| {
c.cgroup_manager = None;
c.pause().map_err(|e| anyhow!(e))
});
assert!(ret.is_err(), "Expecting error, Got {:?}", ret);
}
#[test] #[test]
fn test_linuxcontainer_pause() { fn test_linuxcontainer_pause() {
let ret = new_linux_container_and_then(|mut c: LinuxContainer| { let ret = new_linux_container_and_then(|mut c: LinuxContainer| {
let cgroup_manager: Box<dyn Manager + Send + Sync> = c.cgroup_manager = Box::new(FsManager::new("").map_err(|e| {
Box::new(FsManager::new("").map_err(|e| { anyhow!(format!("fail to create cgroup manager with path: {:}", e))
anyhow!(format!("fail to create cgroup manager with path: {:}", e)) })?);
})?);
c.cgroup_manager = Some(cgroup_manager);
c.pause().map_err(|e| anyhow!(e)) c.pause().map_err(|e| anyhow!(e))
}); });
@@ -2016,25 +2003,12 @@ mod tests {
assert!(format!("{:?}", ret).contains("not paused")) assert!(format!("{:?}", ret).contains("not paused"))
} }
#[test]
fn test_linuxcontainer_resume_cgroupmgr_is_none() {
let ret = new_linux_container_and_then(|mut c: LinuxContainer| {
c.status.transition(ContainerState::Paused);
c.cgroup_manager = None;
c.resume().map_err(|e| anyhow!(e))
});
assert!(ret.is_err(), "Expecting error, Got {:?}", ret);
}
#[test] #[test]
fn test_linuxcontainer_resume() { fn test_linuxcontainer_resume() {
let ret = new_linux_container_and_then(|mut c: LinuxContainer| { let ret = new_linux_container_and_then(|mut c: LinuxContainer| {
let cgroup_manager: Box<dyn Manager + Send + Sync> = c.cgroup_manager = Box::new(FsManager::new("").map_err(|e| {
Box::new(FsManager::new("").map_err(|e| { anyhow!(format!("fail to create cgroup manager with path: {:}", e))
anyhow!(format!("fail to create cgroup manager with path: {:}", e)) })?);
})?);
c.cgroup_manager = Some(cgroup_manager);
// Change status to paused, this way we can resume it // Change status to paused, this way we can resume it
c.status.transition(ContainerState::Paused); c.status.transition(ContainerState::Paused);
c.resume().map_err(|e| anyhow!(e)) c.resume().map_err(|e| anyhow!(e))

View File

@@ -38,6 +38,7 @@ pub mod pipestream;
pub mod process; pub mod process;
#[cfg(feature = "seccomp")] #[cfg(feature = "seccomp")]
pub mod seccomp; pub mod seccomp;
pub mod selinux;
pub mod specconv; pub mod specconv;
pub mod sync; pub mod sync;
pub mod sync_with_async; pub mod sync_with_async;

View File

@@ -25,6 +25,7 @@ use std::fs::File;
use std::io::{BufRead, BufReader}; use std::io::{BufRead, BufReader};
use crate::container::DEFAULT_DEVICES; use crate::container::DEFAULT_DEVICES;
use crate::selinux;
use crate::sync::write_count; use crate::sync::write_count;
use std::string::ToString; use std::string::ToString;
@@ -181,6 +182,8 @@ pub fn init_rootfs(
None => flags |= MsFlags::MS_SLAVE, None => flags |= MsFlags::MS_SLAVE,
} }
let label = &linux.mount_label;
let root = spec let root = spec
.root .root
.as_ref() .as_ref()
@@ -244,7 +247,7 @@ pub fn init_rootfs(
} }
} }
mount_from(cfd_log, m, rootfs, flags, &data, "")?; mount_from(cfd_log, m, rootfs, flags, &data, label)?;
// bind mount won't change mount options, we need remount to make mount options // bind mount won't change mount options, we need remount to make mount options
// effective. // effective.
// first check that we have non-default options required before attempting a // first check that we have non-default options required before attempting a
@@ -524,7 +527,6 @@ pub fn pivot_rootfs<P: ?Sized + NixPath + std::fmt::Debug>(path: &P) -> Result<(
fn rootfs_parent_mount_private(path: &str) -> Result<()> { fn rootfs_parent_mount_private(path: &str) -> Result<()> {
let mount_infos = parse_mount_table(MOUNTINFO_PATH)?; let mount_infos = parse_mount_table(MOUNTINFO_PATH)?;
let mut max_len = 0; let mut max_len = 0;
let mut mount_point = String::from(""); let mut mount_point = String::from("");
let mut options = String::from(""); let mut options = String::from("");
@@ -767,9 +769,9 @@ fn mount_from(
rootfs: &str, rootfs: &str,
flags: MsFlags, flags: MsFlags,
data: &str, data: &str,
_label: &str, label: &str,
) -> Result<()> { ) -> Result<()> {
let d = String::from(data); let mut d = String::from(data);
let dest = secure_join(rootfs, &m.destination); let dest = secure_join(rootfs, &m.destination);
let src = if m.r#type.as_str() == "bind" { let src = if m.r#type.as_str() == "bind" {
@@ -822,6 +824,37 @@ fn mount_from(
e e
})?; })?;
// Set the SELinux context for the mounts
let mut use_xattr = false;
if !label.is_empty() {
if selinux::is_enabled()? {
let device = Path::new(&m.source)
.file_name()
.ok_or_else(|| anyhow!("invalid device source path: {}", &m.source))?
.to_str()
.ok_or_else(|| anyhow!("failed to convert device source path: {}", &m.source))?;
match device {
// SELinux does not support labeling of /proc or /sys
"proc" | "sysfs" => (),
// SELinux does not support mount labeling against /dev/mqueue,
// so we use setxattr instead
"mqueue" => {
use_xattr = true;
}
_ => {
log_child!(cfd_log, "add SELinux mount label to {}", dest.as_str());
selinux::add_mount_label(&mut d, label);
}
}
} else {
log_child!(
cfd_log,
"SELinux label for the mount is provided but SELinux is not enabled on the running kernel"
);
}
}
mount( mount(
Some(src.as_str()), Some(src.as_str()),
dest.as_str(), dest.as_str(),
@@ -834,6 +867,10 @@ fn mount_from(
e e
})?; })?;
if !label.is_empty() && selinux::is_enabled()? && use_xattr {
xattr::set(dest.as_str(), "security.selinux", label.as_bytes())?;
}
if flags.contains(MsFlags::MS_BIND) if flags.contains(MsFlags::MS_BIND)
&& flags.intersects( && flags.intersects(
!(MsFlags::MS_REC !(MsFlags::MS_REC

View File

@@ -0,0 +1,80 @@
// Copyright 2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::{Context, Result};
use nix::unistd::gettid;
use std::fs::{self, OpenOptions};
use std::io::prelude::*;
use std::path::Path;
pub fn is_enabled() -> Result<bool> {
let buf = fs::read_to_string("/proc/mounts")?;
let enabled = buf.contains("selinuxfs");
Ok(enabled)
}
pub fn add_mount_label(data: &mut String, label: &str) {
if data.is_empty() {
let context = format!("context=\"{}\"", label);
data.push_str(&context);
} else {
let context = format!(",context=\"{}\"", label);
data.push_str(&context);
}
}
pub fn set_exec_label(label: &str) -> Result<()> {
let mut attr_path = Path::new("/proc/thread-self/attr/exec").to_path_buf();
if !attr_path.exists() {
// Fall back to the old convention
attr_path = Path::new("/proc/self/task")
.join(gettid().to_string())
.join("attr/exec")
}
let mut file = OpenOptions::new()
.write(true)
.truncate(true)
.open(attr_path)?;
file.write_all(label.as_bytes())
.with_context(|| "failed to apply SELinux label")?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
const TEST_LABEL: &str = "system_u:system_r:unconfined_t:s0";
#[test]
fn test_is_enabled() {
let ret = is_enabled();
assert!(ret.is_ok(), "Expecting Ok, Got {:?}", ret);
}
#[test]
fn test_add_mount_label() {
let mut data = String::new();
add_mount_label(&mut data, TEST_LABEL);
assert_eq!(data, format!("context=\"{}\"", TEST_LABEL));
let mut data = String::from("defaults");
add_mount_label(&mut data, TEST_LABEL);
assert_eq!(data, format!("defaults,context=\"{}\"", TEST_LABEL));
}
#[test]
fn test_set_exec_label() {
let ret = set_exec_label(TEST_LABEL);
if is_enabled().unwrap() {
assert!(ret.is_ok(), "Expecting Ok, Got {:?}", ret);
} else {
assert!(ret.is_err(), "Expecting error, Got {:?}", ret);
}
}
}

View File

@@ -6,6 +6,7 @@
use crate::container::Config; use crate::container::Config;
use anyhow::{anyhow, Context, Result}; use anyhow::{anyhow, Context, Result};
use oci::{Linux, LinuxIdMapping, LinuxNamespace, Spec}; use oci::{Linux, LinuxIdMapping, LinuxNamespace, Spec};
use regex::Regex;
use std::collections::HashMap; use std::collections::HashMap;
use std::path::{Component, PathBuf}; use std::path::{Component, PathBuf};
@@ -86,6 +87,23 @@ fn hostname(oci: &Spec) -> Result<()> {
fn security(oci: &Spec) -> Result<()> { fn security(oci: &Spec) -> Result<()> {
let linux = get_linux(oci)?; let linux = get_linux(oci)?;
let label_pattern = r".*_u:.*_r:.*_t:s[0-9]|1[0-5].*";
let label_regex = Regex::new(label_pattern)?;
if let Some(ref process) = oci.process {
if !process.selinux_label.is_empty() && !label_regex.is_match(&process.selinux_label) {
return Err(anyhow!(
"SELinux label for the process is invalid format: {}",
&process.selinux_label
));
}
}
if !linux.mount_label.is_empty() && !label_regex.is_match(&linux.mount_label) {
return Err(anyhow!(
"SELinux label for the mount is invalid format: {}",
&linux.mount_label
));
}
if linux.masked_paths.is_empty() && linux.readonly_paths.is_empty() { if linux.masked_paths.is_empty() && linux.readonly_paths.is_empty() {
return Ok(()); return Ok(());
@@ -95,8 +113,6 @@ fn security(oci: &Spec) -> Result<()> {
return Err(anyhow!("Linux namespace does not contain mount")); return Err(anyhow!("Linux namespace does not contain mount"));
} }
// don't care about selinux at present
Ok(()) Ok(())
} }
@@ -285,7 +301,7 @@ pub fn validate(conf: &Config) -> Result<()> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use oci::Mount; use oci::{Mount, Process};
#[test] #[test]
fn test_namespace() { fn test_namespace() {
@@ -388,6 +404,29 @@ mod tests {
]; ];
spec.linux = Some(linux); spec.linux = Some(linux);
security(&spec).unwrap(); security(&spec).unwrap();
// SELinux
let valid_label = "system_u:system_r:container_t:s0:c123,c456";
let mut process = Process::default();
process.selinux_label = valid_label.to_string();
spec.process = Some(process);
security(&spec).unwrap();
let mut linux = Linux::default();
linux.mount_label = valid_label.to_string();
spec.linux = Some(linux);
security(&spec).unwrap();
let invalid_label = "system_u:system_r:container_t";
let mut process = Process::default();
process.selinux_label = invalid_label.to_string();
spec.process = Some(process);
security(&spec).unwrap_err();
let mut linux = Linux::default();
linux.mount_label = invalid_label.to_string();
spec.linux = Some(linux);
security(&spec).unwrap_err();
} }
#[test] #[test]

View File

@@ -348,14 +348,13 @@ impl AgentService {
} }
// start oom event loop // start oom event loop
if let Some(ref ctr) = ctr.cgroup_manager {
let cg_path = ctr.get_cgroup_path("memory");
if let Ok(cg_path) = cg_path { let cg_path = ctr.cgroup_manager.as_ref().get_cgroup_path("memory");
let rx = notifier::notify_oom(cid.as_str(), cg_path.to_string()).await?;
s.run_oom_event_monitor(rx, cid.clone()).await; if let Ok(cg_path) = cg_path {
} let rx = notifier::notify_oom(cid.as_str(), cg_path.to_string()).await?;
s.run_oom_event_monitor(rx, cid.clone()).await;
} }
Ok(()) Ok(())
@@ -542,11 +541,7 @@ impl AgentService {
let ctr = sandbox let ctr = sandbox
.get_container(cid) .get_container(cid)
.ok_or_else(|| anyhow!("Invalid container id {}", cid))?; .ok_or_else(|| anyhow!("Invalid container id {}", cid))?;
let cm = ctr ctr.cgroup_manager.as_ref().freeze(state)?;
.cgroup_manager
.as_ref()
.ok_or_else(|| anyhow!("cgroup manager not exist"))?;
cm.freeze(state)?;
Ok(()) Ok(())
} }
@@ -556,11 +551,7 @@ impl AgentService {
let ctr = sandbox let ctr = sandbox
.get_container(cid) .get_container(cid)
.ok_or_else(|| anyhow!("Invalid container id {}", cid))?; .ok_or_else(|| anyhow!("Invalid container id {}", cid))?;
let cm = ctr let pids = ctr.cgroup_manager.as_ref().get_pids()?;
.cgroup_manager
.as_ref()
.ok_or_else(|| anyhow!("cgroup manager not exist"))?;
let pids = cm.get_pids()?;
Ok(pids) Ok(pids)
} }

View File

@@ -298,7 +298,6 @@ impl Sandbox {
info!(self.logger, "updating {}", ctr.id.as_str()); info!(self.logger, "updating {}", ctr.id.as_str());
ctr.cgroup_manager ctr.cgroup_manager
.as_ref() .as_ref()
.unwrap()
.update_cpuset_path(guest_cpuset.as_str(), container_cpust)?; .update_cpuset_path(guest_cpuset.as_str(), container_cpust)?;
} }

8
src/libs/Cargo.lock generated
View File

@@ -40,6 +40,12 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "base64"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "1.2.1" version = "1.2.1"
@@ -420,6 +426,8 @@ dependencies = [
name = "kata-types" name = "kata-types"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow",
"base64",
"bitmask-enum", "bitmask-enum",
"byte-unit", "byte-unit",
"glob", "glob",

View File

@@ -43,8 +43,6 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::fs; use std::fs;
use std::io::{self, BufRead}; use std::io::{self, BufRead};
use std::os::raw::c_char;
use std::os::unix::ffi::OsStrExt;
use std::os::unix::fs::{DirBuilderExt, OpenOptionsExt}; use std::os::unix::fs::{DirBuilderExt, OpenOptionsExt};
use std::os::unix::io::AsRawFd; use std::os::unix::io::AsRawFd;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
@@ -213,11 +211,11 @@ pub fn create_mount_destination<S: AsRef<Path>, D: AsRef<Path>, R: AsRef<Path>>(
} }
} }
/// Remount a bind mount into readonly mode. /// Remount a bind mount
/// ///
/// # Safety /// # Safety
/// Caller needs to ensure safety of the `dst` to avoid possible file path based attacks. /// Caller needs to ensure safety of the `dst` to avoid possible file path based attacks.
pub fn bind_remount_read_only<P: AsRef<Path>>(dst: P) -> Result<()> { pub fn bind_remount<P: AsRef<Path>>(dst: P, readonly: bool) -> Result<()> {
let dst = dst.as_ref(); let dst = dst.as_ref();
if dst.is_empty() { if dst.is_empty() {
return Err(Error::NullMountPointPath); return Err(Error::NullMountPointPath);
@@ -226,7 +224,7 @@ pub fn bind_remount_read_only<P: AsRef<Path>>(dst: P) -> Result<()> {
.canonicalize() .canonicalize()
.map_err(|_e| Error::InvalidPath(dst.to_path_buf()))?; .map_err(|_e| Error::InvalidPath(dst.to_path_buf()))?;
do_rebind_mount_read_only(dst, MsFlags::empty()) do_rebind_mount(dst, readonly, MsFlags::empty())
} }
/// Bind mount `src` to `dst` in slave mode, optionally in readonly mode if `readonly` is true. /// Bind mount `src` to `dst` in slave mode, optionally in readonly mode if `readonly` is true.
@@ -239,7 +237,7 @@ pub fn bind_remount_read_only<P: AsRef<Path>>(dst: P) -> Result<()> {
pub fn bind_mount_unchecked<S: AsRef<Path>, D: AsRef<Path>>( pub fn bind_mount_unchecked<S: AsRef<Path>, D: AsRef<Path>>(
src: S, src: S,
dst: D, dst: D,
read_only: bool, readonly: bool,
) -> Result<()> { ) -> Result<()> {
fail::fail_point!("bind_mount", |_| { fail::fail_point!("bind_mount", |_| {
Err(Error::FailureInject( Err(Error::FailureInject(
@@ -275,8 +273,8 @@ pub fn bind_mount_unchecked<S: AsRef<Path>, D: AsRef<Path>>(
.map_err(|e| Error::Mount(PathBuf::new(), dst.to_path_buf(), e))?; .map_err(|e| Error::Mount(PathBuf::new(), dst.to_path_buf(), e))?;
// Optionally rebind into readonly mode. // Optionally rebind into readonly mode.
if read_only { if readonly {
do_rebind_mount_read_only(dst, MsFlags::empty())?; do_rebind_mount(dst, readonly, MsFlags::empty())?;
} }
Ok(()) Ok(())
@@ -356,7 +354,7 @@ impl Mounter for kata_types::mount::Mount {
// Bind mount readonly. // Bind mount readonly.
let bro_flag = MsFlags::MS_BIND | MsFlags::MS_RDONLY; let bro_flag = MsFlags::MS_BIND | MsFlags::MS_RDONLY;
if (o_flag & bro_flag) == bro_flag { if (o_flag & bro_flag) == bro_flag {
do_rebind_mount_read_only(target, o_flag)?; do_rebind_mount(target, true, o_flag)?;
} }
Ok(()) Ok(())
@@ -364,12 +362,16 @@ impl Mounter for kata_types::mount::Mount {
} }
#[inline] #[inline]
fn do_rebind_mount_read_only<P: AsRef<Path>>(path: P, flags: MsFlags) -> Result<()> { fn do_rebind_mount<P: AsRef<Path>>(path: P, readonly: bool, flags: MsFlags) -> Result<()> {
mount( mount(
Some(""), Some(""),
path.as_ref(), path.as_ref(),
Some(""), Some(""),
flags | MsFlags::MS_BIND | MsFlags::MS_REMOUNT | MsFlags::MS_RDONLY, if readonly {
flags | MsFlags::MS_BIND | MsFlags::MS_REMOUNT | MsFlags::MS_RDONLY
} else {
flags | MsFlags::MS_BIND | MsFlags::MS_REMOUNT
},
Some(""), Some(""),
) )
.map_err(|e| Error::Remount(path.as_ref().to_path_buf(), e)) .map_err(|e| Error::Remount(path.as_ref().to_path_buf(), e))
@@ -756,18 +758,11 @@ pub fn umount_all<P: AsRef<Path>>(mountpoint: P, lazy_umount: bool) -> Result<()
// Counterpart of nix::umount2, with support of `UMOUNT_FOLLOW`. // Counterpart of nix::umount2, with support of `UMOUNT_FOLLOW`.
fn umount2<P: AsRef<Path>>(path: P, lazy_umount: bool) -> std::io::Result<()> { fn umount2<P: AsRef<Path>>(path: P, lazy_umount: bool) -> std::io::Result<()> {
let path_ptr = path.as_ref().as_os_str().as_bytes().as_ptr() as *const c_char; let mut flags = MntFlags::UMOUNT_NOFOLLOW;
let mut flags = MntFlags::UMOUNT_NOFOLLOW.bits();
if lazy_umount { if lazy_umount {
flags |= MntFlags::MNT_DETACH.bits(); flags |= MntFlags::MNT_DETACH;
}
// Safe because parameter is valid and we have checked the reuslt.
if unsafe { libc::umount2(path_ptr, flags) } < 0 {
Err(io::Error::last_os_error())
} else {
Ok(())
} }
nix::mount::umount2(path.as_ref(), flags).map_err(io::Error::from)
} }
#[cfg(test)] #[cfg(test)]
@@ -820,21 +815,21 @@ mod tests {
#[test] #[test]
#[ignore] #[ignore]
fn test_bind_remount_read_only() { fn test_bind_remount() {
let tmpdir = tempfile::tempdir().unwrap(); let tmpdir = tempfile::tempdir().unwrap();
let tmpdir2 = tempfile::tempdir().unwrap(); let tmpdir2 = tempfile::tempdir().unwrap();
assert!(matches!( assert!(matches!(
bind_remount_read_only(&PathBuf::from("")), bind_remount(&PathBuf::from(""), true),
Err(Error::NullMountPointPath) Err(Error::NullMountPointPath)
)); ));
assert!(matches!( assert!(matches!(
bind_remount_read_only(&PathBuf::from("../______doesn't____exist____nnn")), bind_remount(&PathBuf::from("../______doesn't____exist____nnn"), true),
Err(Error::InvalidPath(_)) Err(Error::InvalidPath(_))
)); ));
bind_mount_unchecked(tmpdir2.path(), tmpdir.path(), true).unwrap(); bind_mount_unchecked(tmpdir2.path(), tmpdir.path(), true).unwrap();
bind_remount_read_only(tmpdir.path()).unwrap(); bind_remount(tmpdir.path(), true).unwrap();
umount_timeout(tmpdir.path().to_str().unwrap(), 0).unwrap(); umount_timeout(tmpdir.path().to_str().unwrap(), 0).unwrap();
} }

View File

@@ -35,7 +35,7 @@ pub const DEFAULT_VHOST_USER_STORE_PATH: &str = "/var/run/vhost-user";
pub const DEFAULT_BLOCK_NVDIMM_MEM_OFFSET: u64 = 0; pub const DEFAULT_BLOCK_NVDIMM_MEM_OFFSET: u64 = 0;
pub const DEFAULT_SHARED_FS_TYPE: &str = "virtio-fs"; pub const DEFAULT_SHARED_FS_TYPE: &str = "virtio-fs";
pub const DEFAULT_VIRTIO_FS_CACHE_MODE: &str = "none"; pub const DEFAULT_VIRTIO_FS_CACHE_MODE: &str = "never";
pub const DEFAULT_VIRTIO_FS_DAX_SIZE_MB: u32 = 1024; pub const DEFAULT_VIRTIO_FS_DAX_SIZE_MB: u32 = 1024;
pub const DEFAULT_SHARED_9PFS_SIZE_MB: u32 = 128 * 1024; pub const DEFAULT_SHARED_9PFS_SIZE_MB: u32 = 128 * 1024;
pub const MIN_SHARED_9PFS_SIZE_MB: u32 = 4 * 1024; pub const MIN_SHARED_9PFS_SIZE_MB: u32 = 4 * 1024;

View File

@@ -864,7 +864,7 @@ impl SharedFsInfo {
)?; )?;
} }
let l = ["none", "auto", "always"]; let l = ["never", "auto", "always"];
if !l.contains(&self.virtio_fs_cache.as_str()) { if !l.contains(&self.virtio_fs_cache.as_str()) {
return Err(eother!( return Err(eother!(

View File

@@ -249,6 +249,12 @@ dependencies = [
"rustc-demangle", "rustc-demangle",
] ]
[[package]]
name = "base64"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "1.3.2" version = "1.3.2"
@@ -1352,6 +1358,8 @@ dependencies = [
name = "kata-types" name = "kata-types"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow",
"base64",
"bitmask-enum", "bitmask-enum",
"byte-unit", "byte-unit",
"glob", "glob",
@@ -2288,6 +2296,7 @@ dependencies = [
"rtnetlink", "rtnetlink",
"scopeguard", "scopeguard",
"serde", "serde",
"serde_json",
"slog", "slog",
"slog-scope", "slog-scope",
"test-utils", "test-utils",
@@ -2526,6 +2535,16 @@ dependencies = [
"unix_socket2", "unix_socket2",
] ]
[[package]]
name = "shim-ctl"
version = "0.1.0"
dependencies = [
"common",
"logging",
"runtimes",
"tokio",
]
[[package]] [[package]]
name = "signal-hook-registry" name = "signal-hook-registry"
version = "1.4.0" version = "1.4.0"

View File

@@ -1,4 +1,5 @@
[workspace] [workspace]
members = [ members = [
"crates/shim", "crates/shim",
"crates/shim-ctl",
] ]

View File

@@ -120,6 +120,8 @@ See the
See the See the
[debugging section of the developer guide](../../docs/Developer-Guide.md#troubleshoot-kata-containers). [debugging section of the developer guide](../../docs/Developer-Guide.md#troubleshoot-kata-containers).
An [experimental alternative binary](crates/shim-ctl/README.md) is available that removes containerd dependencies and makes it easier to run the shim proper outside of the runtime's usual deployment environment (i.e. on a developer machine).
## Limitations ## Limitations
For Kata Containers limitations, see the For Kata Containers limitations, see the

View File

@@ -14,6 +14,7 @@ pub mod hypervisor_persist;
pub use device::*; pub use device::*;
pub mod dragonball; pub mod dragonball;
mod kernel_param; mod kernel_param;
pub mod qemu;
pub use kernel_param::Param; pub use kernel_param::Param;
mod utils; mod utils;
use std::collections::HashMap; use std::collections::HashMap;
@@ -28,6 +29,8 @@ const VM_ROOTFS_DRIVER_BLK: &str = "virtio-blk";
const VM_ROOTFS_DRIVER_PMEM: &str = "virtio-pmem"; const VM_ROOTFS_DRIVER_PMEM: &str = "virtio-pmem";
pub const HYPERVISOR_DRAGONBALL: &str = "dragonball"; pub const HYPERVISOR_DRAGONBALL: &str = "dragonball";
pub const HYPERVISOR_QEMU: &str = "qemu";
#[derive(PartialEq)] #[derive(PartialEq)]
pub(crate) enum VmmState { pub(crate) enum VmmState {
NotReady, NotReady,

View File

@@ -0,0 +1,142 @@
// Copyright (c) 2022 Red Hat
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::Result;
use crate::{HypervisorConfig, VcpuThreadIds};
use kata_types::capabilities::{Capabilities, CapabilityBits};
const VSOCK_SCHEME: &str = "vsock";
const VSOCK_AGENT_CID: u32 = 3;
const VSOCK_AGENT_PORT: u32 = 1024;
unsafe impl Send for QemuInner {}
unsafe impl Sync for QemuInner {}
pub struct QemuInner {
config: HypervisorConfig,
}
impl QemuInner {
pub fn new() -> QemuInner {
QemuInner {
config: Default::default(),
}
}
pub(crate) async fn prepare_vm(&mut self, _id: &str, _netns: Option<String>) -> Result<()> {
info!(sl!(), "Preparing QEMU VM");
Ok(())
}
pub(crate) async fn start_vm(&mut self, _timeout: i32) -> Result<()> {
info!(sl!(), "Starting QEMU VM");
let mut command = std::process::Command::new(&self.config.path);
command
.arg("-kernel")
.arg(&self.config.boot_info.kernel)
.arg("-m")
.arg(format!("{}M", &self.config.memory_info.default_memory))
.arg("-initrd")
.arg(&self.config.boot_info.initrd)
.arg("-vga")
.arg("none")
.arg("-nodefaults")
.arg("-nographic");
command.spawn()?;
Ok(())
}
pub(crate) fn stop_vm(&mut self) -> Result<()> {
info!(sl!(), "Stopping QEMU VM");
todo!()
}
pub(crate) fn pause_vm(&self) -> Result<()> {
info!(sl!(), "Pausing QEMU VM");
todo!()
}
pub(crate) fn resume_vm(&self) -> Result<()> {
info!(sl!(), "Resuming QEMU VM");
todo!()
}
pub(crate) async fn save_vm(&self) -> Result<()> {
todo!()
}
/// TODO: using a single hardcoded CID is clearly not adequate in the long
/// run. Use the recently added VsockConfig infrastructure to fix this.
pub(crate) async fn get_agent_socket(&self) -> Result<String> {
info!(sl!(), "QemuInner::get_agent_socket()");
Ok(format!(
"{}://{}:{}",
VSOCK_SCHEME, VSOCK_AGENT_CID, VSOCK_AGENT_PORT
))
}
pub(crate) async fn disconnect(&mut self) {
info!(sl!(), "QemuInner::disconnect()");
todo!()
}
pub(crate) async fn get_thread_ids(&self) -> Result<VcpuThreadIds> {
info!(sl!(), "QemuInner::get_thread_ids()");
todo!()
}
pub(crate) async fn cleanup(&self) -> Result<()> {
info!(sl!(), "QemuInner::cleanup()");
todo!()
}
pub(crate) async fn get_pids(&self) -> Result<Vec<u32>> {
info!(sl!(), "QemuInner::get_pids()");
todo!()
}
pub(crate) async fn check(&self) -> Result<()> {
todo!()
}
pub(crate) async fn get_jailer_root(&self) -> Result<String> {
todo!()
}
pub(crate) async fn capabilities(&self) -> Result<Capabilities> {
let mut caps = Capabilities::default();
caps.set(CapabilityBits::FsSharingSupport);
Ok(caps)
}
pub fn set_hypervisor_config(&mut self, config: HypervisorConfig) {
self.config = config;
}
pub fn hypervisor_config(&self) -> HypervisorConfig {
info!(sl!(), "QemuInner::hypervisor_config()");
self.config.clone()
}
}
use crate::device::Device;
// device manager part of Hypervisor
impl QemuInner {
pub(crate) async fn add_device(&mut self, device: Device) -> Result<()> {
info!(sl!(), "QemuInner::add_device() {}", device);
todo!()
}
pub(crate) async fn remove_device(&mut self, device: Device) -> Result<()> {
info!(sl!(), "QemuInner::remove_device() {} ", device);
todo!()
}
}

View File

@@ -0,0 +1,134 @@
// Copyright (c) 2022 Red Hat
//
// SPDX-License-Identifier: Apache-2.0
//
mod inner;
use crate::device::Device;
use crate::hypervisor_persist::HypervisorState;
use crate::Hypervisor;
use crate::{HypervisorConfig, VcpuThreadIds};
use inner::QemuInner;
use kata_types::capabilities::Capabilities;
use anyhow::Result;
use async_trait::async_trait;
use std::sync::Arc;
use tokio::sync::RwLock;
pub struct Qemu {
inner: Arc<RwLock<QemuInner>>,
}
impl Default for Qemu {
fn default() -> Self {
Self::new()
}
}
impl Qemu {
pub fn new() -> Self {
Self {
inner: Arc::new(RwLock::new(QemuInner::new())),
}
}
pub async fn set_hypervisor_config(&mut self, config: HypervisorConfig) {
let mut inner = self.inner.write().await;
inner.set_hypervisor_config(config)
}
}
#[async_trait]
impl Hypervisor for Qemu {
async fn prepare_vm(&self, id: &str, netns: Option<String>) -> Result<()> {
let mut inner = self.inner.write().await;
inner.prepare_vm(id, netns).await
}
async fn start_vm(&self, timeout: i32) -> Result<()> {
let mut inner = self.inner.write().await;
inner.start_vm(timeout).await
}
async fn stop_vm(&self) -> Result<()> {
let mut inner = self.inner.write().await;
inner.stop_vm()
}
async fn pause_vm(&self) -> Result<()> {
let inner = self.inner.read().await;
inner.pause_vm()
}
async fn resume_vm(&self) -> Result<()> {
let inner = self.inner.read().await;
inner.resume_vm()
}
async fn save_vm(&self) -> Result<()> {
let inner = self.inner.read().await;
inner.save_vm().await
}
async fn add_device(&self, device: Device) -> Result<()> {
let mut inner = self.inner.write().await;
inner.add_device(device).await
}
async fn remove_device(&self, device: Device) -> Result<()> {
let mut inner = self.inner.write().await;
inner.remove_device(device).await
}
async fn get_agent_socket(&self) -> Result<String> {
let inner = self.inner.read().await;
inner.get_agent_socket().await
}
async fn disconnect(&self) {
let mut inner = self.inner.write().await;
inner.disconnect().await
}
async fn hypervisor_config(&self) -> HypervisorConfig {
let inner = self.inner.read().await;
inner.hypervisor_config()
}
async fn get_thread_ids(&self) -> Result<VcpuThreadIds> {
let inner = self.inner.read().await;
inner.get_thread_ids().await
}
async fn cleanup(&self) -> Result<()> {
let inner = self.inner.read().await;
inner.cleanup().await
}
async fn get_pids(&self) -> Result<Vec<u32>> {
let inner = self.inner.read().await;
inner.get_pids().await
}
async fn check(&self) -> Result<()> {
let inner = self.inner.read().await;
inner.check().await
}
async fn get_jailer_root(&self) -> Result<String> {
let inner = self.inner.read().await;
inner.get_jailer_root().await
}
async fn save_state(&self) -> Result<HypervisorState> {
todo!()
}
async fn capabilities(&self) -> Result<Capabilities> {
let inner = self.inner.read().await;
inner.capabilities().await
}
}

View File

@@ -11,14 +11,15 @@ use share_virtio_fs_inline::ShareVirtioFsInline;
mod share_virtio_fs_standalone; mod share_virtio_fs_standalone;
use share_virtio_fs_standalone::ShareVirtioFsStandalone; use share_virtio_fs_standalone::ShareVirtioFsStandalone;
mod utils; mod utils;
use tokio::sync::Mutex;
pub use utils::{do_get_guest_path, do_get_guest_share_path, get_host_rw_shared_path}; pub use utils::{do_get_guest_path, do_get_guest_share_path, get_host_rw_shared_path};
mod virtio_fs_share_mount; mod virtio_fs_share_mount;
use virtio_fs_share_mount::VirtiofsShareMount; use virtio_fs_share_mount::VirtiofsShareMount;
use std::sync::Arc; use std::{collections::HashMap, fmt::Debug, path::PathBuf, sync::Arc};
use agent::Storage; use agent::Storage;
use anyhow::{anyhow, Context, Result}; use anyhow::{anyhow, Context, Ok, Result};
use async_trait::async_trait; use async_trait::async_trait;
use hypervisor::Hypervisor; use hypervisor::Hypervisor;
use kata_types::config::hypervisor::SharedFsInfo; use kata_types::config::hypervisor::SharedFsInfo;
@@ -43,8 +44,10 @@ pub trait ShareFs: Send + Sync {
async fn setup_device_before_start_vm(&self, h: &dyn Hypervisor) -> Result<()>; async fn setup_device_before_start_vm(&self, h: &dyn Hypervisor) -> Result<()>;
async fn setup_device_after_start_vm(&self, h: &dyn Hypervisor) -> Result<()>; async fn setup_device_after_start_vm(&self, h: &dyn Hypervisor) -> Result<()>;
async fn get_storages(&self) -> Result<Vec<Storage>>; async fn get_storages(&self) -> Result<Vec<Storage>>;
fn mounted_info_set(&self) -> Arc<Mutex<HashMap<String, MountedInfo>>>;
} }
#[derive(Debug)]
pub struct ShareFsRootfsConfig { pub struct ShareFsRootfsConfig {
// TODO: for nydus v5/v6 need to update ShareFsMount // TODO: for nydus v5/v6 need to update ShareFsMount
pub cid: String, pub cid: String,
@@ -54,6 +57,7 @@ pub struct ShareFsRootfsConfig {
pub is_rafs: bool, pub is_rafs: bool,
} }
#[derive(Debug)]
pub struct ShareFsVolumeConfig { pub struct ShareFsVolumeConfig {
pub cid: String, pub cid: String,
pub source: String, pub source: String,
@@ -69,10 +73,61 @@ pub struct ShareFsMountResult {
pub storages: Vec<agent::Storage>, pub storages: Vec<agent::Storage>,
} }
/// Save mounted info for sandbox-level shared files.
#[derive(Clone, Debug)]
pub struct MountedInfo {
// Guest path
pub guest_path: PathBuf,
// Ref count of containers that uses this volume with read only permission
pub ro_ref_count: usize,
// Ref count of containers that uses this volume with read write permission
pub rw_ref_count: usize,
}
impl MountedInfo {
pub fn new(guest_path: PathBuf, readonly: bool) -> Self {
Self {
guest_path,
ro_ref_count: if readonly { 1 } else { 0 },
rw_ref_count: if readonly { 0 } else { 1 },
}
}
/// Check if the mount has read only permission
pub fn readonly(&self) -> bool {
self.rw_ref_count == 0
}
/// Ref count for all permissions
pub fn ref_count(&self) -> usize {
self.ro_ref_count + self.rw_ref_count
}
// File/dir name in the form of "sandbox-<uuid>-<file/dir name>"
pub fn file_name(&self) -> Result<String> {
match self.guest_path.file_name() {
Some(file_name) => match file_name.to_str() {
Some(file_name) => Ok(file_name.to_owned()),
None => Err(anyhow!("failed to get string from {:?}", file_name)),
},
None => Err(anyhow!(
"failed to get file name from the guest_path {:?}",
self.guest_path
)),
}
}
}
#[async_trait] #[async_trait]
pub trait ShareFsMount: Send + Sync { pub trait ShareFsMount: Send + Sync {
async fn share_rootfs(&self, config: ShareFsRootfsConfig) -> Result<ShareFsMountResult>; async fn share_rootfs(&self, config: ShareFsRootfsConfig) -> Result<ShareFsMountResult>;
async fn share_volume(&self, config: ShareFsVolumeConfig) -> Result<ShareFsMountResult>; async fn share_volume(&self, config: ShareFsVolumeConfig) -> Result<ShareFsMountResult>;
/// Upgrade to readwrite permission
async fn upgrade_to_rw(&self, file_name: &str) -> Result<()>;
/// Downgrade to readonly permission
async fn downgrade_to_ro(&self, file_name: &str) -> Result<()>;
/// Umount the volume
async fn umount(&self, file_name: &str) -> Result<()>;
} }
pub fn new(id: &str, config: &SharedFsInfo) -> Result<Arc<dyn ShareFs>> { pub fn new(id: &str, config: &SharedFsInfo) -> Result<Arc<dyn ShareFs>> {

View File

@@ -4,11 +4,14 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// //
use std::collections::HashMap;
use agent::Storage; use agent::Storage;
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use async_trait::async_trait; use async_trait::async_trait;
use hypervisor::Hypervisor; use hypervisor::Hypervisor;
use kata_types::config::hypervisor::SharedFsInfo; use kata_types::config::hypervisor::SharedFsInfo;
use tokio::sync::Mutex;
use super::{ use super::{
share_virtio_fs::{ share_virtio_fs::{
@@ -30,6 +33,7 @@ pub struct ShareVirtioFsInlineConfig {
pub struct ShareVirtioFsInline { pub struct ShareVirtioFsInline {
config: ShareVirtioFsInlineConfig, config: ShareVirtioFsInlineConfig,
share_fs_mount: Arc<dyn ShareFsMount>, share_fs_mount: Arc<dyn ShareFsMount>,
mounted_info_set: Arc<Mutex<HashMap<String, MountedInfo>>>,
} }
impl ShareVirtioFsInline { impl ShareVirtioFsInline {
@@ -37,6 +41,7 @@ impl ShareVirtioFsInline {
Ok(Self { Ok(Self {
config: ShareVirtioFsInlineConfig { id: id.to_string() }, config: ShareVirtioFsInlineConfig { id: id.to_string() },
share_fs_mount: Arc::new(VirtiofsShareMount::new(id)), share_fs_mount: Arc::new(VirtiofsShareMount::new(id)),
mounted_info_set: Arc::new(Mutex::new(HashMap::new())),
}) })
} }
} }
@@ -77,4 +82,8 @@ impl ShareFs for ShareVirtioFsInline {
storages.push(shared_volume); storages.push(shared_volume);
Ok(storages) Ok(storages)
} }
fn mounted_info_set(&self) -> Arc<Mutex<HashMap<String, MountedInfo>>> {
self.mounted_info_set.clone()
}
} }

View File

@@ -4,8 +4,12 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// //
use std::{process::Stdio, sync::Arc}; use std::{collections::HashMap, process::Stdio, sync::Arc};
use crate::share_fs::share_virtio_fs::{
prepare_virtiofs, FS_TYPE_VIRTIO_FS, KATA_VIRTIO_FS_DEV_TYPE, MOUNT_GUEST_TAG,
};
use crate::share_fs::{KATA_GUEST_SHARE_DIR, VIRTIO_FS};
use agent::Storage; use agent::Storage;
use anyhow::{anyhow, Context, Result}; use anyhow::{anyhow, Context, Result};
use async_trait::async_trait; use async_trait::async_trait;
@@ -16,19 +20,18 @@ use tokio::{
process::{Child, Command}, process::{Child, Command},
sync::{ sync::{
mpsc::{channel, Receiver, Sender}, mpsc::{channel, Receiver, Sender},
RwLock, Mutex, RwLock,
}, },
}; };
use super::{ use super::{
share_virtio_fs::generate_sock_path, utils::ensure_dir_exist, utils::get_host_ro_shared_path, share_virtio_fs::generate_sock_path, utils::ensure_dir_exist, utils::get_host_ro_shared_path,
virtio_fs_share_mount::VirtiofsShareMount, ShareFs, ShareFsMount, virtio_fs_share_mount::VirtiofsShareMount, MountedInfo, ShareFs, ShareFsMount,
}; };
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct ShareVirtioFsStandaloneConfig { pub struct ShareVirtioFsStandaloneConfig {
id: String, id: String,
jail_root: String,
// virtio_fs_daemon is the virtio-fs vhost-user daemon path // virtio_fs_daemon is the virtio-fs vhost-user daemon path
pub virtio_fs_daemon: String, pub virtio_fs_daemon: String,
@@ -38,14 +41,16 @@ pub struct ShareVirtioFsStandaloneConfig {
pub virtio_fs_extra_args: Vec<String>, pub virtio_fs_extra_args: Vec<String>,
} }
#[derive(Default)] #[derive(Default, Debug)]
struct ShareVirtioFsStandaloneInner { struct ShareVirtioFsStandaloneInner {
pid: Option<u32>, pid: Option<u32>,
} }
pub(crate) struct ShareVirtioFsStandalone { pub(crate) struct ShareVirtioFsStandalone {
inner: Arc<RwLock<ShareVirtioFsStandaloneInner>>, inner: Arc<RwLock<ShareVirtioFsStandaloneInner>>,
config: ShareVirtioFsStandaloneConfig, config: ShareVirtioFsStandaloneConfig,
share_fs_mount: Arc<dyn ShareFsMount>, share_fs_mount: Arc<dyn ShareFsMount>,
mounted_info_set: Arc<Mutex<HashMap<String, MountedInfo>>>,
} }
impl ShareVirtioFsStandalone { impl ShareVirtioFsStandalone {
@@ -54,26 +59,33 @@ impl ShareVirtioFsStandalone {
inner: Arc::new(RwLock::new(ShareVirtioFsStandaloneInner::default())), inner: Arc::new(RwLock::new(ShareVirtioFsStandaloneInner::default())),
config: ShareVirtioFsStandaloneConfig { config: ShareVirtioFsStandaloneConfig {
id: id.to_string(), id: id.to_string(),
jail_root: "".to_string(),
virtio_fs_daemon: config.virtio_fs_daemon.clone(), virtio_fs_daemon: config.virtio_fs_daemon.clone(),
virtio_fs_cache: config.virtio_fs_cache.clone(), virtio_fs_cache: config.virtio_fs_cache.clone(),
virtio_fs_extra_args: config.virtio_fs_extra_args.clone(), virtio_fs_extra_args: config.virtio_fs_extra_args.clone(),
}, },
share_fs_mount: Arc::new(VirtiofsShareMount::new(id)), share_fs_mount: Arc::new(VirtiofsShareMount::new(id)),
mounted_info_set: Arc::new(Mutex::new(HashMap::new())),
}) })
} }
fn virtiofsd_args(&self, sock_path: &str) -> Result<Vec<String>> { fn virtiofsd_args(&self, sock_path: &str) -> Result<Vec<String>> {
let source_path = get_host_ro_shared_path(&self.config.id); let source_path = get_host_ro_shared_path(&self.config.id);
ensure_dir_exist(&source_path)?; ensure_dir_exist(&source_path)?;
let shared_dir = source_path
.to_str()
.ok_or_else(|| anyhow!("convert source path {:?} to str failed", source_path))?;
let mut args: Vec<String> = vec![ let mut args: Vec<String> = vec![
String::from("-f"), String::from("--socket-path"),
format!("--socket-path={}", sock_path), String::from(sock_path),
String::from("-o"), String::from("--shared-dir"),
format!("source={}", source_path.to_str().unwrap()), String::from(shared_dir),
String::from("-o"), String::from("--cache"),
format!("cache={}", self.config.virtio_fs_cache), self.config.virtio_fs_cache.clone(),
String::from("--sandbox"),
String::from("none"),
String::from("--seccomp"),
String::from("none"),
]; ];
if !self.config.virtio_fs_extra_args.is_empty() { if !self.config.virtio_fs_extra_args.is_empty() {
@@ -84,8 +96,8 @@ impl ShareVirtioFsStandalone {
Ok(args) Ok(args)
} }
async fn setup_virtiofsd(&self) -> Result<()> { async fn setup_virtiofsd(&self, h: &dyn Hypervisor) -> Result<()> {
let sock_path = generate_sock_path(&self.config.jail_root); let sock_path = generate_sock_path(&h.get_jailer_root().await?);
let args = self.virtiofsd_args(&sock_path).context("virtiofsd args")?; let args = self.virtiofsd_args(&sock_path).context("virtiofsd args")?;
let mut cmd = Command::new(&self.config.virtio_fs_daemon); let mut cmd = Command::new(&self.config.virtio_fs_daemon);
@@ -160,8 +172,11 @@ impl ShareFs for ShareVirtioFsStandalone {
self.share_fs_mount.clone() self.share_fs_mount.clone()
} }
async fn setup_device_before_start_vm(&self, _h: &dyn Hypervisor) -> Result<()> { async fn setup_device_before_start_vm(&self, h: &dyn Hypervisor) -> Result<()> {
self.setup_virtiofsd().await.context("setup virtiofsd")?; prepare_virtiofs(h, VIRTIO_FS, &self.config.id, &h.get_jailer_root().await?)
.await
.context("prepare virtiofs")?;
self.setup_virtiofsd(h).await.context("setup virtiofsd")?;
Ok(()) Ok(())
} }
@@ -170,6 +185,23 @@ impl ShareFs for ShareVirtioFsStandalone {
} }
async fn get_storages(&self) -> Result<Vec<Storage>> { async fn get_storages(&self) -> Result<Vec<Storage>> {
Ok(vec![]) let mut storages: Vec<Storage> = Vec::new();
let shared_volume: Storage = Storage {
driver: String::from(KATA_VIRTIO_FS_DEV_TYPE),
driver_options: Vec::new(),
source: String::from(MOUNT_GUEST_TAG),
fs_type: String::from(FS_TYPE_VIRTIO_FS),
fs_group: None,
options: vec![String::from("nodev")],
mount_point: String::from(KATA_GUEST_SHARE_DIR),
};
storages.push(shared_volume);
Ok(storages)
}
fn mounted_info_set(&self) -> Arc<Mutex<HashMap<String, MountedInfo>>> {
self.mounted_info_set.clone()
} }
} }

View File

@@ -18,6 +18,7 @@ pub(crate) fn ensure_dir_exist(path: &Path) -> Result<()> {
Ok(()) Ok(())
} }
/// Bind mount the original path to the runtime directory.
pub(crate) fn share_to_guest( pub(crate) fn share_to_guest(
// absolute path for source // absolute path for source
source: &str, source: &str,
@@ -37,7 +38,7 @@ pub(crate) fn share_to_guest(
// to remount the read only dir mount point directly. // to remount the read only dir mount point directly.
if readonly { if readonly {
let dst = do_get_host_path(target, sid, cid, is_volume, true); let dst = do_get_host_path(target, sid, cid, is_volume, true);
mount::bind_remount_read_only(&dst).context("bind remount readonly")?; mount::bind_remount(&dst, readonly).context("bind remount readonly")?;
} }
Ok(do_get_guest_path(target, cid, is_volume, is_rafs)) Ok(do_get_guest_path(target, cid, is_volume, is_rafs))

View File

@@ -7,6 +7,7 @@
use agent::Storage; use agent::Storage;
use anyhow::{anyhow, Context, Result}; use anyhow::{anyhow, Context, Result};
use async_trait::async_trait; use async_trait::async_trait;
use kata_sys_util::mount::{bind_remount, umount_timeout};
use kata_types::k8s::is_watchable_mount; use kata_types::k8s::is_watchable_mount;
use kata_types::mount; use kata_types::mount;
use nix::sys::stat::stat; use nix::sys::stat::stat;
@@ -19,10 +20,12 @@ const WATCHABLE_BIND_DEV_TYPE: &str = "watchable-bind";
const EPHEMERAL_PATH: &str = "/run/kata-containers/sandbox/ephemeral"; const EPHEMERAL_PATH: &str = "/run/kata-containers/sandbox/ephemeral";
use super::{ use super::{
utils, ShareFsMount, ShareFsMountResult, ShareFsRootfsConfig, ShareFsVolumeConfig, utils::{self, do_get_host_path},
ShareFsMount, ShareFsMountResult, ShareFsRootfsConfig, ShareFsVolumeConfig,
KATA_GUEST_SHARE_DIR, PASSTHROUGH_FS_DIR, KATA_GUEST_SHARE_DIR, PASSTHROUGH_FS_DIR,
}; };
#[derive(Debug)]
pub struct VirtiofsShareMount { pub struct VirtiofsShareMount {
id: String, id: String,
} }
@@ -166,4 +169,35 @@ impl ShareFsMount for VirtiofsShareMount {
storages: vec![], storages: vec![],
}) })
} }
async fn upgrade_to_rw(&self, file_name: &str) -> Result<()> {
// Remount readonly directory with readwrite permission
let host_dest = do_get_host_path(file_name, &self.id, "", true, true);
bind_remount(&host_dest, false)
.context("remount readonly directory with readwrite permission")?;
// Remount readwrite directory with readwrite permission
let host_dest = do_get_host_path(file_name, &self.id, "", true, false);
bind_remount(&host_dest, false)
.context("remount readwrite directory with readwrite permission")?;
Ok(())
}
async fn downgrade_to_ro(&self, file_name: &str) -> Result<()> {
// Remount readwrite directory with readonly permission
let host_dest = do_get_host_path(file_name, &self.id, "", true, false);
bind_remount(&host_dest, true)
.context("remount readwrite directory with readonly permission")?;
// Remount readonly directory with readonly permission
let host_dest = do_get_host_path(file_name, &self.id, "", true, true);
bind_remount(&host_dest, true)
.context("remount readonly directory with readonly permission")?;
Ok(())
}
async fn umount(&self, file_name: &str) -> Result<()> {
let host_dest = do_get_host_path(file_name, &self.id, "", true, true);
umount_timeout(&host_dest, 0).context("Umount readwrite host dest")?;
// Umount event will be propagated to ro directory
Ok(())
}
} }

View File

@@ -5,9 +5,11 @@
// //
use anyhow::Result; use anyhow::Result;
use async_trait::async_trait;
use super::Volume; use super::Volume;
#[derive(Debug)]
pub(crate) struct BlockVolume {} pub(crate) struct BlockVolume {}
/// BlockVolume: block device volume /// BlockVolume: block device volume
@@ -17,6 +19,7 @@ impl BlockVolume {
} }
} }
#[async_trait]
impl Volume for BlockVolume { impl Volume for BlockVolume {
fn get_volume_mount(&self) -> anyhow::Result<Vec<oci::Mount>> { fn get_volume_mount(&self) -> anyhow::Result<Vec<oci::Mount>> {
todo!() todo!()
@@ -26,8 +29,9 @@ impl Volume for BlockVolume {
todo!() todo!()
} }
fn cleanup(&self) -> Result<()> { async fn cleanup(&self) -> Result<()> {
todo!() warn!(sl!(), "Cleaning up BlockVolume is still unimplemented.");
Ok(())
} }
} }

View File

@@ -5,9 +5,11 @@
// //
use anyhow::Result; use anyhow::Result;
use async_trait::async_trait;
use super::Volume; use super::Volume;
#[derive(Debug)]
pub(crate) struct DefaultVolume { pub(crate) struct DefaultVolume {
mount: oci::Mount, mount: oci::Mount,
} }
@@ -21,6 +23,7 @@ impl DefaultVolume {
} }
} }
#[async_trait]
impl Volume for DefaultVolume { impl Volume for DefaultVolume {
fn get_volume_mount(&self) -> anyhow::Result<Vec<oci::Mount>> { fn get_volume_mount(&self) -> anyhow::Result<Vec<oci::Mount>> {
Ok(vec![self.mount.clone()]) Ok(vec![self.mount.clone()])
@@ -30,7 +33,8 @@ impl Volume for DefaultVolume {
Ok(vec![]) Ok(vec![])
} }
fn cleanup(&self) -> Result<()> { async fn cleanup(&self) -> Result<()> {
todo!() warn!(sl!(), "Cleaning up DefaultVolume is still unimplemented.");
Ok(())
} }
} }

View File

@@ -8,6 +8,7 @@ mod block_volume;
mod default_volume; mod default_volume;
mod share_fs_volume; mod share_fs_volume;
mod shm_volume; mod shm_volume;
use async_trait::async_trait;
use std::{sync::Arc, vec::Vec}; use std::{sync::Arc, vec::Vec};
@@ -16,10 +17,11 @@ use tokio::sync::RwLock;
use crate::share_fs::ShareFs; use crate::share_fs::ShareFs;
#[async_trait]
pub trait Volume: Send + Sync { pub trait Volume: Send + Sync {
fn get_volume_mount(&self) -> Result<Vec<oci::Mount>>; fn get_volume_mount(&self) -> Result<Vec<oci::Mount>>;
fn get_storage(&self) -> Result<Vec<agent::Storage>>; fn get_storage(&self) -> Result<Vec<agent::Storage>>;
fn cleanup(&self) -> Result<()>; async fn cleanup(&self) -> Result<()>;
} }
#[derive(Default)] #[derive(Default)]

View File

@@ -4,12 +4,17 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// //
use std::{path::Path, sync::Arc}; use std::{
path::{Path, PathBuf},
str::FromStr,
sync::{Arc, Weak},
};
use anyhow::{anyhow, Context, Result}; use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use super::Volume; use super::Volume;
use crate::share_fs::{ShareFs, ShareFsVolumeConfig}; use crate::share_fs::{MountedInfo, ShareFs, ShareFsVolumeConfig};
use kata_types::mount; use kata_types::mount;
// copy file to container's rootfs if filesystem sharing is not supported, otherwise // copy file to container's rootfs if filesystem sharing is not supported, otherwise
@@ -19,6 +24,7 @@ use kata_types::mount;
// device nodes to the guest. // device nodes to the guest.
// skip the volumes whose source had already set to guest share dir. // skip the volumes whose source had already set to guest share dir.
pub(crate) struct ShareFsVolume { pub(crate) struct ShareFsVolume {
share_fs: Option<Weak<dyn ShareFs>>,
mounts: Vec<oci::Mount>, mounts: Vec<oci::Mount>,
storages: Vec<agent::Storage>, storages: Vec<agent::Storage>,
} }
@@ -29,10 +35,12 @@ impl ShareFsVolume {
m: &oci::Mount, m: &oci::Mount,
cid: &str, cid: &str,
) -> Result<Self> { ) -> Result<Self> {
// The file_name is in the format of "sandbox-{uuid}-{file_name}"
let file_name = Path::new(&m.source).file_name().unwrap().to_str().unwrap(); let file_name = Path::new(&m.source).file_name().unwrap().to_str().unwrap();
let file_name = generate_mount_path(cid, file_name); let file_name = generate_mount_path("sandbox", file_name);
let mut volume = Self { let mut volume = Self {
share_fs: share_fs.as_ref().map(Arc::downgrade),
mounts: vec![], mounts: vec![],
storages: vec![], storages: vec![],
}; };
@@ -59,36 +67,87 @@ impl ShareFsVolume {
} }
} }
Some(share_fs) => { Some(share_fs) => {
let readonly = m.options.iter().any(|opt| opt == "ro");
let share_fs_mount = share_fs.get_share_fs_mount(); let share_fs_mount = share_fs.get_share_fs_mount();
let mount_result = share_fs_mount let mounted_info_set = share_fs.mounted_info_set();
.share_volume(ShareFsVolumeConfig { let mut mounted_info_set = mounted_info_set.lock().await;
cid: cid.to_string(), if let Some(mut mounted_info) = mounted_info_set.get(&m.source).cloned() {
source: m.source.clone(), // Mounted at least once
target: file_name, let guest_path = mounted_info
readonly: m.options.iter().any(|o| *o == "ro"), .guest_path
mount_options: m.options.clone(), .clone()
mount: m.clone(), .as_os_str()
is_rafs: false, .to_str()
.unwrap()
.to_owned();
if !readonly && mounted_info.readonly() {
// The current mount should be upgraded to readwrite permission
info!(
sl!(),
"The mount will be upgraded, mount = {:?}, cid = {}", m, cid
);
share_fs_mount
.upgrade_to_rw(
&mounted_info
.file_name()
.context("get name of mounted info")?,
)
.await
.context("upgrade mount")?;
}
if readonly {
mounted_info.ro_ref_count += 1;
} else {
mounted_info.rw_ref_count += 1;
}
mounted_info_set.insert(m.source.clone(), mounted_info);
volume.mounts.push(oci::Mount {
destination: m.destination.clone(),
r#type: "bind".to_string(),
source: guest_path,
options: m.options.clone(),
}) })
.await } else {
.context("share fs volume")?; // Not mounted ever
let mount_result = share_fs_mount
.share_volume(ShareFsVolumeConfig {
// The scope of shared volume is sandbox
cid: String::from(""),
source: m.source.clone(),
target: file_name.clone(),
readonly,
mount_options: m.options.clone(),
mount: m.clone(),
is_rafs: false,
})
.await
.context("mount shared volume")?;
let mounted_info = MountedInfo::new(
PathBuf::from_str(&mount_result.guest_path)
.context("convert guest path")?,
readonly,
);
mounted_info_set.insert(m.source.clone(), mounted_info);
// set storages for the volume
volume.storages = mount_result.storages;
// set storages for the volume // set mount for the volume
volume.storages = mount_result.storages; volume.mounts.push(oci::Mount {
destination: m.destination.clone(),
// set mount for the volume r#type: "bind".to_string(),
volume.mounts.push(oci::Mount { source: mount_result.guest_path,
destination: m.destination.clone(), options: m.options.clone(),
r#type: "bind".to_string(), });
source: mount_result.guest_path, }
options: m.options.clone(),
});
} }
} }
Ok(volume) Ok(volume)
} }
} }
#[async_trait]
impl Volume for ShareFsVolume { impl Volume for ShareFsVolume {
fn get_volume_mount(&self) -> anyhow::Result<Vec<oci::Mount>> { fn get_volume_mount(&self) -> anyhow::Result<Vec<oci::Mount>> {
Ok(self.mounts.clone()) Ok(self.mounts.clone())
@@ -98,8 +157,75 @@ impl Volume for ShareFsVolume {
Ok(self.storages.clone()) Ok(self.storages.clone())
} }
fn cleanup(&self) -> Result<()> { async fn cleanup(&self) -> Result<()> {
todo!() if self.share_fs.is_none() {
return Ok(());
}
let share_fs = match self.share_fs.as_ref().unwrap().upgrade() {
Some(share_fs) => share_fs,
None => return Err(anyhow!("The share_fs was released unexpectedly")),
};
let mounted_info_set = share_fs.mounted_info_set();
let mut mounted_info_set = mounted_info_set.lock().await;
for m in self.mounts.iter() {
let (host_source, mut mounted_info) = match mounted_info_set
.iter()
.find(|entry| entry.1.guest_path.as_os_str().to_str().unwrap() == m.source)
.map(|entry| (entry.0.to_owned(), entry.1.clone()))
{
Some(entry) => entry,
None => {
warn!(
sl!(),
"The mounted info for guest path {} not found", m.source
);
continue;
}
};
let old_readonly = mounted_info.readonly();
if m.options.iter().any(|opt| *opt == "ro") {
mounted_info.ro_ref_count -= 1;
} else {
mounted_info.rw_ref_count -= 1;
}
debug!(
sl!(),
"Ref count for {} was updated to {} due to volume cleanup",
host_source,
mounted_info.ref_count()
);
let share_fs_mount = share_fs.get_share_fs_mount();
let file_name = mounted_info.file_name()?;
if mounted_info.ref_count() > 0 {
// Downgrade to readonly if no container needs readwrite permission
if !old_readonly && mounted_info.readonly() {
info!(sl!(), "Downgrade {} to readonly due to no container that needs readwrite permission", host_source);
share_fs_mount
.downgrade_to_ro(&file_name)
.await
.context("Downgrade volume")?;
}
mounted_info_set.insert(host_source.clone(), mounted_info);
} else {
info!(
sl!(),
"The path will be umounted due to no references, host_source = {}", host_source
);
mounted_info_set.remove(&host_source);
// Umount the volume
share_fs_mount
.umount(&file_name)
.await
.context("Umount volume")?
}
}
Ok(())
} }
} }

View File

@@ -7,6 +7,7 @@
use std::path::Path; use std::path::Path;
use anyhow::Result; use anyhow::Result;
use async_trait::async_trait;
use super::Volume; use super::Volume;
use crate::share_fs::DEFAULT_KATA_GUEST_SANDBOX_DIR; use crate::share_fs::DEFAULT_KATA_GUEST_SANDBOX_DIR;
@@ -19,6 +20,7 @@ pub const DEFAULT_SHM_SIZE: u64 = 65536 * 1024;
// KATA_EPHEMERAL_DEV_TYPE creates a tmpfs backed volume for sharing files between containers. // KATA_EPHEMERAL_DEV_TYPE creates a tmpfs backed volume for sharing files between containers.
pub const KATA_EPHEMERAL_DEV_TYPE: &str = "ephemeral"; pub const KATA_EPHEMERAL_DEV_TYPE: &str = "ephemeral";
#[derive(Debug)]
pub(crate) struct ShmVolume { pub(crate) struct ShmVolume {
mount: oci::Mount, mount: oci::Mount,
storage: Option<agent::Storage>, storage: Option<agent::Storage>,
@@ -82,6 +84,7 @@ impl ShmVolume {
} }
} }
#[async_trait]
impl Volume for ShmVolume { impl Volume for ShmVolume {
fn get_volume_mount(&self) -> anyhow::Result<Vec<oci::Mount>> { fn get_volume_mount(&self) -> anyhow::Result<Vec<oci::Mount>> {
Ok(vec![self.mount.clone()]) Ok(vec![self.mount.clone()])
@@ -96,8 +99,9 @@ impl Volume for ShmVolume {
Ok(s) Ok(s)
} }
fn cleanup(&self) -> Result<()> { async fn cleanup(&self) -> Result<()> {
todo!() warn!(sl!(), "Cleaning up ShmVolume is still unimplemented.");
Ok(())
} }
} }

View File

@@ -4,6 +4,7 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// //
use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use agent::Agent; use agent::Agent;
@@ -81,8 +82,8 @@ impl Container {
let mut inner = self.inner.write().await; let mut inner = self.inner.write().await;
let toml_config = self.resource_manager.config().await; let toml_config = self.resource_manager.config().await;
let config = &self.config; let config = &self.config;
amend_spec(&mut spec, toml_config.runtime.disable_guest_seccomp).context("amend spec")?;
let sandbox_pidns = is_pid_namespace_enabled(&spec); let sandbox_pidns = is_pid_namespace_enabled(&spec);
amend_spec(&mut spec, toml_config.runtime.disable_guest_seccomp).context("amend spec")?;
// handler rootfs // handler rootfs
let rootfs = self let rootfs = self
@@ -143,13 +144,10 @@ impl Container {
// create container // create container
let r = agent::CreateContainerRequest { let r = agent::CreateContainerRequest {
process_id: agent::ContainerProcessID::new(&config.container_id, ""), process_id: agent::ContainerProcessID::new(&config.container_id, ""),
string_user: None,
devices: vec![],
storages, storages,
oci: Some(spec), oci: Some(spec),
guest_hooks: None,
sandbox_pidns, sandbox_pidns,
rootfs_mounts: vec![], ..Default::default()
}; };
self.agent self.agent
@@ -258,7 +256,7 @@ impl Container {
signal: u32, signal: u32,
all: bool, all: bool,
) -> Result<()> { ) -> Result<()> {
let inner = self.inner.read().await; let mut inner = self.inner.write().await;
inner.signal_process(container_process, signal, all).await inner.signal_process(container_process, signal, all).await
} }
@@ -396,6 +394,7 @@ fn amend_spec(spec: &mut oci::Spec, disable_guest_seccomp: bool) -> Result<()> {
resource.block_io = None; resource.block_io = None;
resource.hugepage_limits = Vec::new(); resource.hugepage_limits = Vec::new();
resource.network = None; resource.network = None;
resource.rdma = HashMap::new();
} }
// Host pidns path does not make sense in kata. Let's just align it with // Host pidns path does not make sense in kata. Let's just align it with
@@ -404,7 +403,10 @@ fn amend_spec(spec: &mut oci::Spec, disable_guest_seccomp: bool) -> Result<()> {
for n in linux.namespaces.iter() { for n in linux.namespaces.iter() {
match n.r#type.as_str() { match n.r#type.as_str() {
oci::PIDNAMESPACE | oci::NETWORKNAMESPACE => continue, oci::PIDNAMESPACE | oci::NETWORKNAMESPACE => continue,
_ => ns.push(n.clone()), _ => ns.push(oci::LinuxNamespace {
r#type: n.r#type.clone(),
path: "".to_string(),
}),
} }
} }

View File

@@ -233,7 +233,7 @@ impl ContainerInner {
} }
pub(crate) async fn signal_process( pub(crate) async fn signal_process(
&self, &mut self,
process: &ContainerProcess, process: &ContainerProcess,
signal: u32, signal: u32,
all: bool, all: bool,
@@ -247,6 +247,9 @@ impl ContainerInner {
self.agent self.agent
.signal_process(agent::SignalProcessRequest { process_id, signal }) .signal_process(agent::SignalProcessRequest { process_id, signal })
.await?; .await?;
self.clean_volumes().await.context("clean volumes")?;
Ok(()) Ok(())
} }
@@ -268,4 +271,23 @@ impl ContainerInner {
Ok(()) Ok(())
} }
async fn clean_volumes(&mut self) -> Result<()> {
let mut unhandled = Vec::new();
for v in self.volumes.iter() {
if let Err(err) = v.cleanup().await {
unhandled.push(Arc::clone(v));
warn!(
sl!(),
"Failed to clean volume {:?}, error = {:?}",
v.get_volume_mount(),
err
);
}
}
if !unhandled.is_empty() {
self.volumes = unhandled;
}
Ok(())
}
} }

View File

@@ -6,7 +6,11 @@
use std::{ use std::{
io, io,
os::unix::{io::FromRawFd, net::UnixStream as StdUnixStream}, os::unix::{
io::{FromRawFd, RawFd},
net::UnixStream as StdUnixStream,
prelude::AsRawFd,
},
pin::Pin, pin::Pin,
task::Context as TaskContext, task::Context as TaskContext,
task::Poll, task::Poll,
@@ -52,8 +56,21 @@ impl ShimIo {
"new shim io stdin {:?} stdout {:?} stderr {:?}", stdin, stdout, stderr "new shim io stdin {:?} stdout {:?} stderr {:?}", stdin, stdout, stderr
); );
let set_flag_with_blocking = |fd: RawFd| {
let flag = unsafe { libc::fcntl(fd, libc::F_GETFL) };
let ret = unsafe { libc::fcntl(fd, libc::F_SETFL, flag & !libc::O_NONBLOCK) };
if ret < 0 {
error!(sl!(), "failed to set fcntl for fd {} error {}", fd, ret);
}
};
let stdin_fd: Option<Box<dyn AsyncRead + Send + Unpin>> = if let Some(stdin) = stdin { let stdin_fd: Option<Box<dyn AsyncRead + Send + Unpin>> = if let Some(stdin) = stdin {
info!(sl!(), "open stdin {:?}", &stdin); info!(sl!(), "open stdin {:?}", &stdin);
// Since the stdin peer point (which is hold by containerd) could not be openned
// immediately, which would block here's open with block mode, and we wouldn't want to
// block here, thus here opened with nonblock and then reset it to block mode for
// tokio async io.
match OpenOptions::new() match OpenOptions::new()
.read(true) .read(true)
.write(false) .write(false)
@@ -61,7 +78,11 @@ impl ShimIo {
.open(&stdin) .open(&stdin)
.await .await
{ {
Ok(file) => Some(Box::new(file)), Ok(file) => {
// Set it to blocking to avoid infinitely handling EAGAIN when the reader is empty
set_flag_with_blocking(file.as_raw_fd());
Some(Box::new(file))
}
Err(err) => { Err(err) => {
error!(sl!(), "failed to open {} error {:?}", &stdin, err); error!(sl!(), "failed to open {} error {:?}", &stdin, err);
None None

View File

@@ -133,22 +133,14 @@ impl Process {
let io_name = io_name.to_string(); let io_name = io_name.to_string();
let logger = self.logger.new(o!("io_name" => io_name)); let logger = self.logger.new(o!("io_name" => io_name));
let _ = tokio::spawn(async move { let _ = tokio::spawn(async move {
loop { match tokio::io::copy(&mut reader, &mut writer).await {
match tokio::io::copy(&mut reader, &mut writer).await { Err(e) => {
Err(e) => { warn!(logger, "run_io_copy: failed to copy stream: {}", e);
if let Some(error_code) = e.raw_os_error() { }
if error_code == libc::EAGAIN { Ok(length) => {
continue; info!(logger, "run_io_copy: stop to copy stream length {}", length)
} }
} };
warn!(logger, "run_io_copy: failed to copy stream: {}", e);
}
Ok(length) => {
warn!(logger, "run_io_copy: stop to copy stream length {}", length)
}
};
break;
}
wgw.done(); wgw.done();
}); });

View File

@@ -21,7 +21,10 @@ use anyhow::{anyhow, Context, Result};
use async_trait::async_trait; use async_trait::async_trait;
use common::{message::Message, RuntimeHandler, RuntimeInstance}; use common::{message::Message, RuntimeHandler, RuntimeInstance};
use hypervisor::{dragonball::Dragonball, Hypervisor, HYPERVISOR_DRAGONBALL}; use hypervisor::{dragonball::Dragonball, Hypervisor, HYPERVISOR_DRAGONBALL};
use kata_types::config::{hypervisor::register_hypervisor_plugin, DragonballConfig, TomlConfig}; use hypervisor::{qemu::Qemu, HYPERVISOR_QEMU};
use kata_types::config::{
hypervisor::register_hypervisor_plugin, DragonballConfig, QemuConfig, TomlConfig,
};
use resource::ResourceManager; use resource::ResourceManager;
use sandbox::VIRTCONTAINER; use sandbox::VIRTCONTAINER;
use tokio::sync::mpsc::Sender; use tokio::sync::mpsc::Sender;
@@ -36,6 +39,8 @@ impl RuntimeHandler for VirtContainer {
// register // register
let dragonball_config = Arc::new(DragonballConfig::new()); let dragonball_config = Arc::new(DragonballConfig::new());
register_hypervisor_plugin("dragonball", dragonball_config); register_hypervisor_plugin("dragonball", dragonball_config);
let qemu_config = Arc::new(QemuConfig::new());
register_hypervisor_plugin("qemu", qemu_config);
Ok(()) Ok(())
} }
@@ -106,6 +111,13 @@ async fn new_hypervisor(toml_config: &TomlConfig) -> Result<Arc<dyn Hypervisor>>
.await; .await;
Ok(Arc::new(hypervisor)) Ok(Arc::new(hypervisor))
} }
HYPERVISOR_QEMU => {
let mut hypervisor = Qemu::new();
hypervisor
.set_hypervisor_config(hypervisor_config.clone())
.await;
Ok(Arc::new(hypervisor))
}
_ => Err(anyhow!("Unsupported hypervisor {}", &hypervisor_name)), _ => Err(anyhow!("Unsupported hypervisor {}", &hypervisor_name)),
} }
} }
@@ -149,4 +161,27 @@ agent_name="kata"
let res = new_agent(&toml_config); let res = new_agent(&toml_config);
assert!(res.is_ok()); assert!(res.is_ok());
} }
#[tokio::test]
async fn test_new_hypervisor() {
VirtContainer::init().unwrap();
let toml_config = {
let config_content = r#"
[hypervisor.qemu]
path = "/bin/echo"
kernel = "/bin/echo"
image = "/bin/echo"
firmware = ""
[runtime]
hypervisor_name="qemu"
"#;
TomlConfig::load(config_content).map_err(|e| anyhow!("can not load config toml: {}", e))
}
.unwrap();
let res = new_hypervisor(&toml_config).await;
assert!(res.is_ok());
}
} }

View File

@@ -0,0 +1,14 @@
[package]
name = "shim-ctl"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "^1.0"
common = { path = "../runtimes/common" }
logging = { path = "../../../libs/logging"}
runtimes = { path = "../runtimes" }
tokio = { version = "1.8.0", features = [ "rt", "rt-multi-thread" ] }

View File

@@ -0,0 +1,51 @@
### Purpose
`shim-ctl` is a binary to exercise the shim proper without containerd
dependencies.
The actual Kata shim is hard to execute outside of deployment environments due
to its dependency on containerd's shim v2 protocol. Among others, the
dependency requires having a socket with a remote end that's capable of driving
the shim using the shim v2 `ttrpc` protocol, and a binary for shim to publish
events to.
Since at least some of the shim v2 protocol dependencies are fairly hard to
mock up, this presents a significant obstacle to development.
`shim-ctl` takes advantage of the fact that due to the shim implementation
architecture, only the outermost couple of shim layers are
containerd-dependent and all of the inner layers that do the actual heavy
lifting don't depend on containerd. This allows `shim-ctl` to replace the
containerd-dependent layers with something that's easier to use on a
developer's machine.
### Usage
After building the binary as usual with `cargo build` run `shim-ctl` as follows.
Even though `shim-ctl` does away with containerd dependencies it still has
some requirements of its execution environment. In particular, it needs a
Kata `configuration.toml` file, some Kata distribution files to point a bunch
of `configuration.toml` keys to (like hypervisor keys `path`, `kernel` or
`initrd`) and a container bundle. These are however much easier to fulfill
than the original containerd dependencies, and doing so is a one-off task -
once done they can be reused for an unlimited number of modify-build-run
development cycles.
`shim-ctl` also needs to be launched from an exported container bundle
directory. One can be created by running
```
mkdir rootfs
podman export $(podman create busybox) | tar -C ./rootfs -xvf -
runc spec -b .
```
in a suitable directory.
The program can then be launched like this:
```
cd /the/bundle/directory
KATA_CONF_FILE=/path/to/configuration-qemu.toml /path/to/shim-ctl
```

View File

@@ -0,0 +1,45 @@
// Copyright (c) 2022 Red Hat
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::{Context, Result};
use common::{
message::Message,
types::{ContainerConfig, Request},
};
use runtimes::RuntimeHandlerManager;
use tokio::sync::mpsc::channel;
const MESSAGE_BUFFER_SIZE: usize = 8;
const WORKER_THREADS: usize = 2;
async fn real_main() {
let (sender, _receiver) = channel::<Message>(MESSAGE_BUFFER_SIZE);
let manager = RuntimeHandlerManager::new("xxx", sender).await.unwrap();
let req = Request::CreateContainer(ContainerConfig {
container_id: "xxx".to_owned(),
bundle: ".".to_owned(),
rootfs_mounts: Vec::new(),
terminal: false,
options: None,
stdin: None,
stdout: None,
stderr: None,
});
manager.handler_message(req).await.ok();
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
let runtime = tokio::runtime::Builder::new_multi_thread()
.worker_threads(WORKER_THREADS)
.enable_all()
.build()
.context("prepare tokio runtime")?;
runtime.block_on(real_main());
Ok(())
}

View File

@@ -197,6 +197,11 @@ DEFDISABLEGUESTEMPTYDIR := false
DEFAULTEXPFEATURES := [] DEFAULTEXPFEATURES := []
DEFDISABLESELINUX := false DEFDISABLESELINUX := false
# Default guest SELinux configuration
DEFDISABLEGUESTSELINUX := true
DEFGUESTSELINUXLABEL := system_u:system_r:container_t
#Default SeccomSandbox param #Default SeccomSandbox param
#The same default policy is used by libvirt #The same default policy is used by libvirt
#More explanation on https://lists.gnu.org/archive/html/qemu-devel/2017-02/msg03348.html #More explanation on https://lists.gnu.org/archive/html/qemu-devel/2017-02/msg03348.html
@@ -562,6 +567,8 @@ USER_VARS += DEFNETWORKMODEL_QEMU
USER_VARS += DEFDISABLEGUESTEMPTYDIR USER_VARS += DEFDISABLEGUESTEMPTYDIR
USER_VARS += DEFDISABLEGUESTSECCOMP USER_VARS += DEFDISABLEGUESTSECCOMP
USER_VARS += DEFDISABLESELINUX USER_VARS += DEFDISABLESELINUX
USER_VARS += DEFDISABLEGUESTSELINUX
USER_VARS += DEFGUESTSELINUXLABEL
USER_VARS += DEFAULTEXPFEATURES USER_VARS += DEFAULTEXPFEATURES
USER_VARS += DEFDISABLEBLOCK USER_VARS += DEFDISABLEBLOCK
USER_VARS += DEFBLOCKSTORAGEDRIVER_ACRN USER_VARS += DEFBLOCKSTORAGEDRIVER_ACRN

View File

@@ -76,6 +76,7 @@ type RuntimeConfigInfo struct {
type RuntimeInfo struct { type RuntimeInfo struct {
Config RuntimeConfigInfo Config RuntimeConfigInfo
Path string Path string
GuestSeLinuxLabel string
Experimental []exp.Feature Experimental []exp.Feature
Version RuntimeVersionInfo Version RuntimeVersionInfo
Debug bool Debug bool
@@ -186,6 +187,7 @@ func getRuntimeInfo(configFile string, config oci.RuntimeConfig) RuntimeInfo {
SandboxCgroupOnly: config.SandboxCgroupOnly, SandboxCgroupOnly: config.SandboxCgroupOnly,
Experimental: config.Experimental, Experimental: config.Experimental,
DisableGuestSeccomp: config.DisableGuestSeccomp, DisableGuestSeccomp: config.DisableGuestSeccomp,
GuestSeLinuxLabel: config.GuestSeLinuxLabel,
} }
} }

View File

@@ -38,6 +38,13 @@ image = "@IMAGEPATH@"
# disable applying SELinux on the VMM process (default false) # disable applying SELinux on the VMM process (default false)
disable_selinux=@DEFDISABLESELINUX@ disable_selinux=@DEFDISABLESELINUX@
# disable applying SELinux on the container process
# If set to false, the type `container_t` is applied to the container process by default.
# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
# with `SELINUX=yes`.
# (default: true)
disable_guest_selinux=@DEFDISABLEGUESTSELINUX@
# Path to the firmware. # Path to the firmware.
# If you want Cloud Hypervisor to use a specific firmware, set its path below. # If you want Cloud Hypervisor to use a specific firmware, set its path below.
# This is option is only used when confidential_guest is enabled. # This is option is only used when confidential_guest is enabled.
@@ -321,6 +328,14 @@ internetworking_model="@DEFNETWORKMODEL_CLH@"
# (default: true) # (default: true)
disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# Apply a custom SELinux security policy to the container process inside the VM.
# This is used when you want to apply a type other than the default `container_t`,
# so general users should not uncomment and apply it.
# (format: "user:role:type")
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
# categories are determined automatically by high-level container runtimes such as containerd.
#guest_selinux_label="@DEFGUESTSELINUXLABEL@"
# If enabled, the runtime will create opentracing.io traces and spans. # If enabled, the runtime will create opentracing.io traces and spans.
# (See https://www.jaegertracing.io/docs/getting-started). # (See https://www.jaegertracing.io/docs/getting-started).
# (default: disabled) # (default: disabled)

View File

@@ -463,6 +463,14 @@ valid_entropy_sources = @DEFVALIDENTROPYSOURCES@
# disable applying SELinux on the VMM process (default false) # disable applying SELinux on the VMM process (default false)
disable_selinux=@DEFDISABLESELINUX@ disable_selinux=@DEFDISABLESELINUX@
# disable applying SELinux on the container process
# If set to false, the type `container_t` is applied to the container process by default.
# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
# with `SELINUX=yes`.
# (default: true)
disable_guest_selinux=@DEFDISABLEGUESTSELINUX@
[factory] [factory]
# VM templating support. Once enabled, new VMs are created from template # VM templating support. Once enabled, new VMs are created from template
# using vm cloning. They will share the same initial kernel, initramfs and # using vm cloning. They will share the same initial kernel, initramfs and
@@ -580,6 +588,14 @@ internetworking_model="@DEFNETWORKMODEL_QEMU@"
# (default: true) # (default: true)
disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# Apply a custom SELinux security policy to the container process inside the VM.
# This is used when you want to apply a type other than the default `container_t`,
# so general users should not uncomment and apply it.
# (format: "user:role:type")
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
# categories are determined automatically by high-level container runtimes such as containerd.
#guest_selinux_label="@DEFGUESTSELINUXLABEL@"
# If enabled, the runtime will create opentracing.io traces and spans. # If enabled, the runtime will create opentracing.io traces and spans.
# (See https://www.jaegertracing.io/docs/getting-started). # (See https://www.jaegertracing.io/docs/getting-started).
# (default: disabled) # (default: disabled)

View File

@@ -90,6 +90,7 @@ const defaultSevSnpGuest = false
const defaultGuestSwap = false const defaultGuestSwap = false
const defaultRootlessHypervisor = false const defaultRootlessHypervisor = false
const defaultDisableSeccomp = false const defaultDisableSeccomp = false
const defaultDisableGuestSeLinux = true
const defaultVfioMode = "guest-kernel" const defaultVfioMode = "guest-kernel"
const defaultLegacySerial = false const defaultLegacySerial = false
const defaultGuestPreAttestation = false const defaultGuestPreAttestation = false

View File

@@ -60,9 +60,9 @@ const (
type tomlConfig struct { type tomlConfig struct {
Hypervisor map[string]hypervisor Hypervisor map[string]hypervisor
Agent map[string]agent Agent map[string]agent
Runtime runtime
Image image Image image
Factory factory Factory factory
Runtime runtime
} }
type image struct { type image struct {
@@ -164,6 +164,7 @@ type hypervisor struct {
Rootless bool `toml:"rootless"` Rootless bool `toml:"rootless"`
DisableSeccomp bool `toml:"disable_seccomp"` DisableSeccomp bool `toml:"disable_seccomp"`
DisableSeLinux bool `toml:"disable_selinux"` DisableSeLinux bool `toml:"disable_selinux"`
DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
LegacySerial bool `toml:"use_legacy_serial"` LegacySerial bool `toml:"use_legacy_serial"`
GuestPreAttestation bool `toml:"guest_pre_attestation"` GuestPreAttestation bool `toml:"guest_pre_attestation"`
EnableVCPUsPinning bool `toml:"enable_vcpus_pinning"` EnableVCPUsPinning bool `toml:"enable_vcpus_pinning"`
@@ -175,12 +176,13 @@ type runtime struct {
JaegerUser string `toml:"jaeger_user"` JaegerUser string `toml:"jaeger_user"`
JaegerPassword string `toml:"jaeger_password"` JaegerPassword string `toml:"jaeger_password"`
VfioMode string `toml:"vfio_mode"` VfioMode string `toml:"vfio_mode"`
GuestSeLinuxLabel string `toml:"guest_selinux_label"`
SandboxBindMounts []string `toml:"sandbox_bind_mounts"` SandboxBindMounts []string `toml:"sandbox_bind_mounts"`
Experimental []string `toml:"experimental"` Experimental []string `toml:"experimental"`
Debug bool `toml:"enable_debug"`
Tracing bool `toml:"enable_tracing"` Tracing bool `toml:"enable_tracing"`
DisableNewNetNs bool `toml:"disable_new_netns"` DisableNewNetNs bool `toml:"disable_new_netns"`
DisableGuestSeccomp bool `toml:"disable_guest_seccomp"` DisableGuestSeccomp bool `toml:"disable_guest_seccomp"`
Debug bool `toml:"enable_debug"`
SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"` SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"`
StaticSandboxResourceMgmt bool `toml:"static_sandbox_resource_mgmt"` StaticSandboxResourceMgmt bool `toml:"static_sandbox_resource_mgmt"`
EnablePprof bool `toml:"enable_pprof"` EnablePprof bool `toml:"enable_pprof"`
@@ -701,6 +703,7 @@ func newFirecrackerHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
TxRateLimiterMaxRate: txRateLimiterMaxRate, TxRateLimiterMaxRate: txRateLimiterMaxRate,
EnableAnnotations: h.EnableAnnotations, EnableAnnotations: h.EnableAnnotations,
DisableSeLinux: h.DisableSeLinux, DisableSeLinux: h.DisableSeLinux,
DisableGuestSeLinux: true, // Guest SELinux is not supported in Firecracker
}, nil }, nil
} }
@@ -854,6 +857,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
SEVGuestPolicy: h.SEVGuestPolicy, SEVGuestPolicy: h.SEVGuestPolicy,
SEVCertChainPath: h.SEVCertChainPath, SEVCertChainPath: h.SEVCertChainPath,
EnableVCPUsPinning: h.EnableVCPUsPinning, EnableVCPUsPinning: h.EnableVCPUsPinning,
DisableGuestSeLinux: h.DisableGuestSeLinux,
}, nil }, nil
} }
@@ -920,6 +924,7 @@ func newAcrnHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
GuestHookPath: h.guestHookPath(), GuestHookPath: h.guestHookPath(),
DisableSeLinux: h.DisableSeLinux, DisableSeLinux: h.DisableSeLinux,
EnableAnnotations: h.EnableAnnotations, EnableAnnotations: h.EnableAnnotations,
DisableGuestSeLinux: true, // Guest SELinux is not supported in ACRN
}, nil }, nil
} }
@@ -1025,6 +1030,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
DisableSeccomp: h.DisableSeccomp, DisableSeccomp: h.DisableSeccomp,
ConfidentialGuest: h.ConfidentialGuest, ConfidentialGuest: h.ConfidentialGuest,
DisableSeLinux: h.DisableSeLinux, DisableSeLinux: h.DisableSeLinux,
DisableGuestSeLinux: h.DisableGuestSeLinux,
NetRateLimiterBwMaxRate: h.getNetRateLimiterBwMaxRate(), NetRateLimiterBwMaxRate: h.getNetRateLimiterBwMaxRate(),
NetRateLimiterBwOneTimeBurst: h.getNetRateLimiterBwOneTimeBurst(), NetRateLimiterBwOneTimeBurst: h.getNetRateLimiterBwOneTimeBurst(),
NetRateLimiterOpsMaxRate: h.getNetRateLimiterOpsMaxRate(), NetRateLimiterOpsMaxRate: h.getNetRateLimiterOpsMaxRate(),
@@ -1262,6 +1268,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
GuestSwap: defaultGuestSwap, GuestSwap: defaultGuestSwap,
Rootless: defaultRootlessHypervisor, Rootless: defaultRootlessHypervisor,
DisableSeccomp: defaultDisableSeccomp, DisableSeccomp: defaultDisableSeccomp,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
LegacySerial: defaultLegacySerial, LegacySerial: defaultLegacySerial,
GuestPreAttestation: defaultGuestPreAttestation, GuestPreAttestation: defaultGuestPreAttestation,
GuestPreAttestationProxy: defaultGuestPreAttestationProxy, GuestPreAttestationProxy: defaultGuestPreAttestationProxy,
@@ -1356,7 +1363,7 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat
} }
config.DisableGuestSeccomp = tomlConf.Runtime.DisableGuestSeccomp config.DisableGuestSeccomp = tomlConf.Runtime.DisableGuestSeccomp
config.GuestSeLinuxLabel = tomlConf.Runtime.GuestSeLinuxLabel
config.StaticSandboxResourceMgmt = tomlConf.Runtime.StaticSandboxResourceMgmt config.StaticSandboxResourceMgmt = tomlConf.Runtime.StaticSandboxResourceMgmt
config.SandboxCgroupOnly = tomlConf.Runtime.SandboxCgroupOnly config.SandboxCgroupOnly = tomlConf.Runtime.SandboxCgroupOnly
config.DisableNewNetNs = tomlConf.Runtime.DisableNewNetNs config.DisableNewNetNs = tomlConf.Runtime.DisableNewNetNs

View File

@@ -554,6 +554,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
VhostUserStorePath: defaultVhostUserStorePath, VhostUserStorePath: defaultVhostUserStorePath,
VirtioFSCache: defaultVirtioFSCacheMode, VirtioFSCache: defaultVirtioFSCacheMode,
BlockDeviceAIO: defaultBlockDeviceAIO, BlockDeviceAIO: defaultBlockDeviceAIO,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
} }
expectedAgentConfig := vc.KataAgentConfig{ expectedAgentConfig := vc.KataAgentConfig{

View File

@@ -128,6 +128,9 @@ type RuntimeConfig struct {
//Determines if seccomp should be applied inside guest //Determines if seccomp should be applied inside guest
DisableGuestSeccomp bool DisableGuestSeccomp bool
//SELinux security context applied to the container process inside guest.
GuestSeLinuxLabel string
// Sandbox sizing information which, if provided, indicates the size of // Sandbox sizing information which, if provided, indicates the size of
// the sandbox needed for the workload(s) // the sandbox needed for the workload(s)
SandboxCPUs uint32 SandboxCPUs uint32
@@ -948,6 +951,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid st
DisableGuestSeccomp: runtime.DisableGuestSeccomp, DisableGuestSeccomp: runtime.DisableGuestSeccomp,
GuestSeLinuxLabel: runtime.GuestSeLinuxLabel,
Experimental: runtime.Experimental, Experimental: runtime.Experimental,
ServiceOffload: runtime.ServiceOffload, ServiceOffload: runtime.ServiceOffload,

View File

@@ -77,6 +77,8 @@ const (
MinHypervisorMemory = 256 MinHypervisorMemory = 256
defaultMsize9p = 8192 defaultMsize9p = 8192
defaultDisableGuestSeLinux = true
) )
var ( var (
@@ -356,6 +358,7 @@ type HypervisorConfig struct {
Rootless bool Rootless bool
DisableSeccomp bool DisableSeccomp bool
DisableSeLinux bool DisableSeLinux bool
DisableGuestSeLinux bool
LegacySerial bool LegacySerial bool
EnableVCPUsPinning bool EnableVCPUsPinning bool
} }

View File

@@ -92,22 +92,24 @@ func TestHypervisorConfigValidTemplateConfig(t *testing.T) {
func TestHypervisorConfigDefaults(t *testing.T) { func TestHypervisorConfigDefaults(t *testing.T) {
assert := assert.New(t) assert := assert.New(t)
hypervisorConfig := &HypervisorConfig{ hypervisorConfig := &HypervisorConfig{
KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel), KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel),
ImagePath: fmt.Sprintf("%s/%s", testDir, testImage), ImagePath: fmt.Sprintf("%s/%s", testDir, testImage),
HypervisorPath: "", HypervisorPath: "",
DisableGuestSeLinux: defaultDisableGuestSeLinux,
} }
testHypervisorConfigValid(t, hypervisorConfig, true) testHypervisorConfigValid(t, hypervisorConfig, true)
hypervisorConfigDefaultsExpected := &HypervisorConfig{ hypervisorConfigDefaultsExpected := &HypervisorConfig{
KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel), KernelPath: fmt.Sprintf("%s/%s", testDir, testKernel),
ImagePath: fmt.Sprintf("%s/%s", testDir, testImage), ImagePath: fmt.Sprintf("%s/%s", testDir, testImage),
HypervisorPath: "", HypervisorPath: "",
NumVCPUs: defaultVCPUs, NumVCPUs: defaultVCPUs,
MemorySize: defaultMemSzMiB, MemorySize: defaultMemSzMiB,
DefaultBridges: defaultBridges, DefaultBridges: defaultBridges,
BlockDeviceDriver: defaultBlockDriver, BlockDeviceDriver: defaultBlockDriver,
DefaultMaxVCPUs: defaultMaxVCPUs, DefaultMaxVCPUs: defaultMaxVCPUs,
Msize9p: defaultMsize9p, Msize9p: defaultMsize9p,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
} }
assert.Exactly(hypervisorConfig, hypervisorConfigDefaultsExpected) assert.Exactly(hypervisorConfig, hypervisorConfigDefaultsExpected)

View File

@@ -37,6 +37,7 @@ import (
"context" "context"
"github.com/gogo/protobuf/proto" "github.com/gogo/protobuf/proto"
"github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
"google.golang.org/grpc/codes" "google.golang.org/grpc/codes"
@@ -70,6 +71,9 @@ const (
kernelParamDebugConsole = "agent.debug_console" kernelParamDebugConsole = "agent.debug_console"
kernelParamDebugConsoleVPort = "agent.debug_console_vport" kernelParamDebugConsoleVPort = "agent.debug_console_vport"
kernelParamDebugConsoleVPortValue = "1026" kernelParamDebugConsoleVPortValue = "1026"
// Default SELinux type applied to the container process inside guest
defaultSeLinuxContainerType = "container_t"
) )
type customRequestTimeoutKeyType struct{} type customRequestTimeoutKeyType struct{}
@@ -906,7 +910,7 @@ func (k *kataAgent) removeIgnoredOCIMount(spec *specs.Spec, ignoredMounts map[st
return nil return nil
} }
func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, stripVfio bool) { func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, disableGuestSeLinux bool, guestSeLinuxLabel string, stripVfio bool) error {
// Disable Hooks since they have been handled on the host and there is // Disable Hooks since they have been handled on the host and there is
// no reason to send them to the agent. It would make no sense to try // no reason to send them to the agent. It would make no sense to try
// to apply them on the guest. // to apply them on the guest.
@@ -918,11 +922,34 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, str
grpcSpec.Linux.Seccomp = nil grpcSpec.Linux.Seccomp = nil
} }
// Disable SELinux inside of the virtual machine, the label will apply // Pass SELinux label for the container process to the agent.
// to the KVM process
if grpcSpec.Process.SelinuxLabel != "" { if grpcSpec.Process.SelinuxLabel != "" {
k.Logger().Info("SELinux label from config will be applied to the hypervisor process, not the VM workload") if !disableGuestSeLinux {
grpcSpec.Process.SelinuxLabel = "" k.Logger().Info("SELinux label will be applied to the container process inside guest")
var label string
if guestSeLinuxLabel != "" {
label = guestSeLinuxLabel
} else {
label = grpcSpec.Process.SelinuxLabel
}
processContext, err := selinux.NewContext(label)
if err != nil {
return err
}
// Change the type from KVM to container because the type passed from the high-level
// runtime is for KVM process.
if guestSeLinuxLabel == "" {
processContext["type"] = defaultSeLinuxContainerType
}
grpcSpec.Process.SelinuxLabel = processContext.Get()
} else {
k.Logger().Info("Empty SELinux label for the process and the mount because guest SELinux is disabled")
grpcSpec.Process.SelinuxLabel = ""
grpcSpec.Linux.MountLabel = ""
}
} }
// By now only CPU constraints are supported // By now only CPU constraints are supported
@@ -984,6 +1011,8 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, str
} }
grpcSpec.Linux.Devices = linuxDevices grpcSpec.Linux.Devices = linuxDevices
} }
return nil
} }
func (k *kataAgent) handleShm(mounts []specs.Mount, sandbox *Sandbox) { func (k *kataAgent) handleShm(mounts []specs.Mount, sandbox *Sandbox) {
@@ -1267,9 +1296,20 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
passSeccomp := !sandbox.config.DisableGuestSeccomp && sandbox.seccompSupported passSeccomp := !sandbox.config.DisableGuestSeccomp && sandbox.seccompSupported
// Currently, guest SELinux can be enabled only when SELinux is enabled on the host side.
if !sandbox.config.HypervisorConfig.DisableGuestSeLinux && !selinux.GetEnabled() {
return nil, fmt.Errorf("Guest SELinux is enabled, but SELinux is disabled on the host side")
}
if sandbox.config.HypervisorConfig.DisableGuestSeLinux && sandbox.config.GuestSeLinuxLabel != "" {
return nil, fmt.Errorf("Custom SELinux security policy is provided, but guest SELinux is disabled")
}
// We need to constrain the spec to make sure we're not // We need to constrain the spec to make sure we're not
// passing irrelevant information to the agent. // passing irrelevant information to the agent.
k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.VfioMode == config.VFIOModeGuestKernel) err = k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.HypervisorConfig.DisableGuestSeLinux, sandbox.config.GuestSeLinuxLabel, sandbox.config.VfioMode == config.VFIOModeGuestKernel)
if err != nil {
return nil, err
}
req := &grpc.CreateContainerRequest{ req := &grpc.CreateContainerRequest{
ContainerId: c.id, ContainerId: c.id,

View File

@@ -619,7 +619,7 @@ func TestConstrainGRPCSpec(t *testing.T) {
} }
k := kataAgent{} k := kataAgent{}
k.constrainGRPCSpec(g, true, true) k.constrainGRPCSpec(g, true, true, "", true)
// Check nil fields // Check nil fields
assert.Nil(g.Hooks) assert.Nil(g.Hooks)

View File

@@ -189,6 +189,7 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
SystemdCgroup: sconfig.SystemdCgroup, SystemdCgroup: sconfig.SystemdCgroup,
SandboxCgroupOnly: sconfig.SandboxCgroupOnly, SandboxCgroupOnly: sconfig.SandboxCgroupOnly,
DisableGuestSeccomp: sconfig.DisableGuestSeccomp, DisableGuestSeccomp: sconfig.DisableGuestSeccomp,
GuestSeLinuxLabel: sconfig.GuestSeLinuxLabel,
} }
ss.Config.SandboxBindMounts = append(ss.Config.SandboxBindMounts, sconfig.SandboxBindMounts...) ss.Config.SandboxBindMounts = append(ss.Config.SandboxBindMounts, sconfig.SandboxBindMounts...)
@@ -429,6 +430,7 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
SystemdCgroup: savedConf.SystemdCgroup, SystemdCgroup: savedConf.SystemdCgroup,
SandboxCgroupOnly: savedConf.SandboxCgroupOnly, SandboxCgroupOnly: savedConf.SandboxCgroupOnly,
DisableGuestSeccomp: savedConf.DisableGuestSeccomp, DisableGuestSeccomp: savedConf.DisableGuestSeccomp,
GuestSeLinuxLabel: savedConf.GuestSeLinuxLabel,
} }
sconfig.SandboxBindMounts = append(sconfig.SandboxBindMounts, savedConf.SandboxBindMounts...) sconfig.SandboxBindMounts = append(sconfig.SandboxBindMounts, savedConf.SandboxBindMounts...)

View File

@@ -243,19 +243,6 @@ type ContainerConfig struct {
// SandboxConfig is a sandbox configuration. // SandboxConfig is a sandbox configuration.
// Refs: virtcontainers/sandbox.go:SandboxConfig // Refs: virtcontainers/sandbox.go:SandboxConfig
type SandboxConfig struct { type SandboxConfig struct {
// Information for fields not saved:
// * Annotation: this is kind of casual data, we don't need casual data in persist file,
// if you know this data needs to persist, please gives it
// a specific field
ContainerConfigs []ContainerConfig
// SandboxBindMounts - list of paths to mount into guest
SandboxBindMounts []string
// Experimental enables experimental features
Experimental []string
// Cgroups specifies specific cgroup settings for the various subsystems that the container is // Cgroups specifies specific cgroup settings for the various subsystems that the container is
// placed into to limit the resources the container has available // placed into to limit the resources the container has available
Cgroups *configs.Cgroup `json:"cgroups"` Cgroups *configs.Cgroup `json:"cgroups"`
@@ -265,8 +252,24 @@ type SandboxConfig struct {
KataShimConfig *ShimConfig KataShimConfig *ShimConfig
HypervisorType string // Custom SELinux security policy to the container process inside the VM
NetworkConfig NetworkConfig GuestSeLinuxLabel string
HypervisorType string
// SandboxBindMounts - list of paths to mount into guest
SandboxBindMounts []string
// Experimental enables experimental features
Experimental []string
// Information for fields not saved:
// * Annotation: this is kind of casual data, we don't need casual data in persist file,
// if you know this data needs to persist, please gives it a specific field
ContainerConfigs []ContainerConfig
NetworkConfig NetworkConfig
HypervisorConfig HypervisorConfig HypervisorConfig HypervisorConfig
ShmSize uint64 ShmSize uint64

View File

@@ -247,6 +247,9 @@ const (
// DisableGuestSeccomp is a sandbox annotation that determines if seccomp should be applied inside guest. // DisableGuestSeccomp is a sandbox annotation that determines if seccomp should be applied inside guest.
DisableGuestSeccomp = kataAnnotRuntimePrefix + "disable_guest_seccomp" DisableGuestSeccomp = kataAnnotRuntimePrefix + "disable_guest_seccomp"
// GuestSeLinuxLabel is a SELinux security policy that is applied to a container process inside guest.
GuestSeLinuxLabel = kataAnnotRuntimePrefix + "guest_selinux_label"
// SandboxCgroupOnly is a sandbox annotation that determines if kata processes are managed only in sandbox cgroup. // SandboxCgroupOnly is a sandbox annotation that determines if kata processes are managed only in sandbox cgroup.
SandboxCgroupOnly = kataAnnotRuntimePrefix + "sandbox_cgroup_only" SandboxCgroupOnly = kataAnnotRuntimePrefix + "sandbox_cgroup_only"

View File

@@ -170,6 +170,15 @@ func (q *qemu) kernelParameters() string {
// set the maximum number of vCPUs // set the maximum number of vCPUs
params = append(params, Param{"nr_cpus", fmt.Sprintf("%d", q.config.DefaultMaxVCPUs)}) params = append(params, Param{"nr_cpus", fmt.Sprintf("%d", q.config.DefaultMaxVCPUs)})
// set the SELinux params in accordance with the runtime configuration, disable_guest_selinux.
if q.config.DisableGuestSeLinux {
q.Logger().Info("Set selinux=0 to kernel params because SELinux on the guest is disabled")
params = append(params, Param{"selinux", "0"})
} else {
q.Logger().Info("Set selinux=1 to kernel params because SELinux on the guest is enabled")
params = append(params, Param{"selinux", "1"})
}
// add the params specified by the provided config. As the kernel // add the params specified by the provided config. As the kernel
// honours the last parameter value set and since the config-provided // honours the last parameter value set and since the config-provided
// params are added here, they will take priority over the defaults. // params are added here, they will take priority over the defaults.
@@ -465,6 +474,13 @@ func (q *qemu) createVirtiofsDaemon(sharedPath string) (VirtiofsDaemon, error) {
return nd, nil return nd, nil
} }
// Set the xattr option for virtiofsd daemon to enable extended attributes
// in virtiofs if SELinux on the guest side is enabled.
if !q.config.DisableGuestSeLinux {
q.Logger().Info("Set the xattr option for virtiofsd")
q.config.VirtioFSExtraArgs = append(q.config.VirtioFSExtraArgs, "-o", "xattr")
}
// default use virtiofsd // default use virtiofsd
return &virtiofsd{ return &virtiofsd{
path: q.config.VirtioFSDaemon, path: q.config.VirtioFSDaemon,
@@ -914,7 +930,6 @@ func (q *qemu) StartVM(ctx context.Context, timeout int) error {
// the SELinux label. If these processes require privileged, we do // the SELinux label. If these processes require privileged, we do
// notwant to run them under confinement. // notwant to run them under confinement.
if !q.config.DisableSeLinux { if !q.config.DisableSeLinux {
if err := label.SetProcessLabel(q.config.SELinuxProcessLabel); err != nil { if err := label.SetProcessLabel(q.config.SELinuxProcessLabel); err != nil {
return err return err
} }

View File

@@ -27,15 +27,16 @@ import (
func newQemuConfig() HypervisorConfig { func newQemuConfig() HypervisorConfig {
return HypervisorConfig{ return HypervisorConfig{
KernelPath: testQemuKernelPath, KernelPath: testQemuKernelPath,
InitrdPath: testQemuInitrdPath, InitrdPath: testQemuInitrdPath,
HypervisorPath: testQemuPath, HypervisorPath: testQemuPath,
NumVCPUs: defaultVCPUs, NumVCPUs: defaultVCPUs,
MemorySize: defaultMemSzMiB, MemorySize: defaultMemSzMiB,
DefaultBridges: defaultBridges, DefaultBridges: defaultBridges,
BlockDeviceDriver: defaultBlockDriver, BlockDeviceDriver: defaultBlockDriver,
DefaultMaxVCPUs: defaultMaxVCPUs, DefaultMaxVCPUs: defaultMaxVCPUs,
Msize9p: defaultMsize9p, Msize9p: defaultMsize9p,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
} }
} }
@@ -58,7 +59,7 @@ func testQemuKernelParameters(t *testing.T, kernelParams []Param, expected strin
} }
func TestQemuKernelParameters(t *testing.T) { func TestQemuKernelParameters(t *testing.T) {
expectedOut := fmt.Sprintf("panic=1 nr_cpus=%d foo=foo bar=bar", govmm.MaxVCPUs()) expectedOut := fmt.Sprintf("panic=1 nr_cpus=%d selinux=0 foo=foo bar=bar", govmm.MaxVCPUs())
params := []Param{ params := []Param{
{ {
Key: "foo", Key: "foo",

View File

@@ -135,6 +135,8 @@ type SandboxConfig struct {
Hostname string Hostname string
ID string ID string
HypervisorType HypervisorType HypervisorType HypervisorType
// Custom SELinux security policy to the container process inside the VM
GuestSeLinuxLabel string
// Volumes is a list of shared volumes between the host and the Sandbox. // Volumes is a list of shared volumes between the host and the Sandbox.
Volumes []types.Volume Volumes []types.Volume
// SandboxBindMounts - list of paths to mount into guest // SandboxBindMounts - list of paths to mount into guest

View File

@@ -25,6 +25,8 @@ const cpBinaryName = "cp"
const fileMode0755 = os.FileMode(0755) const fileMode0755 = os.FileMode(0755)
const maxWaitDelay = 50 * time.Millisecond
// The DefaultRateLimiterRefillTime is used for calculating the rate at // The DefaultRateLimiterRefillTime is used for calculating the rate at
// which a TokenBucket is replinished, in cases where a RateLimiter is // which a TokenBucket is replinished, in cases where a RateLimiter is
// applied to either network or disk I/O. // applied to either network or disk I/O.
@@ -306,6 +308,44 @@ func ConvertAddressFamily(family int32) pbTypes.IPFamily {
} }
} }
func waitProcessUsingWaitLoop(pid int, timeoutSecs uint, logger *logrus.Entry) bool {
secs := time.Duration(timeoutSecs) * time.Second
timeout := time.After(secs)
delay := 1 * time.Millisecond
for {
// Wait4 is used to reap and check that a child terminated.
// Without the Wait4 call, Kill(0) for a child will always exit without
// error because the process isn't reaped.
// Wait4 return ECHLD error for non-child processes. Kill(0) is meant
// to address this case, once the process is reaped by init process,
// the call will return ESRCH error.
// "A watched pot never boils" and an unwaited-for process never appears to die!
waitedPid, err := syscall.Wait4(pid, nil, syscall.WNOHANG, nil)
if waitedPid == pid && err == nil {
return false
}
if err := syscall.Kill(pid, syscall.Signal(0)); err != nil {
return false
}
select {
case <-time.After(delay):
delay = delay * 5
if delay > maxWaitDelay {
delay = maxWaitDelay
}
case <-timeout:
logger.Warnf("process %v still running after waiting %ds", pid, timeoutSecs)
return true
}
}
}
// WaitLocalProcess waits for the specified process for up to timeoutSecs seconds. // WaitLocalProcess waits for the specified process for up to timeoutSecs seconds.
// //
// Notes: // Notes:
@@ -334,49 +374,24 @@ func WaitLocalProcess(pid int, timeoutSecs uint, initialSignal syscall.Signal, l
} }
} }
pidRunning := true pidRunning := waitForProcessCompletion(pid, timeoutSecs, logger)
secs := time.Duration(timeoutSecs)
timeout := time.After(secs * time.Second)
// Wait for the VM process to terminate
outer:
for {
select {
case <-time.After(50 * time.Millisecond):
// Check if the process is running periodically to avoid a busy loop
var _status syscall.WaitStatus
var _rusage syscall.Rusage
var waitedPid int
// "A watched pot never boils" and an unwaited-for process never appears to die!
waitedPid, err = syscall.Wait4(pid, &_status, syscall.WNOHANG, &_rusage)
if waitedPid == pid && err == nil {
pidRunning = false
break outer
}
if err = syscall.Kill(pid, syscall.Signal(0)); err != nil {
pidRunning = false
break outer
}
break
case <-timeout:
logger.Warnf("process %v still running after waiting %ds", pid, timeoutSecs)
break outer
}
}
if pidRunning { if pidRunning {
// Force process to die // Force process to die
if err = syscall.Kill(pid, syscall.SIGKILL); err != nil { if err = syscall.Kill(pid, syscall.SIGKILL); err != nil {
if err == syscall.ESRCH {
logger.WithField("pid", pid).Warnf("process already finished")
return nil
}
return fmt.Errorf("Failed to stop process %v: %s", pid, err) return fmt.Errorf("Failed to stop process %v: %s", pid, err)
} }
for {
_, err := syscall.Wait4(pid, nil, 0, nil)
if err != syscall.EINTR {
break
}
}
} }
return nil return nil

View File

@@ -5,6 +5,12 @@
package utils package utils
import "github.com/sirupsen/logrus"
func GetDevicePathAndFsTypeOptions(mountPoint string) (devicePath, fsType string, fsOptions []string, err error) { func GetDevicePathAndFsTypeOptions(mountPoint string) (devicePath, fsType string, fsOptions []string, err error) {
return return
} }
func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) bool {
return waitProcessUsingWaitLoop(pid, timeoutSecs, logger)
}

View File

@@ -14,8 +14,10 @@ import (
"os" "os"
"strings" "strings"
"syscall" "syscall"
"time"
"unsafe" "unsafe"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@@ -151,3 +153,60 @@ func IsAPVFIOMediatedDevice(sysfsdev string) bool {
} }
return false return false
} }
func waitProcessUsingPidfd(pid int, timeoutSecs uint, logger *logrus.Entry) (bool, error) {
pidfd, err := unix.PidfdOpen(pid, 0)
if err != nil {
if err == unix.ESRCH {
return false, nil
}
return true, err
}
defer unix.Close(pidfd)
var n int
maxDelay := time.Duration(timeoutSecs) * time.Second
end := time.Now().Add(maxDelay)
for {
remaining := time.Until(end).Milliseconds()
if remaining < 0 {
remaining = 0
}
n, err = unix.Poll([]unix.PollFd{{Fd: int32(pidfd), Events: unix.POLLIN}}, int(remaining))
if err != unix.EINTR {
break
}
}
if err != nil || n != 1 {
logger.Warnf("process %v still running after waiting %ds", pid, timeoutSecs)
return true, err
}
for {
err := unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil)
if err == unix.EINVAL {
err = unix.Waitid(unix.P_PID, pid, nil, unix.WEXITED, nil)
}
if err != unix.EINTR {
break
}
}
return false, nil
}
func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) bool {
pidRunning, err := waitProcessUsingPidfd(pid, timeoutSecs, logger)
if err == unix.ENOSYS {
pidRunning = waitProcessUsingWaitLoop(pid, timeoutSecs, logger)
}
return pidRunning
}

View File

@@ -337,7 +337,6 @@ impl ContainerLauncher {
self.runner self.runner
.cgroup_manager .cgroup_manager
.as_ref() .as_ref()
.unwrap()
.as_any()? .as_any()?
.downcast_ref::<CgroupManager>() .downcast_ref::<CgroupManager>()
.unwrap() .unwrap()

View File

@@ -65,6 +65,8 @@ readonly -a systemd_files=(
# Set a default value # Set a default value
AGENT_INIT=${AGENT_INIT:-no} AGENT_INIT=${AGENT_INIT:-no}
SELINUX=${SELINUX:-no}
SELINUXFS="/sys/fs/selinux"
# Align image to 128M # Align image to 128M
readonly mem_boundary_mb=128 readonly mem_boundary_mb=128
@@ -94,6 +96,10 @@ Extra environment variables:
DEFAULT: not set DEFAULT: not set
USE_PODMAN: If set and USE_DOCKER not set, will build image in a Podman Container (requries podman) USE_PODMAN: If set and USE_DOCKER not set, will build image in a Podman Container (requries podman)
DEFAULT: not set DEFAULT: not set
SELINUX: If set to "yes", the rootfs is labeled for SELinux.
Make sure that selinuxfs is mounted to /sys/fs/selinux on the host
and the rootfs is built with SELINUX=yes.
DEFAULT value: "no"
Following diagram shows how the resulting image will look like Following diagram shows how the resulting image will look like
@@ -135,6 +141,7 @@ build_with_container() {
local nsdax_bin="$9" local nsdax_bin="$9"
local container_image_name="image-builder-osbuilder" local container_image_name="image-builder-osbuilder"
local shared_files="" local shared_files=""
local selinuxfs=""
image_dir=$(readlink -f "$(dirname "${image}")") image_dir=$(readlink -f "$(dirname "${image}")")
image_name=$(basename "${image}") image_name=$(basename "${image}")
@@ -158,6 +165,14 @@ build_with_container() {
shared_files+="-v ${mke2fs_conf}:${mke2fs_conf}:ro " shared_files+="-v ${mke2fs_conf}:${mke2fs_conf}:ro "
fi fi
if [ "${SELINUX}" == "yes" ]; then
if mountpoint $SELINUXFS > /dev/null; then
selinuxfs="-v ${SELINUXFS}:${SELINUXFS}"
else
die "Make sure that SELinux is enabled on the host"
fi
fi
#Make sure we use a compatible runtime to build rootfs #Make sure we use a compatible runtime to build rootfs
# In case Clear Containers Runtime is installed we dont want to hit issue: # In case Clear Containers Runtime is installed we dont want to hit issue:
#https://github.com/clearcontainers/runtime/issues/828 #https://github.com/clearcontainers/runtime/issues/828
@@ -172,12 +187,14 @@ build_with_container() {
--env ROOT_FREE_SPACE="${root_free_space}" \ --env ROOT_FREE_SPACE="${root_free_space}" \
--env NSDAX_BIN="${nsdax_bin}" \ --env NSDAX_BIN="${nsdax_bin}" \
--env KATA_BUILD_CC="${KATA_BUILD_CC}" \ --env KATA_BUILD_CC="${KATA_BUILD_CC}" \
--env SELINUX="${SELINUX}" \
--env DEBUG="${DEBUG}" \ --env DEBUG="${DEBUG}" \
-v /dev:/dev \ -v /dev:/dev \
-v "${script_dir}":"/osbuilder" \ -v "${script_dir}":"/osbuilder" \
-v "${script_dir}/../scripts":"/scripts" \ -v "${script_dir}/../scripts":"/scripts" \
-v "${rootfs}":"/rootfs" \ -v "${rootfs}":"/rootfs" \
-v "${image_dir}":"/image" \ -v "${image_dir}":"/image" \
${selinuxfs} \
${shared_files} \ ${shared_files} \
${container_image_name} \ ${container_image_name} \
bash "/osbuilder/${script_name}" -o "/image/${image_name}" /rootfs bash "/osbuilder/${script_name}" -o "/image/${image_name}" /rootfs
@@ -398,6 +415,7 @@ create_rootfs_image() {
local img_size="$3" local img_size="$3"
local fs_type="$4" local fs_type="$4"
local block_size="$5" local block_size="$5"
local agent_bin="$6"
create_disk "${image}" "${img_size}" "${fs_type}" "${rootfs_start}" create_disk "${image}" "${img_size}" "${fs_type}" "${rootfs_start}"
@@ -416,6 +434,31 @@ create_rootfs_image() {
info "Copying content from rootfs to root partition" info "Copying content from rootfs to root partition"
cp -a "${rootfs}"/* "${mount_dir}" cp -a "${rootfs}"/* "${mount_dir}"
if [ "${SELINUX}" == "yes" ]; then
if [ "${AGENT_INIT}" == "yes" ]; then
die "Guest SELinux with the agent init is not supported yet"
fi
info "Labeling rootfs for SELinux"
selinuxfs_path="${mount_dir}${SELINUXFS}"
mkdir -p $selinuxfs_path
if mountpoint $SELINUXFS > /dev/null && \
chroot "${mount_dir}" command -v restorecon > /dev/null; then
mount -t selinuxfs selinuxfs $selinuxfs_path
chroot "${mount_dir}" restorecon -RF -e ${SELINUXFS} /
# TODO: This operation will be removed after the updated container-selinux that
# includes the following commit is released.
# https://github.com/containers/container-selinux/commit/39f83cc74d50bd10ab6be4d0bdd98bc04857469f
# We use chcon as an interim solution until then.
chroot "${mount_dir}" chcon -t container_runtime_exec_t "/usr/bin/${agent_bin}"
umount $selinuxfs_path
else
die "Could not label the rootfs. Make sure that SELinux is enabled on the host \
and the rootfs is built with SELINUX=yes"
fi
fi
sync sync
OK "rootfs copied" OK "rootfs copied"
@@ -549,7 +592,7 @@ main() {
# consider in calculate_img_size # consider in calculate_img_size
rootfs_img_size=$((img_size - dax_header_sz)) rootfs_img_size=$((img_size - dax_header_sz))
create_rootfs_image "${rootfs}" "${image}" "${rootfs_img_size}" \ create_rootfs_image "${rootfs}" "${image}" "${rootfs_img_size}" \
"${fs_type}" "${block_size}" "${fs_type}" "${block_size}" "${agent_bin}"
# insert at the beginning of the image the MBR + DAX header # insert at the beginning of the image the MBR + DAX header
set_dax_header "${image}" "${img_size}" "${fs_type}" "${nsdax_bin}" set_dax_header "${image}" "${img_size}" "${fs_type}" "${nsdax_bin}"

View File

@@ -8,10 +8,15 @@ OS_VERSION=${OS_VERSION:-stream9}
PACKAGES="chrony iptables" PACKAGES="chrony iptables"
[ "$AGENT_INIT" = no ] && PACKAGES+=" systemd" [ "$AGENT_INIT" = no ] && PACKAGES+=" systemd"
[ "$SECCOMP" = yes ] && PACKAGES+=" libseccomp" [ "$SECCOMP" = yes ] && PACKAGES+=" libseccomp"
[ "$SELINUX" = yes ] && PACKAGES+=" container-selinux"
# Container registry tag is different from metalink repo, e.g. "stream9" => "9-stream" # Container registry tag is different from metalink repo, e.g. "stream9" => "9-stream"
os_repo_version="$(sed -E "s/(stream)(.+)/\2-\1/" <<< "$OS_VERSION")" os_repo_version="$(sed -E "s/(stream)(.+)/\2-\1/" <<< "$OS_VERSION")"
METALINK="https://mirrors.centos.org/metalink?repo=centos-baseos-$os_repo_version&arch=\$basearch" METALINK="https://mirrors.centos.org/metalink?repo=centos-baseos-$os_repo_version&arch=\$basearch"
if [ "$SELINUX" == yes ]; then
# AppStream repository is required for the container-selinux package
METALINK_APPSTREAM="https://mirrors.centos.org/metalink?repo=centos-appstream-$os_repo_version&arch=\$basearch"
fi
GPG_KEY_FILE=RPM-GPG-KEY-CentOS-Official GPG_KEY_FILE=RPM-GPG-KEY-CentOS-Official
GPG_KEY_URL="https://centos.org/keys/$GPG_KEY_FILE" GPG_KEY_URL="https://centos.org/keys/$GPG_KEY_FILE"

View File

@@ -26,6 +26,7 @@ LIBC=${LIBC:-musl}
# The kata agent enables seccomp feature. # The kata agent enables seccomp feature.
# However, it is not enforced by default: you need to enable that in the main configuration file. # However, it is not enforced by default: you need to enable that in the main configuration file.
SECCOMP=${SECCOMP:-"yes"} SECCOMP=${SECCOMP:-"yes"}
SELINUX=${SELINUX:-"no"}
lib_file="${script_dir}/../scripts/lib.sh" lib_file="${script_dir}/../scripts/lib.sh"
source "$lib_file" source "$lib_file"
@@ -143,6 +144,11 @@ ROOTFS_DIR Path to the directory that is populated with the rootfs.
SECCOMP When set to "no", the kata-agent is built without seccomp capability. SECCOMP When set to "no", the kata-agent is built without seccomp capability.
Default value: "yes" Default value: "yes"
SELINUX When set to "yes", build the rootfs with the required packages to
enable SELinux in the VM.
Make sure the guest kernel is compiled with SELinux enabled.
Default value: "no"
USE_DOCKER If set, build the rootfs inside a container (requires USE_DOCKER If set, build the rootfs inside a container (requires
Docker). Docker).
Default value: <not set> Default value: <not set>
@@ -369,6 +375,15 @@ build_rootfs_distro()
echo "Required rust version: $RUST_VERSION" echo "Required rust version: $RUST_VERSION"
if [ "${SELINUX}" == "yes" ]; then
if [ "${AGENT_INIT}" == "yes" ]; then
die "Guest SELinux with the agent init is not supported yet"
fi
if [ "${distro}" != "centos" ]; then
die "The guest rootfs must be CentOS to enable guest SELinux"
fi
fi
if [ -z "${USE_DOCKER}" ] && [ -z "${USE_PODMAN}" ]; then if [ -z "${USE_DOCKER}" ] && [ -z "${USE_PODMAN}" ]; then
info "build directly" info "build directly"
build_rootfs ${ROOTFS_DIR} build_rootfs ${ROOTFS_DIR}
@@ -454,6 +469,7 @@ build_rootfs_distro()
--env AA_KBC="${AA_KBC}" \ --env AA_KBC="${AA_KBC}" \
--env KATA_BUILD_CC="${KATA_BUILD_CC}" \ --env KATA_BUILD_CC="${KATA_BUILD_CC}" \
--env SECCOMP="${SECCOMP}" \ --env SECCOMP="${SECCOMP}" \
--env SELINUX="${SELINUX}" \
--env DEBUG="${DEBUG}" \ --env DEBUG="${DEBUG}" \
--env HOME="/root" \ --env HOME="/root" \
-v "${repo_dir}":"/kata-containers" \ -v "${repo_dir}":"/kata-containers" \

View File

@@ -79,7 +79,23 @@ gpgcheck=1
gpgkey=file://${CONFIG_DIR}/${GPG_KEY_FILE} gpgkey=file://${CONFIG_DIR}/${GPG_KEY_FILE}
EOF EOF
fi fi
if [ "$SELINUX" == "yes" ]; then
cat > "${DNF_CONF}" << EOF
[appstream]
name=${OS_NAME}-${OS_VERSION} upstream
releasever=${OS_VERSION}
EOF
echo "metalink=$METALINK_APPSTREAM" >> "$DNF_CONF"
if [ -n "$GPG_KEY_URL" ]; then
if [ ! -f "${CONFIG_DIR}/${GPG_KEY_FILE}" ]; then
curl -L "${GPG_KEY_URL}" -o "${CONFIG_DIR}/${GPG_KEY_FILE}"
fi
cat >> "${DNF_CONF}" << EOF
gpgcheck=1
gpgkey=file://${CONFIG_DIR}/${GPG_KEY_FILE}
EOF
fi
fi
} }
build_rootfs() build_rootfs()

View File

@@ -56,7 +56,7 @@ function get_container_runtime() {
function install_artifacts() { function install_artifacts() {
echo "copying kata artifacts onto host" echo "copying kata artifacts onto host"
cp -a /opt/kata-artifacts/opt/kata/* /opt/kata/ cp -au /opt/kata-artifacts/opt/kata/* /opt/kata/
chmod +x /opt/kata/bin/* chmod +x /opt/kata/bin/*
chmod +x /opt/kata/runtime-rs/bin/* chmod +x /opt/kata/runtime-rs/bin/*
} }

View File

@@ -159,12 +159,12 @@ get_kernel() {
major_version=$(echo "${version}" | cut -d. -f1) major_version=$(echo "${version}" | cut -d. -f1)
kernel_tarball="linux-${version}.tar.xz" kernel_tarball="linux-${version}.tar.xz"
if [ ! -f sha256sums.asc ] || ! grep -q "${kernel_tarball}" sha256sums.asc; then if [ ! -f sha256sums.asc ] || ! grep -q "${kernel_tarball}" sha256sums.asc; then
shasum_url="https://cdn.kernel.org/pub/linux/kernel/v${major_version}.x/sha256sums.asc" shasum_url="https://cdn.kernel.org/pub/linux/kernel/v${major_version}.x/sha256sums.asc"
info "Download kernel checksum file: sha256sums.asc from ${shasum_url}" info "Download kernel checksum file: sha256sums.asc from ${shasum_url}"
curl --fail -OL "${shasum_url}" curl --fail -OL "${shasum_url}"
fi fi
grep "${kernel_tarball}" sha256sums.asc >"${kernel_tarball}.sha256" grep "${kernel_tarball}" sha256sums.asc >"${kernel_tarball}.sha256"
if [ -f "${kernel_tarball}" ] && ! sha256sum -c "${kernel_tarball}.sha256"; then if [ -f "${kernel_tarball}" ] && ! sha256sum -c "${kernel_tarball}.sha256"; then
info "invalid kernel tarball ${kernel_tarball} removing " info "invalid kernel tarball ${kernel_tarball} removing "

View File

@@ -39,6 +39,12 @@ CONFIG_ARM64_PMEM=y
CONFIG_ARM64_RAS_EXTN=y CONFIG_ARM64_RAS_EXTN=y
# end of ARMv8.2 architectural feature # end of ARMv8.2 architectural feature
#
# ARMv8.5 architectural features
#
CONFIG_ARCH_RANDOM=y
CONFIG_RANDOM_TRUST_CPU=y
CONFIG_NO_HZ_FULL=y CONFIG_NO_HZ_FULL=y
CONFIG_GENERIC_MSI_IRQ_DOMAIN=y CONFIG_GENERIC_MSI_IRQ_DOMAIN=y
CONFIG_RANDOMIZE_BASE=y CONFIG_RANDOMIZE_BASE=y

View File

@@ -0,0 +1,12 @@
# SELinux support:
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
CONFIG_LSM_MMAP_MIN_ADDR=6553
CONFIG_NETWORK_SECMARK=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
CONFIG_SECURITY_SELINUX_DEVELOP=y
CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0
CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9
CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256

View File

@@ -1 +1 @@
96 97

View File

@@ -48,9 +48,9 @@ info "Calling edksetup script"
source edksetup.sh source edksetup.sh
if [ "${ovmf_build}" == "sev" ]; then if [ "${ovmf_build}" == "sev" ]; then
info "Creating dummy grub file" info "Creating dummy grub file"
#required for building AmdSev package without grub #required for building AmdSev package without grub
touch OvmfPkg/AmdSev/Grub/grub.efi touch OvmfPkg/AmdSev/Grub/grub.efi
fi fi
info "Building ovmf" info "Building ovmf"