CCv0: Merge main into CCv0 branch

Merge remote-tracking branch 'upstream/main' into CCv0 Fixes: #3807 Signed-off-by: stevenhorsman <steven@uk.ibm.com>
2026-01-07 08:24:23 +01:00 · 2022-03-02 14:18:25 +00:00
parent 433a5de354 2ae8bd696a
commit 4decf30b3e
139 changed files with 3137 additions and 1372 deletions
--- a/src/runtime/pkg/containerd-shim-v2/event_forwarder.go
+++ b/src/runtime/pkg/containerd-shim-v2/event_forwarder.go
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 Ant Group
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package containerdshim
+
+import (
+	"context"
+	"os"
+	"time"
+
+	"github.com/containerd/containerd/events"
+)
+
+type forwarderType string
+
+const (
+	forwarderTypeLog        forwarderType = "log"
+	forwarderTypeContainerd forwarderType = "containerd"
+
+	// A time span used to wait for publish a containerd event,
+	// once it costs a longer time than timeOut, it will be canceld.
+	timeOut = 5 * time.Second
+
+	// ttrpc address passed from container runtime.
+	// For now containerd will pass the address, and CRI-O will not
+	ttrpcAddressEnv = "TTRPC_ADDRESS"
+)
+
+type eventsForwarder interface {
+	forward()
+	forwarderType() forwarderType
+}
+
+type logForwarder struct {
+	s *service
+}
+
+func (lf *logForwarder) forward() {
+	for e := range lf.s.events {
+		shimLog.WithField("topic", getTopic(e)).Infof("post event: %+v", e)
+	}
+}
+
+func (lf *logForwarder) forwarderType() forwarderType {
+	return forwarderTypeLog
+}
+
+type containerdForwarder struct {
+	s         *service
+	ctx       context.Context
+	publisher events.Publisher
+}
+
+func (cf *containerdForwarder) forward() {
+	for e := range cf.s.events {
+		ctx, cancel := context.WithTimeout(cf.ctx, timeOut)
+		err := cf.publisher.Publish(ctx, getTopic(e), e)
+		cancel()
+		if err != nil {
+			shimLog.WithError(err).Error("post event")
+		}
+	}
+}
+
+func (cf *containerdForwarder) forwarderType() forwarderType {
+	return forwarderTypeContainerd
+}
+
+func (s *service) newEventsForwarder(ctx context.Context, publisher events.Publisher) eventsForwarder {
+	var forwarder eventsForwarder
+	ttrpcAddress := os.Getenv(ttrpcAddressEnv)
+	if ttrpcAddress == "" {
+		// non containerd will use log forwarder to write events to log
+		forwarder = &logForwarder{
+			s: s,
+		}
+	} else {
+		forwarder = &containerdForwarder{
+			s:         s,
+			ctx:       ctx,
+			publisher: publisher,
+		}
+	}
+
+	return forwarder
+}
--- a/src/runtime/pkg/containerd-shim-v2/event_forwarder_test.go
+++ b/src/runtime/pkg/containerd-shim-v2/event_forwarder_test.go
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 Ant Group
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package containerdshim
+
+import (
+	"context"
+	"os"
+	"testing"
+
+	"github.com/containerd/containerd/events"
+	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/vcmock"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewEventsForwarder(t *testing.T) {
+	assert := assert.New(t)
+
+	sandbox := &vcmock.Sandbox{
+		MockID: testSandboxID,
+	}
+
+	s := &service{
+		id:         testSandboxID,
+		sandbox:    sandbox,
+		containers: make(map[string]*container),
+	}
+
+	// newEventsForwarder will not call publisher to publish events
+	// so here we can use a nil pointer to test newEventsForwarder
+	var publisher events.Publisher
+
+	// check log forwarder
+	forwarder := s.newEventsForwarder(context.Background(), publisher)
+	assert.Equal(forwarderTypeLog, forwarder.forwarderType())
+
+	// check containerd forwarder
+	os.Setenv(ttrpcAddressEnv, "/foo/bar.sock")
+	defer os.Setenv(ttrpcAddressEnv, "")
+	forwarder = s.newEventsForwarder(context.Background(), publisher)
+	assert.Equal(forwarderTypeContainerd, forwarder.forwarderType())
+}
--- a/src/runtime/pkg/containerd-shim-v2/service.go
+++ b/src/runtime/pkg/containerd-shim-v2/service.go
@@ -17,7 +17,6 @@ import (
 	eventstypes "github.com/containerd/containerd/api/events"
 	"github.com/containerd/containerd/api/types/task"
 	"github.com/containerd/containerd/errdefs"
-	"github.com/containerd/containerd/events"
 	"github.com/containerd/containerd/namespaces"
 	cdruntime "github.com/containerd/containerd/runtime"
 	cdshim "github.com/containerd/containerd/runtime/v2/shim"
@@ -51,10 +50,6 @@ const (

 	chSize      = 128
 	exitCode255 = 255
-
-	// A time span used to wait for publish a containerd event,
-	// once it costs a longer time than timeOut, it will be canceld.
-	timeOut = 5 * time.Second
 )

 var (
@@ -100,7 +95,8 @@ func New(ctx context.Context, id string, publisher cdshim.Publisher, shutdown fu

 	go s.processExits()

-	go s.forward(ctx, publisher)
+	forwarder := s.newEventsForwarder(ctx, publisher)
+	go forwarder.forward()

 	return s, nil
 }
@@ -256,17 +252,6 @@ func (s *service) StartShim(ctx context.Context, opts cdshim.StartOpts) (_ strin
 	return address, nil
 }

-func (s *service) forward(ctx context.Context, publisher events.Publisher) {
-	for e := range s.events {
-		ctx, cancel := context.WithTimeout(ctx, timeOut)
-		err := publisher.Publish(ctx, getTopic(e), e)
-		cancel()
-		if err != nil {
-			shimLog.WithError(err).Error("post event")
-		}
-	}
-}
-
 func (s *service) send(evt interface{}) {
 	// for unit test, it will not initialize s.events
 	if s.events != nil {
--- a/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go
+++ b/src/runtime/pkg/govmm/qemu/qemu_arch_base_test.go
@@ -1,3 +1,4 @@
+//go:build !s390x
 // +build !s390x

 // Copyright contributors to the Virtual Machine Manager for Go project
--- a/src/runtime/pkg/govmm/vmm_amd64.go
+++ b/src/runtime/pkg/govmm/vmm_amd64.go
@@ -0,0 +1,12 @@
+//
+// Copyright (c) 2018 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package govmm
+
+// MaxVCPUs returns the maximum number of vCPUs supported
+func MaxVCPUs() uint32 {
+	return uint32(240)
+}
--- a/src/runtime/pkg/govmm/vmm_arm64.go
+++ b/src/runtime/pkg/govmm/vmm_arm64.go
@@ -0,0 +1,25 @@
+//
+// Copyright (c) 2018 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package govmm
+
+//In qemu, maximum number of vCPUs depends on the GIC version, or on how
+//many redistributors we can fit into the memory map.
+//related codes are under github.com/qemu/qemu/hw/arm/virt.c(Line 135 and 1306 in stable-2.11)
+//for now, qemu only supports v2 and v3, we treat v4 as v3 based on
+//backward compatibility.
+var gicList = map[uint32]uint32{
+	uint32(2): uint32(8),
+	uint32(3): uint32(123),
+	uint32(4): uint32(123),
+}
+
+var defaultGICVersion = uint32(3)
+
+// MaxVCPUs returns the maximum number of vCPUs supported
+func MaxVCPUs() uint32 {
+	return gicList[defaultGICVersion]
+}
--- a/src/runtime/pkg/govmm/vmm_ppc64le.go
+++ b/src/runtime/pkg/govmm/vmm_ppc64le.go
@@ -0,0 +1,12 @@
+//
+// Copyright (c) 2018 IBM
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package govmm
+
+// MaxVCPUs returns the maximum number of vCPUs supported
+func MaxVCPUs() uint32 {
+	return uint32(128)
+}
--- a/src/runtime/pkg/govmm/vmm_s390x.go
+++ b/src/runtime/pkg/govmm/vmm_s390x.go
@@ -0,0 +1,15 @@
+//
+// Copyright (c) 2018 IBM
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package govmm
+
+// MaxVCPUs returns the maximum number of vCPUs supported
+func MaxVCPUs() uint32 {
+	// Max number of virtual Cpu defined in qemu. See
+	// https://github.com/qemu/qemu/blob/80422b00196a7af4c6efb628fae0ad8b644e98af/target/s390x/cpu.h#L55
+	// #define S390_MAX_CPUS 248
+	return uint32(248)
+}
--- a/src/runtime/pkg/kata-monitor/cri.go
+++ b/src/runtime/pkg/kata-monitor/cri.go
@@ -142,7 +142,7 @@ func (km *KataMonitor) syncSandboxes(sandboxList []string) ([]string, error) {
 	for _, pod := range r.Items {
 		for _, sandbox := range sandboxList {
 			if pod.Id == sandbox {
-				km.sandboxCache.setMetadata(sandbox, sandboxKubeData{
+				km.sandboxCache.setCRIMetadata(sandbox, sandboxCRIMetadata{
 					uid:       pod.Metadata.Uid,
 					name:      pod.Metadata.Name,
 					namespace: pod.Metadata.Namespace,
@@ -151,9 +151,9 @@ func (km *KataMonitor) syncSandboxes(sandboxList []string) ([]string, error) {
 				sandboxList = removeFromSandboxList(sandboxList, sandbox)

 				monitorLog.WithFields(logrus.Fields{
-					"Pod Name":      pod.Metadata.Name,
-					"Pod Namespace": pod.Metadata.Namespace,
-					"Pod UID":       pod.Metadata.Uid,
+					"cri-name":      pod.Metadata.Name,
+					"cri-namespace": pod.Metadata.Namespace,
+					"cri-uid":       pod.Metadata.Uid,
 				}).Debugf("Synced KATA POD %s", pod.Id)

 				break
--- a/src/runtime/pkg/kata-monitor/metrics.go
+++ b/src/runtime/pkg/kata-monitor/metrics.go
@@ -160,12 +160,12 @@ func (km *KataMonitor) aggregateSandboxMetrics(encoder expfmt.Encoder) error {

 	// get metrics from sandbox's shim
 	for _, sandboxID := range sandboxes {
-		sandboxMetadata, ok := km.sandboxCache.getMetadata(sandboxID)
+		sandboxMetadata, ok := km.sandboxCache.getCRIMetadata(sandboxID)
 		if !ok { // likely the sandbox has been just removed
 			continue
 		}
 		wg.Add(1)
-		go func(sandboxID string, sandboxMetadata sandboxKubeData, results chan<- []*dto.MetricFamily) {
+		go func(sandboxID string, sandboxMetadata sandboxCRIMetadata, results chan<- []*dto.MetricFamily) {
 			sandboxMetrics, err := getParsedMetrics(sandboxID, sandboxMetadata)
 			if err != nil {
 				monitorLog.WithError(err).WithField("sandbox_id", sandboxID).Errorf("failed to get metrics for sandbox")
@@ -223,7 +223,7 @@ func (km *KataMonitor) aggregateSandboxMetrics(encoder expfmt.Encoder) error {

 }

-func getParsedMetrics(sandboxID string, sandboxMetadata sandboxKubeData) ([]*dto.MetricFamily, error) {
+func getParsedMetrics(sandboxID string, sandboxMetadata sandboxCRIMetadata) ([]*dto.MetricFamily, error) {
 	body, err := doGet(sandboxID, defaultTimeout, "metrics")
 	if err != nil {
 		return nil, err
@@ -244,7 +244,7 @@ func GetSandboxMetrics(sandboxID string) (string, error) {

 // parsePrometheusMetrics will decode metrics from Prometheus text format
 // and return array of *dto.MetricFamily with an ASC order
-func parsePrometheusMetrics(sandboxID string, sandboxMetadata sandboxKubeData, body []byte) ([]*dto.MetricFamily, error) {
+func parsePrometheusMetrics(sandboxID string, sandboxMetadata sandboxCRIMetadata, body []byte) ([]*dto.MetricFamily, error) {
 	reader := bytes.NewReader(body)
 	decoder := expfmt.NewDecoder(reader, expfmt.FmtText)

@@ -268,15 +268,15 @@ func parsePrometheusMetrics(sandboxID string, sandboxMetadata sandboxKubeData, b
 					Value: mutils.String2Pointer(sandboxID),
 				},
 				&dto.LabelPair{
-					Name:  mutils.String2Pointer("kube_uid"),
+					Name:  mutils.String2Pointer("cri_uid"),
 					Value: mutils.String2Pointer(sandboxMetadata.uid),
 				},
 				&dto.LabelPair{
-					Name:  mutils.String2Pointer("kube_name"),
+					Name:  mutils.String2Pointer("cri_name"),
 					Value: mutils.String2Pointer(sandboxMetadata.name),
 				},
 				&dto.LabelPair{
-					Name:  mutils.String2Pointer("kube_namespace"),
+					Name:  mutils.String2Pointer("cri_namespace"),
 					Value: mutils.String2Pointer(sandboxMetadata.namespace),
 				},
 			)
--- a/src/runtime/pkg/kata-monitor/metrics_test.go
+++ b/src/runtime/pkg/kata-monitor/metrics_test.go
@@ -40,7 +40,7 @@ ttt 999
 func TestParsePrometheusMetrics(t *testing.T) {
 	assert := assert.New(t)
 	sandboxID := "sandboxID-abc"
-	sandboxMetadata := sandboxKubeData{"123", "pod-name", "pod-namespace"}
+	sandboxMetadata := sandboxCRIMetadata{"123", "pod-name", "pod-namespace"}

 	// parse metrics
 	list, err := parsePrometheusMetrics(sandboxID, sandboxMetadata, []byte(shimMetricBody))
@@ -60,12 +60,12 @@ func TestParsePrometheusMetrics(t *testing.T) {
 	assert.Equal(4, len(m.Label), "should have 4 labels")
 	assert.Equal("sandbox_id", *m.Label[0].Name, "label name should be sandbox_id")
 	assert.Equal(sandboxID, *m.Label[0].Value, "label value should be", sandboxID)
-	assert.Equal("kube_uid", *m.Label[1].Name, "label name should be kube_uid")
+	assert.Equal("cri_uid", *m.Label[1].Name, "label name should be cri_uid")
 	assert.Equal(sandboxMetadata.uid, *m.Label[1].Value, "label value should be", sandboxMetadata.uid)

-	assert.Equal("kube_name", *m.Label[2].Name, "label name should be kube_name")
+	assert.Equal("cri_name", *m.Label[2].Name, "label name should be cri_name")
 	assert.Equal(sandboxMetadata.name, *m.Label[2].Value, "label value should be", sandboxMetadata.name)
-	assert.Equal("kube_namespace", *m.Label[3].Name, "label name should be kube_namespace")
+	assert.Equal("cri_namespace", *m.Label[3].Name, "label name should be cri_namespace")
 	assert.Equal(sandboxMetadata.namespace, *m.Label[3].Value, "label value should be", sandboxMetadata.namespace)

 	summary := m.Summary
--- a/src/runtime/pkg/kata-monitor/monitor.go
+++ b/src/runtime/pkg/kata-monitor/monitor.go
@@ -53,7 +53,7 @@ func NewKataMonitor(runtimeEndpoint string) (*KataMonitor, error) {
 		runtimeEndpoint: runtimeEndpoint,
 		sandboxCache: &sandboxCache{
 			Mutex:     &sync.Mutex{},
-			sandboxes: make(map[string]sandboxKubeData),
+			sandboxes: make(map[string]sandboxCRIMetadata),
 		},
 	}

@@ -104,10 +104,14 @@ func (km *KataMonitor) startPodCacheUpdater() {
 		monitorLog.WithError(err).Fatal("cannot read sandboxes fs")
 		os.Exit(1)
 	}
+	for _, sandbox := range sandboxList {
+		km.sandboxCache.putIfNotExists(sandbox, sandboxCRIMetadata{})
+	}
+
 	monitorLog.Debug("initial sync of sbs directory completed")
 	monitorLog.Tracef("pod list from sbs: %v", sandboxList)

-	// We should get kubernetes metadata from the container manager for each new kata sandbox we detect.
+	// We try to get CRI (kubernetes) metadata from the container manager for each new kata sandbox we detect.
 	// It may take a while for data to be available, so we always wait podCacheRefreshDelaySeconds before checking.
 	cacheUpdateTimer := time.NewTimer(podCacheRefreshDelaySeconds * time.Second)
 	cacheUpdateTimerIsSet := true
@@ -123,7 +127,7 @@ func (km *KataMonitor) startPodCacheUpdater() {
 			case fsnotify.Create:
 				splitPath := strings.Split(event.Name, string(os.PathSeparator))
 				id := splitPath[len(splitPath)-1]
-				if !km.sandboxCache.putIfNotExists(id, sandboxKubeData{}) {
+				if !km.sandboxCache.putIfNotExists(id, sandboxCRIMetadata{}) {
 					monitorLog.WithField("pod", id).Warn(
 						"CREATE event but pod already present in the sandbox cache")
 				}
--- a/src/runtime/pkg/kata-monitor/sandbox_cache.go
+++ b/src/runtime/pkg/kata-monitor/sandbox_cache.go
@@ -9,15 +9,15 @@ import (
 	"sync"
 )

-type sandboxKubeData struct {
+type sandboxCRIMetadata struct {
 	uid       string
 	name      string
 	namespace string
 }
 type sandboxCache struct {
 	*sync.Mutex
-	// the sandboxKubeData links the sandbox id from the container manager to the pod metadata of kubernetes
-	sandboxes map[string]sandboxKubeData
+	// the sandboxCRIMetadata links the sandbox id from the container manager to the pod metadata of kubernetes
+	sandboxes map[string]sandboxCRIMetadata
 }

 func (sc *sandboxCache) getSandboxList() []string {
@@ -43,7 +43,7 @@ func (sc *sandboxCache) deleteIfExists(id string) bool {
 	return false
 }

-func (sc *sandboxCache) putIfNotExists(id string, value sandboxKubeData) bool {
+func (sc *sandboxCache) putIfNotExists(id string, value sandboxCRIMetadata) bool {
 	sc.Lock()
 	defer sc.Unlock()

@@ -56,14 +56,14 @@ func (sc *sandboxCache) putIfNotExists(id string, value sandboxKubeData) bool {
 	return false
 }

-func (sc *sandboxCache) setMetadata(id string, value sandboxKubeData) {
+func (sc *sandboxCache) setCRIMetadata(id string, value sandboxCRIMetadata) {
 	sc.Lock()
 	defer sc.Unlock()

 	sc.sandboxes[id] = value
 }

-func (sc *sandboxCache) getMetadata(id string) (sandboxKubeData, bool) {
+func (sc *sandboxCache) getCRIMetadata(id string) (sandboxCRIMetadata, bool) {
 	sc.Lock()
 	defer sc.Unlock()

--- a/src/runtime/pkg/kata-monitor/sandbox_cache_test.go
+++ b/src/runtime/pkg/kata-monitor/sandbox_cache_test.go
@@ -16,19 +16,19 @@ func TestSandboxCache(t *testing.T) {
 	assert := assert.New(t)
 	sc := &sandboxCache{
 		Mutex:     &sync.Mutex{},
-		sandboxes: map[string]sandboxKubeData{"111": {"1-2-3", "test-name", "test-namespace"}},
+		sandboxes: map[string]sandboxCRIMetadata{"111": {"1-2-3", "test-name", "test-namespace"}},
 	}

 	assert.Equal(1, len(sc.getSandboxList()))

 	// put new item
 	id := "new-id"
-	b := sc.putIfNotExists(id, sandboxKubeData{})
+	b := sc.putIfNotExists(id, sandboxCRIMetadata{})
 	assert.Equal(true, b)
 	assert.Equal(2, len(sc.getSandboxList()))

 	// put key that alreay exists
-	b = sc.putIfNotExists(id, sandboxKubeData{})
+	b = sc.putIfNotExists(id, sandboxCRIMetadata{})
 	assert.Equal(false, b)

 	b = sc.deleteIfExists(id)
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -15,6 +15,7 @@ import (
 	"strings"

 	"github.com/BurntSushi/toml"
+	"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
 	govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
@@ -135,6 +136,7 @@ type hypervisor struct {
 	GuestSwap               bool     `toml:"enable_guest_swap"`
 	Rootless                bool     `toml:"rootless"`
 	DisableSeccomp          bool     `toml:"disable_seccomp"`
+	DisableSeLinux          bool     `toml:"disable_selinux"`
 }

 type runtime struct {
@@ -343,7 +345,7 @@ func (h hypervisor) defaultVCPUs() uint32 {

 func (h hypervisor) defaultMaxVCPUs() uint32 {
 	numcpus := uint32(goruntime.NumCPU())
-	maxvcpus := vc.MaxQemuVCPUs()
+	maxvcpus := govmm.MaxVCPUs()
 	reqVCPUs := h.DefaultMaxVCPUs

 	//don't exceed the number of physical CPUs. If a default is not provided, use the
@@ -876,6 +878,8 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		SGXEPCSize:              defaultSGXEPCSize,
 		EnableAnnotations:       h.EnableAnnotations,
 		DisableSeccomp:          h.DisableSeccomp,
+		ConfidentialGuest:       h.ConfidentialGuest,
+		DisableSeLinux:          h.DisableSeLinux,
 	}, nil
 }

--- a/src/runtime/pkg/katautils/config_test.go
+++ b/src/runtime/pkg/katautils/config_test.go
@@ -17,6 +17,7 @@ import (
 	"syscall"
 	"testing"

+	"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
 	ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
 	vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@@ -919,7 +920,7 @@ func TestHypervisorDefaults(t *testing.T) {
 	h.DefaultMaxVCPUs = numCPUs + 1
 	assert.Equal(h.defaultMaxVCPUs(), numCPUs, "default max vCPU number is wrong")

-	maxvcpus := vc.MaxQemuVCPUs()
+	maxvcpus := govmm.MaxVCPUs()
 	h.DefaultMaxVCPUs = maxvcpus + 1
 	assert.Equal(h.defaultMaxVCPUs(), numCPUs, "default max vCPU number is wrong")

--- a/src/runtime/pkg/katautils/create.go
+++ b/src/runtime/pkg/katautils/create.go
@@ -155,11 +155,19 @@ func CreateSandbox(ctx context.Context, vci vc.VC, ociSpec specs.Spec, runtimeCo
 		}
 	}()

-	// Run pre-start OCI hooks.
-	err = EnterNetNS(sandboxConfig.NetworkConfig.NetworkID, func() error {
-		return PreStartHooks(ctx, ociSpec, containerID, bundlePath)
-	})
-	if err != nil {
+	if ociSpec.Annotations == nil {
+		ociSpec.Annotations = make(map[string]string)
+	}
+	ociSpec.Annotations["nerdctl/network-namespace"] = sandboxConfig.NetworkConfig.NetworkID
+	sandboxConfig.Annotations["nerdctl/network-namespace"] = ociSpec.Annotations["nerdctl/network-namespace"]
+
+	// Run pre-start OCI hooks, in the runtime namespace.
+	if err := PreStartHooks(ctx, ociSpec, containerID, bundlePath); err != nil {
+		return nil, vc.Process{}, err
+	}
+
+	// Run create runtime OCI hooks, in the runtime namespace.
+	if err := CreateRuntimeHooks(ctx, ociSpec, containerID, bundlePath); err != nil {
 		return nil, vc.Process{}, err
 	}

--- a/src/runtime/pkg/katautils/create_test.go
+++ b/src/runtime/pkg/katautils/create_test.go
@@ -264,6 +264,46 @@ func TestCreateSandboxFail(t *testing.T) {
 	assert.True(vcmock.IsMockError(err))
 }

+func TestCreateSandboxAnnotations(t *testing.T) {
+	if tc.NotValid(ktu.NeedRoot()) {
+		t.Skip(ktu.TestDisabledNeedRoot)
+	}
+
+	assert := assert.New(t)
+
+	tmpdir, bundlePath, _ := ktu.SetupOCIConfigFile(t)
+	defer os.RemoveAll(tmpdir)
+
+	runtimeConfig, err := newTestRuntimeConfig(tmpdir, testConsole, true)
+	assert.NoError(err)
+
+	spec, err := compatoci.ParseConfigJSON(bundlePath)
+	assert.NoError(err)
+
+	rootFs := vc.RootFs{Mounted: true}
+
+	testingImpl.CreateSandboxFunc = func(ctx context.Context, sandboxConfig vc.SandboxConfig) (vc.VCSandbox, error) {
+		return &vcmock.Sandbox{
+			MockID: testSandboxID,
+			MockContainers: []*vcmock.Container{
+				{MockID: testContainerID},
+			},
+			MockAnnotations: sandboxConfig.Annotations,
+		}, nil
+	}
+
+	defer func() {
+		testingImpl.CreateSandboxFunc = nil
+	}()
+
+	sandbox, _, err := CreateSandbox(context.Background(), testingImpl, spec, runtimeConfig, rootFs, testContainerID, bundlePath, testConsole, true, true)
+	assert.NoError(err)
+
+	netNsPath, err := sandbox.Annotations("nerdctl/network-namespace")
+	assert.NoError(err)
+	assert.Equal(path.Dir(netNsPath), "/var/run/netns")
+}
+
 func TestCheckForFips(t *testing.T) {
 	assert := assert.New(t)

--- a/src/runtime/pkg/katautils/hook.go
+++ b/src/runtime/pkg/katautils/hook.go
@@ -16,6 +16,7 @@ import (
 	"time"

 	"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
+	syscallWrapper "github.com/kata-containers/kata-containers/src/runtime/pkg/syscall"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 )
@@ -32,15 +33,16 @@ func hookLogger() *logrus.Entry {
 	return kataUtilsLogger.WithField("subsystem", "hook")
 }

-func runHook(ctx context.Context, hook specs.Hook, cid, bundlePath string) error {
+func runHook(ctx context.Context, spec specs.Spec, hook specs.Hook, cid, bundlePath string) error {
 	span, _ := katatrace.Trace(ctx, hookLogger(), "runHook", hookTracingTags)
 	defer span.End()
 	katatrace.AddTags(span, "path", hook.Path, "args", hook.Args)

 	state := specs.State{
-		Pid:    syscall.Gettid(),
-		Bundle: bundlePath,
-		ID:     cid,
+		Pid:         syscallWrapper.Gettid(),
+		Bundle:      bundlePath,
+		ID:          cid,
+		Annotations: spec.Annotations,
 	}

 	stateJSON, err := json.Marshal(state)
@@ -90,13 +92,13 @@ func runHook(ctx context.Context, hook specs.Hook, cid, bundlePath string) error
 	return nil
 }

-func runHooks(ctx context.Context, hooks []specs.Hook, cid, bundlePath, hookType string) error {
+func runHooks(ctx context.Context, spec specs.Spec, hooks []specs.Hook, cid, bundlePath, hookType string) error {
 	span, ctx := katatrace.Trace(ctx, hookLogger(), "runHooks", hookTracingTags)
 	katatrace.AddTags(span, "type", hookType)
 	defer span.End()

 	for _, hook := range hooks {
-		if err := runHook(ctx, hook, cid, bundlePath); err != nil {
+		if err := runHook(ctx, spec, hook, cid, bundlePath); err != nil {
 			hookLogger().WithFields(logrus.Fields{
 				"hook-type": hookType,
 				"error":     err,
@@ -109,6 +111,15 @@ func runHooks(ctx context.Context, hooks []specs.Hook, cid, bundlePath, hookType
 	return nil
 }

+func CreateRuntimeHooks(ctx context.Context, spec specs.Spec, cid, bundlePath string) error {
+	// If no hook available, nothing needs to be done.
+	if spec.Hooks == nil {
+		return nil
+	}
+
+	return runHooks(ctx, spec, spec.Hooks.CreateRuntime, cid, bundlePath, "createRuntime")
+}
+
 // PreStartHooks run the hooks before start container
 func PreStartHooks(ctx context.Context, spec specs.Spec, cid, bundlePath string) error {
 	// If no hook available, nothing needs to be done.
@@ -116,7 +127,7 @@ func PreStartHooks(ctx context.Context, spec specs.Spec, cid, bundlePath string)
 		return nil
 	}

-	return runHooks(ctx, spec.Hooks.Prestart, cid, bundlePath, "pre-start")
+	return runHooks(ctx, spec, spec.Hooks.Prestart, cid, bundlePath, "pre-start")
 }

 // PostStartHooks run the hooks just after start container
@@ -126,7 +137,7 @@ func PostStartHooks(ctx context.Context, spec specs.Spec, cid, bundlePath string
 		return nil
 	}

-	return runHooks(ctx, spec.Hooks.Poststart, cid, bundlePath, "post-start")
+	return runHooks(ctx, spec, spec.Hooks.Poststart, cid, bundlePath, "post-start")
 }

 // PostStopHooks run the hooks after stop container
@@ -136,5 +147,5 @@ func PostStopHooks(ctx context.Context, spec specs.Spec, cid, bundlePath string)
 		return nil
 	}

-	return runHooks(ctx, spec.Hooks.Poststop, cid, bundlePath, "post-stop")
+	return runHooks(ctx, spec, spec.Hooks.Poststop, cid, bundlePath, "post-stop")
 }
--- a/src/runtime/pkg/katautils/hook_test.go
+++ b/src/runtime/pkg/katautils/hook_test.go
@@ -57,26 +57,27 @@ func TestRunHook(t *testing.T) {
 	assert := assert.New(t)

 	ctx := context.Background()
+	spec := specs.Spec{}

 	// Run with timeout 0
 	hook := createHook(0)
-	err := runHook(ctx, hook, testSandboxID, testBundlePath)
+	err := runHook(ctx, spec, hook, testSandboxID, testBundlePath)
 	assert.NoError(err)

 	// Run with timeout 1
 	hook = createHook(1)
-	err = runHook(ctx, hook, testSandboxID, testBundlePath)
+	err = runHook(ctx, spec, hook, testSandboxID, testBundlePath)
 	assert.NoError(err)

 	// Run timeout failure
 	hook = createHook(1)
 	hook.Args = append(hook.Args, "2")
-	err = runHook(ctx, hook, testSandboxID, testBundlePath)
+	err = runHook(ctx, spec, hook, testSandboxID, testBundlePath)
 	assert.Error(err)

 	// Failure due to wrong hook
 	hook = createWrongHook()
-	err = runHook(ctx, hook, testSandboxID, testBundlePath)
+	err = runHook(ctx, spec, hook, testSandboxID, testBundlePath)
 	assert.Error(err)
 }

--- a/src/runtime/pkg/katautils/network_darwin.go
+++ b/src/runtime/pkg/katautils/network_darwin.go
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 Apple Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package katautils
+
+import (
+	vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
+)
+
+func EnterNetNS(networkID string, cb func() error) error {
+	return nil
+}
+
+func SetupNetworkNamespace(config *vc.NetworkConfig) error {
+	return nil
+}
+
+func cleanupNetNS(netNSPath string) error {
+	return nil
+}
--- a/src/runtime/pkg/katautils/network_linux.go
+++ b/src/runtime/pkg/katautils/network_linux.go
--- a/src/runtime/pkg/oci/utils.go
+++ b/src/runtime/pkg/oci/utils.go
@@ -23,6 +23,7 @@ import (
 	"github.com/sirupsen/logrus"
 	"k8s.io/apimachinery/pkg/api/resource"

+	"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
 	vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"

 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/config"
@@ -640,8 +641,8 @@ func addHypervisorCPUOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) e
 			return fmt.Errorf("Number of cpus %d in annotation default_maxvcpus is greater than the number of CPUs %d on the system", max, numCPUs)
 		}

-		if sbConfig.HypervisorType == vc.QemuHypervisor && max > vc.MaxQemuVCPUs() {
-			return fmt.Errorf("Number of cpus %d in annotation default_maxvcpus is greater than max no of CPUs %d supported for qemu", max, vc.MaxQemuVCPUs())
+		if sbConfig.HypervisorType == vc.QemuHypervisor && max > govmm.MaxVCPUs() {
+			return fmt.Errorf("Number of cpus %d in annotation default_maxvcpus is greater than max no of CPUs %d supported for qemu", max, govmm.MaxVCPUs())
 		}
 		sbConfig.HypervisorConfig.DefaultMaxVCPUs = max
 		return nil
--- a/src/runtime/pkg/resourcecontrol/cgroups.go
+++ b/src/runtime/pkg/resourcecontrol/cgroups.go
@@ -0,0 +1,333 @@
+//go:build linux
+// +build linux
+
+// Copyright (c) 2021-2022 Apple Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package resourcecontrol
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"github.com/containerd/cgroups"
+	v1 "github.com/containerd/cgroups/stats/v1"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/sirupsen/logrus"
+)
+
+// prepend a kata specific string to oci cgroup path to
+// form a different cgroup path, thus cAdvisor couldn't
+// find kata containers cgroup path on host to prevent it
+// from grabbing the stats data.
+const CgroupKataPrefix = "kata"
+
+func RenameCgroupPath(path string) (string, error) {
+	if path == "" {
+		path = DefaultResourceControllerID
+	}
+
+	cgroupPathDir := filepath.Dir(path)
+	cgroupPathName := fmt.Sprintf("%s_%s", CgroupKataPrefix, filepath.Base(path))
+	return filepath.Join(cgroupPathDir, cgroupPathName), nil
+
+}
+
+type LinuxCgroup struct {
+	cgroup  cgroups.Cgroup
+	path    string
+	cpusets *specs.LinuxCPU
+	devices []specs.LinuxDeviceCgroup
+
+	sync.Mutex
+}
+
+func sandboxDevices() []specs.LinuxDeviceCgroup {
+	devices := []specs.LinuxDeviceCgroup{}
+
+	defaultDevices := []string{
+		"/dev/null",
+		"/dev/random",
+		"/dev/full",
+		"/dev/tty",
+		"/dev/zero",
+		"/dev/urandom",
+		"/dev/console",
+	}
+
+	// Processes running in a device-cgroup are constrained, they have acccess
+	// only to the devices listed in the devices.list file.
+	// In order to run Virtual Machines and create virtqueues, hypervisors
+	// need access to certain character devices in the host, like kvm and vhost-net.
+	hypervisorDevices := []string{
+		"/dev/kvm",         // To run virtual machines
+		"/dev/vhost-net",   // To create virtqueues
+		"/dev/vfio/vfio",   // To access VFIO devices
+		"/dev/vhost-vsock", // To interact with vsock if
+	}
+
+	defaultDevices = append(defaultDevices, hypervisorDevices...)
+
+	for _, device := range defaultDevices {
+		ldevice, err := DeviceToLinuxDevice(device)
+		if err != nil {
+			controllerLogger.WithField("source", "cgroups").Warnf("Could not add %s to the devices cgroup", device)
+			continue
+		}
+		devices = append(devices, ldevice)
+	}
+
+	wildcardMajor := int64(-1)
+	wildcardMinor := int64(-1)
+	ptsMajor := int64(136)
+	tunMajor := int64(10)
+	tunMinor := int64(200)
+
+	wildcardDevices := []specs.LinuxDeviceCgroup{
+		// allow mknod for any device
+		{
+			Allow:  true,
+			Type:   "c",
+			Major:  &wildcardMajor,
+			Minor:  &wildcardMinor,
+			Access: "m",
+		},
+		{
+			Allow:  true,
+			Type:   "b",
+			Major:  &wildcardMajor,
+			Minor:  &wildcardMinor,
+			Access: "m",
+		},
+		// /dev/pts/ - pts namespaces are "coming soon"
+		{
+			Allow:  true,
+			Type:   "c",
+			Major:  &ptsMajor,
+			Minor:  &wildcardMinor,
+			Access: "rwm",
+		},
+		// tuntap
+		{
+			Allow:  true,
+			Type:   "c",
+			Major:  &tunMajor,
+			Minor:  &tunMinor,
+			Access: "rwm",
+		},
+	}
+
+	devices = append(devices, wildcardDevices...)
+
+	return devices
+}
+
+func NewResourceController(path string, resources *specs.LinuxResources) (ResourceController, error) {
+	var err error
+
+	cgroupPath, err := ValidCgroupPath(path, IsSystemdCgroup(path))
+	if err != nil {
+		return nil, err
+	}
+
+	cgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources)
+	if err != nil {
+		return nil, err
+	}
+
+	return &LinuxCgroup{
+		path:    cgroupPath,
+		devices: resources.Devices,
+		cpusets: resources.CPU,
+		cgroup:  cgroup,
+	}, nil
+}
+
+func NewSandboxResourceController(path string, resources *specs.LinuxResources, sandboxCgroupOnly bool) (ResourceController, error) {
+	var cgroup cgroups.Cgroup
+	sandboxResources := *resources
+	sandboxResources.Devices = append(sandboxResources.Devices, sandboxDevices()...)
+
+	// Currently we know to handle systemd cgroup path only when it's the only cgroup (no overhead group), hence,
+	// if sandboxCgroupOnly is not true we treat it as cgroupfs path as it used to be, although it may be incorrect
+	if !IsSystemdCgroup(path) || !sandboxCgroupOnly {
+		return NewResourceController(path, &sandboxResources)
+	}
+
+	slice, unit, err := getSliceAndUnit(path)
+	if err != nil {
+		return nil, err
+	}
+	// github.com/containerd/cgroups doesn't support creating a scope unit with
+	// v1 cgroups against systemd, the following interacts directly with systemd
+	// to create the cgroup and then load it using containerd's api
+	err = createCgroupsSystemd(slice, unit, uint32(os.Getpid())) // adding runtime process, it makes calling setupCgroups redundant
+	if err != nil {
+		return nil, err
+	}
+
+	cgHierarchy, cgPath, err := cgroupHierarchy(path)
+	if err != nil {
+		return nil, err
+	}
+
+	// load created cgroup and update with resources
+	if cgroup, err = cgroups.Load(cgHierarchy, cgPath); err == nil {
+		err = cgroup.Update(&sandboxResources)
+	}
+
+	if err != nil {
+		return nil, err
+	}
+
+	return &LinuxCgroup{
+		path:    path,
+		devices: sandboxResources.Devices,
+		cpusets: sandboxResources.CPU,
+		cgroup:  cgroup,
+	}, nil
+}
+
+func LoadResourceController(path string) (ResourceController, error) {
+	cgHierarchy, cgPath, err := cgroupHierarchy(path)
+	if err != nil {
+		return nil, err
+	}
+
+	cgroup, err := cgroups.Load(cgHierarchy, cgPath)
+	if err != nil {
+		return nil, err
+	}
+
+	return &LinuxCgroup{
+		path:   path,
+		cgroup: cgroup,
+	}, nil
+}
+
+func (c *LinuxCgroup) Logger() *logrus.Entry {
+	return controllerLogger.WithField("source", "cgroups")
+}
+
+func (c *LinuxCgroup) Delete() error {
+	return c.cgroup.Delete()
+}
+
+func (c *LinuxCgroup) Stat() (*v1.Metrics, error) {
+	return c.cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist))
+}
+
+func (c *LinuxCgroup) AddProcess(pid int, subsystems ...string) error {
+	return c.cgroup.Add(cgroups.Process{Pid: pid})
+}
+
+func (c *LinuxCgroup) AddThread(pid int, subsystems ...string) error {
+	return c.cgroup.AddTask(cgroups.Process{Pid: pid})
+}
+
+func (c *LinuxCgroup) Update(resources *specs.LinuxResources) error {
+	return c.cgroup.Update(resources)
+}
+
+func (c *LinuxCgroup) MoveTo(path string) error {
+	cgHierarchy, cgPath, err := cgroupHierarchy(path)
+	if err != nil {
+		return err
+	}
+
+	newCgroup, err := cgroups.Load(cgHierarchy, cgPath)
+	if err != nil {
+		return err
+	}
+
+	return c.cgroup.MoveTo(newCgroup)
+}
+
+func (c *LinuxCgroup) AddDevice(deviceHostPath string) error {
+	deviceResource, err := DeviceToLinuxDevice(deviceHostPath)
+	if err != nil {
+		return err
+	}
+
+	c.Lock()
+	defer c.Unlock()
+
+	c.devices = append(c.devices, deviceResource)
+
+	if err := c.cgroup.Update(&specs.LinuxResources{
+		Devices: c.devices,
+	}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (c *LinuxCgroup) RemoveDevice(deviceHostPath string) error {
+	deviceResource, err := DeviceToLinuxDevice(deviceHostPath)
+	if err != nil {
+		return err
+	}
+
+	c.Lock()
+	defer c.Unlock()
+
+	for i, d := range c.devices {
+		if d.Type == deviceResource.Type &&
+			d.Major == deviceResource.Major &&
+			d.Minor == deviceResource.Minor {
+			c.devices = append(c.devices[:i], c.devices[i+1:]...)
+		}
+	}
+
+	if err := c.cgroup.Update(&specs.LinuxResources{
+		Devices: c.devices,
+	}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (c *LinuxCgroup) UpdateCpuSet(cpuset, memset string) error {
+	c.Lock()
+	defer c.Unlock()
+
+	if len(cpuset) > 0 {
+		// If we didn't have a cpuset defined, let's create:
+		if c.cpusets == nil {
+			c.cpusets = &specs.LinuxCPU{}
+		}
+
+		c.cpusets.Cpus = cpuset
+	}
+
+	if len(memset) > 0 {
+		// If we didn't have a cpuset defined, let's now create:
+		if c.cpusets == nil {
+			c.cpusets = &specs.LinuxCPU{}
+		}
+
+		c.cpusets.Mems = memset
+	}
+
+	return c.cgroup.Update(&specs.LinuxResources{
+		CPU: c.cpusets,
+	})
+}
+
+func (c *LinuxCgroup) Type() ResourceControllerType {
+	return LinuxCgroups
+}
+
+func (c *LinuxCgroup) ID() string {
+	return c.path
+}
+
+func (c *LinuxCgroup) Parent() string {
+	return filepath.Dir(c.path)
+}
--- a/src/runtime/pkg/resourcecontrol/controller.go
+++ b/src/runtime/pkg/resourcecontrol/controller.go
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 Apple Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package resourcecontrol
+
+import (
+	v1 "github.com/containerd/cgroups/stats/v1"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/sirupsen/logrus"
+)
+
+var (
+	controllerLogger = logrus.WithField("source", "virtcontainers/pkg/resourcecontrol")
+)
+
+// SetLogger sets up a logger for this pkg
+func SetLogger(logger *logrus.Entry) {
+	fields := controllerLogger.Data
+
+	controllerLogger = logger.WithFields(fields)
+}
+
+// HypervisorType describes an hypervisor type.
+type ResourceControllerType string
+
+const (
+	LinuxCgroups ResourceControllerType = "cgroups"
+)
+
+// String converts an hypervisor type to a string.
+func (rType *ResourceControllerType) String() string {
+	switch *rType {
+	case LinuxCgroups:
+		return string(LinuxCgroups)
+	default:
+		return "Unknown controller type"
+	}
+}
+
+// ResourceController represents a system resources controller.
+// On Linux this interface is implemented through the cgroups API.
+type ResourceController interface {
+	// Type returns the resource controller implementation type.
+	Type() ResourceControllerType
+
+	// The controller identifier, e.g. a Linux cgroups path.
+	ID() string
+
+	// Parent returns the parent controller, on hierarchically
+	// defined resource (e.g. Linux cgroups).
+	Parent() string
+
+	// Delete the controller.
+	Delete() error
+
+	// Stat returns the statistics for the controller.
+	Stat() (*v1.Metrics, error)
+
+	// AddProcess adds a process to a set of controllers.
+	AddProcess(int, ...string) error
+
+	// AddThread adds a process thread to a set of controllers.
+	AddThread(int, ...string) error
+
+	// Update updates the set of resources controlled, based on
+	// an OCI resources description.
+	Update(*specs.LinuxResources) error
+
+	// MoveTo moves a controller to another one.
+	MoveTo(string) error
+
+	// AddDevice adds a device resource to the controller.
+	AddDevice(string) error
+
+	// RemoveDevice removes a device resource to the controller.
+	RemoveDevice(string) error
+
+	// UpdateCpuSet updates the set of controlled CPUs and memory nodes.
+	UpdateCpuSet(string, string) error
+}
--- a/src/runtime/pkg/resourcecontrol/utils.go
+++ b/src/runtime/pkg/resourcecontrol/utils.go
@@ -0,0 +1,77 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package resourcecontrol
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/devices"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+)
+
+func DeviceToCgroupDeviceRule(device string) (*devices.Rule, error) {
+	var st unix.Stat_t
+	deviceRule := devices.Rule{
+		Allow:       true,
+		Permissions: "rwm",
+	}
+
+	if err := unix.Stat(device, &st); err != nil {
+		return nil, err
+	}
+
+	devType := st.Mode & unix.S_IFMT
+
+	switch devType {
+	case unix.S_IFCHR:
+		deviceRule.Type = 'c'
+	case unix.S_IFBLK:
+		deviceRule.Type = 'b'
+	default:
+		return nil, fmt.Errorf("unsupported device type: %v", devType)
+	}
+
+	major := int64(unix.Major(uint64(st.Rdev)))
+	minor := int64(unix.Minor(uint64(st.Rdev)))
+	deviceRule.Major = major
+	deviceRule.Minor = minor
+
+	return &deviceRule, nil
+}
+
+func DeviceToLinuxDevice(device string) (specs.LinuxDeviceCgroup, error) {
+	dev, err := DeviceToCgroupDeviceRule(device)
+	if err != nil {
+		return specs.LinuxDeviceCgroup{}, err
+	}
+
+	return specs.LinuxDeviceCgroup{
+		Allow:  dev.Allow,
+		Type:   string(dev.Type),
+		Major:  &dev.Major,
+		Minor:  &dev.Minor,
+		Access: string(dev.Permissions),
+	}, nil
+}
+
+func IsSystemdCgroup(cgroupPath string) bool {
+
+	// If we are utilizing systemd to manage cgroups, we expect to receive a path
+	// in the format slice:scopeprefix:name. A typical example would be:
+	//
+	// system.slice:docker:6b4c4a4d0cc2a12c529dcb13a2b8e438dfb3b2a6af34d548d7d
+	//
+	// Based on this, let's split by the ':' delimiter and verify that the first
+	// section has .slice as a suffix.
+	parts := strings.Split(cgroupPath, ":")
+	if len(parts) == 3 && strings.HasSuffix(parts[0], ".slice") {
+		return true
+	}
+
+	return false
+}
--- a/src/runtime/pkg/resourcecontrol/utils_linux.go
+++ b/src/runtime/pkg/resourcecontrol/utils_linux.go
@@ -0,0 +1,114 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package resourcecontrol
+
+import (
+	"context"
+	"fmt"
+	"path/filepath"
+	"strings"
+
+	"github.com/containerd/cgroups"
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	"github.com/godbus/dbus/v5"
+	"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
+)
+
+// DefaultResourceControllerID runtime-determined location in the cgroups hierarchy.
+const DefaultResourceControllerID = "/vc"
+
+// validCgroupPath returns a valid cgroup path.
+// see https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#cgroups-path
+func ValidCgroupPath(path string, systemdCgroup bool) (string, error) {
+	if IsSystemdCgroup(path) {
+		return path, nil
+	}
+
+	if systemdCgroup {
+		return "", fmt.Errorf("malformed systemd path '%v': expected to be of form 'slice:prefix:name'", path)
+	}
+
+	// In the case of an absolute path (starting with /), the runtime MUST
+	// take the path to be relative to the cgroups mount point.
+	if filepath.IsAbs(path) {
+		return filepath.Clean(path), nil
+	}
+
+	// In the case of a relative path (not starting with /), the runtime MAY
+	// interpret the path relative to a runtime-determined location in the cgroups hierarchy.
+	// clean up path and return a new path relative to DefaultResourceControllerID
+	return filepath.Join(DefaultResourceControllerID, filepath.Clean("/"+path)), nil
+}
+
+func newProperty(name string, units interface{}) systemdDbus.Property {
+	return systemdDbus.Property{
+		Name:  name,
+		Value: dbus.MakeVariant(units),
+	}
+}
+
+func cgroupHierarchy(path string) (cgroups.Hierarchy, cgroups.Path, error) {
+	if !IsSystemdCgroup(path) {
+		return cgroups.V1, cgroups.StaticPath(path), nil
+	} else {
+		slice, unit, err := getSliceAndUnit(path)
+		if err != nil {
+			return nil, nil, err
+		}
+
+		cgroupSlicePath, _ := systemd.ExpandSlice(slice)
+		if err != nil {
+			return nil, nil, err
+		}
+
+		return cgroups.Systemd, cgroups.Slice(cgroupSlicePath, unit), nil
+	}
+}
+
+func createCgroupsSystemd(slice string, unit string, pid uint32) error {
+	ctx := context.TODO()
+	conn, err := systemdDbus.NewWithContext(ctx)
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	properties := []systemdDbus.Property{
+		systemdDbus.PropDescription("cgroup " + unit),
+		newProperty("DefaultDependencies", false),
+		newProperty("MemoryAccounting", true),
+		newProperty("CPUAccounting", true),
+		newProperty("IOAccounting", true),
+	}
+
+	// https://github.com/opencontainers/runc/blob/master/docs/systemd.md
+	if strings.HasSuffix(unit, ".scope") {
+		// It's a scope, which we put into a Slice=.
+		properties = append(properties, systemdDbus.PropSlice(slice))
+		properties = append(properties, newProperty("Delegate", true))
+		properties = append(properties, systemdDbus.PropPids(pid))
+	} else {
+		return fmt.Errorf("Failed to create cgroups with systemd: unit %s is not a scope", unit)
+	}
+
+	ch := make(chan string)
+	// https://www.freedesktop.org/wiki/Software/systemd/ControlGroupInterface/
+	_, err = conn.StartTransientUnitContext(ctx, unit, "replace", properties, ch)
+	if err != nil {
+		return err
+	}
+	<-ch
+	return nil
+}
+
+func getSliceAndUnit(cgroupPath string) (string, string, error) {
+	parts := strings.Split(cgroupPath, ":")
+	if len(parts) == 3 && strings.HasSuffix(parts[0], ".slice") {
+		return parts[0], fmt.Sprintf("%s-%s.scope", parts[1], parts[2]), nil
+	}
+
+	return "", "", fmt.Errorf("Path: %s is not valid systemd's cgroups path", cgroupPath)
+}
--- a/src/runtime/pkg/resourcecontrol/utils_linux_test.go
+++ b/src/runtime/pkg/resourcecontrol/utils_linux_test.go
@@ -0,0 +1,160 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package resourcecontrol
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIsSystemdCgroup(t *testing.T) {
+	assert := assert.New(t)
+
+	tests := []struct {
+		path     string
+		expected bool
+	}{
+		{"foo.slice:kata:afhts2e5d4g5s", true},
+		{"system.slice:kata:afhts2e5d4g5s", true},
+		{"/kata/afhts2e5d4g5s", false},
+		{"a:b:c:d", false},
+		{":::", false},
+		{"", false},
+		{":", false},
+		{"::", false},
+		{":::", false},
+		{"a:b", false},
+		{"a:b:", false},
+		{":a:b", false},
+		{"@:@:@", false},
+	}
+
+	for _, t := range tests {
+		assert.Equal(t.expected, IsSystemdCgroup(t.path), "invalid systemd cgroup path: %v", t.path)
+	}
+}
+
+func TestValidCgroupPath(t *testing.T) {
+	assert := assert.New(t)
+
+	for _, t := range []struct {
+		path          string
+		systemdCgroup bool
+		error         bool
+	}{
+		// empty paths
+		{"../../../", false, false},
+		{"../", false, false},
+		{".", false, false},
+		{"../../../", false, false},
+		{"./../", false, false},
+
+		// valid no-systemd paths
+		{"../../../foo", false, false},
+		{"/../hi", false, false},
+		{"/../hi/foo", false, false},
+		{"o / m /../ g", false, false},
+		{"/overhead/foobar", false, false},
+		{"/sys/fs/cgroup/cpu/sandbox/kata_foobar", false, false},
+
+		// invalid systemd paths
+		{"o / m /../ g", true, true},
+		{"slice:kata", true, true},
+		{"/kata/afhts2e5d4g5s", true, true},
+		{"a:b:c:d", true, true},
+		{":::", true, true},
+		{"", true, true},
+		{":", true, true},
+		{"::", true, true},
+		{":::", true, true},
+		{"a:b", true, true},
+		{"a:b:", true, true},
+		{":a:b", true, true},
+		{"@:@:@", true, true},
+
+		// valid systemd paths
+		{"x.slice:kata:55555", true, false},
+		{"system.slice:kata:afhts2e5d4g5s", true, false},
+	} {
+		path, err := ValidCgroupPath(t.path, t.systemdCgroup)
+		if t.error {
+			assert.Error(err)
+			continue
+		} else {
+			assert.NoError(err)
+		}
+
+		if filepath.IsAbs(t.path) {
+			cleanPath := filepath.Dir(filepath.Clean(t.path))
+			assert.True(strings.HasPrefix(path, cleanPath),
+				"%v should have prefix %v", path, cleanPath)
+		} else if t.systemdCgroup {
+			assert.Equal(t.path, path)
+		} else {
+			assert.True(
+				strings.HasPrefix(path, DefaultResourceControllerID),
+				"%v should have prefix /%v", path, DefaultResourceControllerID)
+		}
+	}
+
+}
+
+func TestDeviceToCgroupDeviceRule(t *testing.T) {
+	assert := assert.New(t)
+
+	f, err := os.CreateTemp("", "device")
+	assert.NoError(err)
+	f.Close()
+
+	// fail: regular file to device
+	dev, err := DeviceToCgroupDeviceRule(f.Name())
+	assert.Error(err)
+	assert.Nil(dev)
+
+	// fail: no such file
+	os.Remove(f.Name())
+	dev, err = DeviceToCgroupDeviceRule(f.Name())
+	assert.Error(err)
+	assert.Nil(dev)
+
+	devPath := "/dev/null"
+	if _, err := os.Stat(devPath); os.IsNotExist(err) {
+		t.Skipf("no such device: %v", devPath)
+		return
+	}
+	dev, err = DeviceToCgroupDeviceRule(devPath)
+	assert.NoError(err)
+	assert.NotNil(dev)
+	assert.Equal(rune(dev.Type), 'c')
+	assert.NotZero(dev.Major)
+	assert.NotZero(dev.Minor)
+	assert.NotEmpty(dev.Permissions)
+	assert.True(dev.Allow)
+}
+
+func TestDeviceToLinuxDevice(t *testing.T) {
+	assert := assert.New(t)
+
+	devPath := "/dev/null"
+	if _, err := os.Stat(devPath); os.IsNotExist(err) {
+		t.Skipf("no such device: %v", devPath)
+		return
+	}
+	dev, err := DeviceToLinuxDevice(devPath)
+	assert.NoError(err)
+	assert.NotNil(dev)
+	assert.Equal(dev.Type, "c")
+	assert.NotNil(dev.Major)
+	assert.NotZero(*dev.Major)
+	assert.NotNil(dev.Minor)
+	assert.NotZero(*dev.Minor)
+	assert.NotEmpty(dev.Access)
+	assert.True(dev.Allow)
+}
--- a/src/runtime/pkg/syscall/syscall_darwin.go
+++ b/src/runtime/pkg/syscall/syscall_darwin.go
@@ -0,0 +1,16 @@
+// Copyright (c) 2022 Apple Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package syscall
+
+import (
+	"syscall"
+)
+
+func Gettid() int {
+	// There is no equivalent to a thread ID on Darwin.
+	// We use the PID instead.
+	return syscall.Getpid()
+}
--- a/src/runtime/pkg/syscall/syscall_linux.go
+++ b/src/runtime/pkg/syscall/syscall_linux.go
@@ -0,0 +1,14 @@
+// Copyright (c) 2022 Apple Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package syscall
+
+import (
+	"syscall"
+)
+
+func Gettid() int {
+	return syscall.Gettid()
+}