gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

manager.go (26798B)


      1 /*
      2    Copyright The containerd Authors.
      3 
      4    Licensed under the Apache License, Version 2.0 (the "License");
      5    you may not use this file except in compliance with the License.
      6    You may obtain a copy of the License at
      7 
      8        http://www.apache.org/licenses/LICENSE-2.0
      9 
     10    Unless required by applicable law or agreed to in writing, software
     11    distributed under the License is distributed on an "AS IS" BASIS,
     12    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13    See the License for the specific language governing permissions and
     14    limitations under the License.
     15 */
     16 
     17 package cgroup2
     18 
     19 import (
     20 	"bufio"
     21 	"context"
     22 	"errors"
     23 	"fmt"
     24 	"io"
     25 	"math"
     26 	"os"
     27 	"path/filepath"
     28 	"strconv"
     29 	"strings"
     30 	"syscall"
     31 	"time"
     32 
     33 	"github.com/containerd/cgroups/v3/cgroup2/stats"
     34 
     35 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
     36 	"github.com/godbus/dbus/v5"
     37 	"github.com/opencontainers/runtime-spec/specs-go"
     38 	"github.com/sirupsen/logrus"
     39 	"golang.org/x/sys/unix"
     40 )
     41 
     42 const (
     43 	subtreeControl     = "cgroup.subtree_control"
     44 	controllersFile    = "cgroup.controllers"
     45 	killFile           = "cgroup.kill"
     46 	defaultCgroup2Path = "/sys/fs/cgroup"
     47 	defaultSlice       = "system.slice"
     48 )
     49 
     50 var (
     51 	canDelegate bool
     52 )
     53 
     54 type Event struct {
     55 	Low     uint64
     56 	High    uint64
     57 	Max     uint64
     58 	OOM     uint64
     59 	OOMKill uint64
     60 }
     61 
     62 // Resources for a cgroups v2 unified hierarchy
     63 type Resources struct {
     64 	CPU     *CPU
     65 	Memory  *Memory
     66 	Pids    *Pids
     67 	IO      *IO
     68 	RDMA    *RDMA
     69 	HugeTlb *HugeTlb
     70 	// When len(Devices) is zero, devices are not controlled
     71 	Devices []specs.LinuxDeviceCgroup
     72 }
     73 
     74 // Values returns the raw filenames and values that
     75 // can be written to the unified hierarchy
     76 func (r *Resources) Values() (o []Value) {
     77 	if r.CPU != nil {
     78 		o = append(o, r.CPU.Values()...)
     79 	}
     80 	if r.Memory != nil {
     81 		o = append(o, r.Memory.Values()...)
     82 	}
     83 	if r.Pids != nil {
     84 		o = append(o, r.Pids.Values()...)
     85 	}
     86 	if r.IO != nil {
     87 		o = append(o, r.IO.Values()...)
     88 	}
     89 	if r.RDMA != nil {
     90 		o = append(o, r.RDMA.Values()...)
     91 	}
     92 	if r.HugeTlb != nil {
     93 		o = append(o, r.HugeTlb.Values()...)
     94 	}
     95 	return o
     96 }
     97 
     98 // EnabledControllers returns the list of all not nil resource controllers
     99 func (r *Resources) EnabledControllers() (c []string) {
    100 	if r.CPU != nil {
    101 		c = append(c, "cpu")
    102 		c = append(c, "cpuset")
    103 	}
    104 	if r.Memory != nil {
    105 		c = append(c, "memory")
    106 	}
    107 	if r.Pids != nil {
    108 		c = append(c, "pids")
    109 	}
    110 	if r.IO != nil {
    111 		c = append(c, "io")
    112 	}
    113 	if r.RDMA != nil {
    114 		c = append(c, "rdma")
    115 	}
    116 	if r.HugeTlb != nil {
    117 		c = append(c, "hugetlb")
    118 	}
    119 	return
    120 }
    121 
    122 // Value of a cgroup setting
    123 type Value struct {
    124 	filename string
    125 	value    interface{}
    126 }
    127 
    128 // write the value to the full, absolute path, of a unified hierarchy
    129 func (c *Value) write(path string, perm os.FileMode) error {
    130 	var data []byte
    131 	switch t := c.value.(type) {
    132 	case uint64:
    133 		data = []byte(strconv.FormatUint(t, 10))
    134 	case uint16:
    135 		data = []byte(strconv.FormatUint(uint64(t), 10))
    136 	case int64:
    137 		data = []byte(strconv.FormatInt(t, 10))
    138 	case []byte:
    139 		data = t
    140 	case string:
    141 		data = []byte(t)
    142 	case CPUMax:
    143 		data = []byte(t)
    144 	default:
    145 		return ErrInvalidFormat
    146 	}
    147 
    148 	return os.WriteFile(
    149 		filepath.Join(path, c.filename),
    150 		data,
    151 		perm,
    152 	)
    153 }
    154 
    155 func writeValues(path string, values []Value) error {
    156 	for _, o := range values {
    157 		if err := o.write(path, defaultFilePerm); err != nil {
    158 			return err
    159 		}
    160 	}
    161 	return nil
    162 }
    163 
    164 func NewManager(mountpoint string, group string, resources *Resources) (*Manager, error) {
    165 	if resources == nil {
    166 		return nil, errors.New("resources reference is nil")
    167 	}
    168 	if err := VerifyGroupPath(group); err != nil {
    169 		return nil, err
    170 	}
    171 	path := filepath.Join(mountpoint, group)
    172 	if err := os.MkdirAll(path, defaultDirPerm); err != nil {
    173 		return nil, err
    174 	}
    175 	m := Manager{
    176 		unifiedMountpoint: mountpoint,
    177 		path:              path,
    178 	}
    179 	if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil {
    180 		// clean up cgroup dir on failure
    181 		os.Remove(path)
    182 		return nil, err
    183 	}
    184 	if err := setResources(path, resources); err != nil {
    185 		os.Remove(path)
    186 		return nil, err
    187 	}
    188 	return &m, nil
    189 }
    190 
    191 type InitConfig struct {
    192 	mountpoint string
    193 }
    194 
    195 type InitOpts func(c *InitConfig) error
    196 
    197 // WithMountpoint sets the unified mountpoint. The default path is /sys/fs/cgroup.
    198 func WithMountpoint(path string) InitOpts {
    199 	return func(c *InitConfig) error {
    200 		c.mountpoint = path
    201 		return nil
    202 	}
    203 }
    204 
    205 // Load a cgroup.
    206 func Load(group string, opts ...InitOpts) (*Manager, error) {
    207 	c := InitConfig{mountpoint: defaultCgroup2Path}
    208 	for _, opt := range opts {
    209 		if err := opt(&c); err != nil {
    210 			return nil, err
    211 		}
    212 	}
    213 
    214 	if err := VerifyGroupPath(group); err != nil {
    215 		return nil, err
    216 	}
    217 	path := filepath.Join(c.mountpoint, group)
    218 	return &Manager{
    219 		unifiedMountpoint: c.mountpoint,
    220 		path:              path,
    221 	}, nil
    222 }
    223 
    224 type Manager struct {
    225 	unifiedMountpoint string
    226 	path              string
    227 }
    228 
    229 func setResources(path string, resources *Resources) error {
    230 	if resources != nil {
    231 		if err := writeValues(path, resources.Values()); err != nil {
    232 			return err
    233 		}
    234 		if err := setDevices(path, resources.Devices); err != nil {
    235 			return err
    236 		}
    237 	}
    238 	return nil
    239 }
    240 
    241 func (c *Manager) RootControllers() ([]string, error) {
    242 	b, err := os.ReadFile(filepath.Join(c.unifiedMountpoint, controllersFile))
    243 	if err != nil {
    244 		return nil, err
    245 	}
    246 	return strings.Fields(string(b)), nil
    247 }
    248 
    249 func (c *Manager) Controllers() ([]string, error) {
    250 	b, err := os.ReadFile(filepath.Join(c.path, controllersFile))
    251 	if err != nil {
    252 		return nil, err
    253 	}
    254 	return strings.Fields(string(b)), nil
    255 }
    256 
    257 func (c *Manager) Update(resources *Resources) error {
    258 	return setResources(c.path, resources)
    259 }
    260 
    261 type ControllerToggle int
    262 
    263 const (
    264 	Enable ControllerToggle = iota + 1
    265 	Disable
    266 )
    267 
    268 func toggleFunc(controllers []string, prefix string) []string {
    269 	out := make([]string, len(controllers))
    270 	for i, c := range controllers {
    271 		out[i] = prefix + c
    272 	}
    273 	return out
    274 }
    275 
    276 func (c *Manager) ToggleControllers(controllers []string, t ControllerToggle) error {
    277 	// when c.path is like /foo/bar/baz, the following files need to be written:
    278 	// * /sys/fs/cgroup/cgroup.subtree_control
    279 	// * /sys/fs/cgroup/foo/cgroup.subtree_control
    280 	// * /sys/fs/cgroup/foo/bar/cgroup.subtree_control
    281 	// Note that /sys/fs/cgroup/foo/bar/baz/cgroup.subtree_control does not need to be written.
    282 	split := strings.Split(c.path, "/")
    283 	var lastErr error
    284 	for i := range split {
    285 		f := strings.Join(split[:i], "/")
    286 		if !strings.HasPrefix(f, c.unifiedMountpoint) || f == c.path {
    287 			continue
    288 		}
    289 		filePath := filepath.Join(f, subtreeControl)
    290 		if err := c.writeSubtreeControl(filePath, controllers, t); err != nil {
    291 			// When running as rootless, the user may face EPERM on parent groups, but it is neglible when the
    292 			// controller is already written.
    293 			// So we only return the last error.
    294 			lastErr = fmt.Errorf("failed to write subtree controllers %+v to %q: %w", controllers, filePath, err)
    295 		} else {
    296 			lastErr = nil
    297 		}
    298 	}
    299 	return lastErr
    300 }
    301 
    302 func (c *Manager) writeSubtreeControl(filePath string, controllers []string, t ControllerToggle) error {
    303 	f, err := os.OpenFile(filePath, os.O_WRONLY, 0)
    304 	if err != nil {
    305 		return err
    306 	}
    307 	defer f.Close()
    308 	switch t {
    309 	case Enable:
    310 		controllers = toggleFunc(controllers, "+")
    311 	case Disable:
    312 		controllers = toggleFunc(controllers, "-")
    313 	}
    314 	_, err = f.WriteString(strings.Join(controllers, " "))
    315 	return err
    316 }
    317 
    318 func (c *Manager) NewChild(name string, resources *Resources) (*Manager, error) {
    319 	if strings.HasPrefix(name, "/") {
    320 		return nil, errors.New("name must be relative")
    321 	}
    322 	path := filepath.Join(c.path, name)
    323 	if err := os.MkdirAll(path, defaultDirPerm); err != nil {
    324 		return nil, err
    325 	}
    326 	m := Manager{
    327 		unifiedMountpoint: c.unifiedMountpoint,
    328 		path:              path,
    329 	}
    330 	if resources != nil {
    331 		if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil {
    332 			// clean up cgroup dir on failure
    333 			os.Remove(path)
    334 			return nil, err
    335 		}
    336 	}
    337 	if err := setResources(path, resources); err != nil {
    338 		// clean up cgroup dir on failure
    339 		os.Remove(path)
    340 		return nil, err
    341 	}
    342 	return &m, nil
    343 }
    344 
    345 func (c *Manager) AddProc(pid uint64) error {
    346 	v := Value{
    347 		filename: cgroupProcs,
    348 		value:    pid,
    349 	}
    350 	return writeValues(c.path, []Value{v})
    351 }
    352 
    353 func (c *Manager) AddThread(tid uint64) error {
    354 	v := Value{
    355 		filename: cgroupThreads,
    356 		value:    tid,
    357 	}
    358 	return writeValues(c.path, []Value{v})
    359 }
    360 
    361 // Kill will try to forcibly exit all of the processes in the cgroup. This is
    362 // equivalent to sending a SIGKILL to every process. On kernels 5.14 and greater
    363 // this will use the cgroup.kill file, on anything that doesn't have the cgroup.kill
    364 // file, a manual process of freezing -> sending a SIGKILL to every process -> thawing
    365 // will be used.
    366 func (c *Manager) Kill() error {
    367 	v := Value{
    368 		filename: killFile,
    369 		value:    "1",
    370 	}
    371 	err := writeValues(c.path, []Value{v})
    372 	if err == nil {
    373 		return nil
    374 	}
    375 	logrus.Warnf("falling back to slower kill implementation: %s", err)
    376 	// Fallback to slow method.
    377 	return c.fallbackKill()
    378 }
    379 
    380 // fallbackKill is a slower fallback to the more modern (kernels 5.14+)
    381 // approach of writing to the cgroup.kill file. This is heavily pulled
    382 // from runc's same approach (in signalAllProcesses), with the only differences
    383 // being this is just tailored to the API exposed in this library, and we don't
    384 // need to care about signals other than SIGKILL.
    385 //
    386 // https://github.com/opencontainers/runc/blob/8da0a0b5675764feaaaaad466f6567a9983fcd08/libcontainer/init_linux.go#L523-L529
    387 func (c *Manager) fallbackKill() error {
    388 	if err := c.Freeze(); err != nil {
    389 		logrus.Warn(err)
    390 	}
    391 	pids, err := c.Procs(true)
    392 	if err != nil {
    393 		if err := c.Thaw(); err != nil {
    394 			logrus.Warn(err)
    395 		}
    396 		return err
    397 	}
    398 	var procs []*os.Process
    399 	for _, pid := range pids {
    400 		p, err := os.FindProcess(int(pid))
    401 		if err != nil {
    402 			logrus.Warn(err)
    403 			continue
    404 		}
    405 		procs = append(procs, p)
    406 		if err := p.Signal(unix.SIGKILL); err != nil {
    407 			logrus.Warn(err)
    408 		}
    409 	}
    410 	if err := c.Thaw(); err != nil {
    411 		logrus.Warn(err)
    412 	}
    413 
    414 	subreaper, err := getSubreaper()
    415 	if err != nil {
    416 		// The error here means that PR_GET_CHILD_SUBREAPER is not
    417 		// supported because this code might run on a kernel older
    418 		// than 3.4. We don't want to throw an error in that case,
    419 		// and we simplify things, considering there is no subreaper
    420 		// set.
    421 		subreaper = 0
    422 	}
    423 
    424 	for _, p := range procs {
    425 		// In case a subreaper has been setup, this code must not
    426 		// wait for the process. Otherwise, we cannot be sure the
    427 		// current process will be reaped by the subreaper, while
    428 		// the subreaper might be waiting for this process in order
    429 		// to retrieve its exit code.
    430 		if subreaper == 0 {
    431 			if _, err := p.Wait(); err != nil {
    432 				if !errors.Is(err, unix.ECHILD) {
    433 					logrus.Warnf("wait on pid %d failed: %s", p.Pid, err)
    434 				}
    435 			}
    436 		}
    437 	}
    438 	return nil
    439 }
    440 
    441 func (c *Manager) Delete() error {
    442 	// kernel prevents cgroups with running process from being removed, check the tree is empty
    443 	processes, err := c.Procs(true)
    444 	if err != nil {
    445 		return err
    446 	}
    447 	if len(processes) > 0 {
    448 		return fmt.Errorf("cgroups: unable to remove path %q: still contains running processes", c.path)
    449 	}
    450 	return remove(c.path)
    451 }
    452 
    453 func (c *Manager) Procs(recursive bool) ([]uint64, error) {
    454 	var processes []uint64
    455 	err := filepath.Walk(c.path, func(p string, info os.FileInfo, err error) error {
    456 		if err != nil {
    457 			return err
    458 		}
    459 		if !recursive && info.IsDir() {
    460 			if p == c.path {
    461 				return nil
    462 			}
    463 			return filepath.SkipDir
    464 		}
    465 		_, name := filepath.Split(p)
    466 		if name != cgroupProcs {
    467 			return nil
    468 		}
    469 		procs, err := parseCgroupProcsFile(p)
    470 		if err != nil {
    471 			return err
    472 		}
    473 		processes = append(processes, procs...)
    474 		return nil
    475 	})
    476 	return processes, err
    477 }
    478 
    479 func (c *Manager) MoveTo(destination *Manager) error {
    480 	processes, err := c.Procs(true)
    481 	if err != nil {
    482 		return err
    483 	}
    484 	for _, p := range processes {
    485 		if err := destination.AddProc(p); err != nil {
    486 			if strings.Contains(err.Error(), "no such process") {
    487 				continue
    488 			}
    489 			return err
    490 		}
    491 	}
    492 	return nil
    493 }
    494 
    495 var singleValueFiles = []string{
    496 	"pids.current",
    497 	"pids.max",
    498 }
    499 
    500 func (c *Manager) Stat() (*stats.Metrics, error) {
    501 	controllers, err := c.Controllers()
    502 	if err != nil {
    503 		return nil, err
    504 	}
    505 	out := make(map[string]interface{})
    506 	for _, controller := range controllers {
    507 		switch controller {
    508 		case "cpu", "memory":
    509 			if err := readKVStatsFile(c.path, controller+".stat", out); err != nil {
    510 				if os.IsNotExist(err) {
    511 					continue
    512 				}
    513 				return nil, err
    514 			}
    515 		}
    516 	}
    517 	for _, name := range singleValueFiles {
    518 		if err := readSingleFile(c.path, name, out); err != nil {
    519 			if os.IsNotExist(err) {
    520 				continue
    521 			}
    522 			return nil, err
    523 		}
    524 	}
    525 	memoryEvents := make(map[string]interface{})
    526 	if err := readKVStatsFile(c.path, "memory.events", memoryEvents); err != nil {
    527 		if !os.IsNotExist(err) {
    528 			return nil, err
    529 		}
    530 	}
    531 	var metrics stats.Metrics
    532 
    533 	metrics.Pids = &stats.PidsStat{
    534 		Current: getPidValue("pids.current", out),
    535 		Limit:   getPidValue("pids.max", out),
    536 	}
    537 	metrics.CPU = &stats.CPUStat{
    538 		UsageUsec:     getUint64Value("usage_usec", out),
    539 		UserUsec:      getUint64Value("user_usec", out),
    540 		SystemUsec:    getUint64Value("system_usec", out),
    541 		NrPeriods:     getUint64Value("nr_periods", out),
    542 		NrThrottled:   getUint64Value("nr_throttled", out),
    543 		ThrottledUsec: getUint64Value("throttled_usec", out),
    544 	}
    545 	metrics.Memory = &stats.MemoryStat{
    546 		Anon:                  getUint64Value("anon", out),
    547 		File:                  getUint64Value("file", out),
    548 		KernelStack:           getUint64Value("kernel_stack", out),
    549 		Slab:                  getUint64Value("slab", out),
    550 		Sock:                  getUint64Value("sock", out),
    551 		Shmem:                 getUint64Value("shmem", out),
    552 		FileMapped:            getUint64Value("file_mapped", out),
    553 		FileDirty:             getUint64Value("file_dirty", out),
    554 		FileWriteback:         getUint64Value("file_writeback", out),
    555 		AnonThp:               getUint64Value("anon_thp", out),
    556 		InactiveAnon:          getUint64Value("inactive_anon", out),
    557 		ActiveAnon:            getUint64Value("active_anon", out),
    558 		InactiveFile:          getUint64Value("inactive_file", out),
    559 		ActiveFile:            getUint64Value("active_file", out),
    560 		Unevictable:           getUint64Value("unevictable", out),
    561 		SlabReclaimable:       getUint64Value("slab_reclaimable", out),
    562 		SlabUnreclaimable:     getUint64Value("slab_unreclaimable", out),
    563 		Pgfault:               getUint64Value("pgfault", out),
    564 		Pgmajfault:            getUint64Value("pgmajfault", out),
    565 		WorkingsetRefault:     getUint64Value("workingset_refault", out),
    566 		WorkingsetActivate:    getUint64Value("workingset_activate", out),
    567 		WorkingsetNodereclaim: getUint64Value("workingset_nodereclaim", out),
    568 		Pgrefill:              getUint64Value("pgrefill", out),
    569 		Pgscan:                getUint64Value("pgscan", out),
    570 		Pgsteal:               getUint64Value("pgsteal", out),
    571 		Pgactivate:            getUint64Value("pgactivate", out),
    572 		Pgdeactivate:          getUint64Value("pgdeactivate", out),
    573 		Pglazyfree:            getUint64Value("pglazyfree", out),
    574 		Pglazyfreed:           getUint64Value("pglazyfreed", out),
    575 		ThpFaultAlloc:         getUint64Value("thp_fault_alloc", out),
    576 		ThpCollapseAlloc:      getUint64Value("thp_collapse_alloc", out),
    577 		Usage:                 getStatFileContentUint64(filepath.Join(c.path, "memory.current")),
    578 		UsageLimit:            getStatFileContentUint64(filepath.Join(c.path, "memory.max")),
    579 		SwapUsage:             getStatFileContentUint64(filepath.Join(c.path, "memory.swap.current")),
    580 		SwapLimit:             getStatFileContentUint64(filepath.Join(c.path, "memory.swap.max")),
    581 	}
    582 	if len(memoryEvents) > 0 {
    583 		metrics.MemoryEvents = &stats.MemoryEvents{
    584 			Low:     getUint64Value("low", memoryEvents),
    585 			High:    getUint64Value("high", memoryEvents),
    586 			Max:     getUint64Value("max", memoryEvents),
    587 			Oom:     getUint64Value("oom", memoryEvents),
    588 			OomKill: getUint64Value("oom_kill", memoryEvents),
    589 		}
    590 	}
    591 	metrics.Io = &stats.IOStat{Usage: readIoStats(c.path)}
    592 	metrics.Rdma = &stats.RdmaStat{
    593 		Current: rdmaStats(filepath.Join(c.path, "rdma.current")),
    594 		Limit:   rdmaStats(filepath.Join(c.path, "rdma.max")),
    595 	}
    596 	metrics.Hugetlb = readHugeTlbStats(c.path)
    597 
    598 	return &metrics, nil
    599 }
    600 
    601 func getUint64Value(key string, out map[string]interface{}) uint64 {
    602 	v, ok := out[key]
    603 	if !ok {
    604 		return 0
    605 	}
    606 	switch t := v.(type) {
    607 	case uint64:
    608 		return t
    609 	}
    610 	return 0
    611 }
    612 
    613 func getPidValue(key string, out map[string]interface{}) uint64 {
    614 	v, ok := out[key]
    615 	if !ok {
    616 		return 0
    617 	}
    618 	switch t := v.(type) {
    619 	case uint64:
    620 		return t
    621 	case string:
    622 		if t == "max" {
    623 			return math.MaxUint64
    624 		}
    625 	}
    626 	return 0
    627 }
    628 
    629 func readSingleFile(path string, file string, out map[string]interface{}) error {
    630 	f, err := os.Open(filepath.Join(path, file))
    631 	if err != nil {
    632 		return err
    633 	}
    634 	defer f.Close()
    635 	data, err := io.ReadAll(f)
    636 	if err != nil {
    637 		return err
    638 	}
    639 	s := strings.TrimSpace(string(data))
    640 	v, err := parseUint(s, 10, 64)
    641 	if err != nil {
    642 		// if we cannot parse as a uint, parse as a string
    643 		out[file] = s
    644 		return nil
    645 	}
    646 	out[file] = v
    647 	return nil
    648 }
    649 
    650 func readKVStatsFile(path string, file string, out map[string]interface{}) error {
    651 	f, err := os.Open(filepath.Join(path, file))
    652 	if err != nil {
    653 		return err
    654 	}
    655 	defer f.Close()
    656 
    657 	s := bufio.NewScanner(f)
    658 	for s.Scan() {
    659 		name, value, err := parseKV(s.Text())
    660 		if err != nil {
    661 			return fmt.Errorf("error while parsing %s (line=%q): %w", filepath.Join(path, file), s.Text(), err)
    662 		}
    663 		out[name] = value
    664 	}
    665 	return s.Err()
    666 }
    667 
    668 func (c *Manager) Freeze() error {
    669 	return c.freeze(c.path, Frozen)
    670 }
    671 
    672 func (c *Manager) Thaw() error {
    673 	return c.freeze(c.path, Thawed)
    674 }
    675 
    676 func (c *Manager) freeze(path string, state State) error {
    677 	values := state.Values()
    678 	for {
    679 		if err := writeValues(path, values); err != nil {
    680 			return err
    681 		}
    682 		current, err := fetchState(path)
    683 		if err != nil {
    684 			return err
    685 		}
    686 		if current == state {
    687 			return nil
    688 		}
    689 		time.Sleep(1 * time.Millisecond)
    690 	}
    691 }
    692 
    693 func (c *Manager) isCgroupEmpty() bool {
    694 	// In case of any error we return true so that we exit and don't leak resources
    695 	out := make(map[string]interface{})
    696 	if err := readKVStatsFile(c.path, "cgroup.events", out); err != nil {
    697 		return true
    698 	}
    699 	if v, ok := out["populated"]; ok {
    700 		populated, ok := v.(uint64)
    701 		if !ok {
    702 			return true
    703 		}
    704 		return populated == 0
    705 	}
    706 	return true
    707 }
    708 
    709 // MemoryEventFD returns inotify file descriptor and 'memory.events' inotify watch descriptor
    710 func (c *Manager) MemoryEventFD() (int, uint32, error) {
    711 	fpath := filepath.Join(c.path, "memory.events")
    712 	fd, err := syscall.InotifyInit()
    713 	if err != nil {
    714 		return 0, 0, errors.New("failed to create inotify fd")
    715 	}
    716 	wd, err := syscall.InotifyAddWatch(fd, fpath, unix.IN_MODIFY)
    717 	if err != nil {
    718 		syscall.Close(fd)
    719 		return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", fpath, err)
    720 	}
    721 	// monitor to detect process exit/cgroup deletion
    722 	evpath := filepath.Join(c.path, "cgroup.events")
    723 	if _, err = syscall.InotifyAddWatch(fd, evpath, unix.IN_MODIFY); err != nil {
    724 		syscall.Close(fd)
    725 		return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", evpath, err)
    726 	}
    727 
    728 	return fd, uint32(wd), nil
    729 }
    730 
    731 func (c *Manager) EventChan() (<-chan Event, <-chan error) {
    732 	ec := make(chan Event)
    733 	errCh := make(chan error, 1)
    734 	go c.waitForEvents(ec, errCh)
    735 
    736 	return ec, errCh
    737 }
    738 
    739 func parseMemoryEvents(out map[string]interface{}) (Event, error) {
    740 	e := Event{}
    741 	if v, ok := out["high"]; ok {
    742 		e.High, ok = v.(uint64)
    743 		if !ok {
    744 			return Event{}, fmt.Errorf("cannot convert high to uint64: %+v", v)
    745 		}
    746 	}
    747 	if v, ok := out["low"]; ok {
    748 		e.Low, ok = v.(uint64)
    749 		if !ok {
    750 			return Event{}, fmt.Errorf("cannot convert low to uint64: %+v", v)
    751 		}
    752 	}
    753 	if v, ok := out["max"]; ok {
    754 		e.Max, ok = v.(uint64)
    755 		if !ok {
    756 			return Event{}, fmt.Errorf("cannot convert max to uint64: %+v", v)
    757 		}
    758 	}
    759 	if v, ok := out["oom"]; ok {
    760 		e.OOM, ok = v.(uint64)
    761 		if !ok {
    762 			return Event{}, fmt.Errorf("cannot convert oom to uint64: %+v", v)
    763 		}
    764 	}
    765 	if v, ok := out["oom_kill"]; ok {
    766 		e.OOMKill, ok = v.(uint64)
    767 		if !ok {
    768 			return Event{}, fmt.Errorf("cannot convert oom_kill to uint64: %+v", v)
    769 		}
    770 	}
    771 	return e, nil
    772 }
    773 
    774 func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
    775 	defer close(errCh)
    776 
    777 	fd, _, err := c.MemoryEventFD()
    778 	if err != nil {
    779 		errCh <- err
    780 		return
    781 	}
    782 	defer syscall.Close(fd)
    783 
    784 	for {
    785 		buffer := make([]byte, syscall.SizeofInotifyEvent*10)
    786 		bytesRead, err := syscall.Read(fd, buffer)
    787 		if err != nil {
    788 			errCh <- err
    789 			return
    790 		}
    791 		if bytesRead >= syscall.SizeofInotifyEvent {
    792 			out := make(map[string]interface{})
    793 			if err := readKVStatsFile(c.path, "memory.events", out); err != nil {
    794 				// When cgroup is deleted read may return -ENODEV instead of -ENOENT from open.
    795 				if _, statErr := os.Lstat(filepath.Join(c.path, "memory.events")); !os.IsNotExist(statErr) {
    796 					errCh <- err
    797 				}
    798 				return
    799 			}
    800 			e, err := parseMemoryEvents(out)
    801 			if err != nil {
    802 				errCh <- err
    803 				return
    804 			}
    805 			ec <- e
    806 			if c.isCgroupEmpty() {
    807 				return
    808 			}
    809 		}
    810 	}
    811 }
    812 
    813 func setDevices(path string, devices []specs.LinuxDeviceCgroup) error {
    814 	if len(devices) == 0 {
    815 		return nil
    816 	}
    817 	insts, license, err := DeviceFilter(devices)
    818 	if err != nil {
    819 		return err
    820 	}
    821 	dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY|unix.O_CLOEXEC, 0600)
    822 	if err != nil {
    823 		return fmt.Errorf("cannot get dir FD for %s", path)
    824 	}
    825 	defer unix.Close(dirFD)
    826 	if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
    827 		if !canSkipEBPFError(devices) {
    828 			return err
    829 		}
    830 	}
    831 	return nil
    832 }
    833 
    834 // getSystemdFullPath returns the full systemd path when creating a systemd slice group.
    835 // the reason this is necessary is because the "-" character has a special meaning in
    836 // systemd slice. For example, when creating a slice called "my-group-112233.slice",
    837 // systemd will create a hierarchy like this:
    838 //
    839 //	/sys/fs/cgroup/my.slice/my-group.slice/my-group-112233.slice
    840 func getSystemdFullPath(slice, group string) string {
    841 	return filepath.Join(defaultCgroup2Path, dashesToPath(slice), dashesToPath(group))
    842 }
    843 
    844 // dashesToPath converts a slice name with dashes to it's corresponding systemd filesystem path.
    845 func dashesToPath(in string) string {
    846 	path := ""
    847 	if strings.HasSuffix(in, ".slice") && strings.Contains(in, "-") {
    848 		parts := strings.Split(in, "-")
    849 		for i := range parts {
    850 			s := strings.Join(parts[0:i+1], "-")
    851 			if !strings.HasSuffix(s, ".slice") {
    852 				s += ".slice"
    853 			}
    854 			path = filepath.Join(path, s)
    855 		}
    856 	} else {
    857 		path = filepath.Join(path, in)
    858 	}
    859 	return path
    860 }
    861 
    862 func NewSystemd(slice, group string, pid int, resources *Resources) (*Manager, error) {
    863 	if slice == "" {
    864 		slice = defaultSlice
    865 	}
    866 	ctx := context.TODO()
    867 	path := getSystemdFullPath(slice, group)
    868 	conn, err := systemdDbus.NewWithContext(ctx)
    869 	if err != nil {
    870 		return &Manager{}, err
    871 	}
    872 	defer conn.Close()
    873 
    874 	properties := []systemdDbus.Property{
    875 		systemdDbus.PropDescription("cgroup " + group),
    876 		newSystemdProperty("DefaultDependencies", false),
    877 		newSystemdProperty("MemoryAccounting", true),
    878 		newSystemdProperty("CPUAccounting", true),
    879 		newSystemdProperty("IOAccounting", true),
    880 	}
    881 
    882 	// if we create a slice, the parent is defined via a Wants=
    883 	if strings.HasSuffix(group, ".slice") {
    884 		properties = append(properties, systemdDbus.PropWants(defaultSlice))
    885 	} else {
    886 		// otherwise, we use Slice=
    887 		properties = append(properties, systemdDbus.PropSlice(defaultSlice))
    888 	}
    889 
    890 	// only add pid if its valid, -1 is used w/ general slice creation.
    891 	if pid != -1 {
    892 		properties = append(properties, newSystemdProperty("PIDs", []uint32{uint32(pid)}))
    893 	}
    894 
    895 	if resources.Memory != nil && resources.Memory.Min != nil && *resources.Memory.Min != 0 {
    896 		properties = append(properties,
    897 			newSystemdProperty("MemoryMin", uint64(*resources.Memory.Min)))
    898 	}
    899 
    900 	if resources.Memory != nil && resources.Memory.Max != nil && *resources.Memory.Max != 0 {
    901 		properties = append(properties,
    902 			newSystemdProperty("MemoryMax", uint64(*resources.Memory.Max)))
    903 	}
    904 
    905 	if resources.CPU != nil && resources.CPU.Weight != nil && *resources.CPU.Weight != 0 {
    906 		properties = append(properties,
    907 			newSystemdProperty("CPUWeight", *resources.CPU.Weight))
    908 	}
    909 
    910 	if resources.CPU != nil && resources.CPU.Max != "" {
    911 		quota, period := resources.CPU.Max.extractQuotaAndPeriod()
    912 		// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
    913 		// corresponds to USEC_INFINITY in systemd
    914 		// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
    915 		// always setting a property value ensures we can apply a quota and remove it later
    916 		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
    917 		if quota > 0 {
    918 			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
    919 			// (integer percentage of CPU) internally.  This means that if a fractional percent of
    920 			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
    921 			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
    922 			cpuQuotaPerSecUSec = uint64(quota*1000000) / period
    923 			if cpuQuotaPerSecUSec%10000 != 0 {
    924 				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
    925 			}
    926 		}
    927 		properties = append(properties,
    928 			newSystemdProperty("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
    929 	}
    930 
    931 	// If we can delegate, we add the property back in
    932 	if canDelegate {
    933 		properties = append(properties, newSystemdProperty("Delegate", true))
    934 	}
    935 
    936 	if resources.Pids != nil && resources.Pids.Max > 0 {
    937 		properties = append(properties,
    938 			newSystemdProperty("TasksAccounting", true),
    939 			newSystemdProperty("TasksMax", uint64(resources.Pids.Max)))
    940 	}
    941 
    942 	statusChan := make(chan string, 1)
    943 	if _, err := conn.StartTransientUnitContext(ctx, group, "replace", properties, statusChan); err == nil {
    944 		select {
    945 		case <-statusChan:
    946 		case <-time.After(time.Second):
    947 			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", group)
    948 		}
    949 	} else if !isUnitExists(err) {
    950 		return &Manager{}, err
    951 	}
    952 
    953 	return &Manager{
    954 		path: path,
    955 	}, nil
    956 }
    957 
    958 func LoadSystemd(slice, group string) (*Manager, error) {
    959 	if slice == "" {
    960 		slice = defaultSlice
    961 	}
    962 	path := getSystemdFullPath(slice, group)
    963 	return &Manager{
    964 		path: path,
    965 	}, nil
    966 }
    967 
    968 func (c *Manager) DeleteSystemd() error {
    969 	ctx := context.TODO()
    970 	conn, err := systemdDbus.NewWithContext(ctx)
    971 	if err != nil {
    972 		return err
    973 	}
    974 	defer conn.Close()
    975 	group := systemdUnitFromPath(c.path)
    976 	ch := make(chan string)
    977 	_, err = conn.StopUnitContext(ctx, group, "replace", ch)
    978 	if err != nil {
    979 		return err
    980 	}
    981 	<-ch
    982 	return nil
    983 }
    984 
    985 func newSystemdProperty(name string, units interface{}) systemdDbus.Property {
    986 	return systemdDbus.Property{
    987 		Name:  name,
    988 		Value: dbus.MakeVariant(units),
    989 	}
    990 }