manager.go (26798B)
1 /* 2 Copyright The containerd Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cgroup2 18 19 import ( 20 "bufio" 21 "context" 22 "errors" 23 "fmt" 24 "io" 25 "math" 26 "os" 27 "path/filepath" 28 "strconv" 29 "strings" 30 "syscall" 31 "time" 32 33 "github.com/containerd/cgroups/v3/cgroup2/stats" 34 35 systemdDbus "github.com/coreos/go-systemd/v22/dbus" 36 "github.com/godbus/dbus/v5" 37 "github.com/opencontainers/runtime-spec/specs-go" 38 "github.com/sirupsen/logrus" 39 "golang.org/x/sys/unix" 40 ) 41 42 const ( 43 subtreeControl = "cgroup.subtree_control" 44 controllersFile = "cgroup.controllers" 45 killFile = "cgroup.kill" 46 defaultCgroup2Path = "/sys/fs/cgroup" 47 defaultSlice = "system.slice" 48 ) 49 50 var ( 51 canDelegate bool 52 ) 53 54 type Event struct { 55 Low uint64 56 High uint64 57 Max uint64 58 OOM uint64 59 OOMKill uint64 60 } 61 62 // Resources for a cgroups v2 unified hierarchy 63 type Resources struct { 64 CPU *CPU 65 Memory *Memory 66 Pids *Pids 67 IO *IO 68 RDMA *RDMA 69 HugeTlb *HugeTlb 70 // When len(Devices) is zero, devices are not controlled 71 Devices []specs.LinuxDeviceCgroup 72 } 73 74 // Values returns the raw filenames and values that 75 // can be written to the unified hierarchy 76 func (r *Resources) Values() (o []Value) { 77 if r.CPU != nil { 78 o = append(o, r.CPU.Values()...) 79 } 80 if r.Memory != nil { 81 o = append(o, r.Memory.Values()...) 82 } 83 if r.Pids != nil { 84 o = append(o, r.Pids.Values()...) 85 } 86 if r.IO != nil { 87 o = append(o, r.IO.Values()...) 88 } 89 if r.RDMA != nil { 90 o = append(o, r.RDMA.Values()...) 91 } 92 if r.HugeTlb != nil { 93 o = append(o, r.HugeTlb.Values()...) 94 } 95 return o 96 } 97 98 // EnabledControllers returns the list of all not nil resource controllers 99 func (r *Resources) EnabledControllers() (c []string) { 100 if r.CPU != nil { 101 c = append(c, "cpu") 102 c = append(c, "cpuset") 103 } 104 if r.Memory != nil { 105 c = append(c, "memory") 106 } 107 if r.Pids != nil { 108 c = append(c, "pids") 109 } 110 if r.IO != nil { 111 c = append(c, "io") 112 } 113 if r.RDMA != nil { 114 c = append(c, "rdma") 115 } 116 if r.HugeTlb != nil { 117 c = append(c, "hugetlb") 118 } 119 return 120 } 121 122 // Value of a cgroup setting 123 type Value struct { 124 filename string 125 value interface{} 126 } 127 128 // write the value to the full, absolute path, of a unified hierarchy 129 func (c *Value) write(path string, perm os.FileMode) error { 130 var data []byte 131 switch t := c.value.(type) { 132 case uint64: 133 data = []byte(strconv.FormatUint(t, 10)) 134 case uint16: 135 data = []byte(strconv.FormatUint(uint64(t), 10)) 136 case int64: 137 data = []byte(strconv.FormatInt(t, 10)) 138 case []byte: 139 data = t 140 case string: 141 data = []byte(t) 142 case CPUMax: 143 data = []byte(t) 144 default: 145 return ErrInvalidFormat 146 } 147 148 return os.WriteFile( 149 filepath.Join(path, c.filename), 150 data, 151 perm, 152 ) 153 } 154 155 func writeValues(path string, values []Value) error { 156 for _, o := range values { 157 if err := o.write(path, defaultFilePerm); err != nil { 158 return err 159 } 160 } 161 return nil 162 } 163 164 func NewManager(mountpoint string, group string, resources *Resources) (*Manager, error) { 165 if resources == nil { 166 return nil, errors.New("resources reference is nil") 167 } 168 if err := VerifyGroupPath(group); err != nil { 169 return nil, err 170 } 171 path := filepath.Join(mountpoint, group) 172 if err := os.MkdirAll(path, defaultDirPerm); err != nil { 173 return nil, err 174 } 175 m := Manager{ 176 unifiedMountpoint: mountpoint, 177 path: path, 178 } 179 if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil { 180 // clean up cgroup dir on failure 181 os.Remove(path) 182 return nil, err 183 } 184 if err := setResources(path, resources); err != nil { 185 os.Remove(path) 186 return nil, err 187 } 188 return &m, nil 189 } 190 191 type InitConfig struct { 192 mountpoint string 193 } 194 195 type InitOpts func(c *InitConfig) error 196 197 // WithMountpoint sets the unified mountpoint. The default path is /sys/fs/cgroup. 198 func WithMountpoint(path string) InitOpts { 199 return func(c *InitConfig) error { 200 c.mountpoint = path 201 return nil 202 } 203 } 204 205 // Load a cgroup. 206 func Load(group string, opts ...InitOpts) (*Manager, error) { 207 c := InitConfig{mountpoint: defaultCgroup2Path} 208 for _, opt := range opts { 209 if err := opt(&c); err != nil { 210 return nil, err 211 } 212 } 213 214 if err := VerifyGroupPath(group); err != nil { 215 return nil, err 216 } 217 path := filepath.Join(c.mountpoint, group) 218 return &Manager{ 219 unifiedMountpoint: c.mountpoint, 220 path: path, 221 }, nil 222 } 223 224 type Manager struct { 225 unifiedMountpoint string 226 path string 227 } 228 229 func setResources(path string, resources *Resources) error { 230 if resources != nil { 231 if err := writeValues(path, resources.Values()); err != nil { 232 return err 233 } 234 if err := setDevices(path, resources.Devices); err != nil { 235 return err 236 } 237 } 238 return nil 239 } 240 241 func (c *Manager) RootControllers() ([]string, error) { 242 b, err := os.ReadFile(filepath.Join(c.unifiedMountpoint, controllersFile)) 243 if err != nil { 244 return nil, err 245 } 246 return strings.Fields(string(b)), nil 247 } 248 249 func (c *Manager) Controllers() ([]string, error) { 250 b, err := os.ReadFile(filepath.Join(c.path, controllersFile)) 251 if err != nil { 252 return nil, err 253 } 254 return strings.Fields(string(b)), nil 255 } 256 257 func (c *Manager) Update(resources *Resources) error { 258 return setResources(c.path, resources) 259 } 260 261 type ControllerToggle int 262 263 const ( 264 Enable ControllerToggle = iota + 1 265 Disable 266 ) 267 268 func toggleFunc(controllers []string, prefix string) []string { 269 out := make([]string, len(controllers)) 270 for i, c := range controllers { 271 out[i] = prefix + c 272 } 273 return out 274 } 275 276 func (c *Manager) ToggleControllers(controllers []string, t ControllerToggle) error { 277 // when c.path is like /foo/bar/baz, the following files need to be written: 278 // * /sys/fs/cgroup/cgroup.subtree_control 279 // * /sys/fs/cgroup/foo/cgroup.subtree_control 280 // * /sys/fs/cgroup/foo/bar/cgroup.subtree_control 281 // Note that /sys/fs/cgroup/foo/bar/baz/cgroup.subtree_control does not need to be written. 282 split := strings.Split(c.path, "/") 283 var lastErr error 284 for i := range split { 285 f := strings.Join(split[:i], "/") 286 if !strings.HasPrefix(f, c.unifiedMountpoint) || f == c.path { 287 continue 288 } 289 filePath := filepath.Join(f, subtreeControl) 290 if err := c.writeSubtreeControl(filePath, controllers, t); err != nil { 291 // When running as rootless, the user may face EPERM on parent groups, but it is neglible when the 292 // controller is already written. 293 // So we only return the last error. 294 lastErr = fmt.Errorf("failed to write subtree controllers %+v to %q: %w", controllers, filePath, err) 295 } else { 296 lastErr = nil 297 } 298 } 299 return lastErr 300 } 301 302 func (c *Manager) writeSubtreeControl(filePath string, controllers []string, t ControllerToggle) error { 303 f, err := os.OpenFile(filePath, os.O_WRONLY, 0) 304 if err != nil { 305 return err 306 } 307 defer f.Close() 308 switch t { 309 case Enable: 310 controllers = toggleFunc(controllers, "+") 311 case Disable: 312 controllers = toggleFunc(controllers, "-") 313 } 314 _, err = f.WriteString(strings.Join(controllers, " ")) 315 return err 316 } 317 318 func (c *Manager) NewChild(name string, resources *Resources) (*Manager, error) { 319 if strings.HasPrefix(name, "/") { 320 return nil, errors.New("name must be relative") 321 } 322 path := filepath.Join(c.path, name) 323 if err := os.MkdirAll(path, defaultDirPerm); err != nil { 324 return nil, err 325 } 326 m := Manager{ 327 unifiedMountpoint: c.unifiedMountpoint, 328 path: path, 329 } 330 if resources != nil { 331 if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil { 332 // clean up cgroup dir on failure 333 os.Remove(path) 334 return nil, err 335 } 336 } 337 if err := setResources(path, resources); err != nil { 338 // clean up cgroup dir on failure 339 os.Remove(path) 340 return nil, err 341 } 342 return &m, nil 343 } 344 345 func (c *Manager) AddProc(pid uint64) error { 346 v := Value{ 347 filename: cgroupProcs, 348 value: pid, 349 } 350 return writeValues(c.path, []Value{v}) 351 } 352 353 func (c *Manager) AddThread(tid uint64) error { 354 v := Value{ 355 filename: cgroupThreads, 356 value: tid, 357 } 358 return writeValues(c.path, []Value{v}) 359 } 360 361 // Kill will try to forcibly exit all of the processes in the cgroup. This is 362 // equivalent to sending a SIGKILL to every process. On kernels 5.14 and greater 363 // this will use the cgroup.kill file, on anything that doesn't have the cgroup.kill 364 // file, a manual process of freezing -> sending a SIGKILL to every process -> thawing 365 // will be used. 366 func (c *Manager) Kill() error { 367 v := Value{ 368 filename: killFile, 369 value: "1", 370 } 371 err := writeValues(c.path, []Value{v}) 372 if err == nil { 373 return nil 374 } 375 logrus.Warnf("falling back to slower kill implementation: %s", err) 376 // Fallback to slow method. 377 return c.fallbackKill() 378 } 379 380 // fallbackKill is a slower fallback to the more modern (kernels 5.14+) 381 // approach of writing to the cgroup.kill file. This is heavily pulled 382 // from runc's same approach (in signalAllProcesses), with the only differences 383 // being this is just tailored to the API exposed in this library, and we don't 384 // need to care about signals other than SIGKILL. 385 // 386 // https://github.com/opencontainers/runc/blob/8da0a0b5675764feaaaaad466f6567a9983fcd08/libcontainer/init_linux.go#L523-L529 387 func (c *Manager) fallbackKill() error { 388 if err := c.Freeze(); err != nil { 389 logrus.Warn(err) 390 } 391 pids, err := c.Procs(true) 392 if err != nil { 393 if err := c.Thaw(); err != nil { 394 logrus.Warn(err) 395 } 396 return err 397 } 398 var procs []*os.Process 399 for _, pid := range pids { 400 p, err := os.FindProcess(int(pid)) 401 if err != nil { 402 logrus.Warn(err) 403 continue 404 } 405 procs = append(procs, p) 406 if err := p.Signal(unix.SIGKILL); err != nil { 407 logrus.Warn(err) 408 } 409 } 410 if err := c.Thaw(); err != nil { 411 logrus.Warn(err) 412 } 413 414 subreaper, err := getSubreaper() 415 if err != nil { 416 // The error here means that PR_GET_CHILD_SUBREAPER is not 417 // supported because this code might run on a kernel older 418 // than 3.4. We don't want to throw an error in that case, 419 // and we simplify things, considering there is no subreaper 420 // set. 421 subreaper = 0 422 } 423 424 for _, p := range procs { 425 // In case a subreaper has been setup, this code must not 426 // wait for the process. Otherwise, we cannot be sure the 427 // current process will be reaped by the subreaper, while 428 // the subreaper might be waiting for this process in order 429 // to retrieve its exit code. 430 if subreaper == 0 { 431 if _, err := p.Wait(); err != nil { 432 if !errors.Is(err, unix.ECHILD) { 433 logrus.Warnf("wait on pid %d failed: %s", p.Pid, err) 434 } 435 } 436 } 437 } 438 return nil 439 } 440 441 func (c *Manager) Delete() error { 442 // kernel prevents cgroups with running process from being removed, check the tree is empty 443 processes, err := c.Procs(true) 444 if err != nil { 445 return err 446 } 447 if len(processes) > 0 { 448 return fmt.Errorf("cgroups: unable to remove path %q: still contains running processes", c.path) 449 } 450 return remove(c.path) 451 } 452 453 func (c *Manager) Procs(recursive bool) ([]uint64, error) { 454 var processes []uint64 455 err := filepath.Walk(c.path, func(p string, info os.FileInfo, err error) error { 456 if err != nil { 457 return err 458 } 459 if !recursive && info.IsDir() { 460 if p == c.path { 461 return nil 462 } 463 return filepath.SkipDir 464 } 465 _, name := filepath.Split(p) 466 if name != cgroupProcs { 467 return nil 468 } 469 procs, err := parseCgroupProcsFile(p) 470 if err != nil { 471 return err 472 } 473 processes = append(processes, procs...) 474 return nil 475 }) 476 return processes, err 477 } 478 479 func (c *Manager) MoveTo(destination *Manager) error { 480 processes, err := c.Procs(true) 481 if err != nil { 482 return err 483 } 484 for _, p := range processes { 485 if err := destination.AddProc(p); err != nil { 486 if strings.Contains(err.Error(), "no such process") { 487 continue 488 } 489 return err 490 } 491 } 492 return nil 493 } 494 495 var singleValueFiles = []string{ 496 "pids.current", 497 "pids.max", 498 } 499 500 func (c *Manager) Stat() (*stats.Metrics, error) { 501 controllers, err := c.Controllers() 502 if err != nil { 503 return nil, err 504 } 505 out := make(map[string]interface{}) 506 for _, controller := range controllers { 507 switch controller { 508 case "cpu", "memory": 509 if err := readKVStatsFile(c.path, controller+".stat", out); err != nil { 510 if os.IsNotExist(err) { 511 continue 512 } 513 return nil, err 514 } 515 } 516 } 517 for _, name := range singleValueFiles { 518 if err := readSingleFile(c.path, name, out); err != nil { 519 if os.IsNotExist(err) { 520 continue 521 } 522 return nil, err 523 } 524 } 525 memoryEvents := make(map[string]interface{}) 526 if err := readKVStatsFile(c.path, "memory.events", memoryEvents); err != nil { 527 if !os.IsNotExist(err) { 528 return nil, err 529 } 530 } 531 var metrics stats.Metrics 532 533 metrics.Pids = &stats.PidsStat{ 534 Current: getPidValue("pids.current", out), 535 Limit: getPidValue("pids.max", out), 536 } 537 metrics.CPU = &stats.CPUStat{ 538 UsageUsec: getUint64Value("usage_usec", out), 539 UserUsec: getUint64Value("user_usec", out), 540 SystemUsec: getUint64Value("system_usec", out), 541 NrPeriods: getUint64Value("nr_periods", out), 542 NrThrottled: getUint64Value("nr_throttled", out), 543 ThrottledUsec: getUint64Value("throttled_usec", out), 544 } 545 metrics.Memory = &stats.MemoryStat{ 546 Anon: getUint64Value("anon", out), 547 File: getUint64Value("file", out), 548 KernelStack: getUint64Value("kernel_stack", out), 549 Slab: getUint64Value("slab", out), 550 Sock: getUint64Value("sock", out), 551 Shmem: getUint64Value("shmem", out), 552 FileMapped: getUint64Value("file_mapped", out), 553 FileDirty: getUint64Value("file_dirty", out), 554 FileWriteback: getUint64Value("file_writeback", out), 555 AnonThp: getUint64Value("anon_thp", out), 556 InactiveAnon: getUint64Value("inactive_anon", out), 557 ActiveAnon: getUint64Value("active_anon", out), 558 InactiveFile: getUint64Value("inactive_file", out), 559 ActiveFile: getUint64Value("active_file", out), 560 Unevictable: getUint64Value("unevictable", out), 561 SlabReclaimable: getUint64Value("slab_reclaimable", out), 562 SlabUnreclaimable: getUint64Value("slab_unreclaimable", out), 563 Pgfault: getUint64Value("pgfault", out), 564 Pgmajfault: getUint64Value("pgmajfault", out), 565 WorkingsetRefault: getUint64Value("workingset_refault", out), 566 WorkingsetActivate: getUint64Value("workingset_activate", out), 567 WorkingsetNodereclaim: getUint64Value("workingset_nodereclaim", out), 568 Pgrefill: getUint64Value("pgrefill", out), 569 Pgscan: getUint64Value("pgscan", out), 570 Pgsteal: getUint64Value("pgsteal", out), 571 Pgactivate: getUint64Value("pgactivate", out), 572 Pgdeactivate: getUint64Value("pgdeactivate", out), 573 Pglazyfree: getUint64Value("pglazyfree", out), 574 Pglazyfreed: getUint64Value("pglazyfreed", out), 575 ThpFaultAlloc: getUint64Value("thp_fault_alloc", out), 576 ThpCollapseAlloc: getUint64Value("thp_collapse_alloc", out), 577 Usage: getStatFileContentUint64(filepath.Join(c.path, "memory.current")), 578 UsageLimit: getStatFileContentUint64(filepath.Join(c.path, "memory.max")), 579 SwapUsage: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.current")), 580 SwapLimit: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.max")), 581 } 582 if len(memoryEvents) > 0 { 583 metrics.MemoryEvents = &stats.MemoryEvents{ 584 Low: getUint64Value("low", memoryEvents), 585 High: getUint64Value("high", memoryEvents), 586 Max: getUint64Value("max", memoryEvents), 587 Oom: getUint64Value("oom", memoryEvents), 588 OomKill: getUint64Value("oom_kill", memoryEvents), 589 } 590 } 591 metrics.Io = &stats.IOStat{Usage: readIoStats(c.path)} 592 metrics.Rdma = &stats.RdmaStat{ 593 Current: rdmaStats(filepath.Join(c.path, "rdma.current")), 594 Limit: rdmaStats(filepath.Join(c.path, "rdma.max")), 595 } 596 metrics.Hugetlb = readHugeTlbStats(c.path) 597 598 return &metrics, nil 599 } 600 601 func getUint64Value(key string, out map[string]interface{}) uint64 { 602 v, ok := out[key] 603 if !ok { 604 return 0 605 } 606 switch t := v.(type) { 607 case uint64: 608 return t 609 } 610 return 0 611 } 612 613 func getPidValue(key string, out map[string]interface{}) uint64 { 614 v, ok := out[key] 615 if !ok { 616 return 0 617 } 618 switch t := v.(type) { 619 case uint64: 620 return t 621 case string: 622 if t == "max" { 623 return math.MaxUint64 624 } 625 } 626 return 0 627 } 628 629 func readSingleFile(path string, file string, out map[string]interface{}) error { 630 f, err := os.Open(filepath.Join(path, file)) 631 if err != nil { 632 return err 633 } 634 defer f.Close() 635 data, err := io.ReadAll(f) 636 if err != nil { 637 return err 638 } 639 s := strings.TrimSpace(string(data)) 640 v, err := parseUint(s, 10, 64) 641 if err != nil { 642 // if we cannot parse as a uint, parse as a string 643 out[file] = s 644 return nil 645 } 646 out[file] = v 647 return nil 648 } 649 650 func readKVStatsFile(path string, file string, out map[string]interface{}) error { 651 f, err := os.Open(filepath.Join(path, file)) 652 if err != nil { 653 return err 654 } 655 defer f.Close() 656 657 s := bufio.NewScanner(f) 658 for s.Scan() { 659 name, value, err := parseKV(s.Text()) 660 if err != nil { 661 return fmt.Errorf("error while parsing %s (line=%q): %w", filepath.Join(path, file), s.Text(), err) 662 } 663 out[name] = value 664 } 665 return s.Err() 666 } 667 668 func (c *Manager) Freeze() error { 669 return c.freeze(c.path, Frozen) 670 } 671 672 func (c *Manager) Thaw() error { 673 return c.freeze(c.path, Thawed) 674 } 675 676 func (c *Manager) freeze(path string, state State) error { 677 values := state.Values() 678 for { 679 if err := writeValues(path, values); err != nil { 680 return err 681 } 682 current, err := fetchState(path) 683 if err != nil { 684 return err 685 } 686 if current == state { 687 return nil 688 } 689 time.Sleep(1 * time.Millisecond) 690 } 691 } 692 693 func (c *Manager) isCgroupEmpty() bool { 694 // In case of any error we return true so that we exit and don't leak resources 695 out := make(map[string]interface{}) 696 if err := readKVStatsFile(c.path, "cgroup.events", out); err != nil { 697 return true 698 } 699 if v, ok := out["populated"]; ok { 700 populated, ok := v.(uint64) 701 if !ok { 702 return true 703 } 704 return populated == 0 705 } 706 return true 707 } 708 709 // MemoryEventFD returns inotify file descriptor and 'memory.events' inotify watch descriptor 710 func (c *Manager) MemoryEventFD() (int, uint32, error) { 711 fpath := filepath.Join(c.path, "memory.events") 712 fd, err := syscall.InotifyInit() 713 if err != nil { 714 return 0, 0, errors.New("failed to create inotify fd") 715 } 716 wd, err := syscall.InotifyAddWatch(fd, fpath, unix.IN_MODIFY) 717 if err != nil { 718 syscall.Close(fd) 719 return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", fpath, err) 720 } 721 // monitor to detect process exit/cgroup deletion 722 evpath := filepath.Join(c.path, "cgroup.events") 723 if _, err = syscall.InotifyAddWatch(fd, evpath, unix.IN_MODIFY); err != nil { 724 syscall.Close(fd) 725 return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", evpath, err) 726 } 727 728 return fd, uint32(wd), nil 729 } 730 731 func (c *Manager) EventChan() (<-chan Event, <-chan error) { 732 ec := make(chan Event) 733 errCh := make(chan error, 1) 734 go c.waitForEvents(ec, errCh) 735 736 return ec, errCh 737 } 738 739 func parseMemoryEvents(out map[string]interface{}) (Event, error) { 740 e := Event{} 741 if v, ok := out["high"]; ok { 742 e.High, ok = v.(uint64) 743 if !ok { 744 return Event{}, fmt.Errorf("cannot convert high to uint64: %+v", v) 745 } 746 } 747 if v, ok := out["low"]; ok { 748 e.Low, ok = v.(uint64) 749 if !ok { 750 return Event{}, fmt.Errorf("cannot convert low to uint64: %+v", v) 751 } 752 } 753 if v, ok := out["max"]; ok { 754 e.Max, ok = v.(uint64) 755 if !ok { 756 return Event{}, fmt.Errorf("cannot convert max to uint64: %+v", v) 757 } 758 } 759 if v, ok := out["oom"]; ok { 760 e.OOM, ok = v.(uint64) 761 if !ok { 762 return Event{}, fmt.Errorf("cannot convert oom to uint64: %+v", v) 763 } 764 } 765 if v, ok := out["oom_kill"]; ok { 766 e.OOMKill, ok = v.(uint64) 767 if !ok { 768 return Event{}, fmt.Errorf("cannot convert oom_kill to uint64: %+v", v) 769 } 770 } 771 return e, nil 772 } 773 774 func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) { 775 defer close(errCh) 776 777 fd, _, err := c.MemoryEventFD() 778 if err != nil { 779 errCh <- err 780 return 781 } 782 defer syscall.Close(fd) 783 784 for { 785 buffer := make([]byte, syscall.SizeofInotifyEvent*10) 786 bytesRead, err := syscall.Read(fd, buffer) 787 if err != nil { 788 errCh <- err 789 return 790 } 791 if bytesRead >= syscall.SizeofInotifyEvent { 792 out := make(map[string]interface{}) 793 if err := readKVStatsFile(c.path, "memory.events", out); err != nil { 794 // When cgroup is deleted read may return -ENODEV instead of -ENOENT from open. 795 if _, statErr := os.Lstat(filepath.Join(c.path, "memory.events")); !os.IsNotExist(statErr) { 796 errCh <- err 797 } 798 return 799 } 800 e, err := parseMemoryEvents(out) 801 if err != nil { 802 errCh <- err 803 return 804 } 805 ec <- e 806 if c.isCgroupEmpty() { 807 return 808 } 809 } 810 } 811 } 812 813 func setDevices(path string, devices []specs.LinuxDeviceCgroup) error { 814 if len(devices) == 0 { 815 return nil 816 } 817 insts, license, err := DeviceFilter(devices) 818 if err != nil { 819 return err 820 } 821 dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY|unix.O_CLOEXEC, 0600) 822 if err != nil { 823 return fmt.Errorf("cannot get dir FD for %s", path) 824 } 825 defer unix.Close(dirFD) 826 if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { 827 if !canSkipEBPFError(devices) { 828 return err 829 } 830 } 831 return nil 832 } 833 834 // getSystemdFullPath returns the full systemd path when creating a systemd slice group. 835 // the reason this is necessary is because the "-" character has a special meaning in 836 // systemd slice. For example, when creating a slice called "my-group-112233.slice", 837 // systemd will create a hierarchy like this: 838 // 839 // /sys/fs/cgroup/my.slice/my-group.slice/my-group-112233.slice 840 func getSystemdFullPath(slice, group string) string { 841 return filepath.Join(defaultCgroup2Path, dashesToPath(slice), dashesToPath(group)) 842 } 843 844 // dashesToPath converts a slice name with dashes to it's corresponding systemd filesystem path. 845 func dashesToPath(in string) string { 846 path := "" 847 if strings.HasSuffix(in, ".slice") && strings.Contains(in, "-") { 848 parts := strings.Split(in, "-") 849 for i := range parts { 850 s := strings.Join(parts[0:i+1], "-") 851 if !strings.HasSuffix(s, ".slice") { 852 s += ".slice" 853 } 854 path = filepath.Join(path, s) 855 } 856 } else { 857 path = filepath.Join(path, in) 858 } 859 return path 860 } 861 862 func NewSystemd(slice, group string, pid int, resources *Resources) (*Manager, error) { 863 if slice == "" { 864 slice = defaultSlice 865 } 866 ctx := context.TODO() 867 path := getSystemdFullPath(slice, group) 868 conn, err := systemdDbus.NewWithContext(ctx) 869 if err != nil { 870 return &Manager{}, err 871 } 872 defer conn.Close() 873 874 properties := []systemdDbus.Property{ 875 systemdDbus.PropDescription("cgroup " + group), 876 newSystemdProperty("DefaultDependencies", false), 877 newSystemdProperty("MemoryAccounting", true), 878 newSystemdProperty("CPUAccounting", true), 879 newSystemdProperty("IOAccounting", true), 880 } 881 882 // if we create a slice, the parent is defined via a Wants= 883 if strings.HasSuffix(group, ".slice") { 884 properties = append(properties, systemdDbus.PropWants(defaultSlice)) 885 } else { 886 // otherwise, we use Slice= 887 properties = append(properties, systemdDbus.PropSlice(defaultSlice)) 888 } 889 890 // only add pid if its valid, -1 is used w/ general slice creation. 891 if pid != -1 { 892 properties = append(properties, newSystemdProperty("PIDs", []uint32{uint32(pid)})) 893 } 894 895 if resources.Memory != nil && resources.Memory.Min != nil && *resources.Memory.Min != 0 { 896 properties = append(properties, 897 newSystemdProperty("MemoryMin", uint64(*resources.Memory.Min))) 898 } 899 900 if resources.Memory != nil && resources.Memory.Max != nil && *resources.Memory.Max != 0 { 901 properties = append(properties, 902 newSystemdProperty("MemoryMax", uint64(*resources.Memory.Max))) 903 } 904 905 if resources.CPU != nil && resources.CPU.Weight != nil && *resources.CPU.Weight != 0 { 906 properties = append(properties, 907 newSystemdProperty("CPUWeight", *resources.CPU.Weight)) 908 } 909 910 if resources.CPU != nil && resources.CPU.Max != "" { 911 quota, period := resources.CPU.Max.extractQuotaAndPeriod() 912 // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. 913 // corresponds to USEC_INFINITY in systemd 914 // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd 915 // always setting a property value ensures we can apply a quota and remove it later 916 cpuQuotaPerSecUSec := uint64(math.MaxUint64) 917 if quota > 0 { 918 // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota 919 // (integer percentage of CPU) internally. This means that if a fractional percent of 920 // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest 921 // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. 922 cpuQuotaPerSecUSec = uint64(quota*1000000) / period 923 if cpuQuotaPerSecUSec%10000 != 0 { 924 cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 925 } 926 } 927 properties = append(properties, 928 newSystemdProperty("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) 929 } 930 931 // If we can delegate, we add the property back in 932 if canDelegate { 933 properties = append(properties, newSystemdProperty("Delegate", true)) 934 } 935 936 if resources.Pids != nil && resources.Pids.Max > 0 { 937 properties = append(properties, 938 newSystemdProperty("TasksAccounting", true), 939 newSystemdProperty("TasksMax", uint64(resources.Pids.Max))) 940 } 941 942 statusChan := make(chan string, 1) 943 if _, err := conn.StartTransientUnitContext(ctx, group, "replace", properties, statusChan); err == nil { 944 select { 945 case <-statusChan: 946 case <-time.After(time.Second): 947 logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", group) 948 } 949 } else if !isUnitExists(err) { 950 return &Manager{}, err 951 } 952 953 return &Manager{ 954 path: path, 955 }, nil 956 } 957 958 func LoadSystemd(slice, group string) (*Manager, error) { 959 if slice == "" { 960 slice = defaultSlice 961 } 962 path := getSystemdFullPath(slice, group) 963 return &Manager{ 964 path: path, 965 }, nil 966 } 967 968 func (c *Manager) DeleteSystemd() error { 969 ctx := context.TODO() 970 conn, err := systemdDbus.NewWithContext(ctx) 971 if err != nil { 972 return err 973 } 974 defer conn.Close() 975 group := systemdUnitFromPath(c.path) 976 ch := make(chan string) 977 _, err = conn.StopUnitContext(ctx, group, "replace", ch) 978 if err != nil { 979 return err 980 } 981 <-ch 982 return nil 983 } 984 985 func newSystemdProperty(name string, units interface{}) systemdDbus.Property { 986 return systemdDbus.Property{ 987 Name: name, 988 Value: dbus.MakeVariant(units), 989 } 990 }