memory.go (13738B)
1 /* 2 Copyright The containerd Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cgroup1 18 19 import ( 20 "bufio" 21 "fmt" 22 "io" 23 "os" 24 "path/filepath" 25 "strconv" 26 "strings" 27 28 v1 "github.com/containerd/cgroups/v3/cgroup1/stats" 29 specs "github.com/opencontainers/runtime-spec/specs-go" 30 "golang.org/x/sys/unix" 31 ) 32 33 // MemoryEvent is an interface that V1 memory Cgroup notifications implement. Arg returns the 34 // file name whose fd should be written to "cgroups.event_control". EventFile returns the name of 35 // the file that supports the notification api e.g. "memory.usage_in_bytes". 36 type MemoryEvent interface { 37 Arg() string 38 EventFile() string 39 } 40 41 type memoryThresholdEvent struct { 42 threshold uint64 43 swap bool 44 } 45 46 // MemoryThresholdEvent returns a new [MemoryEvent] representing the memory threshold set. 47 // If swap is true, the event will be registered using memory.memsw.usage_in_bytes 48 func MemoryThresholdEvent(threshold uint64, swap bool) MemoryEvent { 49 return &memoryThresholdEvent{ 50 threshold, 51 swap, 52 } 53 } 54 55 func (m *memoryThresholdEvent) Arg() string { 56 return strconv.FormatUint(m.threshold, 10) 57 } 58 59 func (m *memoryThresholdEvent) EventFile() string { 60 if m.swap { 61 return "memory.memsw.usage_in_bytes" 62 } 63 return "memory.usage_in_bytes" 64 } 65 66 type oomEvent struct{} 67 68 // OOMEvent returns a new oom event to be used with RegisterMemoryEvent. 69 func OOMEvent() MemoryEvent { 70 return &oomEvent{} 71 } 72 73 func (oom *oomEvent) Arg() string { 74 return "" 75 } 76 77 func (oom *oomEvent) EventFile() string { 78 return "memory.oom_control" 79 } 80 81 type memoryPressureEvent struct { 82 pressureLevel MemoryPressureLevel 83 hierarchy EventNotificationMode 84 } 85 86 // MemoryPressureEvent returns a new [MemoryEvent] representing the memory pressure set. 87 func MemoryPressureEvent(pressureLevel MemoryPressureLevel, hierarchy EventNotificationMode) MemoryEvent { 88 return &memoryPressureEvent{ 89 pressureLevel, 90 hierarchy, 91 } 92 } 93 94 func (m *memoryPressureEvent) Arg() string { 95 return string(m.pressureLevel) + "," + string(m.hierarchy) 96 } 97 98 func (m *memoryPressureEvent) EventFile() string { 99 return "memory.pressure_level" 100 } 101 102 // MemoryPressureLevel corresponds to the memory pressure levels defined 103 // for memory cgroups. 104 type MemoryPressureLevel string 105 106 // The three memory pressure levels are as follows. 107 // - The "low" level means that the system is reclaiming memory for new 108 // allocations. Monitoring this reclaiming activity might be useful for 109 // maintaining cache level. Upon notification, the program (typically 110 // "Activity Manager") might analyze vmstat and act in advance (i.e. 111 // prematurely shutdown unimportant services). 112 // - The "medium" level means that the system is experiencing medium memory 113 // pressure, the system might be making swap, paging out active file caches, 114 // etc. Upon this event applications may decide to further analyze 115 // vmstat/zoneinfo/memcg or internal memory usage statistics and free any 116 // resources that can be easily reconstructed or re-read from a disk. 117 // - The "critical" level means that the system is actively thrashing, it is 118 // about to out of memory (OOM) or even the in-kernel OOM killer is on its 119 // way to trigger. Applications should do whatever they can to help the 120 // system. It might be too late to consult with vmstat or any other 121 // statistics, so it is advisable to take an immediate action. 122 // "https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt" Section 11 123 const ( 124 LowPressure MemoryPressureLevel = "low" 125 MediumPressure MemoryPressureLevel = "medium" 126 CriticalPressure MemoryPressureLevel = "critical" 127 ) 128 129 // EventNotificationMode corresponds to the notification modes 130 // for the memory cgroups pressure level notifications. 131 type EventNotificationMode string 132 133 // There are three optional modes that specify different propagation behavior: 134 // - "default": this is the default behavior specified above. This mode is the 135 // same as omitting the optional mode parameter, preserved by backwards 136 // compatibility. 137 // - "hierarchy": events always propagate up to the root, similar to the default 138 // behavior, except that propagation continues regardless of whether there are 139 // event listeners at each level, with the "hierarchy" mode. In the above 140 // example, groups A, B, and C will receive notification of memory pressure. 141 // - "local": events are pass-through, i.e. they only receive notifications when 142 // memory pressure is experienced in the memcg for which the notification is 143 // registered. In the above example, group C will receive notification if 144 // registered for "local" notification and the group experiences memory 145 // pressure. However, group B will never receive notification, regardless if 146 // there is an event listener for group C or not, if group B is registered for 147 // local notification. 148 // "https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt" Section 11 149 const ( 150 DefaultMode EventNotificationMode = "default" 151 LocalMode EventNotificationMode = "local" 152 HierarchyMode EventNotificationMode = "hierarchy" 153 ) 154 155 // NewMemory returns a Memory controller given the root folder of cgroups. 156 // It may optionally accept other configuration options, such as IgnoreModules(...) 157 func NewMemory(root string, options ...func(*memoryController)) *memoryController { 158 mc := &memoryController{ 159 root: filepath.Join(root, string(Memory)), 160 ignored: map[string]struct{}{}, 161 } 162 for _, opt := range options { 163 opt(mc) 164 } 165 return mc 166 } 167 168 // IgnoreModules configure the memory controller to not read memory metrics for some 169 // module names (e.g. passing "memsw" would avoid all the memory.memsw.* entries) 170 func IgnoreModules(names ...string) func(*memoryController) { 171 return func(mc *memoryController) { 172 for _, name := range names { 173 mc.ignored[name] = struct{}{} 174 } 175 } 176 } 177 178 // OptionalSwap allows the memory controller to not fail if cgroups is not accounting 179 // Swap memory (there are no memory.memsw.* entries) 180 func OptionalSwap() func(*memoryController) { 181 return func(mc *memoryController) { 182 _, err := os.Stat(filepath.Join(mc.root, "memory.memsw.usage_in_bytes")) 183 if os.IsNotExist(err) { 184 mc.ignored["memsw"] = struct{}{} 185 } 186 } 187 } 188 189 type memoryController struct { 190 root string 191 ignored map[string]struct{} 192 } 193 194 func (m *memoryController) Name() Name { 195 return Memory 196 } 197 198 func (m *memoryController) Path(path string) string { 199 return filepath.Join(m.root, path) 200 } 201 202 func (m *memoryController) Create(path string, resources *specs.LinuxResources) error { 203 if err := os.MkdirAll(m.Path(path), defaultDirPerm); err != nil { 204 return err 205 } 206 if resources.Memory == nil { 207 return nil 208 } 209 return m.set(path, getMemorySettings(resources)) 210 } 211 212 func (m *memoryController) Update(path string, resources *specs.LinuxResources) error { 213 if resources.Memory == nil { 214 return nil 215 } 216 g := func(v *int64) bool { 217 return v != nil && *v > 0 218 } 219 settings := getMemorySettings(resources) 220 if g(resources.Memory.Limit) && g(resources.Memory.Swap) { 221 // if the updated swap value is larger than the current memory limit set the swap changes first 222 // then set the memory limit as swap must always be larger than the current limit 223 current, err := readUint(filepath.Join(m.Path(path), "memory.limit_in_bytes")) 224 if err != nil { 225 return err 226 } 227 if current < uint64(*resources.Memory.Swap) { 228 settings[0], settings[1] = settings[1], settings[0] 229 } 230 } 231 return m.set(path, settings) 232 } 233 234 func (m *memoryController) Stat(path string, stats *v1.Metrics) error { 235 fMemStat, err := os.Open(filepath.Join(m.Path(path), "memory.stat")) 236 if err != nil { 237 return err 238 } 239 defer fMemStat.Close() 240 stats.Memory = &v1.MemoryStat{ 241 Usage: &v1.MemoryEntry{}, 242 Swap: &v1.MemoryEntry{}, 243 Kernel: &v1.MemoryEntry{}, 244 KernelTCP: &v1.MemoryEntry{}, 245 } 246 if err := m.parseStats(fMemStat, stats.Memory); err != nil { 247 return err 248 } 249 250 fMemOomControl, err := os.Open(filepath.Join(m.Path(path), "memory.oom_control")) 251 if err != nil { 252 return err 253 } 254 defer fMemOomControl.Close() 255 stats.MemoryOomControl = &v1.MemoryOomControl{} 256 if err := m.parseOomControlStats(fMemOomControl, stats.MemoryOomControl); err != nil { 257 return err 258 } 259 for _, t := range []struct { 260 module string 261 entry *v1.MemoryEntry 262 }{ 263 { 264 module: "", 265 entry: stats.Memory.Usage, 266 }, 267 { 268 module: "memsw", 269 entry: stats.Memory.Swap, 270 }, 271 { 272 module: "kmem", 273 entry: stats.Memory.Kernel, 274 }, 275 { 276 module: "kmem.tcp", 277 entry: stats.Memory.KernelTCP, 278 }, 279 } { 280 if _, ok := m.ignored[t.module]; ok { 281 continue 282 } 283 for _, tt := range []struct { 284 name string 285 value *uint64 286 }{ 287 { 288 name: "usage_in_bytes", 289 value: &t.entry.Usage, 290 }, 291 { 292 name: "max_usage_in_bytes", 293 value: &t.entry.Max, 294 }, 295 { 296 name: "failcnt", 297 value: &t.entry.Failcnt, 298 }, 299 { 300 name: "limit_in_bytes", 301 value: &t.entry.Limit, 302 }, 303 } { 304 parts := []string{"memory"} 305 if t.module != "" { 306 parts = append(parts, t.module) 307 } 308 parts = append(parts, tt.name) 309 v, err := readUint(filepath.Join(m.Path(path), strings.Join(parts, "."))) 310 if err != nil { 311 return err 312 } 313 *tt.value = v 314 } 315 } 316 return nil 317 } 318 319 func (m *memoryController) parseStats(r io.Reader, stat *v1.MemoryStat) error { 320 var ( 321 raw = make(map[string]uint64) 322 sc = bufio.NewScanner(r) 323 line int 324 ) 325 for sc.Scan() { 326 key, v, err := parseKV(sc.Text()) 327 if err != nil { 328 return fmt.Errorf("%d: %v", line, err) 329 } 330 raw[key] = v 331 line++ 332 } 333 if err := sc.Err(); err != nil { 334 return err 335 } 336 stat.Cache = raw["cache"] 337 stat.RSS = raw["rss"] 338 stat.RSSHuge = raw["rss_huge"] 339 stat.MappedFile = raw["mapped_file"] 340 stat.Dirty = raw["dirty"] 341 stat.Writeback = raw["writeback"] 342 stat.PgPgIn = raw["pgpgin"] 343 stat.PgPgOut = raw["pgpgout"] 344 stat.PgFault = raw["pgfault"] 345 stat.PgMajFault = raw["pgmajfault"] 346 stat.InactiveAnon = raw["inactive_anon"] 347 stat.ActiveAnon = raw["active_anon"] 348 stat.InactiveFile = raw["inactive_file"] 349 stat.ActiveFile = raw["active_file"] 350 stat.Unevictable = raw["unevictable"] 351 stat.HierarchicalMemoryLimit = raw["hierarchical_memory_limit"] 352 stat.HierarchicalSwapLimit = raw["hierarchical_memsw_limit"] 353 stat.TotalCache = raw["total_cache"] 354 stat.TotalRSS = raw["total_rss"] 355 stat.TotalRSSHuge = raw["total_rss_huge"] 356 stat.TotalMappedFile = raw["total_mapped_file"] 357 stat.TotalDirty = raw["total_dirty"] 358 stat.TotalWriteback = raw["total_writeback"] 359 stat.TotalPgPgIn = raw["total_pgpgin"] 360 stat.TotalPgPgOut = raw["total_pgpgout"] 361 stat.TotalPgFault = raw["total_pgfault"] 362 stat.TotalPgMajFault = raw["total_pgmajfault"] 363 stat.TotalInactiveAnon = raw["total_inactive_anon"] 364 stat.TotalActiveAnon = raw["total_active_anon"] 365 stat.TotalInactiveFile = raw["total_inactive_file"] 366 stat.TotalActiveFile = raw["total_active_file"] 367 stat.TotalUnevictable = raw["total_unevictable"] 368 return nil 369 } 370 371 func (m *memoryController) parseOomControlStats(r io.Reader, stat *v1.MemoryOomControl) error { 372 var ( 373 raw = make(map[string]uint64) 374 sc = bufio.NewScanner(r) 375 line int 376 ) 377 for sc.Scan() { 378 key, v, err := parseKV(sc.Text()) 379 if err != nil { 380 return fmt.Errorf("%d: %v", line, err) 381 } 382 raw[key] = v 383 line++ 384 } 385 if err := sc.Err(); err != nil { 386 return err 387 } 388 stat.OomKillDisable = raw["oom_kill_disable"] 389 stat.UnderOom = raw["under_oom"] 390 stat.OomKill = raw["oom_kill"] 391 return nil 392 } 393 394 func (m *memoryController) set(path string, settings []memorySettings) error { 395 for _, t := range settings { 396 if t.value != nil { 397 if err := os.WriteFile( 398 filepath.Join(m.Path(path), "memory."+t.name), 399 []byte(strconv.FormatInt(*t.value, 10)), 400 defaultFilePerm, 401 ); err != nil { 402 return err 403 } 404 } 405 } 406 return nil 407 } 408 409 type memorySettings struct { 410 name string 411 value *int64 412 } 413 414 func getMemorySettings(resources *specs.LinuxResources) []memorySettings { 415 mem := resources.Memory 416 var swappiness *int64 417 if mem.Swappiness != nil { 418 v := int64(*mem.Swappiness) 419 swappiness = &v 420 } 421 return []memorySettings{ 422 { 423 name: "limit_in_bytes", 424 value: mem.Limit, 425 }, 426 { 427 name: "soft_limit_in_bytes", 428 value: mem.Reservation, 429 }, 430 { 431 name: "memsw.limit_in_bytes", 432 value: mem.Swap, 433 }, 434 { 435 name: "kmem.limit_in_bytes", 436 value: mem.Kernel, 437 }, 438 { 439 name: "kmem.tcp.limit_in_bytes", 440 value: mem.KernelTCP, 441 }, 442 { 443 name: "oom_control", 444 value: getOomControlValue(mem), 445 }, 446 { 447 name: "swappiness", 448 value: swappiness, 449 }, 450 } 451 } 452 453 func getOomControlValue(mem *specs.LinuxMemory) *int64 { 454 if mem.DisableOOMKiller != nil && *mem.DisableOOMKiller { 455 i := int64(1) 456 return &i 457 } 458 return nil 459 } 460 461 func (m *memoryController) memoryEvent(path string, event MemoryEvent) (uintptr, error) { 462 root := m.Path(path) 463 efd, err := unix.Eventfd(0, unix.EFD_CLOEXEC) 464 if err != nil { 465 return 0, err 466 } 467 evtFile, err := os.Open(filepath.Join(root, event.EventFile())) 468 if err != nil { 469 unix.Close(efd) 470 return 0, err 471 } 472 defer evtFile.Close() 473 data := fmt.Sprintf("%d %d %s", efd, evtFile.Fd(), event.Arg()) 474 evctlPath := filepath.Join(root, "cgroup.event_control") 475 if err := os.WriteFile(evctlPath, []byte(data), 0700); err != nil { 476 unix.Close(efd) 477 return 0, err 478 } 479 return uintptr(efd), nil 480 }