kprobe.go (17858B)
1 package link 2 3 import ( 4 "bytes" 5 "crypto/rand" 6 "errors" 7 "fmt" 8 "os" 9 "path/filepath" 10 "runtime" 11 "strings" 12 "sync" 13 "syscall" 14 "unsafe" 15 16 "github.com/cilium/ebpf" 17 "github.com/cilium/ebpf/internal/sys" 18 "github.com/cilium/ebpf/internal/unix" 19 ) 20 21 var ( 22 kprobeEventsPath = filepath.Join(tracefsPath, "kprobe_events") 23 24 kprobeRetprobeBit = struct { 25 once sync.Once 26 value uint64 27 err error 28 }{} 29 ) 30 31 type probeType uint8 32 33 type probeArgs struct { 34 symbol, group, path string 35 offset, refCtrOffset, cookie uint64 36 pid int 37 ret bool 38 } 39 40 // KprobeOptions defines additional parameters that will be used 41 // when loading Kprobes. 42 type KprobeOptions struct { 43 // Arbitrary value that can be fetched from an eBPF program 44 // via `bpf_get_attach_cookie()`. 45 // 46 // Needs kernel 5.15+. 47 Cookie uint64 48 // Offset of the kprobe relative to the traced symbol. 49 // Can be used to insert kprobes at arbitrary offsets in kernel functions, 50 // e.g. in places where functions have been inlined. 51 Offset uint64 52 } 53 54 const ( 55 kprobeType probeType = iota 56 uprobeType 57 ) 58 59 func (pt probeType) String() string { 60 if pt == kprobeType { 61 return "kprobe" 62 } 63 return "uprobe" 64 } 65 66 func (pt probeType) EventsPath() string { 67 if pt == kprobeType { 68 return kprobeEventsPath 69 } 70 return uprobeEventsPath 71 } 72 73 func (pt probeType) PerfEventType(ret bool) perfEventType { 74 if pt == kprobeType { 75 if ret { 76 return kretprobeEvent 77 } 78 return kprobeEvent 79 } 80 if ret { 81 return uretprobeEvent 82 } 83 return uprobeEvent 84 } 85 86 func (pt probeType) RetprobeBit() (uint64, error) { 87 if pt == kprobeType { 88 return kretprobeBit() 89 } 90 return uretprobeBit() 91 } 92 93 // Kprobe attaches the given eBPF program to a perf event that fires when the 94 // given kernel symbol starts executing. See /proc/kallsyms for available 95 // symbols. For example, printk(): 96 // 97 // kp, err := Kprobe("printk", prog, nil) 98 // 99 // Losing the reference to the resulting Link (kp) will close the Kprobe 100 // and prevent further execution of prog. The Link must be Closed during 101 // program shutdown to avoid leaking system resources. 102 func Kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) { 103 k, err := kprobe(symbol, prog, opts, false) 104 if err != nil { 105 return nil, err 106 } 107 108 lnk, err := attachPerfEvent(k, prog) 109 if err != nil { 110 k.Close() 111 return nil, err 112 } 113 114 return lnk, nil 115 } 116 117 // Kretprobe attaches the given eBPF program to a perf event that fires right 118 // before the given kernel symbol exits, with the function stack left intact. 119 // See /proc/kallsyms for available symbols. For example, printk(): 120 // 121 // kp, err := Kretprobe("printk", prog, nil) 122 // 123 // Losing the reference to the resulting Link (kp) will close the Kretprobe 124 // and prevent further execution of prog. The Link must be Closed during 125 // program shutdown to avoid leaking system resources. 126 func Kretprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) { 127 k, err := kprobe(symbol, prog, opts, true) 128 if err != nil { 129 return nil, err 130 } 131 132 lnk, err := attachPerfEvent(k, prog) 133 if err != nil { 134 k.Close() 135 return nil, err 136 } 137 138 return lnk, nil 139 } 140 141 // isValidKprobeSymbol implements the equivalent of a regex match 142 // against "^[a-zA-Z_][0-9a-zA-Z_.]*$". 143 func isValidKprobeSymbol(s string) bool { 144 if len(s) < 1 { 145 return false 146 } 147 148 for i, c := range []byte(s) { 149 switch { 150 case c >= 'a' && c <= 'z': 151 case c >= 'A' && c <= 'Z': 152 case c == '_': 153 case i > 0 && c >= '0' && c <= '9': 154 155 // Allow `.` in symbol name. GCC-compiled kernel may change symbol name 156 // to have a `.isra.$n` suffix, like `udp_send_skb.isra.52`. 157 // See: https://gcc.gnu.org/gcc-10/changes.html 158 case i > 0 && c == '.': 159 160 default: 161 return false 162 } 163 } 164 165 return true 166 } 167 168 // kprobe opens a perf event on the given symbol and attaches prog to it. 169 // If ret is true, create a kretprobe. 170 func kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions, ret bool) (*perfEvent, error) { 171 if symbol == "" { 172 return nil, fmt.Errorf("symbol name cannot be empty: %w", errInvalidInput) 173 } 174 if prog == nil { 175 return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput) 176 } 177 if !isValidKprobeSymbol(symbol) { 178 return nil, fmt.Errorf("symbol '%s' must be a valid symbol in /proc/kallsyms: %w", symbol, errInvalidInput) 179 } 180 if prog.Type() != ebpf.Kprobe { 181 return nil, fmt.Errorf("eBPF program type %s is not a Kprobe: %w", prog.Type(), errInvalidInput) 182 } 183 184 args := probeArgs{ 185 pid: perfAllThreads, 186 symbol: symbol, 187 ret: ret, 188 } 189 190 if opts != nil { 191 args.cookie = opts.Cookie 192 args.offset = opts.Offset 193 } 194 195 // Use kprobe PMU if the kernel has it available. 196 tp, err := pmuKprobe(args) 197 if errors.Is(err, os.ErrNotExist) { 198 args.symbol = platformPrefix(symbol) 199 tp, err = pmuKprobe(args) 200 } 201 if err == nil { 202 return tp, nil 203 } 204 if err != nil && !errors.Is(err, ErrNotSupported) { 205 return nil, fmt.Errorf("creating perf_kprobe PMU: %w", err) 206 } 207 208 // Use tracefs if kprobe PMU is missing. 209 args.symbol = symbol 210 tp, err = tracefsKprobe(args) 211 if errors.Is(err, os.ErrNotExist) { 212 args.symbol = platformPrefix(symbol) 213 tp, err = tracefsKprobe(args) 214 } 215 if err != nil { 216 return nil, fmt.Errorf("creating trace event '%s' in tracefs: %w", symbol, err) 217 } 218 219 return tp, nil 220 } 221 222 // pmuKprobe opens a perf event based on the kprobe PMU. 223 // Returns os.ErrNotExist if the given symbol does not exist in the kernel. 224 func pmuKprobe(args probeArgs) (*perfEvent, error) { 225 return pmuProbe(kprobeType, args) 226 } 227 228 // pmuProbe opens a perf event based on a Performance Monitoring Unit. 229 // 230 // Requires at least a 4.17 kernel. 231 // e12f03d7031a "perf/core: Implement the 'perf_kprobe' PMU" 232 // 33ea4b24277b "perf/core: Implement the 'perf_uprobe' PMU" 233 // 234 // Returns ErrNotSupported if the kernel doesn't support perf_[k,u]probe PMU 235 func pmuProbe(typ probeType, args probeArgs) (*perfEvent, error) { 236 // Getting the PMU type will fail if the kernel doesn't support 237 // the perf_[k,u]probe PMU. 238 et, err := getPMUEventType(typ) 239 if err != nil { 240 return nil, err 241 } 242 243 var config uint64 244 if args.ret { 245 bit, err := typ.RetprobeBit() 246 if err != nil { 247 return nil, err 248 } 249 config |= 1 << bit 250 } 251 252 var ( 253 attr unix.PerfEventAttr 254 sp unsafe.Pointer 255 ) 256 switch typ { 257 case kprobeType: 258 // Create a pointer to a NUL-terminated string for the kernel. 259 sp, err = unsafeStringPtr(args.symbol) 260 if err != nil { 261 return nil, err 262 } 263 264 attr = unix.PerfEventAttr{ 265 // The minimum size required for PMU kprobes is PERF_ATTR_SIZE_VER1, 266 // since it added the config2 (Ext2) field. Use Ext2 as probe_offset. 267 Size: unix.PERF_ATTR_SIZE_VER1, 268 Type: uint32(et), // PMU event type read from sysfs 269 Ext1: uint64(uintptr(sp)), // Kernel symbol to trace 270 Ext2: args.offset, // Kernel symbol offset 271 Config: config, // Retprobe flag 272 } 273 case uprobeType: 274 sp, err = unsafeStringPtr(args.path) 275 if err != nil { 276 return nil, err 277 } 278 279 if args.refCtrOffset != 0 { 280 config |= args.refCtrOffset << uprobeRefCtrOffsetShift 281 } 282 283 attr = unix.PerfEventAttr{ 284 // The minimum size required for PMU uprobes is PERF_ATTR_SIZE_VER1, 285 // since it added the config2 (Ext2) field. The Size field controls the 286 // size of the internal buffer the kernel allocates for reading the 287 // perf_event_attr argument from userspace. 288 Size: unix.PERF_ATTR_SIZE_VER1, 289 Type: uint32(et), // PMU event type read from sysfs 290 Ext1: uint64(uintptr(sp)), // Uprobe path 291 Ext2: args.offset, // Uprobe offset 292 Config: config, // RefCtrOffset, Retprobe flag 293 } 294 } 295 296 rawFd, err := unix.PerfEventOpen(&attr, args.pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC) 297 298 // On some old kernels, kprobe PMU doesn't allow `.` in symbol names and 299 // return -EINVAL. Return ErrNotSupported to allow falling back to tracefs. 300 // https://github.com/torvalds/linux/blob/94710cac0ef4/kernel/trace/trace_kprobe.c#L340-L343 301 if errors.Is(err, unix.EINVAL) && strings.Contains(args.symbol, ".") { 302 return nil, fmt.Errorf("symbol '%s+%#x': older kernels don't accept dots: %w", args.symbol, args.offset, ErrNotSupported) 303 } 304 // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL 305 // when trying to create a kretprobe for a missing symbol. Make sure ENOENT 306 // is returned to the caller. 307 if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) { 308 return nil, fmt.Errorf("symbol '%s+%#x' not found: %w", args.symbol, args.offset, os.ErrNotExist) 309 } 310 // Since commit ab105a4fb894, -EILSEQ is returned when a kprobe sym+offset is resolved 311 // to an invalid insn boundary. 312 if errors.Is(err, syscall.EILSEQ) { 313 return nil, fmt.Errorf("symbol '%s+%#x' not found (bad insn boundary): %w", args.symbol, args.offset, os.ErrNotExist) 314 } 315 // Since at least commit cb9a19fe4aa51, ENOTSUPP is returned 316 // when attempting to set a uprobe on a trap instruction. 317 if errors.Is(err, unix.ENOTSUPP) { 318 return nil, fmt.Errorf("failed setting uprobe on offset %#x (possible trap insn): %w", args.offset, err) 319 } 320 if err != nil { 321 return nil, fmt.Errorf("opening perf event: %w", err) 322 } 323 324 // Ensure the string pointer is not collected before PerfEventOpen returns. 325 runtime.KeepAlive(sp) 326 327 fd, err := sys.NewFD(rawFd) 328 if err != nil { 329 return nil, err 330 } 331 332 // Kernel has perf_[k,u]probe PMU available, initialize perf event. 333 return &perfEvent{ 334 typ: typ.PerfEventType(args.ret), 335 name: args.symbol, 336 pmuID: et, 337 cookie: args.cookie, 338 fd: fd, 339 }, nil 340 } 341 342 // tracefsKprobe creates a Kprobe tracefs entry. 343 func tracefsKprobe(args probeArgs) (*perfEvent, error) { 344 return tracefsProbe(kprobeType, args) 345 } 346 347 // tracefsProbe creates a trace event by writing an entry to <tracefs>/[k,u]probe_events. 348 // A new trace event group name is generated on every call to support creating 349 // multiple trace events for the same kernel or userspace symbol. 350 // Path and offset are only set in the case of uprobe(s) and are used to set 351 // the executable/library path on the filesystem and the offset where the probe is inserted. 352 // A perf event is then opened on the newly-created trace event and returned to the caller. 353 func tracefsProbe(typ probeType, args probeArgs) (_ *perfEvent, err error) { 354 // Generate a random string for each trace event we attempt to create. 355 // This value is used as the 'group' token in tracefs to allow creating 356 // multiple kprobe trace events with the same name. 357 group, err := randomGroup("ebpf") 358 if err != nil { 359 return nil, fmt.Errorf("randomizing group name: %w", err) 360 } 361 args.group = group 362 363 // Before attempting to create a trace event through tracefs, 364 // check if an event with the same group and name already exists. 365 // Kernels 4.x and earlier don't return os.ErrExist on writing a duplicate 366 // entry, so we need to rely on reads for detecting uniqueness. 367 _, err = getTraceEventID(group, args.symbol) 368 if err == nil { 369 return nil, fmt.Errorf("trace event already exists: %s/%s", group, args.symbol) 370 } 371 if err != nil && !errors.Is(err, os.ErrNotExist) { 372 return nil, fmt.Errorf("checking trace event %s/%s: %w", group, args.symbol, err) 373 } 374 375 // Create the [k,u]probe trace event using tracefs. 376 if err := createTraceFSProbeEvent(typ, args); err != nil { 377 return nil, fmt.Errorf("creating probe entry on tracefs: %w", err) 378 } 379 defer func() { 380 if err != nil { 381 // Make sure we clean up the created tracefs event when we return error. 382 // If a livepatch handler is already active on the symbol, the write to 383 // tracefs will succeed, a trace event will show up, but creating the 384 // perf event will fail with EBUSY. 385 _ = closeTraceFSProbeEvent(typ, args.group, args.symbol) 386 } 387 }() 388 389 // Get the newly-created trace event's id. 390 tid, err := getTraceEventID(group, args.symbol) 391 if err != nil { 392 return nil, fmt.Errorf("getting trace event id: %w", err) 393 } 394 395 // Kprobes are ephemeral tracepoints and share the same perf event type. 396 fd, err := openTracepointPerfEvent(tid, args.pid) 397 if err != nil { 398 return nil, err 399 } 400 401 return &perfEvent{ 402 typ: typ.PerfEventType(args.ret), 403 group: group, 404 name: args.symbol, 405 tracefsID: tid, 406 cookie: args.cookie, 407 fd: fd, 408 }, nil 409 } 410 411 // createTraceFSProbeEvent creates a new ephemeral trace event by writing to 412 // <tracefs>/[k,u]probe_events. Returns os.ErrNotExist if symbol is not a valid 413 // kernel symbol, or if it is not traceable with kprobes. Returns os.ErrExist 414 // if a probe with the same group and symbol already exists. 415 func createTraceFSProbeEvent(typ probeType, args probeArgs) error { 416 // Open the kprobe_events file in tracefs. 417 f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666) 418 if err != nil { 419 return fmt.Errorf("error opening '%s': %w", typ.EventsPath(), err) 420 } 421 defer f.Close() 422 423 var pe, token string 424 switch typ { 425 case kprobeType: 426 // The kprobe_events syntax is as follows (see Documentation/trace/kprobetrace.txt): 427 // p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe 428 // r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe 429 // -:[GRP/]EVENT : Clear a probe 430 // 431 // Some examples: 432 // r:ebpf_1234/r_my_kretprobe nf_conntrack_destroy 433 // p:ebpf_5678/p_my_kprobe __x64_sys_execve 434 // 435 // Leaving the kretprobe's MAXACTIVE set to 0 (or absent) will make the 436 // kernel default to NR_CPUS. This is desired in most eBPF cases since 437 // subsampling or rate limiting logic can be more accurately implemented in 438 // the eBPF program itself. 439 // See Documentation/kprobes.txt for more details. 440 token = kprobeToken(args) 441 pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(args.ret), args.group, sanitizeSymbol(args.symbol), token) 442 case uprobeType: 443 // The uprobe_events syntax is as follows: 444 // p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a probe 445 // r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return probe 446 // -:[GRP/]EVENT : Clear a probe 447 // 448 // Some examples: 449 // r:ebpf_1234/readline /bin/bash:0x12345 450 // p:ebpf_5678/main_mySymbol /bin/mybin:0x12345(0x123) 451 // 452 // See Documentation/trace/uprobetracer.txt for more details. 453 token = uprobeToken(args) 454 pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(args.ret), args.group, args.symbol, token) 455 } 456 _, err = f.WriteString(pe) 457 // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL 458 // when trying to create a kretprobe for a missing symbol. Make sure ENOENT 459 // is returned to the caller. 460 // EINVAL is also returned on pre-5.2 kernels when the `SYM[+offs]` token 461 // is resolved to an invalid insn boundary. 462 if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) { 463 return fmt.Errorf("token %s: %w", token, os.ErrNotExist) 464 } 465 // Since commit ab105a4fb894, -EILSEQ is returned when a kprobe sym+offset is resolved 466 // to an invalid insn boundary. 467 if errors.Is(err, syscall.EILSEQ) { 468 return fmt.Errorf("token %s: bad insn boundary: %w", token, os.ErrNotExist) 469 } 470 // ERANGE is returned when the `SYM[+offs]` token is too big and cannot 471 // be resolved. 472 if errors.Is(err, syscall.ERANGE) { 473 return fmt.Errorf("token %s: offset too big: %w", token, os.ErrNotExist) 474 } 475 if err != nil { 476 return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err) 477 } 478 479 return nil 480 } 481 482 // closeTraceFSProbeEvent removes the [k,u]probe with the given type, group and symbol 483 // from <tracefs>/[k,u]probe_events. 484 func closeTraceFSProbeEvent(typ probeType, group, symbol string) error { 485 f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666) 486 if err != nil { 487 return fmt.Errorf("error opening %s: %w", typ.EventsPath(), err) 488 } 489 defer f.Close() 490 491 // See [k,u]probe_events syntax above. The probe type does not need to be specified 492 // for removals. 493 pe := fmt.Sprintf("-:%s/%s", group, sanitizeSymbol(symbol)) 494 if _, err = f.WriteString(pe); err != nil { 495 return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err) 496 } 497 498 return nil 499 } 500 501 // randomGroup generates a pseudorandom string for use as a tracefs group name. 502 // Returns an error when the output string would exceed 63 characters (kernel 503 // limitation), when rand.Read() fails or when prefix contains characters not 504 // allowed by isValidTraceID. 505 func randomGroup(prefix string) (string, error) { 506 if !isValidTraceID(prefix) { 507 return "", fmt.Errorf("prefix '%s' must be alphanumeric or underscore: %w", prefix, errInvalidInput) 508 } 509 510 b := make([]byte, 8) 511 if _, err := rand.Read(b); err != nil { 512 return "", fmt.Errorf("reading random bytes: %w", err) 513 } 514 515 group := fmt.Sprintf("%s_%x", prefix, b) 516 if len(group) > 63 { 517 return "", fmt.Errorf("group name '%s' cannot be longer than 63 characters: %w", group, errInvalidInput) 518 } 519 520 return group, nil 521 } 522 523 func probePrefix(ret bool) string { 524 if ret { 525 return "r" 526 } 527 return "p" 528 } 529 530 // determineRetprobeBit reads a Performance Monitoring Unit's retprobe bit 531 // from /sys/bus/event_source/devices/<pmu>/format/retprobe. 532 func determineRetprobeBit(typ probeType) (uint64, error) { 533 p := filepath.Join("/sys/bus/event_source/devices/", typ.String(), "/format/retprobe") 534 535 data, err := os.ReadFile(p) 536 if err != nil { 537 return 0, err 538 } 539 540 var rp uint64 541 n, err := fmt.Sscanf(string(bytes.TrimSpace(data)), "config:%d", &rp) 542 if err != nil { 543 return 0, fmt.Errorf("parse retprobe bit: %w", err) 544 } 545 if n != 1 { 546 return 0, fmt.Errorf("parse retprobe bit: expected 1 item, got %d", n) 547 } 548 549 return rp, nil 550 } 551 552 func kretprobeBit() (uint64, error) { 553 kprobeRetprobeBit.once.Do(func() { 554 kprobeRetprobeBit.value, kprobeRetprobeBit.err = determineRetprobeBit(kprobeType) 555 }) 556 return kprobeRetprobeBit.value, kprobeRetprobeBit.err 557 } 558 559 // kprobeToken creates the SYM[+offs] token for the tracefs api. 560 func kprobeToken(args probeArgs) string { 561 po := args.symbol 562 563 if args.offset != 0 { 564 po += fmt.Sprintf("+%#x", args.offset) 565 } 566 567 return po 568 }