Skip to content

Commit 813a5fb

Browse files
Use llvm-symbolizer's JSON output for symbolizing (#879)
In some edge cases (e.g. injected JIT symbols), function names can have new lines. This breaks the llvm-symbolizer output parsing, and makes pprof hang. Conveniently, as of LLVM 13, llvm-symbolizer has a JSON output mode, which is robust against all kinds of weirdness like new lines. We can use this instead of the line-based parsing, and as a bonus we get much simpler handling of multiple frames in a stack, as the JSON output already returns these as an array. This also requires splitting the CODE and DATA processing into separate functions, since their JSON output is incompatible. For now, we keep the DATA output as before, a slightly hacky but functional concatenation of start + size, but this could be improved. Co-authored-by: Alexey Alexandrov <aalexand@users.noreply.github.com>
1 parent 304e4f0 commit 813a5fb

2 files changed

Lines changed: 63 additions & 71 deletions

File tree

internal/binutils/addr2liner_llvm.go

Lines changed: 57 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package binutils
1616

1717
import (
1818
"bufio"
19+
"encoding/json"
1920
"fmt"
2021
"io"
2122
"os/exec"
@@ -37,6 +38,7 @@ type llvmSymbolizer struct {
3738
filename string
3839
rw lineReaderWriter
3940
base uint64
41+
isData bool
4042
}
4143

4244
type llvmSymbolizerJob struct {
@@ -76,7 +78,7 @@ func newLLVMSymbolizer(cmd, file string, base uint64, isData bool) (*llvmSymboli
7678
}
7779

7880
j := &llvmSymbolizerJob{
79-
cmd: exec.Command(cmd, "--inlining", "-demangle=false"),
81+
cmd: exec.Command(cmd, "--inlining", "-demangle=false", "--output-style=JSON"),
8082
symType: "CODE",
8183
}
8284
if isData {
@@ -102,63 +104,67 @@ func newLLVMSymbolizer(cmd, file string, base uint64, isData bool) (*llvmSymboli
102104
filename: file,
103105
rw: j,
104106
base: base,
107+
isData: isData,
105108
}
106109

107110
return a, nil
108111
}
109112

110-
// readFrame parses the llvm-symbolizer output for a single address. It
111-
// returns a populated plugin.Frame and whether it has reached the end of the
112-
// data.
113-
func (d *llvmSymbolizer) readFrame() (plugin.Frame, bool) {
114-
funcname, err := d.rw.readLine()
113+
// readDataFrames parses the llvm-symbolizer DATA output for a single address. It
114+
// returns a populated plugin.Frame array with a single entry.
115+
func (d *llvmSymbolizer) readDataFrames() ([]plugin.Frame, error) {
116+
line, err := d.rw.readLine()
115117
if err != nil {
116-
return plugin.Frame{}, true
118+
return nil, err
117119
}
118-
119-
switch funcname {
120-
case "":
121-
return plugin.Frame{}, true
122-
case "??":
123-
funcname = ""
120+
var frame struct {
121+
Address string `json:"Address"`
122+
ModuleName string `json:"ModuleName"`
123+
Data struct {
124+
Start string `json:"Start"`
125+
Size string `json:"Size"`
126+
Name string `json:"Name"`
127+
} `json:"Data"`
128+
}
129+
if err := json.Unmarshal([]byte(line), &frame); err != nil {
130+
return nil, err
131+
}
132+
// Match non-JSON output behaviour of stuffing the start/size into the filename of a single frame,
133+
// with the size being a decimal value.
134+
size, err := strconv.ParseInt(frame.Data.Size, 0, 0)
135+
if err != nil {
136+
return nil, err
124137
}
138+
var stack []plugin.Frame
139+
stack = append(stack, plugin.Frame{Func: frame.Data.Name, File: fmt.Sprintf("%s %d", frame.Data.Start, size)})
140+
return stack, nil
141+
}
125142

126-
fileline, err := d.rw.readLine()
143+
// readCodeFrames parses the llvm-symbolizer CODE output for a single address. It
144+
// returns a populated plugin.Frame array.
145+
func (d *llvmSymbolizer) readCodeFrames() ([]plugin.Frame, error) {
146+
line, err := d.rw.readLine()
127147
if err != nil {
128-
return plugin.Frame{Func: funcname}, true
129-
}
130-
131-
linenumber := 0
132-
columnnumber := 0
133-
// The llvm-symbolizer outputs the <file_name>:<line_number>:<column_number>.
134-
// When it cannot identify the source code location, it outputs "??:0:0".
135-
// Older versions output just the filename and line number, so we check for
136-
// both conditions here.
137-
if fileline == "??:0" || fileline == "??:0:0" {
138-
fileline = ""
139-
} else {
140-
switch split := strings.Split(fileline, ":"); len(split) {
141-
case 3:
142-
// filename:line:column
143-
if col, err := strconv.Atoi(split[2]); err == nil {
144-
columnnumber = col
145-
}
146-
fallthrough
147-
case 2:
148-
// filename:line
149-
if line, err := strconv.Atoi(split[1]); err == nil {
150-
linenumber = line
151-
}
152-
fallthrough
153-
case 1:
154-
// filename
155-
fileline = split[0]
156-
default:
157-
// Unrecognized, ignore
158-
}
159-
}
160-
161-
return plugin.Frame{Func: funcname, File: fileline, Line: linenumber, Column: columnnumber}, false
148+
return nil, err
149+
}
150+
var frame struct {
151+
Address string `json:"Address"`
152+
ModuleName string `json:"ModuleName"`
153+
Symbol []struct {
154+
Line int `json:"Line"`
155+
Column int `json:"Column"`
156+
FunctionName string `json:"FunctionName"`
157+
FileName string `json:"FileName"`
158+
} `json:"Symbol"`
159+
}
160+
if err := json.Unmarshal([]byte(line), &frame); err != nil {
161+
return nil, err
162+
}
163+
var stack []plugin.Frame
164+
for _, s := range frame.Symbol {
165+
stack = append(stack, plugin.Frame{Func: s.FunctionName, File: s.FileName, Line: s.Line, Column: s.Column})
166+
}
167+
return stack, nil
162168
}
163169

164170
// addrInfo returns the stack frame information for a specific program
@@ -170,18 +176,8 @@ func (d *llvmSymbolizer) addrInfo(addr uint64) ([]plugin.Frame, error) {
170176
if err := d.rw.write(fmt.Sprintf("%s 0x%x", d.filename, addr-d.base)); err != nil {
171177
return nil, err
172178
}
173-
174-
var stack []plugin.Frame
175-
for {
176-
frame, end := d.readFrame()
177-
if end {
178-
break
179-
}
180-
181-
if frame != (plugin.Frame{}) {
182-
stack = append(stack, frame)
183-
}
179+
if d.isData {
180+
return d.readDataFrames()
184181
}
185-
186-
return stack, nil
182+
return d.readCodeFrames()
187183
}

internal/binutils/testdata/fake-llvm-symbolizer

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,18 @@ IFS=" "
2222
while read line; do
2323
# line has form:
2424
# filename 0xaddr
25-
# Emit dummy output that matches llvm-symbolizer output format.
25+
# Emit dummy output that matches llvm-symbolizer JSON output format.
2626
set -- ${line}
2727
kind=$1
2828
fname=$2
2929
addr=$3
3030
case ${kind} in
3131
CODE)
32-
echo "Inlined_${addr}"
33-
echo "${fname}.h"
34-
echo "Func_${addr}"
35-
echo "${fname}.c:2:1"
36-
echo;;
32+
echo "{\"Address\":\"${addr}\",\"ModuleName\":\"${fname}\",\"Symbol\":[{\"Column\":0,\"FileName\":\"${fname}.h\",\"FunctionName\":\"Inlined_${addr}\",\"Line\":0},{\"Column\":1,\"FileName\":\"${fname}.c\",\"FunctionName\":\"Func_${addr}\",\"Line\":2}]}"
33+
;;
3734
DATA)
38-
echo "${fname}_${addr}"
39-
echo "${addr} 8"
40-
echo;;
41-
*) echo ${kind} ${fname} ${addr};;
35+
echo "{\"Address\":\"${addr}\",\"ModuleName\":\"${fname}\",\"Data\":{\"Name\":\"${fname}_${addr}\",\"Size\":\"0x8\",\"Start\":\"${addr}\"}}"
36+
;;
37+
*) exit 1;;
4238
esac
4339
done

0 commit comments

Comments
 (0)