blob: 36c96d7ea055e99fffaee19d77b8ca8d129a8833 [file] [log] [blame]
David Srbecky8bb486a2022-01-12 13:31:00 +00001#!/usr/bin/env python3
2#
3# Copyright (C) 2022 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""
18Checks dwarf CFI (unwinding) information by comparing it to disassembly.
19It is only a simple heuristic check of stack pointer adjustments.
20Fully inferring CFI from disassembly is not possible in general.
21"""
22
David Srbeckyc36a86c2023-05-11 23:14:21 +010023import os, re, subprocess, collections, pathlib, bisect, collections, sys
24from dataclasses import dataclass
25from functools import cache
26from pathlib import Path
27from typing import Any, List, Optional, Set, Tuple, Dict
David Srbecky8bb486a2022-01-12 13:31:00 +000028
David Srbeckyc36a86c2023-05-11 23:14:21 +010029arch: str = ""
30ARCHES = ["i386", "x86_64", "arm", "aarch64", "riscv64"]
David Srbecky8bb486a2022-01-12 13:31:00 +000031
David Srbeckyc36a86c2023-05-11 23:14:21 +010032IGNORE : Dict[str, List[str]] = {
33 # Aligns stack.
34 "art_quick_osr_stub": ["i386"],
35 # Intermediate invalid CFI while loading all registers.
36 "art_quick_do_long_jump": ["x86_64"],
37 # Saves/restores SP in other register.
38 "art_quick_generic_jni_trampoline": ["arm", "i386", "x86_64"],
39 # Starts with non-zero offset at the start of the method.
40 "art_quick_throw_null_pointer_exception_from_signal": ["arm", "aarch64", "i386", "x86_64"],
41 # Pops stack without static control flow past the opcode.
42 "nterp_op_return": ["arm", "aarch64", "i386", "x86_64"],
43}
44
45SP = {"arm": "SP", "aarch64": "WSP", "i386": "ESP", "x86_64": "RSP", "riscv64": "X2"}
46INITIAL_OFFSET = {"i386": 4, "x86_64": 8}
47
48@cache
49def get_inst_semantics(arch: str) -> List[Any]:
50 """ List of regex expressions for supported instructions and their behaviour """
51
52 rexprs = []
53 def add(rexpr, adjust_offset=lambda m: 0, adjust_pc=None):
54 rexprs.append((re.compile(rexpr), adjust_offset, adjust_pc))
55 if arch in ["i386", "x86_64"]:
56 ptr_size = {"i386": 4, "x86_64": 8}[arch]
57 add(r"push. .*", lambda m: ptr_size)
58 add(r"pop. .*", lambda m: -ptr_size)
59 add(r"sub. \$(\w+), (?:%esp|%rsp)", lambda m: int(m[1], 0))
60 add(r"add. \$(\w+), (?:%esp|%rsp)", lambda m: -int(m[1], 0))
61 add(r"call. (0x\w+) <.*", lambda m: ptr_size, adjust_pc=lambda m: int(m[1], 0))
62 add(r"j[a-z]* (0x\w+) <.*", adjust_pc=lambda m: int(m[1], 0))
63 if arch in ["arm", "aarch64"]:
64 add(r"sub sp,(?: sp,)? #(\w+)", lambda m: int(m[1], 0))
65 add(r"add sp,(?: sp,)? #(\w+)", lambda m: -int(m[1], 0))
66 add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
67 add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
68 add(r"stp \w+, \w+, \[sp, #-(\w+)\]!", lambda m: int(m[1], 0))
69 add(r"ldp \w+, \w+, \[sp\], #(\w+)", lambda m: -int(m[1], 0))
70 add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(",")))
71 add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(",")))
72 add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(",")))
73 add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(",")))
74 add(r"cb\w* \w+, (0x\w+).*", adjust_pc=lambda m: int(m[1], 0))
75 add(r"(?:b|bl|b\w\w) (0x\w+).*", adjust_pc=lambda m: int(m[1], 0))
76 return rexprs
77
78@dataclass(frozen=True)
79class Error(Exception):
80 address: int
81 message: str
82
83def get_arch(lib: pathlib.Path) -> str:
84 """ Get architecture of the given library based on the ELF header. """
85
86 proc = subprocess.run(["llvm-objdump", "--file-headers", lib],
87 encoding='utf-8',
88 capture_output=True,
89 check=True)
90
91 m = re.search("^architecture: *(.*)$", proc.stdout, re.MULTILINE)
92 assert m, "Can not find ABI of ELF file " + str(lib)
93 assert m.group(1) in ARCHES, "Unknown arch: " + m.group(1)
94 return m.group(1)
95
96Source = collections.namedtuple("Source", ["pc", "file", "line", "flag"])
97
98def get_src(lib: pathlib.Path) -> List[Source]:
David Srbecky8bb486a2022-01-12 13:31:00 +000099 """ Get source-file and line-number for all hand-written assembly code. """
100
101 proc = subprocess.run(["llvm-dwarfdump", "--debug-line", lib],
102 encoding='utf-8',
103 capture_output=True,
104 check=True)
105
106 section_re = re.compile("^debug_line\[0x[0-9a-f]+\]$", re.MULTILINE)
107 filename_re = re.compile('file_names\[ *(\d)+\]:\n\s*name: "(.*)"', re.MULTILINE)
David Srbeckyc36a86c2023-05-11 23:14:21 +0100108 line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)' # pc, line, column, file
David Srbecky8bb486a2022-01-12 13:31:00 +0000109 ' +\d+ +\d +(.*)') # isa, discriminator, flag
110
111 results = []
112 for section in section_re.split(proc.stdout):
113 files = {m[1]: m[2] for m in filename_re.finditer(section)}
114 if not any(f.endswith(".S") for f in files.values()):
115 continue
116 lines = line_re.findall(section)
117 results.extend([Source(int(a, 16), files[fn], l, fg) for a, l, fn, fg in lines])
118 return sorted(filter(lambda line: "end_sequence" not in line.flag, results))
119
David Srbeckyc36a86c2023-05-11 23:14:21 +0100120Fde = collections.namedtuple("Fde", ["pc", "end", "data"])
David Srbecky8bb486a2022-01-12 13:31:00 +0000121
122def get_fde(lib: pathlib.Path) -> List[Fde]:
David Srbeckyc36a86c2023-05-11 23:14:21 +0100123 """ Get all FDE blocks (in dumped text-based format) """
David Srbecky8bb486a2022-01-12 13:31:00 +0000124
125 proc = subprocess.run(["llvm-dwarfdump", "--debug-frame", lib],
126 encoding='utf-8',
127 capture_output=True,
128 check=True)
129
130 section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent.
131 fda_re = re.compile(".* FDE .* pc=([0-9a-f]+)...([0-9a-f]+)")
132
133 results = []
134 for section in section_re.split(proc.stdout):
135 m = fda_re.match(section)
136 if m:
137 fde = Fde(int(m[1], 16), int(m[2], 16), section)
David Srbeckyc36a86c2023-05-11 23:14:21 +0100138 if fde.pc != 0:
David Srbecky8bb486a2022-01-12 13:31:00 +0000139 results.append(fde)
140 return sorted(results)
141
David Srbeckyc36a86c2023-05-11 23:14:21 +0100142Asm = collections.namedtuple("Asm", ["pc", "name", "data"])
David Srbecky8bb486a2022-01-12 13:31:00 +0000143
144def get_asm(lib: pathlib.Path) -> List[Asm]:
David Srbeckyc36a86c2023-05-11 23:14:21 +0100145 """ Get all ASM blocks (in dumped text-based format) """
David Srbecky8bb486a2022-01-12 13:31:00 +0000146
David Srbeckyc36a86c2023-05-11 23:14:21 +0100147 proc = subprocess.run(["llvm-objdump", "--disassemble", "--no-show-raw-insn", lib],
David Srbecky8bb486a2022-01-12 13:31:00 +0000148 encoding='utf-8',
149 capture_output=True,
150 check=True)
151
152 section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent.
153 sym_re = re.compile("([0-9a-f]+) <(.+)>:")
154
155 results = []
156 for section in section_re.split(proc.stdout):
157 sym = sym_re.match(section)
158 if sym:
159 results.append(Asm(int(sym[1], 16), sym[2], section))
160 return sorted(results)
161
David Srbeckyc36a86c2023-05-11 23:14:21 +0100162Inst = collections.namedtuple("Inst", ["pc", "inst"])
David Srbecky8bb486a2022-01-12 13:31:00 +0000163
164def get_instructions(asm: Asm) -> List[Inst]:
165 """ Extract individual instructions from disassembled code block """
166
167 data = re.sub(r"[ \t]+", " ", asm.data)
168 inst_re = re.compile(r"([0-9a-f]+): +(?:[0-9a-f]{2} +)*(.*)")
David Srbeckyc36a86c2023-05-11 23:14:21 +0100169 return [Inst(int(pc, 16), inst) for pc, inst in inst_re.findall(data)]
David Srbecky8bb486a2022-01-12 13:31:00 +0000170
David Srbeckyc36a86c2023-05-11 23:14:21 +0100171# PC -> CFA offset (stack size at given PC; None if it not just trivial SP+<integer>)
172CfaOffsets = Dict[int, Optional[int]]
David Srbecky8bb486a2022-01-12 13:31:00 +0000173
David Srbeckyc36a86c2023-05-11 23:14:21 +0100174def get_dwarf_cfa_offsets(insts: List[Inst], fde: Fde) -> CfaOffsets:
175 """ Get CFA offsets for all instructions from DWARF """
David Srbecky8bb486a2022-01-12 13:31:00 +0000176
David Srbeckyc36a86c2023-05-11 23:14:21 +0100177 # Parse the CFA offset definitions from the FDE.
178 sp = SP[arch]
179 m = re.compile(r"(0x[0-9a-f]+): CFA=(\w*)([^:\n]*)").findall(fde.data)
180 cfa = collections.deque([(int(a, 0), int(o or "0") if r == sp else None) for a, r, o in m])
181 if all(offset is None for add, offset in cfa):
182 # This would create result that never checks anything.
183 raise Error(insts[0].pc, "No trivial CFA offsets. Add function to IGNORE list?")
184
185 # Create map from instruction PCs to corresponding CFA offsets.
186 offset: Optional[int] = INITIAL_OFFSET.get(arch, 0)
187 result: CfaOffsets = {}
188 for pc, inst in insts:
189 while cfa and cfa[0][0] <= pc:
190 offset = cfa.popleft()[1]
191 result[pc] = offset
David Srbecky8bb486a2022-01-12 13:31:00 +0000192 return result
193
David Srbeckyc36a86c2023-05-11 23:14:21 +0100194def get_inferred_cfa_offsets(insts: List[Inst]) -> CfaOffsets:
195 """ Get CFA offsets for all instructions from static analysis """
David Srbecky8bb486a2022-01-12 13:31:00 +0000196
David Srbeckyc36a86c2023-05-11 23:14:21 +0100197 rexprs = get_inst_semantics(arch)
198 offset: Optional[int] = INITIAL_OFFSET.get(arch, 0)
199 result: CfaOffsets = {}
200 for pc, inst in insts:
201 # Set current offset for PC, unless branch already set it.
202 offset = result.setdefault(pc, offset)
David Srbecky8bb486a2022-01-12 13:31:00 +0000203
David Srbeckyc36a86c2023-05-11 23:14:21 +0100204 # Adjust PC and offset based on the current instruction.
205 for rexpr, adjust_offset, adjust_pc in rexprs:
206 m = rexpr.fullmatch(inst)
David Srbecky8bb486a2022-01-12 13:31:00 +0000207 if m:
David Srbeckyc36a86c2023-05-11 23:14:21 +0100208 new_offset = offset + adjust_offset(m)
209 if adjust_pc:
210 new_pc = adjust_pc(m)
211 if insts[0].pc <= new_pc <= insts[-1].pc:
212 if new_pc in result and result[new_pc] != new_offset:
213 raise Error(pc, "Inconsistent branch (old={} new={})"
214 .format(result[new_pc], new_offset))
215 result[new_pc] = new_offset
216 else:
217 offset = new_offset
David Srbecky8bb486a2022-01-12 13:31:00 +0000218 break # First matched pattern wins.
David Srbecky8bb486a2022-01-12 13:31:00 +0000219 return result
220
David Srbeckyc36a86c2023-05-11 23:14:21 +0100221def check_fde(fde: Fde, insts: List[Inst], srcs) -> None:
David Srbecky8bb486a2022-01-12 13:31:00 +0000222 """ Compare DWARF offsets to assembly-inferred offsets. Report differences. """
223
David Srbeckyc36a86c2023-05-11 23:14:21 +0100224 dwarf_cfa_offsets = get_dwarf_cfa_offsets(insts, fde)
225 inferred_cfa_offsets = get_inferred_cfa_offsets(insts)
David Srbecky8bb486a2022-01-12 13:31:00 +0000226
David Srbeckyc36a86c2023-05-11 23:14:21 +0100227 for pc, inst in insts:
228 if dwarf_cfa_offsets[pc] is not None and dwarf_cfa_offsets[pc] != inferred_cfa_offsets[pc]:
229 if pc in srcs: # Only report if it maps to source code (not padding or literals).
230 for inst2 in insts:
231 print("0x{:08x} [{}]: dwarf={} inferred={} {}".format(
232 inst2.pc, srcs.get(inst2.pc, ""),
233 str(dwarf_cfa_offsets[inst2.pc]), str(inferred_cfa_offsets[inst2.pc]),
234 inst2.inst.strip()))
235 raise Error(pc, "DWARF offset does not match inferred offset")
236
237def check_lib(lib: pathlib.Path) -> int:
238 global arch
239 arch = get_arch(lib)
240
David Srbecky8bb486a2022-01-12 13:31:00 +0000241 assert lib.exists()
David Srbecky8bb486a2022-01-12 13:31:00 +0000242 fdes = get_fde(lib)
243 asms = collections.deque(get_asm(lib))
David Srbeckyc36a86c2023-05-11 23:14:21 +0100244 srcs = {src.pc: src.file + ":" + src.line for src in get_src(lib)}
David Srbecky8bb486a2022-01-12 13:31:00 +0000245 seen = set() # Used to verify the we have covered all assembly source lines.
David Srbeckyc36a86c2023-05-11 23:14:21 +0100246 fail = 0
David Srbecky8bb486a2022-01-12 13:31:00 +0000247
248 for fde in fdes:
David Srbeckyc36a86c2023-05-11 23:14:21 +0100249 if fde.pc not in srcs:
David Srbecky8bb486a2022-01-12 13:31:00 +0000250 continue # Ignore if it is not hand-written assembly.
251
252 # Assembly instructions (one FDE can cover several assembly chunks).
David Srbeckyc36a86c2023-05-11 23:14:21 +0100253 all_insts, name = [], ""
254 while asms and asms[0].pc < fde.end:
David Srbecky8bb486a2022-01-12 13:31:00 +0000255 asm = asms.popleft()
David Srbeckyc36a86c2023-05-11 23:14:21 +0100256 if asm.pc < fde.pc:
David Srbecky8bb486a2022-01-12 13:31:00 +0000257 continue
258 insts = get_instructions(asm)
David Srbeckyc36a86c2023-05-11 23:14:21 +0100259 if any(asm.name.startswith(n) and arch in a for n, a in IGNORE.items()):
260 seen.update([inst.pc for inst in insts])
David Srbecky8bb486a2022-01-12 13:31:00 +0000261 continue
262 all_insts.extend(insts)
263 name = name or asm.name
264 if not all_insts:
265 continue # No assembly
266
267 # Compare DWARF data to assembly instructions
David Srbeckyc36a86c2023-05-11 23:14:21 +0100268 try:
269 check_fde(fde, all_insts, srcs)
270 except Error as e:
271 print("0x{:08x} [{}]: ERROR in {}: {}\n"
272 .format(e.address, srcs.get(e.address, ""), name, e.message))
273 fail += 1
274 seen.update([inst.pc for inst in all_insts])
275 for pc in sorted(set(srcs.keys()) - seen):
276 print("ERROR: Missing CFI for {:08x}: {}".format(pc, srcs[pc]))
277 fail += 1
278 return fail
David Srbecky8bb486a2022-01-12 13:31:00 +0000279
280
281def main(argv):
282 """ Check libraries provided on the command line, or use the default build output """
283
284 libs = argv[1:]
285 if not libs:
David Srbeckyc36a86c2023-05-11 23:14:21 +0100286 apex = Path(os.environ["OUT"]) / "symbols/apex/"
287 libs = list(apex.glob("**/libart.so"))
288 assert libs, "Can not find any libart.so in " + str(apex)
David Srbecky8bb486a2022-01-12 13:31:00 +0000289 for lib in libs:
David Srbeckyc36a86c2023-05-11 23:14:21 +0100290 fail = check_lib(pathlib.Path(lib))
291 if fail > 0:
292 print(fail, "ERROR(s) in", str(lib))
293 sys.exit(1)
David Srbecky8bb486a2022-01-12 13:31:00 +0000294
295if __name__ == "__main__":
David Srbeckyc36a86c2023-05-11 23:14:21 +0100296 main(sys.argv)