David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | # |
| 3 | # Copyright (C) 2022 The Android Open Source Project |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | # you may not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | |
| 17 | """ |
| 18 | Checks dwarf CFI (unwinding) information by comparing it to disassembly. |
| 19 | It is only a simple heuristic check of stack pointer adjustments. |
| 20 | Fully inferring CFI from disassembly is not possible in general. |
| 21 | """ |
| 22 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 23 | import os, re, subprocess, collections, pathlib, bisect, collections, sys |
| 24 | from dataclasses import dataclass |
| 25 | from functools import cache |
| 26 | from pathlib import Path |
| 27 | from typing import Any, List, Optional, Set, Tuple, Dict |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 28 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 29 | arch: str = "" |
| 30 | ARCHES = ["i386", "x86_64", "arm", "aarch64", "riscv64"] |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 31 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 32 | IGNORE : Dict[str, List[str]] = { |
| 33 | # Aligns stack. |
| 34 | "art_quick_osr_stub": ["i386"], |
| 35 | # Intermediate invalid CFI while loading all registers. |
| 36 | "art_quick_do_long_jump": ["x86_64"], |
| 37 | # Saves/restores SP in other register. |
| 38 | "art_quick_generic_jni_trampoline": ["arm", "i386", "x86_64"], |
| 39 | # Starts with non-zero offset at the start of the method. |
| 40 | "art_quick_throw_null_pointer_exception_from_signal": ["arm", "aarch64", "i386", "x86_64"], |
| 41 | # Pops stack without static control flow past the opcode. |
| 42 | "nterp_op_return": ["arm", "aarch64", "i386", "x86_64"], |
| 43 | } |
| 44 | |
| 45 | SP = {"arm": "SP", "aarch64": "WSP", "i386": "ESP", "x86_64": "RSP", "riscv64": "X2"} |
| 46 | INITIAL_OFFSET = {"i386": 4, "x86_64": 8} |
| 47 | |
| 48 | @cache |
| 49 | def get_inst_semantics(arch: str) -> List[Any]: |
| 50 | """ List of regex expressions for supported instructions and their behaviour """ |
| 51 | |
| 52 | rexprs = [] |
| 53 | def add(rexpr, adjust_offset=lambda m: 0, adjust_pc=None): |
| 54 | rexprs.append((re.compile(rexpr), adjust_offset, adjust_pc)) |
| 55 | if arch in ["i386", "x86_64"]: |
| 56 | ptr_size = {"i386": 4, "x86_64": 8}[arch] |
| 57 | add(r"push. .*", lambda m: ptr_size) |
| 58 | add(r"pop. .*", lambda m: -ptr_size) |
| 59 | add(r"sub. \$(\w+), (?:%esp|%rsp)", lambda m: int(m[1], 0)) |
| 60 | add(r"add. \$(\w+), (?:%esp|%rsp)", lambda m: -int(m[1], 0)) |
| 61 | add(r"call. (0x\w+) <.*", lambda m: ptr_size, adjust_pc=lambda m: int(m[1], 0)) |
| 62 | add(r"j[a-z]* (0x\w+) <.*", adjust_pc=lambda m: int(m[1], 0)) |
| 63 | if arch in ["arm", "aarch64"]: |
| 64 | add(r"sub sp,(?: sp,)? #(\w+)", lambda m: int(m[1], 0)) |
| 65 | add(r"add sp,(?: sp,)? #(\w+)", lambda m: -int(m[1], 0)) |
| 66 | add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1])) |
| 67 | add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1])) |
| 68 | add(r"stp \w+, \w+, \[sp, #-(\w+)\]!", lambda m: int(m[1], 0)) |
| 69 | add(r"ldp \w+, \w+, \[sp\], #(\w+)", lambda m: -int(m[1], 0)) |
| 70 | add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(","))) |
| 71 | add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(","))) |
| 72 | add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(","))) |
| 73 | add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(","))) |
| 74 | add(r"cb\w* \w+, (0x\w+).*", adjust_pc=lambda m: int(m[1], 0)) |
| 75 | add(r"(?:b|bl|b\w\w) (0x\w+).*", adjust_pc=lambda m: int(m[1], 0)) |
| 76 | return rexprs |
| 77 | |
| 78 | @dataclass(frozen=True) |
| 79 | class Error(Exception): |
| 80 | address: int |
| 81 | message: str |
| 82 | |
| 83 | def get_arch(lib: pathlib.Path) -> str: |
| 84 | """ Get architecture of the given library based on the ELF header. """ |
| 85 | |
| 86 | proc = subprocess.run(["llvm-objdump", "--file-headers", lib], |
| 87 | encoding='utf-8', |
| 88 | capture_output=True, |
| 89 | check=True) |
| 90 | |
| 91 | m = re.search("^architecture: *(.*)$", proc.stdout, re.MULTILINE) |
| 92 | assert m, "Can not find ABI of ELF file " + str(lib) |
| 93 | assert m.group(1) in ARCHES, "Unknown arch: " + m.group(1) |
| 94 | return m.group(1) |
| 95 | |
| 96 | Source = collections.namedtuple("Source", ["pc", "file", "line", "flag"]) |
| 97 | |
| 98 | def get_src(lib: pathlib.Path) -> List[Source]: |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 99 | """ Get source-file and line-number for all hand-written assembly code. """ |
| 100 | |
| 101 | proc = subprocess.run(["llvm-dwarfdump", "--debug-line", lib], |
| 102 | encoding='utf-8', |
| 103 | capture_output=True, |
| 104 | check=True) |
| 105 | |
| 106 | section_re = re.compile("^debug_line\[0x[0-9a-f]+\]$", re.MULTILINE) |
| 107 | filename_re = re.compile('file_names\[ *(\d)+\]:\n\s*name: "(.*)"', re.MULTILINE) |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 108 | line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)' # pc, line, column, file |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 109 | ' +\d+ +\d +(.*)') # isa, discriminator, flag |
| 110 | |
| 111 | results = [] |
| 112 | for section in section_re.split(proc.stdout): |
| 113 | files = {m[1]: m[2] for m in filename_re.finditer(section)} |
| 114 | if not any(f.endswith(".S") for f in files.values()): |
| 115 | continue |
| 116 | lines = line_re.findall(section) |
| 117 | results.extend([Source(int(a, 16), files[fn], l, fg) for a, l, fn, fg in lines]) |
| 118 | return sorted(filter(lambda line: "end_sequence" not in line.flag, results)) |
| 119 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 120 | Fde = collections.namedtuple("Fde", ["pc", "end", "data"]) |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 121 | |
| 122 | def get_fde(lib: pathlib.Path) -> List[Fde]: |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 123 | """ Get all FDE blocks (in dumped text-based format) """ |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 124 | |
| 125 | proc = subprocess.run(["llvm-dwarfdump", "--debug-frame", lib], |
| 126 | encoding='utf-8', |
| 127 | capture_output=True, |
| 128 | check=True) |
| 129 | |
| 130 | section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent. |
| 131 | fda_re = re.compile(".* FDE .* pc=([0-9a-f]+)...([0-9a-f]+)") |
| 132 | |
| 133 | results = [] |
| 134 | for section in section_re.split(proc.stdout): |
| 135 | m = fda_re.match(section) |
| 136 | if m: |
| 137 | fde = Fde(int(m[1], 16), int(m[2], 16), section) |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 138 | if fde.pc != 0: |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 139 | results.append(fde) |
| 140 | return sorted(results) |
| 141 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 142 | Asm = collections.namedtuple("Asm", ["pc", "name", "data"]) |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 143 | |
| 144 | def get_asm(lib: pathlib.Path) -> List[Asm]: |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 145 | """ Get all ASM blocks (in dumped text-based format) """ |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 146 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 147 | proc = subprocess.run(["llvm-objdump", "--disassemble", "--no-show-raw-insn", lib], |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 148 | encoding='utf-8', |
| 149 | capture_output=True, |
| 150 | check=True) |
| 151 | |
| 152 | section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent. |
| 153 | sym_re = re.compile("([0-9a-f]+) <(.+)>:") |
| 154 | |
| 155 | results = [] |
| 156 | for section in section_re.split(proc.stdout): |
| 157 | sym = sym_re.match(section) |
| 158 | if sym: |
| 159 | results.append(Asm(int(sym[1], 16), sym[2], section)) |
| 160 | return sorted(results) |
| 161 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 162 | Inst = collections.namedtuple("Inst", ["pc", "inst"]) |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 163 | |
| 164 | def get_instructions(asm: Asm) -> List[Inst]: |
| 165 | """ Extract individual instructions from disassembled code block """ |
| 166 | |
| 167 | data = re.sub(r"[ \t]+", " ", asm.data) |
| 168 | inst_re = re.compile(r"([0-9a-f]+): +(?:[0-9a-f]{2} +)*(.*)") |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 169 | return [Inst(int(pc, 16), inst) for pc, inst in inst_re.findall(data)] |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 170 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 171 | # PC -> CFA offset (stack size at given PC; None if it not just trivial SP+<integer>) |
| 172 | CfaOffsets = Dict[int, Optional[int]] |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 173 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 174 | def get_dwarf_cfa_offsets(insts: List[Inst], fde: Fde) -> CfaOffsets: |
| 175 | """ Get CFA offsets for all instructions from DWARF """ |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 176 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 177 | # Parse the CFA offset definitions from the FDE. |
| 178 | sp = SP[arch] |
| 179 | m = re.compile(r"(0x[0-9a-f]+): CFA=(\w*)([^:\n]*)").findall(fde.data) |
| 180 | cfa = collections.deque([(int(a, 0), int(o or "0") if r == sp else None) for a, r, o in m]) |
| 181 | if all(offset is None for add, offset in cfa): |
| 182 | # This would create result that never checks anything. |
| 183 | raise Error(insts[0].pc, "No trivial CFA offsets. Add function to IGNORE list?") |
| 184 | |
| 185 | # Create map from instruction PCs to corresponding CFA offsets. |
| 186 | offset: Optional[int] = INITIAL_OFFSET.get(arch, 0) |
| 187 | result: CfaOffsets = {} |
| 188 | for pc, inst in insts: |
| 189 | while cfa and cfa[0][0] <= pc: |
| 190 | offset = cfa.popleft()[1] |
| 191 | result[pc] = offset |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 192 | return result |
| 193 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 194 | def get_inferred_cfa_offsets(insts: List[Inst]) -> CfaOffsets: |
| 195 | """ Get CFA offsets for all instructions from static analysis """ |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 196 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 197 | rexprs = get_inst_semantics(arch) |
| 198 | offset: Optional[int] = INITIAL_OFFSET.get(arch, 0) |
| 199 | result: CfaOffsets = {} |
| 200 | for pc, inst in insts: |
| 201 | # Set current offset for PC, unless branch already set it. |
| 202 | offset = result.setdefault(pc, offset) |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 203 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 204 | # Adjust PC and offset based on the current instruction. |
| 205 | for rexpr, adjust_offset, adjust_pc in rexprs: |
| 206 | m = rexpr.fullmatch(inst) |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 207 | if m: |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 208 | new_offset = offset + adjust_offset(m) |
| 209 | if adjust_pc: |
| 210 | new_pc = adjust_pc(m) |
| 211 | if insts[0].pc <= new_pc <= insts[-1].pc: |
| 212 | if new_pc in result and result[new_pc] != new_offset: |
| 213 | raise Error(pc, "Inconsistent branch (old={} new={})" |
| 214 | .format(result[new_pc], new_offset)) |
| 215 | result[new_pc] = new_offset |
| 216 | else: |
| 217 | offset = new_offset |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 218 | break # First matched pattern wins. |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 219 | return result |
| 220 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 221 | def check_fde(fde: Fde, insts: List[Inst], srcs) -> None: |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 222 | """ Compare DWARF offsets to assembly-inferred offsets. Report differences. """ |
| 223 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 224 | dwarf_cfa_offsets = get_dwarf_cfa_offsets(insts, fde) |
| 225 | inferred_cfa_offsets = get_inferred_cfa_offsets(insts) |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 226 | |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 227 | for pc, inst in insts: |
| 228 | if dwarf_cfa_offsets[pc] is not None and dwarf_cfa_offsets[pc] != inferred_cfa_offsets[pc]: |
| 229 | if pc in srcs: # Only report if it maps to source code (not padding or literals). |
| 230 | for inst2 in insts: |
| 231 | print("0x{:08x} [{}]: dwarf={} inferred={} {}".format( |
| 232 | inst2.pc, srcs.get(inst2.pc, ""), |
| 233 | str(dwarf_cfa_offsets[inst2.pc]), str(inferred_cfa_offsets[inst2.pc]), |
| 234 | inst2.inst.strip())) |
| 235 | raise Error(pc, "DWARF offset does not match inferred offset") |
| 236 | |
| 237 | def check_lib(lib: pathlib.Path) -> int: |
| 238 | global arch |
| 239 | arch = get_arch(lib) |
| 240 | |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 241 | assert lib.exists() |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 242 | fdes = get_fde(lib) |
| 243 | asms = collections.deque(get_asm(lib)) |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 244 | srcs = {src.pc: src.file + ":" + src.line for src in get_src(lib)} |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 245 | seen = set() # Used to verify the we have covered all assembly source lines. |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 246 | fail = 0 |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 247 | |
| 248 | for fde in fdes: |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 249 | if fde.pc not in srcs: |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 250 | continue # Ignore if it is not hand-written assembly. |
| 251 | |
| 252 | # Assembly instructions (one FDE can cover several assembly chunks). |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 253 | all_insts, name = [], "" |
| 254 | while asms and asms[0].pc < fde.end: |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 255 | asm = asms.popleft() |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 256 | if asm.pc < fde.pc: |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 257 | continue |
| 258 | insts = get_instructions(asm) |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 259 | if any(asm.name.startswith(n) and arch in a for n, a in IGNORE.items()): |
| 260 | seen.update([inst.pc for inst in insts]) |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 261 | continue |
| 262 | all_insts.extend(insts) |
| 263 | name = name or asm.name |
| 264 | if not all_insts: |
| 265 | continue # No assembly |
| 266 | |
| 267 | # Compare DWARF data to assembly instructions |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 268 | try: |
| 269 | check_fde(fde, all_insts, srcs) |
| 270 | except Error as e: |
| 271 | print("0x{:08x} [{}]: ERROR in {}: {}\n" |
| 272 | .format(e.address, srcs.get(e.address, ""), name, e.message)) |
| 273 | fail += 1 |
| 274 | seen.update([inst.pc for inst in all_insts]) |
| 275 | for pc in sorted(set(srcs.keys()) - seen): |
| 276 | print("ERROR: Missing CFI for {:08x}: {}".format(pc, srcs[pc])) |
| 277 | fail += 1 |
| 278 | return fail |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 279 | |
| 280 | |
| 281 | def main(argv): |
| 282 | """ Check libraries provided on the command line, or use the default build output """ |
| 283 | |
| 284 | libs = argv[1:] |
| 285 | if not libs: |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 286 | apex = Path(os.environ["OUT"]) / "symbols/apex/" |
| 287 | libs = list(apex.glob("**/libart.so")) |
| 288 | assert libs, "Can not find any libart.so in " + str(apex) |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 289 | for lib in libs: |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 290 | fail = check_lib(pathlib.Path(lib)) |
| 291 | if fail > 0: |
| 292 | print(fail, "ERROR(s) in", str(lib)) |
| 293 | sys.exit(1) |
David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 294 | |
| 295 | if __name__ == "__main__": |
David Srbecky | c36a86c | 2023-05-11 23:14:21 +0100 | [diff] [blame] | 296 | main(sys.argv) |