CFI: Fix directives for x86 and ARM.

Also add support for multiple architectures to the
check_cfi.py script (x86 was not supported before).

Test: ./art/tools/check_cfi.py (on x86 and ARM targets)
Change-Id: Ida3e0d6468eb67316c156704679050763ab01160
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index f0b9a7a..f060411 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -152,11 +152,9 @@
   mov r1, r0                           // pass the result
   mov r0, rSELF                        // Thread::Current
   bl artDeoptimizeIfNeeded
-  CFI_REMEMBER_STATE
   RESTORE_SAVE_EVERYTHING_FRAME
   REFRESH_MARKING_REGISTER
   bx     lr
-  CFI_RESTORE_STATE_AND_DEF_CFA sp, FRAME_SIZE_SAVE_EVERYTHING
 .endm
 
 .macro DEOPT_OR_RESTORE_SAVE_EVERYTHING_FRAME_AND_RETURN_R0 temp, is_ref
@@ -2509,8 +2507,9 @@
 ENTRY art_quick_method_exit_hook
     SETUP_SAVE_EVERYTHING_FRAME r5
 
-    sub sp, #4                                @ align stack
+    INCREASE_FRAME 4                          @ align stack
     push {r2}                                 @ pass frame_size stack
+    .cfi_adjust_cfa_offset 4
     add r3, sp, #(8 + 8)                      @ store fpr_res pointer, in kSaveEverything frame
     add r2, sp, #(136 + 8)                    @ store gpr_res pointer, in kSaveEverything frame
     add r1, sp, #(FRAME_SIZE_SAVE_EVERYTHING + 8)   @ pass ArtMethod**
@@ -2518,7 +2517,7 @@
     blx artMethodExitHook                     @ (Thread*, ArtMethod**, gpr_res*, fpr_res*,
                                               @ frame_size)
 
-    add sp, #8                                @ pop arguments on stack
+    DECREASE_FRAME 8                          @ pop arguments on stack
     RESTORE_SAVE_EVERYTHING_FRAME
     REFRESH_MARKING_REGISTER
     blx lr
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index a931550..8f613d9 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -213,22 +213,20 @@
   mov x1, x0                        // pass the result
   mov x0, xSELF                     // Thread::Current
   bl artDeoptimizeIfNeeded
-  CFI_REMEMBER_STATE
   RESTORE_SAVE_EVERYTHING_FRAME
   REFRESH_MARKING_REGISTER
   ret
-  CFI_RESTORE_STATE_AND_DEF_CFA sp, FRAME_SIZE_SAVE_EVERYTHING
 .endm
 
 .macro DEOPT_OR_RESTORE_SAVE_EVERYTHING_FRAME_AND_RETURN_X0 temp, is_ref
   ldr \temp, [xSELF, #THREAD_DEOPT_CHECK_REQUIRED_OFFSET]
-  cbnz \temp, 2f
   CFI_REMEMBER_STATE
+  cbnz \temp, 2f
   RESTORE_SAVE_EVERYTHING_FRAME_KEEP_X0
   REFRESH_MARKING_REGISTER
   ret
-  CFI_RESTORE_STATE_AND_DEF_CFA sp, FRAME_SIZE_SAVE_EVERYTHING
 2:
+  CFI_RESTORE_STATE_AND_DEF_CFA sp, FRAME_SIZE_SAVE_EVERYTHING
   str x0, [sp, #SAVE_EVERYTHING_FRAME_X0_OFFSET] // update result in the frame
   mov x2, \is_ref                                // pass if result is a reference
   mov x1, x0                                     // pass the result
diff --git a/runtime/arch/x86/memcmp16_x86.S b/runtime/arch/x86/memcmp16_x86.S
index 636ceb9..45f647c 100644
--- a/runtime/arch/x86/memcmp16_x86.S
+++ b/runtime/arch/x86/memcmp16_x86.S
@@ -844,8 +844,11 @@
     subl       %ebx, %eax
     RETURN
 
-
-    CFI_PUSH (%ebx)
+    # Unreachable, but needed for static analysis in the check_cfi.py script,
+    # since it does just single forward pass, but the code below is only
+    # reachable via a backward branch.
+    CFI_DEF_CFA (esp, 4)
+    PUSH       (%ebx)
 
     .p2align 4
 L(more8bytes):
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index b050822..bff674f 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -750,8 +750,7 @@
     PUSH ecx                                          // pass arg2
     PUSH eax                                          // pass arg1
     call CALLVAR(cxx_name)                            // cxx_name(arg1, arg2, arg3, Thread*)
-    addl LITERAL(16), %esp                            // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-32)
+    DECREASE_FRAME 16                                 // pop arguments
     RESTORE_SAVE_REFS_ONLY_FRAME                      // restore frame up to return address
     CALL_MACRO(return_macro)                          // return or deliver exception
     END_FUNCTION VAR(c_name)
@@ -812,21 +811,19 @@
   pushl %fs:THREAD_SELF_OFFSET      // Pass Thread::Current
   CFI_ADJUST_CFA_OFFSET(4)
   call SYMBOL(artDeoptimizeIfNeeded)
-  addl LITERAL(16), %esp             // pop arguments
-  CFI_REMEMBER_STATE
+  DECREASE_FRAME(16)                // pop arguments
   RESTORE_SAVE_EVERYTHING_FRAME
   ret
-  CFI_RESTORE_STATE_AND_DEF_CFA esp, FRAME_SIZE_SAVE_EVERYTHING
 END_MACRO
 
 MACRO2(DEOPT_OR_RESTORE_SAVE_EVERYTHING_FRAME_AND_RETURN_EAX, temp, is_ref = 0)
   cmpl LITERAL(0), %fs:THREAD_DEOPT_CHECK_REQUIRED_OFFSET
-  jne 2f
   CFI_REMEMBER_STATE
+  jne 2f
   RESTORE_SAVE_EVERYTHING_FRAME_KEEP_EAX
   ret
-  CFI_RESTORE_STATE_AND_DEF_CFA esp, FRAME_SIZE_SAVE_EVERYTHING
 2:
+  CFI_RESTORE_STATE_AND_DEF_CFA esp, FRAME_SIZE_SAVE_EVERYTHING
   movl %eax, SAVE_EVERYTHING_FRAME_EAX_OFFSET(%esp) // update eax in the frame
   INCREASE_FRAME 4                                  // alignment padding
   pushl MACRO_LITERAL(\is_ref)                      // is_ref
@@ -835,7 +832,7 @@
   pushl %fs:THREAD_SELF_OFFSET                      // Pass Thread::Current
   CFI_ADJUST_CFA_OFFSET(4)
   call SYMBOL(artDeoptimizeIfNeeded)
-  addl LITERAL(16), %esp                            // pop arguments
+  DECREASE_FRAME(16)                                // pop arguments
   CFI_REMEMBER_STATE
   RESTORE_SAVE_EVERYTHING_FRAME
   ret
@@ -945,7 +942,7 @@
 .Lslow_path\c_name:
     SETUP_SAVE_REFS_ONLY_FRAME ebx              // save ref containing registers for GC
     // Outgoing argument set up
-    subl LITERAL(8), %esp                       // alignment padding
+    INCREASE_FRAME(8)                           // alignment padding
     pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
     CFI_ADJUST_CFA_OFFSET(4)
     PUSH eax
@@ -1009,8 +1006,10 @@
     // EAX: type
     // EBX, ECX, EDX: free.
     PUSH edi
+    CFI_REMEMBER_STATE
     ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lslow_path\c_name
 .Lslow_path\c_name:
+    CFI_RESTORE_STATE_AND_DEF_CFA esp, 8
     ALLOC_OBJECT_RESOLVED_TLAB_SLOW_PATH RAW_VAR(cxx_name)
     END_FUNCTION VAR(c_name)
 END_MACRO
@@ -1119,8 +1118,10 @@
     // EAX: mirror::Class* klass, ECX: int32_t component_count
     PUSH edi
     CALL_MACRO(size_setup) .Lslow_path\c_entrypoint
+    CFI_REMEMBER_STATE
     ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path\c_entrypoint
 .Lslow_path\c_entrypoint:
+    CFI_RESTORE_STATE_AND_DEF_CFA esp, 8
     POP edi
     SETUP_SAVE_REFS_ONLY_FRAME ebx                      // save ref containing registers for GC
     // Outgoing argument set up
@@ -1359,6 +1360,7 @@
     UNPOISON_HEAP_REF eax
     call SYMBOL(art_quick_read_barrier_mark_reg00)  // Mark EAX.
     cmpl %eax, %ebx
+    CFI_REMEMBER_STATE
     jne .Laput_obj_check_assignability_gc_marking
     POP_ARG edx                   // Restore `art_quick_aput_obj()` arguments.
     POP_ARG ecx
@@ -1366,6 +1368,7 @@
     jmp .Laput_obj_store
 
 .Laput_obj_check_assignability_gc_marking:
+    CFI_RESTORE_STATE_AND_DEF_CFA esp, 16
     // Prepare arguments in line with `.Laput_obj_check_assignability_call` and jump there.
     // (EAX, ECX and EDX were already saved in the right stack slots.)
     INCREASE_FRAME 8              // Alignment padding.
@@ -1614,9 +1617,11 @@
     addl LITERAL(16), %esp        // pop arguments
     CFI_ADJUST_CFA_OFFSET(-16)
     test %eax, %eax               // if code pointer is null goto deliver the OOME.
+    CFI_REMEMBER_STATE
     jz 1f
     RESTORE_SAVE_REFS_AND_ARGS_FRAME_AND_JUMP
 1:
+    CFI_RESTORE_STATE_AND_DEF_CFA esp, 64
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
     DELIVER_PENDING_EXCEPTION
 END_FUNCTION art_quick_resolution_trampoline
@@ -1905,6 +1910,7 @@
     add LITERAL(LOCK_WORD_STATE_FORWARDING_ADDRESS_OVERFLOW), %eax
     // Jump if overflow, the only case where it overflows should be the forwarding address one.
     // Taken ~25% of the time.
+    CFI_REMEMBER_STATE
     jnae .Lret_forwarding_address\name
 
     // Save all potentially live caller-save core registers.
@@ -1955,6 +1961,7 @@
 .Lret_rb_\name:
     ret
 .Lret_forwarding_address\name:
+    CFI_RESTORE_STATE_AND_DEF_CFA esp, 8
     // The overflow cleared the top bits.
     sall LITERAL(LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT), %eax
     mov %eax, REG_VAR(reg)
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 0d721d1..ae8f4bd 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -768,20 +768,18 @@
   movq %rax, %rsi                      // pass the result
   movq %gs:THREAD_SELF_OFFSET, %rdi    // pass Thread::Current
   call SYMBOL(artDeoptimizeIfNeeded)
-  CFI_REMEMBER_STATE
   RESTORE_SAVE_EVERYTHING_FRAME
   ret
-  CFI_RESTORE_STATE_AND_DEF_CFA rsp, FRAME_SIZE_SAVE_EVERYTHING
 END_MACRO
 
 MACRO1(DEOPT_OR_RESTORE_SAVE_EVERYTHING_FRAME_AND_RETURN_RAX, is_ref = 0)
   cmpl LITERAL(0), %gs:THREAD_DEOPT_CHECK_REQUIRED_OFFSET
-  jne 2f
   CFI_REMEMBER_STATE
+  jne 2f
   RESTORE_SAVE_EVERYTHING_FRAME_KEEP_RAX
   ret
-  CFI_RESTORE_STATE_AND_DEF_CFA rsp, FRAME_SIZE_SAVE_EVERYTHING
 2:
+  CFI_RESTORE_STATE_AND_DEF_CFA rsp, FRAME_SIZE_SAVE_EVERYTHING
   movq %rax, SAVE_EVERYTHING_FRAME_RAX_OFFSET(%rsp) // update result in the frame
   movq LITERAL(\is_ref), %rdx                       // pass if result is a reference
   movq %rax, %rsi                                   // pass the result
@@ -1263,7 +1261,7 @@
 
 #ifdef USE_READ_BARRIER
 .Laput_obj_gc_marking:
-    CFI_RESTORE_STATE_AND_DEF_CFA rsp, 4
+    CFI_RESTORE_STATE_AND_DEF_CFA rsp, 8
     // We need to align stack for `art_quick_read_barrier_mark_regNN`.
     INCREASE_FRAME 8                        // Stack alignment.
     call SYMBOL(art_quick_read_barrier_mark_reg01)  // Mark ECX
@@ -1772,6 +1770,7 @@
     // Jump if the addl caused eax to unsigned overflow. The only case where it overflows is the
     // forwarding address one.
     // Taken ~25% of the time.
+    CFI_REMEMBER_STATE
     jnae .Lret_forwarding_address\name
 
     // Save all potentially live caller-save core registers.
@@ -1839,6 +1838,7 @@
 .Lret_rb_\name:
     ret
 .Lret_forwarding_address\name:
+    CFI_RESTORE_STATE_AND_DEF_CFA rsp, 16
     // The overflow cleared the top bits.
     sall LITERAL(LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT), %eax
     movq %rax, REG_VAR(reg)
diff --git a/runtime/interpreter/mterp/arm64ng/main.S b/runtime/interpreter/mterp/arm64ng/main.S
index 424d060..2492aec 100644
--- a/runtime/interpreter/mterp/arm64ng/main.S
+++ b/runtime/interpreter/mterp/arm64ng/main.S
@@ -1593,6 +1593,7 @@
  */
 
 OAT_ENTRY ExecuteNterpWithClinitImpl, EndExecuteNterpWithClinitImpl
+    .cfi_startproc
     // For simplicity, we don't do a read barrier here, but instead rely
     // on art_quick_resolution_trampoline to always have a suspend point before
     // calling back here.
@@ -1613,6 +1614,7 @@
     b.eq ExecuteNterpImpl
 .Lresolution_trampoline:
     b art_quick_resolution_trampoline
+    .cfi_endproc
 EndExecuteNterpWithClinitImpl:
 
 OAT_ENTRY ExecuteNterpImpl, EndExecuteNterpImpl
diff --git a/runtime/interpreter/mterp/armng/main.S b/runtime/interpreter/mterp/armng/main.S
index 3647f3e..1e528d4 100644
--- a/runtime/interpreter/mterp/armng/main.S
+++ b/runtime/interpreter/mterp/armng/main.S
@@ -1612,6 +1612,7 @@
  */
 
 OAT_ENTRY ExecuteNterpWithClinitImpl, EndExecuteNterpWithClinitImpl
+    .cfi_startproc
     // For simplicity, we don't do a read barrier here, but instead rely
     // on art_quick_resolution_trampoline to always have a suspend point before
     // calling back here.
@@ -1631,6 +1632,7 @@
     cmp r4, ip
     beq ExecuteNterpImpl
     b art_quick_resolution_trampoline
+    .cfi_endproc
 EndExecuteNterpWithClinitImpl:
 
 OAT_ENTRY ExecuteNterpImpl, EndExecuteNterpImpl
diff --git a/runtime/interpreter/mterp/x86_64ng/main.S b/runtime/interpreter/mterp/x86_64ng/main.S
index 9ad7efa..e00980c 100644
--- a/runtime/interpreter/mterp/x86_64ng/main.S
+++ b/runtime/interpreter/mterp/x86_64ng/main.S
@@ -1697,6 +1697,7 @@
  */
 
 OAT_ENTRY ExecuteNterpWithClinitImpl, EndExecuteNterpWithClinitImpl
+    .cfi_startproc
     // For simplicity, we don't do a read barrier here, but instead rely
     // on art_quick_resolution_trampoline to always have a suspend point before
     // calling back here.
@@ -1709,6 +1710,7 @@
     cmpl %r10d, rSELF:THREAD_TID_OFFSET
     je ExecuteNterpImpl
     jmp art_quick_resolution_trampoline
+    .cfi_endproc
 EndExecuteNterpWithClinitImpl:
 
 OAT_ENTRY ExecuteNterpImpl, EndExecuteNterpImpl
diff --git a/runtime/interpreter/mterp/x86ng/main.S b/runtime/interpreter/mterp/x86ng/main.S
index 5b0edd4..4bc57ec 100644
--- a/runtime/interpreter/mterp/x86ng/main.S
+++ b/runtime/interpreter/mterp/x86ng/main.S
@@ -204,6 +204,15 @@
     addl MACRO_LITERAL(SYMBOL(artNterpAsmInstructionStart) - 0b), rIBASE
 .endm
 
+.macro RESTORE_IBASE_WITH_CFA
+    call 0f
+0:
+    CFI_ADJUST_CFA_OFFSET(4)
+    popl rIBASE
+    CFI_ADJUST_CFA_OFFSET(-4)
+    addl MACRO_LITERAL(SYMBOL(artNterpAsmInstructionStart) - 0b), rIBASE
+.endm
+
 .macro SPILL_ALL_CORE_PARAMETERS
     PUSH_ARG eax
     PUSH_ARG ecx
@@ -270,9 +279,8 @@
   PUSH_ARG ecx
   PUSH_ARG eax
   call \helper
-  addl MACRO_LITERAL(16), %esp
-  CFI_ADJUST_CFA_OFFSET(-16)
-  RESTORE_IBASE
+  DECREASE_FRAME 16
+  RESTORE_IBASE_WITH_CFA
   FETCH_INST_CLEAR_OPCODE
   RESTORE_SAVE_REFS_ONLY_FRAME
   cmpl LITERAL(0), %fs:THREAD_EXCEPTION_OFFSET
@@ -1760,7 +1768,8 @@
  */
 
 OAT_ENTRY ExecuteNterpWithClinitImpl, EndExecuteNterpWithClinitImpl
-    push %esi
+    .cfi_startproc
+    PUSH_ARG esi
     // For simplicity, we don't do a read barrier here, but instead rely
     // on art_quick_resolution_trampoline to always have a suspend point before
     // calling back here.
@@ -1771,13 +1780,16 @@
     jb .Linvoke_trampoline
     movl MIRROR_CLASS_CLINIT_THREAD_ID_OFFSET(%esi), %esi
     cmpl %esi, rSELF:THREAD_TID_OFFSET
+    CFI_REMEMBER_STATE
     je .Lcontinue_execute_nterp
 .Linvoke_trampoline:
-    pop %esi
+    POP_ARG esi
     jmp art_quick_resolution_trampoline
 .Lcontinue_execute_nterp:
-    pop %esi
+    CFI_RESTORE_STATE_AND_DEF_CFA esp, 8
+    POP_ARG esi
     jmp ExecuteNterpImpl
+    .cfi_endproc
 EndExecuteNterpWithClinitImpl:
 
 OAT_ENTRY ExecuteNterpImpl, EndExecuteNterpImpl
diff --git a/tools/check_cfi.py b/tools/check_cfi.py
index ac258c2..36c96d7 100755
--- a/tools/check_cfi.py
+++ b/tools/check_cfi.py
@@ -20,12 +20,82 @@
 Fully inferring CFI from disassembly is not possible in general.
 """
 
-import os, re, subprocess, collections, pathlib, bisect, collections
-from typing import List, Optional, Set, Tuple
+import os, re, subprocess, collections, pathlib, bisect, collections, sys
+from dataclasses import dataclass
+from functools import cache
+from pathlib import Path
+from typing import Any, List, Optional, Set, Tuple, Dict
 
-Source = collections.namedtuple("Source", ["addr", "file", "line", "flag"])
+arch: str = ""
+ARCHES = ["i386", "x86_64", "arm", "aarch64", "riscv64"]
 
-def get_source(lib: pathlib.Path) -> List[Source]:
+IGNORE : Dict[str, List[str]] = {
+    # Aligns stack.
+    "art_quick_osr_stub": ["i386"],
+    # Intermediate invalid CFI while loading all registers.
+    "art_quick_do_long_jump": ["x86_64"],
+    # Saves/restores SP in other register.
+    "art_quick_generic_jni_trampoline": ["arm", "i386", "x86_64"],
+    # Starts with non-zero offset at the start of the method.
+    "art_quick_throw_null_pointer_exception_from_signal": ["arm", "aarch64", "i386", "x86_64"],
+    # Pops stack without static control flow past the opcode.
+    "nterp_op_return": ["arm", "aarch64", "i386", "x86_64"],
+}
+
+SP = {"arm": "SP", "aarch64": "WSP", "i386": "ESP", "x86_64": "RSP", "riscv64": "X2"}
+INITIAL_OFFSET = {"i386": 4, "x86_64": 8}
+
+@cache
+def get_inst_semantics(arch: str) -> List[Any]:
+  """ List of regex expressions for supported instructions and their behaviour """
+
+  rexprs = []
+  def add(rexpr, adjust_offset=lambda m: 0, adjust_pc=None):
+    rexprs.append((re.compile(rexpr), adjust_offset, adjust_pc))
+  if arch in ["i386", "x86_64"]:
+    ptr_size = {"i386": 4, "x86_64": 8}[arch]
+    add(r"push. .*", lambda m: ptr_size)
+    add(r"pop. .*", lambda m: -ptr_size)
+    add(r"sub. \$(\w+), (?:%esp|%rsp)", lambda m: int(m[1], 0))
+    add(r"add. \$(\w+), (?:%esp|%rsp)", lambda m: -int(m[1], 0))
+    add(r"call. (0x\w+) <.*", lambda m: ptr_size, adjust_pc=lambda m: int(m[1], 0))
+    add(r"j[a-z]* (0x\w+) <.*", adjust_pc=lambda m: int(m[1], 0))
+  if arch in ["arm", "aarch64"]:
+    add(r"sub sp,(?: sp,)? #(\w+)", lambda m: int(m[1], 0))
+    add(r"add sp,(?: sp,)? #(\w+)", lambda m: -int(m[1], 0))
+    add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
+    add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
+    add(r"stp \w+, \w+, \[sp, #-(\w+)\]!", lambda m: int(m[1], 0))
+    add(r"ldp \w+, \w+, \[sp\], #(\w+)", lambda m: -int(m[1], 0))
+    add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(",")))
+    add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(",")))
+    add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(",")))
+    add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(",")))
+    add(r"cb\w* \w+, (0x\w+).*", adjust_pc=lambda m: int(m[1], 0))
+    add(r"(?:b|bl|b\w\w) (0x\w+).*", adjust_pc=lambda m: int(m[1], 0))
+  return rexprs
+
+@dataclass(frozen=True)
+class Error(Exception):
+  address: int
+  message: str
+
+def get_arch(lib: pathlib.Path) -> str:
+  """ Get architecture of the given library based on the ELF header. """
+
+  proc = subprocess.run(["llvm-objdump", "--file-headers", lib],
+                        encoding='utf-8',
+                        capture_output=True,
+                        check=True)
+
+  m = re.search("^architecture: *(.*)$", proc.stdout, re.MULTILINE)
+  assert m, "Can not find ABI of ELF file " + str(lib)
+  assert m.group(1) in ARCHES, "Unknown arch: " + m.group(1)
+  return m.group(1)
+
+Source = collections.namedtuple("Source", ["pc", "file", "line", "flag"])
+
+def get_src(lib: pathlib.Path) -> List[Source]:
   """ Get source-file and line-number for all hand-written assembly code. """
 
   proc = subprocess.run(["llvm-dwarfdump", "--debug-line", lib],
@@ -35,7 +105,7 @@
 
   section_re = re.compile("^debug_line\[0x[0-9a-f]+\]$", re.MULTILINE)
   filename_re = re.compile('file_names\[ *(\d)+\]:\n\s*name: "(.*)"', re.MULTILINE)
-  line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)'  # addr, line, column, file
+  line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)'  # pc, line, column, file
                        ' +\d+ +\d +(.*)')                     # isa, discriminator, flag
 
   results = []
@@ -47,10 +117,10 @@
     results.extend([Source(int(a, 16), files[fn], l, fg) for a, l, fn, fg in lines])
   return sorted(filter(lambda line: "end_sequence" not in line.flag, results))
 
-Fde = collections.namedtuple("Fde", ["addr", "end", "data"])
+Fde = collections.namedtuple("Fde", ["pc", "end", "data"])
 
 def get_fde(lib: pathlib.Path) -> List[Fde]:
-  """ Get all unwinding FDE blocks (in dumped text-based format) """
+  """ Get all FDE blocks (in dumped text-based format) """
 
   proc = subprocess.run(["llvm-dwarfdump", "--debug-frame", lib],
                         encoding='utf-8',
@@ -65,16 +135,16 @@
     m = fda_re.match(section)
     if m:
       fde = Fde(int(m[1], 16), int(m[2], 16), section)
-      if fde.addr != 0:
+      if fde.pc != 0:
         results.append(fde)
   return sorted(results)
 
-Asm = collections.namedtuple("Asm", ["addr", "name", "data"])
+Asm = collections.namedtuple("Asm", ["pc", "name", "data"])
 
 def get_asm(lib: pathlib.Path) -> List[Asm]:
-  """ Get disassembly for all methods (in dumped text-based format) """
+  """ Get all ASM blocks (in dumped text-based format) """
 
-  proc = subprocess.run(["llvm-objdump", "--disassemble", lib],
+  proc = subprocess.run(["llvm-objdump", "--disassemble", "--no-show-raw-insn", lib],
                         encoding='utf-8',
                         capture_output=True,
                         check=True)
@@ -89,132 +159,105 @@
       results.append(Asm(int(sym[1], 16), sym[2], section))
   return sorted(results)
 
-Cfa = collections.namedtuple("Cfa", ["addr", "cfa"])
-
-def get_cfa(fde: Fde) -> List[Cfa]:
-  """ Extract individual CFA (SP+offset) entries from the FDE block """
-
-  cfa_re = re.compile("0x([0-9a-f]+): CFA=([^\s:]+)")
-  return [Cfa(int(addr, 16), cfa) for addr, cfa in cfa_re.findall(fde.data)]
-
-Inst = collections.namedtuple("Inst", ["addr", "inst", "symbol"])
+Inst = collections.namedtuple("Inst", ["pc", "inst"])
 
 def get_instructions(asm: Asm) -> List[Inst]:
   """ Extract individual instructions from disassembled code block """
 
   data = re.sub(r"[ \t]+", " ", asm.data)
   inst_re = re.compile(r"([0-9a-f]+): +(?:[0-9a-f]{2} +)*(.*)")
-  return [Inst(int(addr, 16), inst, asm.name) for addr, inst in inst_re.findall(data)]
+  return [Inst(int(pc, 16), inst) for pc, inst in inst_re.findall(data)]
 
-CfaOffset = collections.namedtuple("CfaOffset", ["addr", "offset"])
+# PC -> CFA offset (stack size at given PC; None if it not just trivial SP+<integer>)
+CfaOffsets = Dict[int, Optional[int]]
 
-def get_dwarf_cfa_offsets(cfas: List[Cfa]) -> List[CfaOffset]:
-  """ Parse textual CFA entries into integer stack offsets """
+def get_dwarf_cfa_offsets(insts: List[Inst], fde: Fde) -> CfaOffsets:
+  """ Get CFA offsets for all instructions from DWARF """
 
-  result = []
-  for addr, cfa in cfas:
-    if cfa == "WSP" or cfa == "SP":
-      result.append(CfaOffset(addr, 0))
-    elif cfa.startswith("WSP+") or cfa.startswith("SP+"):
-      result.append(CfaOffset(addr, int(cfa.split("+")[1])))
-    else:
-      result.append(CfaOffset(addr, None))
+  # Parse the CFA offset definitions from the FDE.
+  sp = SP[arch]
+  m = re.compile(r"(0x[0-9a-f]+): CFA=(\w*)([^:\n]*)").findall(fde.data)
+  cfa = collections.deque([(int(a, 0), int(o or "0") if r == sp else None) for a, r, o in m])
+  if all(offset is None for add, offset in cfa):
+    # This would create result that never checks anything.
+    raise Error(insts[0].pc, "No trivial CFA offsets. Add function to IGNORE list?")
+
+  # Create map from instruction PCs to corresponding CFA offsets.
+  offset: Optional[int] = INITIAL_OFFSET.get(arch, 0)
+  result: CfaOffsets = {}
+  for pc, inst in insts:
+    while cfa and cfa[0][0] <= pc:
+      offset = cfa.popleft()[1]
+    result[pc] = offset
   return result
 
-def get_infered_cfa_offsets(insts: List[Inst]) -> List[CfaOffset]:
-  """ Heuristic to convert disassembly into stack offsets """
+def get_inferred_cfa_offsets(insts: List[Inst]) -> CfaOffsets:
+  """ Get CFA offsets for all instructions from static analysis """
 
-  # Regular expressions which find instructions that adjust stack pointer.
-  rexprs = []
-  def add(rexpr, adjust_offset):
-    rexprs.append((re.compile(rexpr), adjust_offset))
-  add(r"sub sp,(?: sp,)? #(\d+)", lambda m: int(m[1]))
-  add(r"add sp,(?: sp,)? #(\d+)", lambda m: -int(m[1]))
-  add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
-  add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
-  add(r"stp \w+, \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
-  add(r"ldp \w+, \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
-  add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(",")))
-  add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(",")))
-  add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(",")))
-  add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(",")))
+  rexprs = get_inst_semantics(arch)
+  offset: Optional[int] = INITIAL_OFFSET.get(arch, 0)
+  result: CfaOffsets = {}
+  for pc, inst in insts:
+    # Set current offset for PC, unless branch already set it.
+    offset = result.setdefault(pc, offset)
 
-  # Regular expression which identifies branches.
-  jmp_re = re.compile(r"cb\w* \w+, 0x(\w+)|(?:b|bl|b\w\w) 0x(\w+)")
-
-  offset, future_offset = 0, {}
-  result = [CfaOffset(insts[0].addr, offset)]
-  for addr, inst, symbol in insts:
-    # Previous code branched here, so us that offset instead.
-    # This likely identifies slow-path which is after return.
-    if addr in future_offset:
-      offset = future_offset[addr]
-
-    # Add entry to output (only if the offset changed).
-    if result[-1].offset != offset:
-      result.append(CfaOffset(addr, offset))
-
-    # Adjust offset if the instruction modifies stack pointer.
-    for rexpr, adjust_offset in rexprs:
-      m = rexpr.match(inst)
+    # Adjust PC and offset based on the current instruction.
+    for rexpr, adjust_offset, adjust_pc in rexprs:
+      m = rexpr.fullmatch(inst)
       if m:
-        offset += adjust_offset(m)
+        new_offset = offset + adjust_offset(m)
+        if adjust_pc:
+          new_pc = adjust_pc(m)
+          if insts[0].pc <= new_pc <= insts[-1].pc:
+            if new_pc in result and result[new_pc] != new_offset:
+              raise Error(pc, "Inconsistent branch (old={} new={})"
+                              .format(result[new_pc], new_offset))
+            result[new_pc] = new_offset
+        else:
+          offset = new_offset
         break  # First matched pattern wins.
-
-    # Record branches.  We only support forward edges for now.
-    m = jmp_re.match(inst)
-    if m:
-      future_offset[int(m[m.lastindex], 16)] = offset
   return result
 
-def check_fde(fde: Fde, insts: List[Inst], srcs, verbose: bool = False) -> Tuple[str, Set[int]]:
+def check_fde(fde: Fde, insts: List[Inst], srcs) -> None:
   """ Compare DWARF offsets to assembly-inferred offsets. Report differences. """
 
-  error, seen_addrs = None, set()
-  cfas = get_cfa(fde)
-  i, dwarf_cfa = 0, get_dwarf_cfa_offsets(cfas)
-  j, infered_cfa = 0, get_infered_cfa_offsets(insts)
-  for inst in insts:
-    seen_addrs.add(inst.addr)
-    while i+1 < len(dwarf_cfa) and dwarf_cfa[i+1].addr <= inst.addr:
-      i += 1
-    while j+1 < len(infered_cfa) and infered_cfa[j+1].addr <= inst.addr:
-      j += 1
-    if verbose:
-      print("{:08x}: dwarf={:4} infered={:4} {:40} // {}".format(
-                inst.addr, str(dwarf_cfa[i].offset), str(infered_cfa[j].offset),
-                inst.inst.strip(), srcs.get(inst.addr, "")))
-    if dwarf_cfa[i].offset is not None and dwarf_cfa[i].offset != infered_cfa[j].offset:
-      if inst.addr in srcs:  # Only report if it maps to source code (not padding or literals).
-        error = error or "{:08x} {}".format(inst.addr, srcs.get(inst.addr, ""))
-  return error, seen_addrs
+  dwarf_cfa_offsets = get_dwarf_cfa_offsets(insts, fde)
+  inferred_cfa_offsets = get_inferred_cfa_offsets(insts)
 
-def check_lib(lib: pathlib.Path):
+  for pc, inst in insts:
+    if dwarf_cfa_offsets[pc] is not None and dwarf_cfa_offsets[pc] != inferred_cfa_offsets[pc]:
+      if pc in srcs:  # Only report if it maps to source code (not padding or literals).
+        for inst2 in insts:
+          print("0x{:08x} [{}]: dwarf={} inferred={} {}".format(
+                    inst2.pc, srcs.get(inst2.pc, ""),
+                    str(dwarf_cfa_offsets[inst2.pc]), str(inferred_cfa_offsets[inst2.pc]),
+                    inst2.inst.strip()))
+        raise Error(pc, "DWARF offset does not match inferred offset")
+
+def check_lib(lib: pathlib.Path) -> int:
+  global arch
+  arch = get_arch(lib)
+
   assert lib.exists()
-  IGNORE = [
-      "art_quick_throw_null_pointer_exception_from_signal",  # Starts with non-zero offset.
-      "art_quick_generic_jni_trampoline",  # Saves/restores SP in other register.
-      "nterp_op_",  # Uses calculated CFA due to dynamic stack size.
-      "$d.",  # Data (literals) interleaved within code.
-  ]
   fdes = get_fde(lib)
   asms = collections.deque(get_asm(lib))
-  srcs = {src.addr: src.file + ":" + src.line for src in get_source(lib)}
+  srcs = {src.pc: src.file + ":" + src.line for src in get_src(lib)}
   seen = set()  # Used to verify the we have covered all assembly source lines.
+  fail = 0
 
   for fde in fdes:
-    if fde.addr not in srcs:
+    if fde.pc not in srcs:
       continue  # Ignore if it is not hand-written assembly.
 
     # Assembly instructions (one FDE can cover several assembly chunks).
-    all_insts, name = [], None
-    while asms and asms[0].addr < fde.end:
+    all_insts, name = [], ""
+    while asms and asms[0].pc < fde.end:
       asm = asms.popleft()
-      if asm.addr < fde.addr:
+      if asm.pc < fde.pc:
         continue
       insts = get_instructions(asm)
-      if any(asm.name.startswith(i) for i in IGNORE):
-        seen.update([inst.addr for inst in insts])
+      if any(asm.name.startswith(n) and arch in a for n, a in IGNORE.items()):
+        seen.update([inst.pc for inst in insts])
         continue
       all_insts.extend(insts)
       name = name or asm.name
@@ -222,14 +265,17 @@
       continue  # No assembly
 
     # Compare DWARF data to assembly instructions
-    error, seen_addrs = check_fde(fde, all_insts, srcs)
-    if error:
-      print("ERROR at " + name + " " + error)
-      check_fde(fde, all_insts, srcs, True)
-      print("")
-    seen.update(seen_addrs)
-  for addr in sorted(set(srcs.keys()) - seen):
-    print("Missing CFI for {:08x}: {}".format(addr, srcs[addr]))
+    try:
+      check_fde(fde, all_insts, srcs)
+    except Error as e:
+      print("0x{:08x} [{}]: ERROR in {}: {}\n"
+            .format(e.address, srcs.get(e.address, ""), name, e.message))
+      fail += 1
+    seen.update([inst.pc for inst in all_insts])
+  for pc in sorted(set(srcs.keys()) - seen):
+    print("ERROR: Missing CFI for {:08x}: {}".format(pc, srcs[pc]))
+    fail += 1
+  return fail
 
 
 def main(argv):
@@ -237,11 +283,14 @@
 
   libs = argv[1:]
   if not libs:
-    out = os.environ["OUT"]
-    libs.append(out + "/symbols/apex/com.android.art/lib/libart.so")
-    libs.append(out + "/symbols/apex/com.android.art/lib64/libart.so")
+    apex = Path(os.environ["OUT"]) / "symbols/apex/"
+    libs = list(apex.glob("**/libart.so"))
+    assert libs, "Can not find any libart.so in " + str(apex)
   for lib in libs:
-    check_lib(pathlib.Path(lib))
+    fail = check_lib(pathlib.Path(lib))
+    if fail > 0:
+      print(fail, "ERROR(s) in", str(lib))
+      sys.exit(1)
 
 if __name__ == "__main__":
-  main(os.sys.argv)
+  main(sys.argv)