Make suspend check test specific flags.

Make 20 bits in `Thread.tls32_.state_and_flags` available
for new uses.

Code size changes per suspend check:
  - x86/x86-64: +3B (CMP r/m32, imm8 -> TST r/m32, imm32)
  - arm: none (CMP -> TST, both 32-bit with high register)
  - arm64: +4B (CBNZ/CBZ -> TST+BNE/BEQ)

Note: Using implicit suspend checks on arm64 would sidestep
this code size increase entirely.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 172332525
Change-Id: If5b0be0183efba3f397596b22e03a8b7afb87f85
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 933e270..775bfcf 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1994,12 +1994,12 @@
   Register temp = temps.AcquireW();
 
   __ Ldr(temp, MemOperand(tr, Thread::ThreadFlagsOffset<kArm64PointerSize>().SizeValue()));
-  static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
+  __ Tst(temp, Thread::SuspendOrCheckpointRequestFlags());
   if (successor == nullptr) {
-    __ Cbnz(temp, slow_path->GetEntryLabel());
+    __ B(ne, slow_path->GetEntryLabel());
     __ Bind(slow_path->GetReturnLabel());
   } else {
-    __ Cbz(temp, codegen_->GetLabelOf(successor));
+    __ B(eq, codegen_->GetLabelOf(successor));
     __ B(slow_path->GetEntryLabel());
     // slow_path will return to GetLabelOf(successor).
   }
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index c514c22..841d59b 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -7168,12 +7168,12 @@
   vixl32::Register temp = temps.Acquire();
   GetAssembler()->LoadFromOffset(
       kLoadWord, temp, tr, Thread::ThreadFlagsOffset<kArmPointerSize>().Int32Value());
-  static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
+  __ Tst(temp, Thread::SuspendOrCheckpointRequestFlags());
   if (successor == nullptr) {
-    __ CompareAndBranchIfNonZero(temp, slow_path->GetEntryLabel());
+    __ B(ne, slow_path->GetEntryLabel());
     __ Bind(slow_path->GetReturnLabel());
   } else {
-    __ CompareAndBranchIfZero(temp, codegen_->GetLabelOf(successor));
+    __ B(eq, codegen_->GetLabelOf(successor));
     __ B(slow_path->GetEntryLabel());
   }
 }
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index f19eaae..5434407 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -6683,14 +6683,13 @@
     DCHECK_EQ(slow_path->GetSuccessor(), successor);
   }
 
-  static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
-  __ fs()->cmpl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>().Int32Value()),
-                Immediate(0));
+  __ fs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>().Int32Value()),
+                 Immediate(Thread::SuspendOrCheckpointRequestFlags()));
   if (successor == nullptr) {
-    __ j(kNotEqual, slow_path->GetEntryLabel());
+    __ j(kNotZero, slow_path->GetEntryLabel());
     __ Bind(slow_path->GetReturnLabel());
   } else {
-    __ j(kEqual, codegen_->GetLabelOf(successor));
+    __ j(kZero, codegen_->GetLabelOf(successor));
     __ jmp(slow_path->GetEntryLabel());
   }
 }
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index b0bdffe..fa61c67 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -6018,15 +6018,14 @@
     DCHECK_EQ(slow_path->GetSuccessor(), successor);
   }
 
-  static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
-  __ gs()->cmpl(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>().Int32Value(),
-                                  /* no_rip= */ true),
-                Immediate(0));
+  __ gs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>().Int32Value(),
+                                   /* no_rip= */ true),
+                 Immediate(Thread::SuspendOrCheckpointRequestFlags()));
   if (successor == nullptr) {
-    __ j(kNotEqual, slow_path->GetEntryLabel());
+    __ j(kNotZero, slow_path->GetEntryLabel());
     __ Bind(slow_path->GetReturnLabel());
   } else {
-    __ j(kEqual, codegen_->GetLabelOf(successor));
+    __ j(kZero, codegen_->GetLabelOf(successor));
     __ jmp(slow_path->GetEntryLabel());
   }
 }
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 2b3c2dd..3d45abd 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -1058,8 +1058,7 @@
                       tr,
                       Thread::ThreadFlagsOffset<kArmPointerSize>().Int32Value());
 
-  static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
-  ___ Cmp(scratch, 0);
+  ___ Tst(scratch, Thread::SuspendOrCheckpointRequestFlags());
   ___ BPreferNear(ne, ArmVIXLJNIMacroLabel::Cast(label)->AsArm());
   // TODO: think about using CBNZ here.
 }
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index e2d29fd..a505db0 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -893,8 +893,8 @@
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register scratch = temps.AcquireW();
   ___ Ldr(scratch, MEM_OP(reg_x(TR), Thread::ThreadFlagsOffset<kArm64PointerSize>().Int32Value()));
-  static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
-  ___ Cbnz(scratch, Arm64JNIMacroLabel::Cast(label)->AsArm64());
+  ___ Tst(scratch, Thread::SuspendOrCheckpointRequestFlags());
+  ___ B(ne, Arm64JNIMacroLabel::Cast(label)->AsArm64());
 }
 
 void Arm64JNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 904cca4..4ba3aa1 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -590,9 +590,9 @@
 }
 
 void X86JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
-  static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
-  __ fs()->cmpl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()), Immediate(0));
-  __ j(kNotEqual, X86JNIMacroLabel::Cast(label)->AsX86());
+  __ fs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()),
+                 Immediate(Thread::SuspendOrCheckpointRequestFlags()));
+  __ j(kNotZero, X86JNIMacroLabel::Cast(label)->AsX86());
 }
 
 void X86JNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index 2fb2797..de99e74 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -672,10 +672,9 @@
 }
 
 void X86_64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
-  static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
-  __ gs()->cmpl(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>(), true),
-                Immediate(0));
-  __ j(kNotEqual, X86_64JNIMacroLabel::Cast(label)->AsX86_64());
+  __ gs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>(), true),
+                 Immediate(Thread::SuspendOrCheckpointRequestFlags()));
+  __ j(kNotZero, X86_64JNIMacroLabel::Cast(label)->AsX86_64());
 }
 
 void X86_64JNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
diff --git a/runtime/entrypoints/quick/quick_jni_entrypoints.cc b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
index b3d7f38..3cf7dd7 100644
--- a/runtime/entrypoints/quick/quick_jni_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
@@ -186,11 +186,7 @@
     // When we are in @FastNative, we are already Runnable.
     DCHECK(Locks::mutator_lock_->IsSharedHeld(self));
     // Only do a suspend check on the way out of JNI just like compiled stubs.
-    if (UNLIKELY(self->TestAllFlags())) {
-      // In fast JNI mode we never transitioned out of runnable. Perform a suspend check if there
-      // is a flag raised.
-      self->CheckSuspend();
-    }
+    self->CheckSuspend();
   }
   // We need the mutator lock (i.e., calling GoToRunnable()) before accessing the shorty or the
   // locked object.
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 3ac1292..7acee5e 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -40,10 +40,7 @@
 }
 
 inline void Thread::AllowThreadSuspension() {
-  DCHECK_EQ(Thread::Current(), this);
-  if (UNLIKELY(TestAllFlags())) {
-    CheckSuspend();
-  }
+  CheckSuspend();
   // Invalidate the current thread's object pointers (ObjPtr) to catch possible moving GC bugs due
   // to missing handles.
   PoisonObjectPointers();
@@ -51,16 +48,17 @@
 
 inline void Thread::CheckSuspend() {
   DCHECK_EQ(Thread::Current(), this);
-  for (;;) {
+  while (true) {
     StateAndFlags state_and_flags(tls32_.state_and_flags.load(std::memory_order_relaxed));
-    if (state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest)) {
+    if (LIKELY(!state_and_flags.IsAnyOfFlagsSet(SuspendOrCheckpointRequestFlags()))) {
+      break;
+    } else if (state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest)) {
       RunCheckpointFunction();
     } else if (state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest)) {
       FullSuspendCheck();
-    } else if (state_and_flags.IsFlagSet(ThreadFlag::kEmptyCheckpointRequest)) {
-      RunEmptyCheckpoint();
     } else {
-      break;
+      DCHECK(state_and_flags.IsFlagSet(ThreadFlag::kEmptyCheckpointRequest));
+      RunEmptyCheckpoint();
     }
   }
 }
@@ -256,11 +254,12 @@
     GetMutatorLock()->AssertNotHeld(this);  // Otherwise we starve GC.
     // Optimize for the return from native code case - this is the fast path.
     // Atomically change from suspended to runnable if no suspend request pending.
-    StateAndFlags new_state_and_flags = old_state_and_flags;
-    new_state_and_flags.SetState(ThreadState::kRunnable);
-    static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
-    if (LIKELY(new_state_and_flags.GetValue() == 0u)) {  // No flags set?
+    constexpr uint32_t kCheckedFlags =
+        SuspendOrCheckpointRequestFlags() | enum_cast<uint32_t>(ThreadFlag::kActiveSuspendBarrier);
+    if (LIKELY(!old_state_and_flags.IsAnyOfFlagsSet(kCheckedFlags))) {
       // CAS the value with a memory barrier.
+      StateAndFlags new_state_and_flags = old_state_and_flags;
+      new_state_and_flags.SetState(ThreadState::kRunnable);
       if (LIKELY(tls32_.state_and_flags.CompareAndSetWeakAcquire(old_state_and_flags.GetValue(),
                                                                  new_state_and_flags.GetValue()))) {
         // Mark the acquisition of a share of the mutator lock.
@@ -272,10 +271,14 @@
     } else if (UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest) ||
                         old_state_and_flags.IsFlagSet(ThreadFlag::kEmptyCheckpointRequest))) {
       // Impossible
+      StateAndFlags flags = old_state_and_flags;
+      static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
+      flags.SetState(ThreadState::kRunnable);  // Note: Keeping unused bits.
       LOG(FATAL) << "Transitioning to runnable with checkpoint flag, "
-                 << " flags=" << new_state_and_flags.GetValue()  // State set to kRunnable = 0.
+                 << " flags=" << flags.GetValue()  // State set to kRunnable = 0.
                  << " state=" << old_state_and_flags.GetState();
-    } else if (old_state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest)) {
+    } else {
+      DCHECK(old_state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest));
       // Wait while our suspend count is non-zero.
 
       // We pass null to the MutexLock as we may be in a situation where the
diff --git a/runtime/thread.h b/runtime/thread.h
index f1dd7b8..2673ef5 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1110,13 +1110,6 @@
     return state_and_flags.IsFlagSet(flag);
   }
 
-  bool TestAllFlags() const {
-    StateAndFlags state_and_flags(tls32_.state_and_flags.load(std::memory_order_relaxed));
-    static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
-    state_and_flags.SetState(ThreadState::kRunnable);  // Clear state bits.
-    return state_and_flags.GetValue() != 0u;
-  }
-
   void AtomicSetFlag(ThreadFlag flag) {
     tls32_.state_and_flags.fetch_or(enum_cast<uint32_t>(flag), std::memory_order_seq_cst);
   }
@@ -1316,6 +1309,17 @@
     return WhichPowerOf2(InterpreterCache::kSize);
   }
 
+  static constexpr uint32_t AllThreadFlags() {
+    return enum_cast<uint32_t>(ThreadFlag::kLastFlag) |
+           (enum_cast<uint32_t>(ThreadFlag::kLastFlag) - 1u);
+  }
+
+  static constexpr uint32_t SuspendOrCheckpointRequestFlags() {
+    return enum_cast<uint32_t>(ThreadFlag::kSuspendRequest) |
+           enum_cast<uint32_t>(ThreadFlag::kCheckpointRequest) |
+           enum_cast<uint32_t>(ThreadFlag::kEmptyCheckpointRequest);
+  }
+
  private:
   explicit Thread(bool daemon);
   ~Thread() REQUIRES(!Locks::mutator_lock_, !Locks::thread_suspend_count_lock_);
@@ -1482,6 +1486,11 @@
       value_ = value;
     }
 
+    bool IsAnyOfFlagsSet(uint32_t flags) const {
+      DCHECK_EQ(flags & ~AllThreadFlags(), 0u);
+      return (value_ & flags) != 0u;
+    }
+
     bool IsFlagSet(ThreadFlag flag) const {
       return (value_ & enum_cast<uint32_t>(flag)) != 0u;
     }
diff --git a/test/706-checker-scheduler/src/Main.java b/test/706-checker-scheduler/src/Main.java
index d4d3923..41fee9a 100644
--- a/test/706-checker-scheduler/src/Main.java
+++ b/test/706-checker-scheduler/src/Main.java
@@ -606,7 +606,7 @@
   /// CHECK:     add
   /// CHECK:     adds
   /// CHECK:     ldr
-  /// CHECK:     cmp
+  /// CHECK:     tst
   /// CHECK:     beq
 
   /// CHECK-START-ARM64: void Main.testCrossItersDependencies() disassembly (after)
@@ -614,7 +614,8 @@
   /// CHECK:     add
   /// CHECK:     add
   /// CHECK:     ldr
-  /// CHECK:     cbz
+  /// CHECK:     tst
+  /// CHECK:     b.eq
   private static void testCrossItersDependencies() {
     int[] data = {1, 2, 3, 0};
     int sub = 0;
diff --git a/tools/cpp-define-generator/thread.def b/tools/cpp-define-generator/thread.def
index fff5755..6dc6c0e 100644
--- a/tools/cpp-define-generator/thread.def
+++ b/tools/cpp-define-generator/thread.def
@@ -21,10 +21,6 @@
 
 ASM_DEFINE(THREAD_CARD_TABLE_OFFSET,
            art::Thread::CardTableOffset<art::kRuntimePointerSize>().Int32Value())
-ASM_DEFINE(THREAD_CHECKPOINT_REQUEST,
-           static_cast<uint32_t>(art::ThreadFlag::kCheckpointRequest))
-ASM_DEFINE(THREAD_EMPTY_CHECKPOINT_REQUEST,
-           static_cast<uint32_t>(art::ThreadFlag::kEmptyCheckpointRequest))
 ASM_DEFINE(THREAD_EXCEPTION_OFFSET,
            art::Thread::ExceptionOffset<art::kRuntimePointerSize>().Int32Value())
 ASM_DEFINE(THREAD_FLAGS_OFFSET,
@@ -56,9 +52,7 @@
 ASM_DEFINE(THREAD_SELF_OFFSET,
            art::Thread::SelfOffset<art::kRuntimePointerSize>().Int32Value())
 ASM_DEFINE(THREAD_SUSPEND_OR_CHECKPOINT_REQUEST,
-           static_cast<uint32_t>(art::ThreadFlag::kSuspendRequest) |
-               static_cast<uint32_t>(art::ThreadFlag::kCheckpointRequest) |
-               static_cast<uint32_t>(art::ThreadFlag::kEmptyCheckpointRequest))
+           art::Thread::SuspendOrCheckpointRequestFlags())
 ASM_DEFINE(THREAD_SUSPEND_REQUEST,
            static_cast<uint32_t>(art::ThreadFlag::kSuspendRequest))
 ASM_DEFINE(THREAD_TOP_QUICK_FRAME_OFFSET,