Allow mixing of thread offsets between 32 and 64bit architectures.

Begin a more full implementation x86-64 REX prefixes.
Doesn't implement 64bit thread offset support for the JNI compiler.

Change-Id: If9af2f08a1833c21ddb4b4077f9b03add1a05147
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 1576905..e8224cd 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -294,6 +294,7 @@
 
 
 LIBART_ENUM_OPERATOR_OUT_HEADER_FILES := \
+	arch/x86_64/registers_x86_64.h \
 	base/mutex.h \
 	dex_file.h \
 	dex_instruction.h \
diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h
index cfffbea..4b64076 100644
--- a/runtime/arch/arm/asm_support_arm.h
+++ b/runtime/arch/arm/asm_support_arm.h
@@ -23,13 +23,13 @@
 #define rSUSPEND r4
 // Register holding Thread::Current().
 #define rSELF r9
-// Offset of field Thread::suspend_count_ verified in InitCpu
+// Offset of field Thread::tls32_.state_and_flags verified in InitCpu
 #define THREAD_FLAGS_OFFSET 0
-// Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
-// Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 12
-// Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
-#define THREAD_ID_OFFSET 60
+// Offset of field Thread::tls32_.thin_lock_thread_id verified in InitCpu
+#define THREAD_ID_OFFSET 12
+// Offset of field Thread::tlsPtr_.card_table verified in InitCpu
+#define THREAD_CARD_TABLE_OFFSET 112
+// Offset of field Thread::tlsPtr_.exception verified in InitCpu
+#define THREAD_EXCEPTION_OFFSET 116
 
 #endif  // ART_RUNTIME_ARCH_ARM_ASM_SUPPORT_ARM_H_
diff --git a/runtime/arch/arm/fault_handler_arm.cc b/runtime/arch/arm/fault_handler_arm.cc
index abce838..65a4952 100644
--- a/runtime/arch/arm/fault_handler_arm.cc
+++ b/runtime/arch/arm/fault_handler_arm.cc
@@ -109,7 +109,7 @@
 bool SuspensionHandler::Action(int sig, siginfo_t* info, void* context) {
   // These are the instructions to check for.  The first one is the ldr r0,[r9,#xxx]
   // where xxx is the offset of the suspend trigger.
-  uint32_t checkinst1 = 0xf8d90000 + Thread::ThreadSuspendTriggerOffset().Int32Value();
+  uint32_t checkinst1 = 0xf8d90000 + Thread::ThreadSuspendTriggerOffset<4>().Int32Value();
   uint16_t checkinst2 = 0x6800;
 
   struct ucontext *uc = (struct ucontext *)context;
diff --git a/runtime/arch/arm/thread_arm.cc b/runtime/arch/arm/thread_arm.cc
index df4a04a..2a551a8 100644
--- a/runtime/arch/arm/thread_arm.cc
+++ b/runtime/arch/arm/thread_arm.cc
@@ -22,10 +22,10 @@
 namespace art {
 
 void Thread::InitCpu() {
-  CHECK_EQ(THREAD_FLAGS_OFFSET, OFFSETOF_MEMBER(Thread, state_and_flags_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_FLAGS_OFFSET, ThreadFlagsOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<4>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/arm64/thread_arm64.cc b/runtime/arch/arm64/thread_arm64.cc
index 4eebb85..564dced 100644
--- a/runtime/arch/arm64/thread_arm64.cc
+++ b/runtime/arch/arm64/thread_arm64.cc
@@ -22,10 +22,10 @@
 namespace art {
 
 void Thread::InitCpu() {
-  CHECK_EQ(THREAD_FLAGS_OFFSET, OFFSETOF_MEMBER(Thread, state_and_flags_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_FLAGS_OFFSET, ThreadFlagsOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<8>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/mips/asm_support_mips.h b/runtime/arch/mips/asm_support_mips.h
index 5307997..36ce1b6 100644
--- a/runtime/arch/mips/asm_support_mips.h
+++ b/runtime/arch/mips/asm_support_mips.h
@@ -23,11 +23,11 @@
 #define rSUSPEND $s0
 // Register holding Thread::Current().
 #define rSELF $s1
-// Offset of field Thread::suspend_count_ verified in InitCpu
+// Offset of field Thread::tls32_.state_and_flags verified in InitCpu
 #define THREAD_FLAGS_OFFSET 0
-// Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
-// Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 12
+// Offset of field Thread::tlsPtr_.card_table verified in InitCpu
+#define THREAD_CARD_TABLE_OFFSET 112
+// Offset of field Thread::tlsPtr_.exception verified in InitCpu
+#define THREAD_EXCEPTION_OFFSET 116
 
 #endif  // ART_RUNTIME_ARCH_MIPS_ASM_SUPPORT_MIPS_H_
diff --git a/runtime/arch/mips/thread_mips.cc b/runtime/arch/mips/thread_mips.cc
index f5d211f..a451496 100644
--- a/runtime/arch/mips/thread_mips.cc
+++ b/runtime/arch/mips/thread_mips.cc
@@ -22,9 +22,9 @@
 namespace art {
 
 void Thread::InitCpu() {
-  CHECK_EQ(THREAD_FLAGS_OFFSET, OFFSETOF_MEMBER(Thread, state_and_flags_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
+  CHECK_EQ(THREAD_FLAGS_OFFSET, ThreadFlagsOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<4>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/x86/asm_support_x86.h b/runtime/arch/x86/asm_support_x86.h
index e817ff7..e986c41 100644
--- a/runtime/arch/x86/asm_support_x86.h
+++ b/runtime/arch/x86/asm_support_x86.h
@@ -20,12 +20,12 @@
 #include "asm_support.h"
 
 // Offset of field Thread::self_ verified in InitCpu
-#define THREAD_SELF_OFFSET 40
+#define THREAD_SELF_OFFSET 148
 // Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
+#define THREAD_CARD_TABLE_OFFSET 112
 // Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 12
+#define THREAD_EXCEPTION_OFFSET 116
 // Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
-#define THREAD_ID_OFFSET 60
+#define THREAD_ID_OFFSET 12
 
 #endif  // ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_H_
diff --git a/runtime/arch/x86/thread_x86.cc b/runtime/arch/x86/thread_x86.cc
index 235da99..26cd864 100644
--- a/runtime/arch/x86/thread_x86.cc
+++ b/runtime/arch/x86/thread_x86.cc
@@ -120,11 +120,11 @@
       :);  // clobber
 
   // Allow easy indirection back to Thread*.
-  self_ = this;
+  tlsPtr_.self = this;
 
   // Sanity check that reads from %fs point to this Thread*.
   Thread* self_check;
-  CHECK_EQ(THREAD_SELF_OFFSET, OFFSETOF_MEMBER(Thread, self_));
+  CHECK_EQ(THREAD_SELF_OFFSET, SelfOffset<4>().Int32Value());
   __asm__ __volatile__("movl %%fs:(%1), %0"
       : "=r"(self_check)  // output
       : "r"(THREAD_SELF_OFFSET)  // input
@@ -132,9 +132,9 @@
   CHECK_EQ(self_check, this);
 
   // Sanity check other offsets.
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<4>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<4>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/x86_64/asm_support_x86_64.h b/runtime/arch/x86_64/asm_support_x86_64.h
index 03d9e24..70ef3ef 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.h
+++ b/runtime/arch/x86_64/asm_support_x86_64.h
@@ -27,12 +27,12 @@
 #define RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET 16
 
 // Offset of field Thread::self_ verified in InitCpu
-#define THREAD_SELF_OFFSET 72
+#define THREAD_SELF_OFFSET 184
 // Offset of field Thread::card_table_ verified in InitCpu
-#define THREAD_CARD_TABLE_OFFSET 8
+#define THREAD_CARD_TABLE_OFFSET 112
 // Offset of field Thread::exception_ verified in InitCpu
-#define THREAD_EXCEPTION_OFFSET 16
+#define THREAD_EXCEPTION_OFFSET 120
 // Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
-#define THREAD_ID_OFFSET 112
+#define THREAD_ID_OFFSET 12
 
 #endif  // ART_RUNTIME_ARCH_X86_64_ASM_SUPPORT_X86_64_H_
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 0d75a89..17b8556 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -146,7 +146,6 @@
     // Outgoing argument set up
     mov %rsp, %rdx                    // pass SP
     mov %gs:THREAD_SELF_OFFSET, %rsi  // pass Thread::Current()
-    mov %rax, %rdi                    // pass arg1
     call PLT_VAR(cxx_name, 1)     // cxx_name(arg1, Thread*, SP)
     int3                          // unreached
     END_FUNCTION VAR(c_name, 0)
diff --git a/runtime/arch/x86_64/registers_x86_64.h b/runtime/arch/x86_64/registers_x86_64.h
index b9d06b5..8b0dc07 100644
--- a/runtime/arch/x86_64/registers_x86_64.h
+++ b/runtime/arch/x86_64/registers_x86_64.h
@@ -67,7 +67,7 @@
   XMM15 = 15,
   kNumberOfFloatRegisters = 16
 };
-std::ostream& operator<<(std::ostream& os, const Register& rhs);
+std::ostream& operator<<(std::ostream& os, const FloatRegister& rhs);
 
 }  // namespace x86_64
 }  // namespace art
diff --git a/runtime/arch/x86_64/thread_x86_64.cc b/runtime/arch/x86_64/thread_x86_64.cc
index b74fc5d..de4c56a 100644
--- a/runtime/arch/x86_64/thread_x86_64.cc
+++ b/runtime/arch/x86_64/thread_x86_64.cc
@@ -36,11 +36,11 @@
   arch_prctl(ARCH_SET_GS, this);
 
   // Allow easy indirection back to Thread*.
-  self_ = this;
+  tlsPtr_.self = this;
 
   // Sanity check that reads from %gs point to this Thread*.
   Thread* self_check;
-  CHECK_EQ(THREAD_SELF_OFFSET, OFFSETOF_MEMBER(Thread, self_));
+  CHECK_EQ(THREAD_SELF_OFFSET, SelfOffset<8>().Int32Value());
   __asm__ __volatile__("movq %%gs:(%1), %0"
       : "=r"(self_check)  // output
       : "r"(THREAD_SELF_OFFSET)  // input
@@ -54,15 +54,15 @@
            Runtime::GetCalleeSaveMethodOffset(Runtime::kRefsOnly));
   CHECK_EQ(static_cast<size_t>(RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET),
            Runtime::GetCalleeSaveMethodOffset(Runtime::kRefsAndArgs));
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, OFFSETOF_MEMBER(Thread, exception_));
-  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, OFFSETOF_MEMBER(Thread, card_table_));
-  CHECK_EQ(THREAD_ID_OFFSET, OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<8>().Int32Value());
+  CHECK_EQ(THREAD_ID_OFFSET, ThinLockIdOffset<8>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
   // Sanity check that reads from %gs point to this Thread*.
   Thread* self_check;
-  CHECK_EQ(THREAD_SELF_OFFSET, OFFSETOF_MEMBER(Thread, self_));
+  CHECK_EQ(THREAD_SELF_OFFSET, SelfOffset<8>().Int32Value());
   __asm__ __volatile__("movq %%gs:(%1), %0"
       : "=r"(self_check)  // output
       : "r"(THREAD_SELF_OFFSET)  // input
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 024f830..2872a02 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -1924,7 +1924,7 @@
   if (error != JDWP::ERR_NONE) {
     return error;
   }
-  thread->Interrupt();
+  thread->Interrupt(soa.Self());
   return JDWP::ERR_NONE;
 }
 
diff --git a/runtime/entrypoints/interpreter/interpreter_entrypoints.h b/runtime/entrypoints/interpreter/interpreter_entrypoints.h
index c7df4e6..d8b2204 100644
--- a/runtime/entrypoints/interpreter/interpreter_entrypoints.h
+++ b/runtime/entrypoints/interpreter/interpreter_entrypoints.h
@@ -21,9 +21,8 @@
 #include "dex_file.h"
 #include "offsets.h"
 
-#define INTERPRETER_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, interpreter_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(InterpreterEntryPoints, x)))
+#define INTERPRETER_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::InterpreterEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(InterpreterEntryPoints, x))
 
 namespace art {
 
diff --git a/runtime/entrypoints/jni/jni_entrypoints.h b/runtime/entrypoints/jni/jni_entrypoints.h
index 0a53447..6fb0560 100644
--- a/runtime/entrypoints/jni/jni_entrypoints.h
+++ b/runtime/entrypoints/jni/jni_entrypoints.h
@@ -20,9 +20,8 @@
 #include "base/macros.h"
 #include "offsets.h"
 
-#define JNI_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, jni_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(JniEntryPoints, x)))
+#define JNI_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::JniEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(JniEntryPoints, x))
 
 namespace art {
 
diff --git a/runtime/entrypoints/portable/portable_entrypoints.h b/runtime/entrypoints/portable/portable_entrypoints.h
index dbea707..6f77e1c 100644
--- a/runtime/entrypoints/portable/portable_entrypoints.h
+++ b/runtime/entrypoints/portable/portable_entrypoints.h
@@ -27,9 +27,8 @@
 }  // namespace mirror
 class Thread;
 
-#define PORTABLE_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, portable_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(PortableEntryPoints, x)))
+#define PORTABLE_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::PortableEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(PortableEntryPoints, x))
 
 // Pointers to functions that are called by code generated by compiler's adhering to the portable
 // compiler ABI.
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index 5c3b824..ec69e28 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -22,9 +22,8 @@
 #include "base/macros.h"
 #include "offsets.h"
 
-#define QUICK_ENTRYPOINT_OFFSET(x) \
-    ThreadOffset(static_cast<uintptr_t>(OFFSETOF_MEMBER(Thread, quick_entrypoints_)) + \
-                 static_cast<uintptr_t>(OFFSETOF_MEMBER(QuickEntryPoints, x)))
+#define QUICK_ENTRYPOINT_OFFSET(ptr_size, x) \
+    Thread::QuickEntryPointOffset<ptr_size>(OFFSETOF_MEMBER(QuickEntryPoints, x))
 
 namespace art {
 
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 19fdc63..f5f6f16 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -565,7 +565,7 @@
 
   if (LIKELY(idx <= kMaxThreadLocalSizeBracketIdx)) {
     // Use a thread-local run.
-    Run* thread_local_run = reinterpret_cast<Run*>(self->rosalloc_runs_[idx]);
+    Run* thread_local_run = reinterpret_cast<Run*>(self->GetRosAllocRun(idx));
     if (UNLIKELY(thread_local_run == NULL)) {
       MutexLock mu(self, *size_bracket_locks_[idx]);
       thread_local_run = RefillRun(self, idx);
@@ -575,7 +575,7 @@
       DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
       DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
       thread_local_run->is_thread_local_ = 1;
-      self->rosalloc_runs_[idx] = thread_local_run;
+      self->SetRosAllocRun(idx, thread_local_run);
       DCHECK(!thread_local_run->IsFull());
     }
 
@@ -600,7 +600,7 @@
       } else {
         // No slots got freed. Try to refill the thread-local run.
         DCHECK(thread_local_run->IsFull());
-        self->rosalloc_runs_[idx] = NULL;
+        self->SetRosAllocRun(idx, nullptr);
         thread_local_run->is_thread_local_ = 0;
         if (kIsDebugBuild) {
           full_runs_[idx].insert(thread_local_run);
@@ -619,7 +619,7 @@
         DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
         DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
         thread_local_run->is_thread_local_ = 1;
-        self->rosalloc_runs_[idx] = thread_local_run;
+        self->SetRosAllocRun(idx, thread_local_run);
         DCHECK(!thread_local_run->IsFull());
       }
 
@@ -1602,11 +1602,11 @@
   WriterMutexLock wmu(self, bulk_free_lock_);
   for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
     MutexLock mu(self, *size_bracket_locks_[idx]);
-    Run* thread_local_run = reinterpret_cast<Run*>(thread->rosalloc_runs_[idx]);
+    Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
     if (thread_local_run != NULL) {
       DCHECK_EQ(thread_local_run->magic_num_, kMagicNum);
       DCHECK_NE(thread_local_run->is_thread_local_, 0);
-      thread->rosalloc_runs_[idx] = NULL;
+      thread->SetRosAllocRun(idx, nullptr);
       // Note the thread local run may not be full here.
       bool dont_care;
       thread_local_run->MergeThreadLocalFreeBitMapToAllocBitMap(&dont_care);
@@ -1659,7 +1659,7 @@
     WriterMutexLock wmu(self, bulk_free_lock_);
     for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
       MutexLock mu(self, *size_bracket_locks_[idx]);
-      Run* thread_local_run = reinterpret_cast<Run*>(thread->rosalloc_runs_[idx]);
+      Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
       DCHECK(thread_local_run == nullptr);
     }
   }
@@ -1924,7 +1924,7 @@
       Thread* thread = *it;
       for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
         MutexLock mu(self, *rosalloc->size_bracket_locks_[i]);
-        Run* thread_local_run = reinterpret_cast<Run*>(thread->rosalloc_runs_[i]);
+        Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(i));
         if (thread_local_run == this) {
           CHECK(!owner_found)
               << "A thread local run has more than one owner thread " << Dump();
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 6148894..a955cc8 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -213,7 +213,7 @@
   // since there can exist multiple bump pointer spaces which exist at the same time.
   if (num_blocks_ > 0) {
     for (Thread* thread : thread_list) {
-      total += thread->thread_local_pos_ - thread->thread_local_start_;
+      total += thread->GetThreadLocalBytesAllocated();
     }
   }
   return total;
@@ -231,15 +231,15 @@
   // since there can exist multiple bump pointer spaces which exist at the same time.
   if (num_blocks_ > 0) {
     for (Thread* thread : thread_list) {
-      total += thread->thread_local_objects_;
+      total += thread->GetThreadLocalObjectsAllocated();
     }
   }
   return total;
 }
 
 void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
-  objects_allocated_.FetchAndAdd(thread->thread_local_objects_);
-  bytes_allocated_.FetchAndAdd(thread->thread_local_pos_ - thread->thread_local_start_);
+  objects_allocated_.FetchAndAdd(thread->GetThreadLocalObjectsAllocated());
+  bytes_allocated_.FetchAndAdd(thread->GetThreadLocalBytesAllocated());
   thread->SetTlab(nullptr, nullptr);
 }
 
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 13aa77f..f7aeffd 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -2829,7 +2829,7 @@
       local_ref_cookie(IRT_FIRST_SEGMENT),
       locals(kLocalsInitial, kLocalsMax, kLocal),
       check_jni(false),
-      critical(false),
+      critical(0),
       monitors("monitors", kMonitorsInitial, kMonitorsMax) {
   functions = unchecked_functions = &gJniNativeInterface;
   if (vm->check_jni) {
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 332aef0..2d3d318 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -157,7 +157,7 @@
 void Monitor::AppendToWaitSet(Thread* thread) {
   DCHECK(owner_ == Thread::Current());
   DCHECK(thread != NULL);
-  DCHECK(thread->wait_next_ == NULL) << thread->wait_next_;
+  DCHECK(thread->GetWaitNext() == nullptr) << thread->GetWaitNext();
   if (wait_set_ == NULL) {
     wait_set_ = thread;
     return;
@@ -165,10 +165,10 @@
 
   // push_back.
   Thread* t = wait_set_;
-  while (t->wait_next_ != NULL) {
-    t = t->wait_next_;
+  while (t->GetWaitNext() != nullptr) {
+    t = t->GetWaitNext();
   }
-  t->wait_next_ = thread;
+  t->SetWaitNext(thread);
 }
 
 /*
@@ -182,19 +182,19 @@
     return;
   }
   if (wait_set_ == thread) {
-    wait_set_ = thread->wait_next_;
-    thread->wait_next_ = NULL;
+    wait_set_ = thread->GetWaitNext();
+    thread->SetWaitNext(nullptr);
     return;
   }
 
   Thread* t = wait_set_;
-  while (t->wait_next_ != NULL) {
-    if (t->wait_next_ == thread) {
-      t->wait_next_ = thread->wait_next_;
-      thread->wait_next_ = NULL;
+  while (t->GetWaitNext() != NULL) {
+    if (t->GetWaitNext() == thread) {
+      t->SetWaitNext(thread->GetWaitNext());
+      thread->SetWaitNext(nullptr);
       return;
     }
-    t = t->wait_next_;
+    t = t->GetWaitNext();
   }
 }
 
@@ -226,6 +226,7 @@
     monitor_lock_.Unlock(self);  // Let go of locks in order.
     {
       ScopedThreadStateChange tsc(self, kBlocked);  // Change to blocked and give up mutator_lock_.
+      self->SetMonitorEnterObject(obj_);
       MutexLock mu2(self, monitor_lock_);  // Reacquire monitor_lock_ without mutator_lock_ for Wait.
       if (owner_ != NULL) {  // Did the owner_ give the lock up?
         ++num_waiters_;
@@ -248,6 +249,7 @@
           }
         }
       }
+      self->SetMonitorEnterObject(nullptr);
     }
     monitor_lock_.Lock(self);  // Reacquire locks in order.
   }
@@ -447,33 +449,33 @@
   bool was_interrupted = false;
   {
     // Pseudo-atomically wait on self's wait_cond_ and release the monitor lock.
-    MutexLock mu(self, *self->wait_mutex_);
+    MutexLock mu(self, *self->GetWaitMutex());
 
     // Set wait_monitor_ to the monitor object we will be waiting on. When wait_monitor_ is
     // non-NULL a notifying or interrupting thread must signal the thread's wait_cond_ to wake it
     // up.
-    DCHECK(self->wait_monitor_ == NULL);
-    self->wait_monitor_ = this;
+    DCHECK(self->GetWaitMonitor() == nullptr);
+    self->SetWaitMonitor(this);
 
     // Release the monitor lock.
     monitor_contenders_.Signal(self);
     monitor_lock_.Unlock(self);
 
     // Handle the case where the thread was interrupted before we called wait().
-    if (self->interrupted_) {
+    if (self->IsInterruptedLocked()) {
       was_interrupted = true;
     } else {
       // Wait for a notification or a timeout to occur.
       if (why == kWaiting) {
-        self->wait_cond_->Wait(self);
+        self->GetWaitConditionVariable()->Wait(self);
       } else {
         DCHECK(why == kTimedWaiting || why == kSleeping) << why;
-        self->wait_cond_->TimedWait(self, ms, ns);
+        self->GetWaitConditionVariable()->TimedWait(self, ms, ns);
       }
-      if (self->interrupted_) {
+      if (self->IsInterruptedLocked()) {
         was_interrupted = true;
       }
-      self->interrupted_ = false;
+      self->SetInterruptedLocked(false);
     }
   }
 
@@ -485,15 +487,15 @@
     // that a thread in a waiting/sleeping state has a non-null wait_monitor_ for debugging
     // and diagnostic purposes. (If you reset this earlier, stack dumps will claim that threads
     // are waiting on "null".)
-    MutexLock mu(self, *self->wait_mutex_);
-    DCHECK(self->wait_monitor_ != NULL);
-    self->wait_monitor_ = NULL;
+    MutexLock mu(self, *self->GetWaitMutex());
+    DCHECK(self->GetWaitMonitor() != nullptr);
+    self->SetWaitMonitor(nullptr);
   }
 
   // Re-acquire the monitor and lock.
   Lock(self);
   monitor_lock_.Lock(self);
-  self->wait_mutex_->AssertNotHeld(self);
+  self->GetWaitMutex()->AssertNotHeld(self);
 
   /*
    * We remove our thread from wait set after restoring the count
@@ -516,8 +518,8 @@
      * cleared when this exception is thrown."
      */
     {
-      MutexLock mu(self, *self->wait_mutex_);
-      self->interrupted_ = false;
+      MutexLock mu(self, *self->GetWaitMutex());
+      self->SetInterruptedLocked(false);
     }
     if (interruptShouldThrow) {
       ThrowLocation throw_location = self->GetCurrentLocationForThrow();
@@ -538,13 +540,13 @@
   // Signal the first waiting thread in the wait set.
   while (wait_set_ != NULL) {
     Thread* thread = wait_set_;
-    wait_set_ = thread->wait_next_;
-    thread->wait_next_ = NULL;
+    wait_set_ = thread->GetWaitNext();
+    thread->SetWaitNext(nullptr);
 
     // Check to see if the thread is still waiting.
-    MutexLock mu(self, *thread->wait_mutex_);
-    if (thread->wait_monitor_ != NULL) {
-      thread->wait_cond_->Signal(self);
+    MutexLock mu(self, *thread->GetWaitMutex());
+    if (thread->GetWaitMonitor() != nullptr) {
+      thread->GetWaitConditionVariable()->Signal(self);
       return;
     }
   }
@@ -561,8 +563,8 @@
   // Signal all threads in the wait set.
   while (wait_set_ != NULL) {
     Thread* thread = wait_set_;
-    wait_set_ = thread->wait_next_;
-    thread->wait_next_ = NULL;
+    wait_set_ = thread->GetWaitNext();
+    thread->SetWaitNext(nullptr);
     thread->Notify();
   }
 }
@@ -633,6 +635,7 @@
     ThreadList* thread_list = Runtime::Current()->GetThreadList();
     // Suspend the owner, inflate. First change to blocked and give up mutator_lock_.
     ScopedThreadStateChange tsc(self, kBlocked);
+    self->SetMonitorEnterObject(obj.get());
     if (lock_word == obj->GetLockWord()) {  // If lock word hasn't changed.
       bool timed_out;
       Thread* owner = thread_list->SuspendThreadByThreadId(owner_thread_id, false, &timed_out);
@@ -647,6 +650,7 @@
         thread_list->Resume(owner, false);
       }
     }
+    self->SetMonitorEnterObject(nullptr);
   }
 }
 
@@ -880,8 +884,8 @@
     }
     {
       Thread* self = Thread::Current();
-      MutexLock mu(self, *thread->wait_mutex_);
-      Monitor* monitor = thread->wait_monitor_;
+      MutexLock mu(self, *thread->GetWaitMutex());
+      Monitor* monitor = thread->GetWaitMonitor();
       if (monitor != NULL) {
         mirror::Object* object = monitor->obj_;
         object_identity_hashcode = object->IdentityHashCode();
@@ -890,7 +894,7 @@
     }
   } else if (state == kBlocked) {
     os << "  - waiting to lock ";
-    mirror::Object* object = thread->monitor_enter_object_;
+    mirror::Object* object = thread->GetMonitorEnterObject();
     if (object != NULL) {
       object_identity_hashcode = object->IdentityHashCode();
       lock_owner = object->GetLockOwnerThreadId();
@@ -915,11 +919,11 @@
 mirror::Object* Monitor::GetContendedMonitor(Thread* thread) {
   // This is used to implement JDWP's ThreadReference.CurrentContendedMonitor, and has a bizarre
   // definition of contended that includes a monitor a thread is trying to enter...
-  mirror::Object* result = thread->monitor_enter_object_;
+  mirror::Object* result = thread->GetMonitorEnterObject();
   if (result == NULL) {
     // ...but also a monitor that the thread is waiting on.
-    MutexLock mu(Thread::Current(), *thread->wait_mutex_);
-    Monitor* monitor = thread->wait_monitor_;
+    MutexLock mu(Thread::Current(), *thread->GetWaitMutex());
+    Monitor* monitor = thread->GetWaitMonitor();
     if (monitor != NULL) {
       result = monitor->GetObject();
     }
@@ -1118,7 +1122,7 @@
       Monitor* mon = lock_word.FatLockMonitor();
       owner_ = mon->owner_;
       entry_count_ = 1 + mon->lock_count_;
-      for (Thread* waiter = mon->wait_set_; waiter != NULL; waiter = waiter->wait_next_) {
+      for (Thread* waiter = mon->wait_set_; waiter != NULL; waiter = waiter->GetWaitNext()) {
         waiters_.push_back(waiter);
       }
       break;
diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc
index de1b593..0b84005 100644
--- a/runtime/native/java_lang_Thread.cc
+++ b/runtime/native/java_lang_Thread.cc
@@ -104,11 +104,11 @@
 }
 
 static void Thread_nativeInterrupt(JNIEnv* env, jobject java_thread) {
-  ScopedObjectAccess soa(env);
+  ScopedFastNativeObjectAccess soa(env);
   MutexLock mu(soa.Self(), *Locks::thread_list_lock_);
   Thread* thread = Thread::FromManagedThread(soa, java_thread);
   if (thread != NULL) {
-    thread->Interrupt();
+    thread->Interrupt(soa.Self());
   }
 }
 
@@ -175,7 +175,7 @@
   NATIVE_METHOD(Thread, nativeCreate, "(Ljava/lang/Thread;JZ)V"),
   NATIVE_METHOD(Thread, nativeGetStatus, "(Z)I"),
   NATIVE_METHOD(Thread, nativeHoldsLock, "(Ljava/lang/Object;)Z"),
-  NATIVE_METHOD(Thread, nativeInterrupt, "()V"),
+  NATIVE_METHOD(Thread, nativeInterrupt, "!()V"),
   NATIVE_METHOD(Thread, nativeSetName, "(Ljava/lang/String;)V"),
   NATIVE_METHOD(Thread, nativeSetPriority, "(I)V"),
   NATIVE_METHOD(Thread, sleep, "!(Ljava/lang/Object;JI)V"),
diff --git a/runtime/offsets.h b/runtime/offsets.h
index ed4e49e..72a6b0f 100644
--- a/runtime/offsets.h
+++ b/runtime/offsets.h
@@ -50,6 +50,7 @@
 };
 
 // Offsets relative to the current running thread.
+template<size_t pointer_size>
 class ThreadOffset : public Offset {
  public:
   explicit ThreadOffset(size_t val) : Offset(val) {}
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index a8da2f8..f016189 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -576,7 +576,7 @@
   // objects. We can't supply a thread group yet; it will be fixed later. Since we are the main
   // thread, we do not get a java peer.
   Thread* self = Thread::Attach("main", false, NULL, false);
-  CHECK_EQ(self->thin_lock_thread_id_, ThreadList::kMainThreadId);
+  CHECK_EQ(self->GetThreadId(), ThreadList::kMainThreadId);
   CHECK(self != NULL);
 
   // Set us to runnable so tools using a runtime can allocate and GC by default
diff --git a/runtime/runtime_stats.h b/runtime/runtime_stats.h
index 05d3fbb..6ed7fd5 100644
--- a/runtime/runtime_stats.h
+++ b/runtime/runtime_stats.h
@@ -89,20 +89,20 @@
   }
 
   // Number of objects allocated.
-  int allocated_objects;
+  uint64_t allocated_objects;
   // Cumulative size of all objects allocated.
-  int allocated_bytes;
+  uint64_t allocated_bytes;
 
   // Number of objects freed.
-  int freed_objects;
+  uint64_t freed_objects;
   // Cumulative size of all freed objects.
-  int freed_bytes;
+  uint64_t freed_bytes;
 
   // Number of times an allocation triggered a GC.
-  int gc_for_alloc_count;
+  uint64_t gc_for_alloc_count;
 
   // Number of initialized classes.
-  int class_init_count;
+  uint64_t class_init_count;
   // Cumulative time spent in class initialization.
   uint64_t class_init_time_ns;
 
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 66077f9..fc886d5 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -51,8 +51,8 @@
   DCHECK_NE(new_state, kRunnable);
   DCHECK_EQ(this, Thread::Current());
   union StateAndFlags old_state_and_flags;
-  old_state_and_flags.as_int = state_and_flags_.as_int;
-  state_and_flags_.as_struct.state = new_state;
+  old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
+  tls32_.state_and_flags.as_struct.state = new_state;
   return static_cast<ThreadState>(old_state_and_flags.as_struct.state);
 }
 
@@ -60,7 +60,7 @@
 #ifdef NDEBUG
   UNUSED(check_locks);  // Keep GCC happy about unused parameters.
 #else
-  CHECK_EQ(0u, no_thread_suspension_) << last_no_thread_suspension_cause_;
+  CHECK_EQ(0u, tls32_.no_thread_suspension) << tlsPtr_.last_no_thread_suspension_cause;
   if (check_locks) {
     bool bad_mutexes_held = false;
     for (int i = kLockLevelCount - 1; i >= 0; --i) {
@@ -88,7 +88,7 @@
   union StateAndFlags old_state_and_flags;
   union StateAndFlags new_state_and_flags;
   while (true) {
-    old_state_and_flags.as_int = state_and_flags_.as_int;
+    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     if (UNLIKELY((old_state_and_flags.as_struct.flags & kCheckpointRequest) != 0)) {
       RunCheckpointFunction();
       continue;
@@ -98,7 +98,7 @@
     new_state_and_flags.as_struct.flags = old_state_and_flags.as_struct.flags;
     new_state_and_flags.as_struct.state = new_state;
     int status = android_atomic_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                       &state_and_flags_.as_int);
+                                       &tls32_.state_and_flags.as_int);
     if (LIKELY(status == 0)) {
       break;
     }
@@ -110,22 +110,22 @@
 inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
   bool done = false;
   union StateAndFlags old_state_and_flags;
-  old_state_and_flags.as_int = state_and_flags_.as_int;
+  old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
   int16_t old_state = old_state_and_flags.as_struct.state;
   DCHECK_NE(static_cast<ThreadState>(old_state), kRunnable);
   do {
     Locks::mutator_lock_->AssertNotHeld(this);  // Otherwise we starve GC..
-    old_state_and_flags.as_int = state_and_flags_.as_int;
+    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
     if (UNLIKELY((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0)) {
       // Wait while our suspend count is non-zero.
       MutexLock mu(this, *Locks::thread_suspend_count_lock_);
-      old_state_and_flags.as_int = state_and_flags_.as_int;
+      old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
       DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
       while ((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0) {
         // Re-check when Thread::resume_cond_ is notified.
         Thread::resume_cond_->Wait(this);
-        old_state_and_flags.as_int = state_and_flags_.as_int;
+        old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
         DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
       }
       DCHECK_EQ(GetSuspendCount(), 0);
@@ -133,7 +133,7 @@
     // Re-acquire shared mutator_lock_ access.
     Locks::mutator_lock_->SharedLock(this);
     // Atomically change from suspended to runnable if no suspend request pending.
-    old_state_and_flags.as_int = state_and_flags_.as_int;
+    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
     if (LIKELY((old_state_and_flags.as_struct.flags & kSuspendRequest) == 0)) {
       union StateAndFlags new_state_and_flags;
@@ -141,7 +141,7 @@
       new_state_and_flags.as_struct.state = kRunnable;
       // CAS the value without a memory barrier, that occurred in the lock above.
       done = android_atomic_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                &state_and_flags_.as_int) == 0;
+                                &tls32_.state_and_flags.as_int) == 0;
     }
     if (UNLIKELY(!done)) {
       // Failed to transition to Runnable. Release shared mutator_lock_ access and try again.
@@ -161,26 +161,27 @@
 }
 
 inline size_t Thread::TlabSize() const {
-  return thread_local_end_ - thread_local_pos_;
+  return tlsPtr_.thread_local_end - tlsPtr_.thread_local_pos;
 }
 
 inline mirror::Object* Thread::AllocTlab(size_t bytes) {
   DCHECK_GE(TlabSize(), bytes);
-  ++thread_local_objects_;
-  mirror::Object* ret = reinterpret_cast<mirror::Object*>(thread_local_pos_);
-  thread_local_pos_ += bytes;
+  ++tlsPtr_.thread_local_objects;
+  mirror::Object* ret = reinterpret_cast<mirror::Object*>(tlsPtr_.thread_local_pos);
+  tlsPtr_.thread_local_pos += bytes;
   return ret;
 }
 
 inline bool Thread::PushOnThreadLocalAllocationStack(mirror::Object* obj) {
-  DCHECK_LE(thread_local_alloc_stack_top_, thread_local_alloc_stack_end_);
-  if (thread_local_alloc_stack_top_ < thread_local_alloc_stack_end_) {
+  DCHECK_LE(tlsPtr_.thread_local_alloc_stack_top, tlsPtr_.thread_local_alloc_stack_end);
+  if (tlsPtr_.thread_local_alloc_stack_top < tlsPtr_.thread_local_alloc_stack_end) {
     // There's room.
-    DCHECK_LE(reinterpret_cast<byte*>(thread_local_alloc_stack_top_) + sizeof(mirror::Object*),
-              reinterpret_cast<byte*>(thread_local_alloc_stack_end_));
-    DCHECK(*thread_local_alloc_stack_top_ == nullptr);
-    *thread_local_alloc_stack_top_ = obj;
-    ++thread_local_alloc_stack_top_;
+    DCHECK_LE(reinterpret_cast<byte*>(tlsPtr_.thread_local_alloc_stack_top) +
+                  sizeof(mirror::Object*),
+              reinterpret_cast<byte*>(tlsPtr_.thread_local_alloc_stack_end));
+    DCHECK(*tlsPtr_.thread_local_alloc_stack_top == nullptr);
+    *tlsPtr_.thread_local_alloc_stack_top = obj;
+    ++tlsPtr_.thread_local_alloc_stack_top;
     return true;
   }
   return false;
@@ -193,8 +194,8 @@
   DCHECK_ALIGNED(start, sizeof(mirror::Object*));
   DCHECK_ALIGNED(end, sizeof(mirror::Object*));
   DCHECK_LT(start, end);
-  thread_local_alloc_stack_end_ = end;
-  thread_local_alloc_stack_top_ = start;
+  tlsPtr_.thread_local_alloc_stack_end = end;
+  tlsPtr_.thread_local_alloc_stack_top = start;
 }
 
 inline void Thread::RevokeThreadLocalAllocationStack() {
@@ -204,8 +205,8 @@
     DCHECK(this == self || IsSuspended() || GetState() == kWaitingPerformingGc)
         << GetState() << " thread " << this << " self " << self;
   }
-  thread_local_alloc_stack_end_ = nullptr;
-  thread_local_alloc_stack_top_ = nullptr;
+  tlsPtr_.thread_local_alloc_stack_end = nullptr;
+  tlsPtr_.thread_local_alloc_stack_top = nullptr;
 }
 
 }  // namespace art
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 3692b9f..fd5b599 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -79,57 +79,49 @@
 static const char* kThreadNameDuringStartup = "<native thread without managed peer>";
 
 void Thread::InitCardTable() {
-  card_table_ = Runtime::Current()->GetHeap()->GetCardTable()->GetBiasedBegin();
+  tlsPtr_.card_table = Runtime::Current()->GetHeap()->GetCardTable()->GetBiasedBegin();
 }
 
-#if !defined(__APPLE__)
 static void UnimplementedEntryPoint() {
   UNIMPLEMENTED(FATAL);
 }
-#endif
 
 void InitEntryPoints(InterpreterEntryPoints* ipoints, JniEntryPoints* jpoints,
                      PortableEntryPoints* ppoints, QuickEntryPoints* qpoints);
 
 void Thread::InitTlsEntryPoints() {
-#if !defined(__APPLE__)  // The Mac GCC is too old to accept this code.
   // Insert a placeholder so we can easily tell if we call an unimplemented entry point.
-  uintptr_t* begin = reinterpret_cast<uintptr_t*>(&interpreter_entrypoints_);
-  uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(begin) + sizeof(quick_entrypoints_));
+  uintptr_t* begin = reinterpret_cast<uintptr_t*>(&tlsPtr_.interpreter_entrypoints);
+  uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(begin) +
+                                                sizeof(tlsPtr_.quick_entrypoints));
   for (uintptr_t* it = begin; it != end; ++it) {
     *it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);
   }
-  begin = reinterpret_cast<uintptr_t*>(&interpreter_entrypoints_);
-  end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(begin) + sizeof(portable_entrypoints_));
-  for (uintptr_t* it = begin; it != end; ++it) {
-    *it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);
-  }
-#endif
-  InitEntryPoints(&interpreter_entrypoints_, &jni_entrypoints_, &portable_entrypoints_,
-                  &quick_entrypoints_);
+  InitEntryPoints(&tlsPtr_.interpreter_entrypoints, &tlsPtr_.jni_entrypoints,
+                  &tlsPtr_.portable_entrypoints, &tlsPtr_.quick_entrypoints);
 }
 
 void Thread::ResetQuickAllocEntryPointsForThread() {
-  ResetQuickAllocEntryPoints(&quick_entrypoints_);
+  ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints);
 }
 
 void Thread::SetDeoptimizationShadowFrame(ShadowFrame* sf) {
-  deoptimization_shadow_frame_ = sf;
+  tlsPtr_.deoptimization_shadow_frame = sf;
 }
 
 void Thread::SetDeoptimizationReturnValue(const JValue& ret_val) {
-  deoptimization_return_value_.SetJ(ret_val.GetJ());
+  tls64_.deoptimization_return_value.SetJ(ret_val.GetJ());
 }
 
 ShadowFrame* Thread::GetAndClearDeoptimizationShadowFrame(JValue* ret_val) {
-  ShadowFrame* sf = deoptimization_shadow_frame_;
-  deoptimization_shadow_frame_ = nullptr;
-  ret_val->SetJ(deoptimization_return_value_.GetJ());
+  ShadowFrame* sf = tlsPtr_.deoptimization_shadow_frame;
+  tlsPtr_.deoptimization_shadow_frame = nullptr;
+  ret_val->SetJ(tls64_.deoptimization_return_value.GetJ());
   return sf;
 }
 
 void Thread::InitTid() {
-  tid_ = ::art::GetTid();
+  tls32_.tid = ::art::GetTid();
 }
 
 void Thread::InitAfterFork() {
@@ -159,10 +151,10 @@
     ScopedObjectAccess soa(self);
 
     // Copy peer into self, deleting global reference when done.
-    CHECK(self->jpeer_ != nullptr);
-    self->opeer_ = soa.Decode<mirror::Object*>(self->jpeer_);
-    self->GetJniEnv()->DeleteGlobalRef(self->jpeer_);
-    self->jpeer_ = nullptr;
+    CHECK(self->tlsPtr_.jpeer != nullptr);
+    self->tlsPtr_.opeer = soa.Decode<mirror::Object*>(self->tlsPtr_.jpeer);
+    self->GetJniEnv()->DeleteGlobalRef(self->tlsPtr_.jpeer);
+    self->tlsPtr_.jpeer = nullptr;
 
     {
       SirtRef<mirror::String> thread_name(self, self->GetThreadName(soa));
@@ -171,7 +163,7 @@
     Dbg::PostThreadStart(self);
 
     // Invoke the 'run' method of our java.lang.Thread.
-    mirror::Object* receiver = self->opeer_;
+    mirror::Object* receiver = self->tlsPtr_.opeer;
     jmethodID mid = WellKnownClasses::java_lang_Thread_run;
     InvokeVirtualOrInterfaceWithJValues(soa, receiver, mid, nullptr);
   }
@@ -237,7 +229,7 @@
 // is the StackOverflow reserved region used when creating the StackOverflow
 // exception.
 void Thread::InstallImplicitProtection(bool is_main_stack) {
-  byte* pregion = stack_end_;
+  byte* pregion = tlsPtr_.stack_end;
 
   constexpr uint32_t kMarker = 0xdadadada;
   uintptr_t *marker = reinterpret_cast<uintptr_t*>(pregion);
@@ -288,7 +280,7 @@
 
   Thread* child_thread = new Thread(is_daemon);
   // Use global JNI ref to hold peer live while child thread starts.
-  child_thread->jpeer_ = env->NewGlobalRef(java_peer);
+  child_thread->tlsPtr_.jpeer = env->NewGlobalRef(java_peer);
   stack_size = FixStackSize(stack_size);
 
   // Thread.start is synchronized, so we know that nativePeer is 0, and know that we're not racing to
@@ -311,8 +303,8 @@
       runtime->EndThreadBirth();
     }
     // Manually delete the global reference since Thread::Init will not have been run.
-    env->DeleteGlobalRef(child_thread->jpeer_);
-    child_thread->jpeer_ = nullptr;
+    env->DeleteGlobalRef(child_thread->tlsPtr_.jpeer);
+    child_thread->tlsPtr_.jpeer = nullptr;
     delete child_thread;
     child_thread = nullptr;
     // TODO: remove from thread group?
@@ -340,15 +332,15 @@
   InitTid();
   // Set pthread_self_ ahead of pthread_setspecific, that makes Thread::Current function, this
   // avoids pthread_self_ ever being invalid when discovered from Thread::Current().
-  pthread_self_ = pthread_self();
+  tlsPtr_.pthread_self = pthread_self();
   CHECK(is_started_);
   CHECK_PTHREAD_CALL(pthread_setspecific, (Thread::pthread_key_self_, this), "attach self");
   DCHECK_EQ(Thread::Current(), this);
 
-  thin_lock_thread_id_ = thread_list->AllocThreadId(this);
+  tls32_.thin_lock_thread_id = thread_list->AllocThreadId(this);
   InitStackHwm();
 
-  jni_env_ = new JNIEnvExt(this, java_vm);
+  tlsPtr_.jni_env = new JNIEnvExt(this, java_vm);
   thread_list->Register(this);
 }
 
@@ -385,7 +377,7 @@
   } else {
     // These aren't necessary, but they improve diagnostics for unit tests & command-line tools.
     if (thread_name != nullptr) {
-      self->name_->assign(thread_name);
+      self->tlsPtr_.name->assign(thread_name);
       ::art::SetThreadName(thread_name);
     }
   }
@@ -396,7 +388,7 @@
 void Thread::CreatePeer(const char* name, bool as_daemon, jobject thread_group) {
   Runtime* runtime = Runtime::Current();
   CHECK(runtime->IsStarted());
-  JNIEnv* env = jni_env_;
+  JNIEnv* env = tlsPtr_.jni_env;
 
   if (thread_group == nullptr) {
     thread_group = runtime->GetMainThreadGroup();
@@ -412,7 +404,7 @@
   }
   {
     ScopedObjectAccess soa(this);
-    opeer_ = soa.Decode<mirror::Object*>(peer.get());
+    tlsPtr_.opeer = soa.Decode<mirror::Object*>(peer.get());
   }
   env->CallNonvirtualVoidMethod(peer.get(),
                                 WellKnownClasses::java_lang_Thread,
@@ -422,8 +414,8 @@
 
   Thread* self = this;
   DCHECK_EQ(self, Thread::Current());
-  jni_env_->SetLongField(peer.get(), WellKnownClasses::java_lang_Thread_nativePeer,
-                         reinterpret_cast<jlong>(self));
+  env->SetLongField(peer.get(), WellKnownClasses::java_lang_Thread_nativePeer,
+                    reinterpret_cast<jlong>(self));
 
   ScopedObjectAccess soa(self);
   SirtRef<mirror::String> peer_thread_name(soa.Self(), GetThreadName(soa));
@@ -449,34 +441,36 @@
 void Thread::InitPeer(ScopedObjectAccess& soa, jboolean thread_is_daemon, jobject thread_group,
                       jobject thread_name, jint thread_priority) {
   soa.DecodeField(WellKnownClasses::java_lang_Thread_daemon)->
-      SetBoolean<kTransactionActive>(opeer_, thread_is_daemon);
+      SetBoolean<kTransactionActive>(tlsPtr_.opeer, thread_is_daemon);
   soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->
-      SetObject<kTransactionActive>(opeer_, soa.Decode<mirror::Object*>(thread_group));
+      SetObject<kTransactionActive>(tlsPtr_.opeer, soa.Decode<mirror::Object*>(thread_group));
   soa.DecodeField(WellKnownClasses::java_lang_Thread_name)->
-      SetObject<kTransactionActive>(opeer_, soa.Decode<mirror::Object*>(thread_name));
+      SetObject<kTransactionActive>(tlsPtr_.opeer, soa.Decode<mirror::Object*>(thread_name));
   soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)->
-      SetInt<kTransactionActive>(opeer_, thread_priority);
+      SetInt<kTransactionActive>(tlsPtr_.opeer, thread_priority);
 }
 
 void Thread::SetThreadName(const char* name) {
-  name_->assign(name);
+  tlsPtr_.name->assign(name);
   ::art::SetThreadName(name);
   Dbg::DdmSendThreadNotification(this, CHUNK_TYPE("THNM"));
 }
 
 void Thread::InitStackHwm() {
-  void* stack_base;
-  size_t stack_size;
-  GetThreadStack(pthread_self_, &stack_base, &stack_size);
+  void* read_stack_base;
+  size_t read_stack_size;
+  GetThreadStack(tlsPtr_.pthread_self, &read_stack_base, &read_stack_size);
 
   // TODO: include this in the thread dumps; potentially useful in SIGQUIT output?
-  VLOG(threads) << StringPrintf("Native stack is at %p (%s)", stack_base, PrettySize(stack_size).c_str());
+  VLOG(threads) << StringPrintf("Native stack is at %p (%s)", read_stack_base,
+                                PrettySize(read_stack_size).c_str());
 
-  stack_begin_ = reinterpret_cast<byte*>(stack_base);
-  stack_size_ = stack_size;
+  tlsPtr_.stack_begin = reinterpret_cast<byte*>(read_stack_base);
+  tlsPtr_.stack_size = read_stack_size;
 
-  if (stack_size_ <= kStackOverflowReservedBytes) {
-    LOG(FATAL) << "Attempt to attach a thread with a too-small stack (" << stack_size_ << " bytes)";
+  if (read_stack_size <= kStackOverflowReservedBytes) {
+    LOG(FATAL) << "Attempt to attach a thread with a too-small stack (" << read_stack_size
+        << " bytes)";
   }
 
   // TODO: move this into the Linux GetThreadStack implementation.
@@ -500,12 +494,12 @@
       CHECK_PTHREAD_CALL(pthread_attr_destroy, (&default_attributes), "default stack size query");
 
       // ...and use that as our limit.
-      size_t old_stack_size = stack_size_;
-      stack_size_ = default_stack_size;
-      stack_begin_ += (old_stack_size - stack_size_);
+      size_t old_stack_size = read_stack_size;
+      tlsPtr_.stack_size = default_stack_size;
+      tlsPtr_.stack_begin += (old_stack_size - default_stack_size);
       VLOG(threads) << "Limiting unlimited stack (reported as " << PrettySize(old_stack_size) << ")"
-                    << " to " << PrettySize(stack_size_)
-                    << " with base " << reinterpret_cast<void*>(stack_begin_);
+                    << " to " << PrettySize(default_stack_size)
+                    << " with base " << reinterpret_cast<void*>(tlsPtr_.stack_begin);
     }
   }
 #endif
@@ -521,16 +515,16 @@
       // to install our own region so we need to move the limits
       // of the stack to make room for it.
       constexpr uint32_t kDelta = 16 * KB;
-      stack_begin_ += kDelta;
-      stack_end_ += kDelta;
-      stack_size_ -= kDelta;
+      tlsPtr_.stack_begin += kDelta;
+      tlsPtr_.stack_end += kDelta;
+      tlsPtr_.stack_size -= kDelta;
     }
     InstallImplicitProtection(is_main_thread);
   }
 
   // Sanity check.
   int stack_variable;
-  CHECK_GT(&stack_variable, reinterpret_cast<void*>(stack_end_));
+  CHECK_GT(&stack_variable, reinterpret_cast<void*>(tlsPtr_.stack_end));
 }
 
 void Thread::ShortDump(std::ostream& os) const {
@@ -542,8 +536,8 @@
   }
   os << GetState()
            << ",Thread*=" << this
-           << ",peer=" << opeer_
-           << ",\"" << *name_ << "\""
+           << ",peer=" << tlsPtr_.opeer
+           << ",\"" << *tlsPtr_.name << "\""
            << "]";
 }
 
@@ -554,17 +548,17 @@
 
 mirror::String* Thread::GetThreadName(const ScopedObjectAccessUnchecked& soa) const {
   mirror::ArtField* f = soa.DecodeField(WellKnownClasses::java_lang_Thread_name);
-  return (opeer_ != nullptr) ? reinterpret_cast<mirror::String*>(f->GetObject(opeer_)) : nullptr;
+  return (tlsPtr_.opeer != nullptr) ? reinterpret_cast<mirror::String*>(f->GetObject(tlsPtr_.opeer)) : nullptr;
 }
 
 void Thread::GetThreadName(std::string& name) const {
-  name.assign(*name_);
+  name.assign(*tlsPtr_.name);
 }
 
 uint64_t Thread::GetCpuMicroTime() const {
 #if defined(HAVE_POSIX_CLOCKS)
   clockid_t cpu_clock_id;
-  pthread_getcpuclockid(pthread_self_, &cpu_clock_id);
+  pthread_getcpuclockid(tlsPtr_.pthread_self, &cpu_clock_id);
   timespec now;
   clock_gettime(cpu_clock_id, &now);
   return static_cast<uint64_t>(now.tv_sec) * UINT64_C(1000000) + now.tv_nsec / UINT64_C(1000);
@@ -575,11 +569,11 @@
 }
 
 void Thread::AtomicSetFlag(ThreadFlag flag) {
-  android_atomic_or(flag, &state_and_flags_.as_int);
+  android_atomic_or(flag, &tls32_.state_and_flags.as_int);
 }
 
 void Thread::AtomicClearFlag(ThreadFlag flag) {
-  android_atomic_and(-1 ^ flag, &state_and_flags_.as_int);
+  android_atomic_and(-1 ^ flag, &tls32_.state_and_flags.as_int);
 }
 
 // Attempt to rectify locks so that we dump thread list with required locks before exiting.
@@ -604,24 +598,24 @@
 }
 
 void Thread::ModifySuspendCount(Thread* self, int delta, bool for_debugger) {
-  DCHECK(delta == -1 || delta == +1 || delta == -debug_suspend_count_)
-      << delta << " " << debug_suspend_count_ << " " << this;
-  DCHECK_GE(suspend_count_, debug_suspend_count_) << this;
+  DCHECK(delta == -1 || delta == +1 || delta == -tls32_.debug_suspend_count)
+      << delta << " " << tls32_.debug_suspend_count << " " << this;
+  DCHECK_GE(tls32_.suspend_count, tls32_.debug_suspend_count) << this;
   Locks::thread_suspend_count_lock_->AssertHeld(self);
   if (this != self && !IsSuspended()) {
     Locks::thread_list_lock_->AssertHeld(self);
   }
-  if (UNLIKELY(delta < 0 && suspend_count_ <= 0)) {
+  if (UNLIKELY(delta < 0 && tls32_.suspend_count <= 0)) {
     UnsafeLogFatalForSuspendCount(self, this);
     return;
   }
 
-  suspend_count_ += delta;
+  tls32_.suspend_count += delta;
   if (for_debugger) {
-    debug_suspend_count_ += delta;
+    tls32_.debug_suspend_count += delta;
   }
 
-  if (suspend_count_ == 0) {
+  if (tls32_.suspend_count == 0) {
     AtomicClearFlag(kSuspendRequest);
   } else {
     AtomicSetFlag(kSuspendRequest);
@@ -639,8 +633,8 @@
   {
     MutexLock mu(this, *Locks::thread_suspend_count_lock_);
     for (uint32_t i = 0; i < kMaxCheckpoints; ++i) {
-      checkpoints[i] = checkpoint_functions_[i];
-      checkpoint_functions_[i] = nullptr;
+      checkpoints[i] = tlsPtr_.checkpoint_functions[i];
+      tlsPtr_.checkpoint_functions[i] = nullptr;
     }
     AtomicClearFlag(kCheckpointRequest);
   }
@@ -661,14 +655,14 @@
 
 bool Thread::RequestCheckpoint(Closure* function) {
   union StateAndFlags old_state_and_flags;
-  old_state_and_flags.as_int = state_and_flags_.as_int;
+  old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
   if (old_state_and_flags.as_struct.state != kRunnable) {
     return false;  // Fail, thread is suspended and so can't run a checkpoint.
   }
 
   uint32_t available_checkpoint = kMaxCheckpoints;
   for (uint32_t i = 0 ; i < kMaxCheckpoints; ++i) {
-    if (checkpoint_functions_[i] == nullptr) {
+    if (tlsPtr_.checkpoint_functions[i] == nullptr) {
       available_checkpoint = i;
       break;
     }
@@ -677,7 +671,7 @@
     // No checkpoint functions available, we can't run a checkpoint
     return false;
   }
-  checkpoint_functions_[available_checkpoint] = function;
+  tlsPtr_.checkpoint_functions[available_checkpoint] = function;
 
   // Checkpoint function installed now install flag bit.
   // We must be runnable to request a checkpoint.
@@ -686,11 +680,11 @@
   new_state_and_flags.as_int = old_state_and_flags.as_int;
   new_state_and_flags.as_struct.flags |= kCheckpointRequest;
   int succeeded = android_atomic_acquire_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                         &state_and_flags_.as_int);
+                                             &tls32_.state_and_flags.as_int);
   if (UNLIKELY(succeeded != 0)) {
     // The thread changed state before the checkpoint was installed.
-    CHECK_EQ(checkpoint_functions_[available_checkpoint], function);
-    checkpoint_functions_[available_checkpoint] = nullptr;
+    CHECK_EQ(tlsPtr_.checkpoint_functions[available_checkpoint], function);
+    tlsPtr_.checkpoint_functions[available_checkpoint] = nullptr;
   } else {
     CHECK_EQ(ReadFlag(kCheckpointRequest), true);
     TriggerSuspend();
@@ -715,13 +709,15 @@
   bool is_daemon = false;
   Thread* self = Thread::Current();
 
-  if (self != nullptr && thread != nullptr && thread->opeer_ != nullptr) {
+  if (self != nullptr && thread != nullptr && thread->tlsPtr_.opeer != nullptr) {
     ScopedObjectAccessUnchecked soa(self);
-    priority = soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)->GetInt(thread->opeer_);
-    is_daemon = soa.DecodeField(WellKnownClasses::java_lang_Thread_daemon)->GetBoolean(thread->opeer_);
+    priority = soa.DecodeField(WellKnownClasses::java_lang_Thread_priority)
+        ->GetInt(thread->tlsPtr_.opeer);
+    is_daemon = soa.DecodeField(WellKnownClasses::java_lang_Thread_daemon)
+        ->GetBoolean(thread->tlsPtr_.opeer);
 
     mirror::Object* thread_group =
-        soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->GetObject(thread->opeer_);
+        soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->GetObject(thread->tlsPtr_.opeer);
 
     if (thread_group != nullptr) {
       mirror::ArtField* group_name_field =
@@ -740,7 +736,7 @@
   }
 
   if (thread != nullptr) {
-    os << '"' << *thread->name_ << '"';
+    os << '"' << *thread->tlsPtr_.name << '"';
     if (is_daemon) {
       os << " daemon";
     }
@@ -760,9 +756,9 @@
   if (thread != nullptr) {
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
     os << "  | group=\"" << group_name << "\""
-       << " sCount=" << thread->suspend_count_
-       << " dsCount=" << thread->debug_suspend_count_
-       << " obj=" << reinterpret_cast<void*>(thread->opeer_)
+       << " sCount=" << thread->tls32_.suspend_count
+       << " dsCount=" << thread->tls32_.debug_suspend_count
+       << " obj=" << reinterpret_cast<void*>(thread->tlsPtr_.opeer)
        << " self=" << reinterpret_cast<const void*>(thread) << "\n";
   }
 
@@ -772,9 +768,10 @@
   if (thread != nullptr) {
     int policy;
     sched_param sp;
-    CHECK_PTHREAD_CALL(pthread_getschedparam, (thread->pthread_self_, &policy, &sp), __FUNCTION__);
+    CHECK_PTHREAD_CALL(pthread_getschedparam, (thread->tlsPtr_.pthread_self, &policy, &sp),
+                       __FUNCTION__);
     os << " sched=" << policy << "/" << sp.sched_priority
-       << " handle=" << reinterpret_cast<void*>(thread->pthread_self_);
+       << " handle=" << reinterpret_cast<void*>(thread->tlsPtr_.pthread_self);
   }
   os << "\n";
 
@@ -799,8 +796,9 @@
      << " core=" << task_cpu
      << " HZ=" << sysconf(_SC_CLK_TCK) << "\n";
   if (thread != nullptr) {
-    os << "  | stack=" << reinterpret_cast<void*>(thread->stack_begin_) << "-" << reinterpret_cast<void*>(thread->stack_end_)
-       << " stackSize=" << PrettySize(thread->stack_size_) << "\n";
+    os << "  | stack=" << reinterpret_cast<void*>(thread->tlsPtr_.stack_begin) << "-"
+        << reinterpret_cast<void*>(thread->tlsPtr_.stack_end) << " stackSize="
+        << PrettySize(thread->tlsPtr_.stack_size) << "\n";
   }
 }
 
@@ -919,7 +917,8 @@
       DumpNativeStack(os, GetTid(), "  native: ", false, method_ref.get());
     }
     UniquePtr<Context> context(Context::Create());
-    StackDumpVisitor dumper(os, const_cast<Thread*>(this), context.get(), !throwing_OutOfMemoryError_);
+    StackDumpVisitor dumper(os, const_cast<Thread*>(this), context.get(),
+                            !tls32_.throwing_OutOfMemoryError);
     dumper.WalkStack();
   } else {
     os << "Not able to dump stack of thread that isn't suspended";
@@ -928,11 +927,12 @@
 
 void Thread::ThreadExitCallback(void* arg) {
   Thread* self = reinterpret_cast<Thread*>(arg);
-  if (self->thread_exit_check_count_ == 0) {
-    LOG(WARNING) << "Native thread exiting without having called DetachCurrentThread (maybe it's going to use a pthread_key_create destructor?): " << *self;
+  if (self->tls32_.thread_exit_check_count == 0) {
+    LOG(WARNING) << "Native thread exiting without having called DetachCurrentThread (maybe it's "
+        "going to use a pthread_key_create destructor?): " << *self;
     CHECK(is_started_);
     CHECK_PTHREAD_CALL(pthread_setspecific, (Thread::pthread_key_self_, self), "reattach self");
-    self->thread_exit_check_count_ = 1;
+    self->tls32_.thread_exit_check_count = 1;
   } else {
     LOG(FATAL) << "Native thread exited without calling DetachCurrentThread: " << *self;
   }
@@ -984,58 +984,21 @@
   }
 }
 
-Thread::Thread(bool daemon)
-    : suspend_count_(0),
-      card_table_(nullptr),
-      exception_(nullptr),
-      stack_end_(nullptr),
-      managed_stack_(),
-      jni_env_(nullptr),
-      self_(nullptr),
-      opeer_(nullptr),
-      jpeer_(nullptr),
-      stack_begin_(nullptr),
-      stack_size_(0),
-      thin_lock_thread_id_(0),
-      stack_trace_sample_(nullptr),
-      trace_clock_base_(0),
-      tid_(0),
-      wait_mutex_(new Mutex("a thread wait mutex")),
-      wait_cond_(new ConditionVariable("a thread wait condition variable", *wait_mutex_)),
-      wait_monitor_(nullptr),
-      interrupted_(false),
-      wait_next_(nullptr),
-      monitor_enter_object_(nullptr),
-      top_sirt_(nullptr),
-      runtime_(nullptr),
-      class_loader_override_(nullptr),
-      long_jump_context_(nullptr),
-      throwing_OutOfMemoryError_(false),
-      debug_suspend_count_(0),
-      debug_invoke_req_(new DebugInvokeReq),
-      single_step_control_(new SingleStepControl),
-      deoptimization_shadow_frame_(nullptr),
-      instrumentation_stack_(new std::deque<instrumentation::InstrumentationStackFrame>),
-      name_(new std::string(kThreadNameDuringStartup)),
-      daemon_(daemon),
-      pthread_self_(0),
-      no_thread_suspension_(0),
-      last_no_thread_suspension_cause_(nullptr),
-      suspend_trigger_(reinterpret_cast<uintptr_t*>(&suspend_trigger_)),
-      thread_exit_check_count_(0),
-      thread_local_start_(nullptr),
-      thread_local_pos_(nullptr),
-      thread_local_end_(nullptr),
-      thread_local_objects_(0),
-      thread_local_alloc_stack_top_(nullptr),
-      thread_local_alloc_stack_end_(nullptr) {
+Thread::Thread(bool daemon) : tls32_(daemon), wait_monitor_(nullptr), interrupted_(false) {
+  wait_mutex_ = new Mutex("a thread wait mutex");
+  wait_cond_ = new ConditionVariable("a thread wait condition variable", *wait_mutex_);
+  tlsPtr_.debug_invoke_req = new DebugInvokeReq;
+  tlsPtr_.single_step_control = new SingleStepControl;
+  tlsPtr_.instrumentation_stack = new std::deque<instrumentation::InstrumentationStackFrame>;
+  tlsPtr_.name = new std::string(kThreadNameDuringStartup);
+
   CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
-  state_and_flags_.as_struct.flags = 0;
-  state_and_flags_.as_struct.state = kNative;
-  memset(&held_mutexes_[0], 0, sizeof(held_mutexes_));
-  memset(rosalloc_runs_, 0, sizeof(rosalloc_runs_));
+  tls32_.state_and_flags.as_struct.flags = 0;
+  tls32_.state_and_flags.as_struct.state = kNative;
+  memset(&tlsPtr_.held_mutexes[0], 0, sizeof(tlsPtr_.held_mutexes));
+  memset(tlsPtr_.rosalloc_runs, 0, sizeof(tlsPtr_.rosalloc_runs));
   for (uint32_t i = 0; i < kMaxCheckpoints; ++i) {
-    checkpoint_functions_[i] = nullptr;
+    tlsPtr_.checkpoint_functions[i] = nullptr;
   }
 }
 
@@ -1046,7 +1009,8 @@
   // assigned fairly early on, and needs to be.
   // It turns out that the last thing to change is the thread name; that's a good proxy for "has
   // this thread _ever_ entered kRunnable".
-  return (jpeer_ == nullptr && opeer_ == nullptr) || (*name_ == kThreadNameDuringStartup);
+  return (tlsPtr_.jpeer == nullptr && tlsPtr_.opeer == nullptr) ||
+      (*tlsPtr_.name == kThreadNameDuringStartup);
 }
 
 void Thread::AssertNoPendingException() const {
@@ -1084,7 +1048,7 @@
   Thread* self = this;
   DCHECK_EQ(self, Thread::Current());
 
-  if (opeer_ != nullptr) {
+  if (tlsPtr_.opeer != nullptr) {
     ScopedObjectAccess soa(self);
     // We may need to call user-supplied managed code, do this before final clean-up.
     HandleUncaughtExceptions(soa);
@@ -1092,16 +1056,18 @@
 
     // this.nativePeer = 0;
     if (Runtime::Current()->IsActiveTransaction()) {
-      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)->SetLong<true>(opeer_, 0);
+      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)
+          ->SetLong<true>(tlsPtr_.opeer, 0);
     } else {
-      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)->SetLong<false>(opeer_, 0);
+      soa.DecodeField(WellKnownClasses::java_lang_Thread_nativePeer)
+          ->SetLong<false>(tlsPtr_.opeer, 0);
     }
     Dbg::PostThreadDeath(self);
 
     // Thread.join() is implemented as an Object.wait() on the Thread.lock object. Signal anyone
     // who is waiting.
     mirror::Object* lock =
-        soa.DecodeField(WellKnownClasses::java_lang_Thread_lock)->GetObject(opeer_);
+        soa.DecodeField(WellKnownClasses::java_lang_Thread_lock)->GetObject(tlsPtr_.opeer);
     // (This conditional is only needed for tests, where Thread.lock won't have been set.)
     if (lock != nullptr) {
       SirtRef<mirror::Object> sirt_obj(self, lock);
@@ -1111,29 +1077,29 @@
   }
 
   // On thread detach, all monitors entered with JNI MonitorEnter are automatically exited.
-  if (jni_env_ != nullptr) {
-    jni_env_->monitors.VisitRoots(MonitorExitVisitor, self, 0, kRootVMInternal);
+  if (tlsPtr_.jni_env != nullptr) {
+    tlsPtr_.jni_env->monitors.VisitRoots(MonitorExitVisitor, self, 0, kRootVMInternal);
   }
 }
 
 Thread::~Thread() {
-  if (jni_env_ != nullptr && jpeer_ != nullptr) {
+  if (tlsPtr_.jni_env != nullptr && tlsPtr_.jpeer != nullptr) {
     // If pthread_create fails we don't have a jni env here.
-    jni_env_->DeleteGlobalRef(jpeer_);
-    jpeer_ = nullptr;
+    tlsPtr_.jni_env->DeleteGlobalRef(tlsPtr_.jpeer);
+    tlsPtr_.jpeer = nullptr;
   }
-  opeer_ = nullptr;
+  tlsPtr_.opeer = nullptr;
 
-  bool initialized = (jni_env_ != nullptr);  // Did Thread::Init run?
+  bool initialized = (tlsPtr_.jni_env != nullptr);  // Did Thread::Init run?
   if (initialized) {
-    delete jni_env_;
-    jni_env_ = nullptr;
+    delete tlsPtr_.jni_env;
+    tlsPtr_.jni_env = nullptr;
   }
   CHECK_NE(GetState(), kRunnable);
   CHECK_NE(ReadFlag(kCheckpointRequest), true);
-  CHECK(checkpoint_functions_[0] == nullptr);
-  CHECK(checkpoint_functions_[1] == nullptr);
-  CHECK(checkpoint_functions_[2] == nullptr);
+  CHECK(tlsPtr_.checkpoint_functions[0] == nullptr);
+  CHECK(tlsPtr_.checkpoint_functions[1] == nullptr);
+  CHECK(tlsPtr_.checkpoint_functions[2] == nullptr);
 
   // We may be deleting a still born thread.
   SetStateUnsafe(kTerminated);
@@ -1141,19 +1107,19 @@
   delete wait_cond_;
   delete wait_mutex_;
 
-  if (long_jump_context_ != nullptr) {
-    delete long_jump_context_;
+  if (tlsPtr_.long_jump_context != nullptr) {
+    delete tlsPtr_.long_jump_context;
   }
 
   if (initialized) {
     CleanupCpu();
   }
 
-  delete debug_invoke_req_;
-  delete single_step_control_;
-  delete instrumentation_stack_;
-  delete name_;
-  delete stack_trace_sample_;
+  delete tlsPtr_.debug_invoke_req;
+  delete tlsPtr_.single_step_control;
+  delete tlsPtr_.instrumentation_stack;
+  delete tlsPtr_.name;
+  delete tlsPtr_.stack_trace_sample;
 
   Runtime::Current()->GetHeap()->RevokeThreadLocalBuffers(this);
 
@@ -1164,47 +1130,50 @@
   if (!IsExceptionPending()) {
     return;
   }
-  ScopedLocalRef<jobject> peer(jni_env_, soa.AddLocalReference<jobject>(opeer_));
+  ScopedLocalRef<jobject> peer(tlsPtr_.jni_env, soa.AddLocalReference<jobject>(tlsPtr_.opeer));
   ScopedThreadStateChange tsc(this, kNative);
 
   // Get and clear the exception.
-  ScopedLocalRef<jthrowable> exception(jni_env_, jni_env_->ExceptionOccurred());
-  jni_env_->ExceptionClear();
+  ScopedLocalRef<jthrowable> exception(tlsPtr_.jni_env, tlsPtr_.jni_env->ExceptionOccurred());
+  tlsPtr_.jni_env->ExceptionClear();
 
   // If the thread has its own handler, use that.
-  ScopedLocalRef<jobject> handler(jni_env_,
-                                  jni_env_->GetObjectField(peer.get(),
-                                                           WellKnownClasses::java_lang_Thread_uncaughtHandler));
+  ScopedLocalRef<jobject> handler(tlsPtr_.jni_env,
+                                  tlsPtr_.jni_env->GetObjectField(peer.get(),
+                                      WellKnownClasses::java_lang_Thread_uncaughtHandler));
   if (handler.get() == nullptr) {
     // Otherwise use the thread group's default handler.
-    handler.reset(jni_env_->GetObjectField(peer.get(), WellKnownClasses::java_lang_Thread_group));
+    handler.reset(tlsPtr_.jni_env->GetObjectField(peer.get(),
+                                                  WellKnownClasses::java_lang_Thread_group));
   }
 
   // Call the handler.
-  jni_env_->CallVoidMethod(handler.get(),
-                           WellKnownClasses::java_lang_Thread$UncaughtExceptionHandler_uncaughtException,
-                           peer.get(), exception.get());
+  tlsPtr_.jni_env->CallVoidMethod(handler.get(),
+      WellKnownClasses::java_lang_Thread$UncaughtExceptionHandler_uncaughtException,
+      peer.get(), exception.get());
 
   // If the handler threw, clear that exception too.
-  jni_env_->ExceptionClear();
+  tlsPtr_.jni_env->ExceptionClear();
 }
 
 void Thread::RemoveFromThreadGroup(ScopedObjectAccess& soa) {
   // this.group.removeThread(this);
   // group can be null if we're in the compiler or a test.
-  mirror::Object* ogroup = soa.DecodeField(WellKnownClasses::java_lang_Thread_group)->GetObject(opeer_);
+  mirror::Object* ogroup = soa.DecodeField(WellKnownClasses::java_lang_Thread_group)
+      ->GetObject(tlsPtr_.opeer);
   if (ogroup != nullptr) {
     ScopedLocalRef<jobject> group(soa.Env(), soa.AddLocalReference<jobject>(ogroup));
-    ScopedLocalRef<jobject> peer(soa.Env(), soa.AddLocalReference<jobject>(opeer_));
+    ScopedLocalRef<jobject> peer(soa.Env(), soa.AddLocalReference<jobject>(tlsPtr_.opeer));
     ScopedThreadStateChange tsc(soa.Self(), kNative);
-    jni_env_->CallVoidMethod(group.get(), WellKnownClasses::java_lang_ThreadGroup_removeThread,
-                             peer.get());
+    tlsPtr_.jni_env->CallVoidMethod(group.get(),
+                                    WellKnownClasses::java_lang_ThreadGroup_removeThread,
+                                    peer.get());
   }
 }
 
 size_t Thread::NumSirtReferences() {
   size_t count = 0;
-  for (StackIndirectReferenceTable* cur = top_sirt_; cur; cur = cur->GetLink()) {
+  for (StackIndirectReferenceTable* cur = tlsPtr_.top_sirt; cur; cur = cur->GetLink()) {
     count += cur->NumberOfReferences();
   }
   return count;
@@ -1213,17 +1182,17 @@
 bool Thread::SirtContains(jobject obj) const {
   StackReference<mirror::Object>* sirt_entry =
       reinterpret_cast<StackReference<mirror::Object>*>(obj);
-  for (StackIndirectReferenceTable* cur = top_sirt_; cur; cur = cur->GetLink()) {
+  for (StackIndirectReferenceTable* cur = tlsPtr_.top_sirt; cur; cur = cur->GetLink()) {
     if (cur->Contains(sirt_entry)) {
       return true;
     }
   }
   // JNI code invoked from portable code uses shadow frames rather than the SIRT.
-  return managed_stack_.ShadowFramesContain(sirt_entry);
+  return tlsPtr_.managed_stack.ShadowFramesContain(sirt_entry);
 }
 
 void Thread::SirtVisitRoots(RootCallback* visitor, void* arg, uint32_t thread_id) {
-  for (StackIndirectReferenceTable* cur = top_sirt_; cur; cur = cur->GetLink()) {
+  for (StackIndirectReferenceTable* cur = tlsPtr_.top_sirt; cur; cur = cur->GetLink()) {
     size_t num_refs = cur->NumberOfReferences();
     for (size_t j = 0; j < num_refs; ++j) {
       mirror::Object* object = cur->GetReference(j);
@@ -1248,7 +1217,7 @@
   mirror::Object* result;
   // The "kinds" below are sorted by the frequency we expect to encounter them.
   if (kind == kLocal) {
-    IndirectReferenceTable& locals = jni_env_->locals;
+    IndirectReferenceTable& locals = tlsPtr_.jni_env->locals;
     result = locals.Get(ref);
   } else if (kind == kSirtOrInvalid) {
     // TODO: make stack indirect reference table lookup more efficient.
@@ -1287,19 +1256,18 @@
 // Implements java.lang.Thread.interrupted.
 bool Thread::Interrupted() {
   MutexLock mu(Thread::Current(), *wait_mutex_);
-  bool interrupted = interrupted_;
-  interrupted_ = false;
+  bool interrupted = IsInterruptedLocked();
+  SetInterruptedLocked(false);
   return interrupted;
 }
 
 // Implements java.lang.Thread.isInterrupted.
 bool Thread::IsInterrupted() {
   MutexLock mu(Thread::Current(), *wait_mutex_);
-  return interrupted_;
+  return IsInterruptedLocked();
 }
 
-void Thread::Interrupt() {
-  Thread* self = Thread::Current();
+void Thread::Interrupt(Thread* self) {
   MutexLock mu(self, *wait_mutex_);
   if (interrupted_) {
     return;
@@ -1677,12 +1645,12 @@
 
 void Thread::ThrowOutOfMemoryError(const char* msg) {
   LOG(ERROR) << StringPrintf("Throwing OutOfMemoryError \"%s\"%s",
-      msg, (throwing_OutOfMemoryError_ ? " (recursive case)" : ""));
+      msg, (tls32_.throwing_OutOfMemoryError ? " (recursive case)" : ""));
   ThrowLocation throw_location = GetCurrentLocationForThrow();
-  if (!throwing_OutOfMemoryError_) {
-    throwing_OutOfMemoryError_ = true;
+  if (!tls32_.throwing_OutOfMemoryError) {
+    tls32_.throwing_OutOfMemoryError = true;
     ThrowNewException(throw_location, "Ljava/lang/OutOfMemoryError;", msg);
-    throwing_OutOfMemoryError_ = false;
+    tls32_.throwing_OutOfMemoryError = false;
   } else {
     Dump(LOG(ERROR));  // The pre-allocated OOME has no stack, so help out and log one.
     SetException(throw_location, Runtime::Current()->GetPreAllocatedOutOfMemoryError());
@@ -1705,140 +1673,146 @@
 #endif
 }
 
-struct EntryPointInfo {
-  uint32_t offset;
-  const char* name;
-};
-#define INTERPRETER_ENTRY_POINT_INFO(x) { INTERPRETER_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-#define JNI_ENTRY_POINT_INFO(x)         { JNI_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-#define PORTABLE_ENTRY_POINT_INFO(x)    { PORTABLE_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-#define QUICK_ENTRY_POINT_INFO(x)       { QUICK_ENTRYPOINT_OFFSET(x).Uint32Value(), #x }
-static const EntryPointInfo gThreadEntryPointInfo[] = {
-  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToInterpreterBridge),
-  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToCompiledCodeBridge),
-  JNI_ENTRY_POINT_INFO(pDlsymLookup),
-  PORTABLE_ENTRY_POINT_INFO(pPortableImtConflictTrampoline),
-  PORTABLE_ENTRY_POINT_INFO(pPortableResolutionTrampoline),
-  PORTABLE_ENTRY_POINT_INFO(pPortableToInterpreterBridge),
-  QUICK_ENTRY_POINT_INFO(pAllocArray),
-  QUICK_ENTRY_POINT_INFO(pAllocArrayResolved),
-  QUICK_ENTRY_POINT_INFO(pAllocArrayWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pAllocObject),
-  QUICK_ENTRY_POINT_INFO(pAllocObjectResolved),
-  QUICK_ENTRY_POINT_INFO(pAllocObjectInitialized),
-  QUICK_ENTRY_POINT_INFO(pAllocObjectWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArray),
-  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArrayWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInstanceofNonTrivial),
-  QUICK_ENTRY_POINT_INFO(pCheckCast),
-  QUICK_ENTRY_POINT_INFO(pInitializeStaticStorage),
-  QUICK_ENTRY_POINT_INFO(pInitializeTypeAndVerifyAccess),
-  QUICK_ENTRY_POINT_INFO(pInitializeType),
-  QUICK_ENTRY_POINT_INFO(pResolveString),
-  QUICK_ENTRY_POINT_INFO(pSet32Instance),
-  QUICK_ENTRY_POINT_INFO(pSet32Static),
-  QUICK_ENTRY_POINT_INFO(pSet64Instance),
-  QUICK_ENTRY_POINT_INFO(pSet64Static),
-  QUICK_ENTRY_POINT_INFO(pSetObjInstance),
-  QUICK_ENTRY_POINT_INFO(pSetObjStatic),
-  QUICK_ENTRY_POINT_INFO(pGet32Instance),
-  QUICK_ENTRY_POINT_INFO(pGet32Static),
-  QUICK_ENTRY_POINT_INFO(pGet64Instance),
-  QUICK_ENTRY_POINT_INFO(pGet64Static),
-  QUICK_ENTRY_POINT_INFO(pGetObjInstance),
-  QUICK_ENTRY_POINT_INFO(pGetObjStatic),
-  QUICK_ENTRY_POINT_INFO(pAputObjectWithNullAndBoundCheck),
-  QUICK_ENTRY_POINT_INFO(pAputObjectWithBoundCheck),
-  QUICK_ENTRY_POINT_INFO(pAputObject),
-  QUICK_ENTRY_POINT_INFO(pHandleFillArrayData),
-  QUICK_ENTRY_POINT_INFO(pJniMethodStart),
-  QUICK_ENTRY_POINT_INFO(pJniMethodStartSynchronized),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEnd),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEndSynchronized),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReference),
-  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReferenceSynchronized),
-  QUICK_ENTRY_POINT_INFO(pQuickGenericJniTrampoline),
-  QUICK_ENTRY_POINT_INFO(pLockObject),
-  QUICK_ENTRY_POINT_INFO(pUnlockObject),
-  QUICK_ENTRY_POINT_INFO(pCmpgDouble),
-  QUICK_ENTRY_POINT_INFO(pCmpgFloat),
-  QUICK_ENTRY_POINT_INFO(pCmplDouble),
-  QUICK_ENTRY_POINT_INFO(pCmplFloat),
-  QUICK_ENTRY_POINT_INFO(pFmod),
-  QUICK_ENTRY_POINT_INFO(pSqrt),
-  QUICK_ENTRY_POINT_INFO(pL2d),
-  QUICK_ENTRY_POINT_INFO(pFmodf),
-  QUICK_ENTRY_POINT_INFO(pL2f),
-  QUICK_ENTRY_POINT_INFO(pD2iz),
-  QUICK_ENTRY_POINT_INFO(pF2iz),
-  QUICK_ENTRY_POINT_INFO(pIdivmod),
-  QUICK_ENTRY_POINT_INFO(pD2l),
-  QUICK_ENTRY_POINT_INFO(pF2l),
-  QUICK_ENTRY_POINT_INFO(pLdiv),
-  QUICK_ENTRY_POINT_INFO(pLmod),
-  QUICK_ENTRY_POINT_INFO(pLmul),
-  QUICK_ENTRY_POINT_INFO(pShlLong),
-  QUICK_ENTRY_POINT_INFO(pShrLong),
-  QUICK_ENTRY_POINT_INFO(pUshrLong),
-  QUICK_ENTRY_POINT_INFO(pIndexOf),
-  QUICK_ENTRY_POINT_INFO(pMemcmp16),
-  QUICK_ENTRY_POINT_INFO(pStringCompareTo),
-  QUICK_ENTRY_POINT_INFO(pMemcpy),
-  QUICK_ENTRY_POINT_INFO(pQuickImtConflictTrampoline),
-  QUICK_ENTRY_POINT_INFO(pQuickResolutionTrampoline),
-  QUICK_ENTRY_POINT_INFO(pQuickToInterpreterBridge),
-  QUICK_ENTRY_POINT_INFO(pInvokeDirectTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeInterfaceTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeStaticTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeSuperTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pInvokeVirtualTrampolineWithAccessCheck),
-  QUICK_ENTRY_POINT_INFO(pCheckSuspend),
-  QUICK_ENTRY_POINT_INFO(pTestSuspend),
-  QUICK_ENTRY_POINT_INFO(pDeliverException),
-  QUICK_ENTRY_POINT_INFO(pThrowArrayBounds),
-  QUICK_ENTRY_POINT_INFO(pThrowDivZero),
-  QUICK_ENTRY_POINT_INFO(pThrowNoSuchMethod),
-  QUICK_ENTRY_POINT_INFO(pThrowNullPointer),
-  QUICK_ENTRY_POINT_INFO(pThrowStackOverflow),
-};
-#undef QUICK_ENTRY_POINT_INFO
+// Explicitly instantiate 32 and 64bit thread offset dumping support.
+template void Thread::DumpThreadOffset<4>(std::ostream& os, uint32_t offset);
+template void Thread::DumpThreadOffset<8>(std::ostream& os, uint32_t offset);
 
-void Thread::DumpThreadOffset(std::ostream& os, uint32_t offset, size_t size_of_pointers) {
-  CHECK_EQ(size_of_pointers, 4U);  // TODO: support 64-bit targets.
-
-#define DO_THREAD_OFFSET(x) \
-    if (offset == static_cast<uint32_t>(OFFSETOF_VOLATILE_MEMBER(Thread, x))) { \
-      os << # x; \
+template<size_t ptr_size>
+void Thread::DumpThreadOffset(std::ostream& os, uint32_t offset) {
+#define DO_THREAD_OFFSET(x, y) \
+    if (offset == x.Uint32Value()) { \
+      os << y; \
       return; \
     }
-  DO_THREAD_OFFSET(state_and_flags_);
-  DO_THREAD_OFFSET(card_table_);
-  DO_THREAD_OFFSET(exception_);
-  DO_THREAD_OFFSET(opeer_);
-  DO_THREAD_OFFSET(jni_env_);
-  DO_THREAD_OFFSET(self_);
-  DO_THREAD_OFFSET(stack_end_);
-  DO_THREAD_OFFSET(suspend_count_);
-  DO_THREAD_OFFSET(thin_lock_thread_id_);
-  // DO_THREAD_OFFSET(top_of_managed_stack_);
-  // DO_THREAD_OFFSET(top_of_managed_stack_pc_);
-  DO_THREAD_OFFSET(top_sirt_);
-  DO_THREAD_OFFSET(suspend_trigger_);
+  DO_THREAD_OFFSET(ThreadFlagsOffset<ptr_size>(), "state_and_flags")
+  DO_THREAD_OFFSET(CardTableOffset<ptr_size>(), "card_table")
+  DO_THREAD_OFFSET(ExceptionOffset<ptr_size>(), "exception")
+  DO_THREAD_OFFSET(PeerOffset<ptr_size>(), "peer");
+  DO_THREAD_OFFSET(JniEnvOffset<ptr_size>(), "jni_env")
+  DO_THREAD_OFFSET(SelfOffset<ptr_size>(), "self")
+  DO_THREAD_OFFSET(StackEndOffset<ptr_size>(), "stack_end")
+  DO_THREAD_OFFSET(ThinLockIdOffset<ptr_size>(), "thin_lock_thread_id")
+  DO_THREAD_OFFSET(TopOfManagedStackOffset<ptr_size>(), "top_quick_frame_method")
+  DO_THREAD_OFFSET(TopOfManagedStackPcOffset<ptr_size>(), "top_quick_frame_pc")
+  DO_THREAD_OFFSET(TopShadowFrameOffset<ptr_size>(), "top_shadow_frame")
+  DO_THREAD_OFFSET(TopSirtOffset<ptr_size>(), "top_sirt")
+  DO_THREAD_OFFSET(ThreadSuspendTriggerOffset<ptr_size>(), "suspend_trigger")
 #undef DO_THREAD_OFFSET
 
-  size_t entry_point_count = arraysize(gThreadEntryPointInfo);
-  CHECK_EQ(entry_point_count * size_of_pointers,
-           sizeof(InterpreterEntryPoints) + sizeof(JniEntryPoints) + sizeof(PortableEntryPoints) +
-           sizeof(QuickEntryPoints));
-  uint32_t expected_offset = OFFSETOF_MEMBER(Thread, interpreter_entrypoints_);
-  for (size_t i = 0; i < entry_point_count; ++i) {
-    CHECK_EQ(gThreadEntryPointInfo[i].offset, expected_offset) << gThreadEntryPointInfo[i].name;
-    expected_offset += size_of_pointers;
-    if (gThreadEntryPointInfo[i].offset == offset) {
-      os << gThreadEntryPointInfo[i].name;
-      return;
+#define INTERPRETER_ENTRY_POINT_INFO(x) \
+    if (INTERPRETER_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
     }
-  }
+  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToInterpreterBridge)
+  INTERPRETER_ENTRY_POINT_INFO(pInterpreterToCompiledCodeBridge)
+#undef INTERPRETER_ENTRY_POINT_INFO
+
+#define JNI_ENTRY_POINT_INFO(x) \
+    if (JNI_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
+    }
+  JNI_ENTRY_POINT_INFO(pDlsymLookup)
+#undef JNI_ENTRY_POINT_INFO
+
+#define PORTABLE_ENTRY_POINT_INFO(x) \
+    if (PORTABLE_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
+    }
+  PORTABLE_ENTRY_POINT_INFO(pPortableImtConflictTrampoline)
+  PORTABLE_ENTRY_POINT_INFO(pPortableResolutionTrampoline)
+  PORTABLE_ENTRY_POINT_INFO(pPortableToInterpreterBridge)
+#undef PORTABLE_ENTRY_POINT_INFO
+
+#define QUICK_ENTRY_POINT_INFO(x) \
+    if (QUICK_ENTRYPOINT_OFFSET(ptr_size, x).Uint32Value() == offset) { \
+      os << #x; \
+      return; \
+    }
+  QUICK_ENTRY_POINT_INFO(pAllocArray)
+  QUICK_ENTRY_POINT_INFO(pAllocArrayResolved)
+  QUICK_ENTRY_POINT_INFO(pAllocArrayWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pAllocObject)
+  QUICK_ENTRY_POINT_INFO(pAllocObjectResolved)
+  QUICK_ENTRY_POINT_INFO(pAllocObjectInitialized)
+  QUICK_ENTRY_POINT_INFO(pAllocObjectWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArray)
+  QUICK_ENTRY_POINT_INFO(pCheckAndAllocArrayWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInstanceofNonTrivial)
+  QUICK_ENTRY_POINT_INFO(pCheckCast)
+  QUICK_ENTRY_POINT_INFO(pInitializeStaticStorage)
+  QUICK_ENTRY_POINT_INFO(pInitializeTypeAndVerifyAccess)
+  QUICK_ENTRY_POINT_INFO(pInitializeType)
+  QUICK_ENTRY_POINT_INFO(pResolveString)
+  QUICK_ENTRY_POINT_INFO(pSet32Instance)
+  QUICK_ENTRY_POINT_INFO(pSet32Static)
+  QUICK_ENTRY_POINT_INFO(pSet64Instance)
+  QUICK_ENTRY_POINT_INFO(pSet64Static)
+  QUICK_ENTRY_POINT_INFO(pSetObjInstance)
+  QUICK_ENTRY_POINT_INFO(pSetObjStatic)
+  QUICK_ENTRY_POINT_INFO(pGet32Instance)
+  QUICK_ENTRY_POINT_INFO(pGet32Static)
+  QUICK_ENTRY_POINT_INFO(pGet64Instance)
+  QUICK_ENTRY_POINT_INFO(pGet64Static)
+  QUICK_ENTRY_POINT_INFO(pGetObjInstance)
+  QUICK_ENTRY_POINT_INFO(pGetObjStatic)
+  QUICK_ENTRY_POINT_INFO(pAputObjectWithNullAndBoundCheck)
+  QUICK_ENTRY_POINT_INFO(pAputObjectWithBoundCheck)
+  QUICK_ENTRY_POINT_INFO(pAputObject)
+  QUICK_ENTRY_POINT_INFO(pHandleFillArrayData)
+  QUICK_ENTRY_POINT_INFO(pJniMethodStart)
+  QUICK_ENTRY_POINT_INFO(pJniMethodStartSynchronized)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEnd)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndSynchronized)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReference)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReferenceSynchronized)
+  QUICK_ENTRY_POINT_INFO(pQuickGenericJniTrampoline)
+  QUICK_ENTRY_POINT_INFO(pLockObject)
+  QUICK_ENTRY_POINT_INFO(pUnlockObject)
+  QUICK_ENTRY_POINT_INFO(pCmpgDouble)
+  QUICK_ENTRY_POINT_INFO(pCmpgFloat)
+  QUICK_ENTRY_POINT_INFO(pCmplDouble)
+  QUICK_ENTRY_POINT_INFO(pCmplFloat)
+  QUICK_ENTRY_POINT_INFO(pFmod)
+  QUICK_ENTRY_POINT_INFO(pSqrt)
+  QUICK_ENTRY_POINT_INFO(pL2d)
+  QUICK_ENTRY_POINT_INFO(pFmodf)
+  QUICK_ENTRY_POINT_INFO(pL2f)
+  QUICK_ENTRY_POINT_INFO(pD2iz)
+  QUICK_ENTRY_POINT_INFO(pF2iz)
+  QUICK_ENTRY_POINT_INFO(pIdivmod)
+  QUICK_ENTRY_POINT_INFO(pD2l)
+  QUICK_ENTRY_POINT_INFO(pF2l)
+  QUICK_ENTRY_POINT_INFO(pLdiv)
+  QUICK_ENTRY_POINT_INFO(pLmod)
+  QUICK_ENTRY_POINT_INFO(pLmul)
+  QUICK_ENTRY_POINT_INFO(pShlLong)
+  QUICK_ENTRY_POINT_INFO(pShrLong)
+  QUICK_ENTRY_POINT_INFO(pUshrLong)
+  QUICK_ENTRY_POINT_INFO(pIndexOf)
+  QUICK_ENTRY_POINT_INFO(pMemcmp16)
+  QUICK_ENTRY_POINT_INFO(pStringCompareTo)
+  QUICK_ENTRY_POINT_INFO(pMemcpy)
+  QUICK_ENTRY_POINT_INFO(pQuickImtConflictTrampoline)
+  QUICK_ENTRY_POINT_INFO(pQuickResolutionTrampoline)
+  QUICK_ENTRY_POINT_INFO(pQuickToInterpreterBridge)
+  QUICK_ENTRY_POINT_INFO(pInvokeDirectTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeInterfaceTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeStaticTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeSuperTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pInvokeVirtualTrampolineWithAccessCheck)
+  QUICK_ENTRY_POINT_INFO(pCheckSuspend)
+  QUICK_ENTRY_POINT_INFO(pTestSuspend)
+  QUICK_ENTRY_POINT_INFO(pDeliverException)
+  QUICK_ENTRY_POINT_INFO(pThrowArrayBounds)
+  QUICK_ENTRY_POINT_INFO(pThrowDivZero)
+  QUICK_ENTRY_POINT_INFO(pThrowNoSuchMethod)
+  QUICK_ENTRY_POINT_INFO(pThrowNullPointer)
+  QUICK_ENTRY_POINT_INFO(pThrowStackOverflow)
+#undef QUICK_ENTRY_POINT_INFO
+
   os << offset;
 }
 
@@ -1869,11 +1843,11 @@
 }
 
 Context* Thread::GetLongJumpContext() {
-  Context* result = long_jump_context_;
+  Context* result = tlsPtr_.long_jump_context;
   if (result == nullptr) {
     result = Context::Create();
   } else {
-    long_jump_context_ = nullptr;  // Avoid context being shared.
+    tlsPtr_.long_jump_context = nullptr;  // Avoid context being shared.
     result->Reset();
   }
   return result;
@@ -1918,11 +1892,11 @@
   return ThrowLocation(visitor.this_object_, visitor.method_, visitor.dex_pc_);
 }
 
-bool Thread::HoldsLock(mirror::Object* object) {
+bool Thread::HoldsLock(mirror::Object* object) const {
   if (object == nullptr) {
     return false;
   }
-  return object->GetLockOwnerThreadId() == thin_lock_thread_id_;
+  return object->GetLockOwnerThreadId() == GetThreadId();
 }
 
 // RootVisitor parameters are: (const Object* obj, size_t vreg, const StackVisitor* visitor).
@@ -2061,30 +2035,30 @@
 
 void Thread::SetClassLoaderOverride(mirror::ClassLoader* class_loader_override) {
   VerifyObject(class_loader_override);
-  class_loader_override_ = class_loader_override;
+  tlsPtr_.class_loader_override = class_loader_override;
 }
 
 void Thread::VisitRoots(RootCallback* visitor, void* arg) {
   uint32_t thread_id = GetThreadId();
-  if (opeer_ != nullptr) {
-    visitor(&opeer_, arg, thread_id, kRootThreadObject);
+  if (tlsPtr_.opeer != nullptr) {
+    visitor(&tlsPtr_.opeer, arg, thread_id, kRootThreadObject);
   }
-  if (exception_ != nullptr) {
-    visitor(reinterpret_cast<mirror::Object**>(&exception_), arg, thread_id, kRootNativeStack);
+  if (tlsPtr_.exception != nullptr) {
+    visitor(reinterpret_cast<mirror::Object**>(&tlsPtr_.exception), arg, thread_id, kRootNativeStack);
   }
-  throw_location_.VisitRoots(visitor, arg);
-  if (class_loader_override_ != nullptr) {
-    visitor(reinterpret_cast<mirror::Object**>(&class_loader_override_), arg, thread_id,
+  tlsPtr_.throw_location.VisitRoots(visitor, arg);
+  if (tlsPtr_.class_loader_override != nullptr) {
+    visitor(reinterpret_cast<mirror::Object**>(&tlsPtr_.class_loader_override), arg, thread_id,
             kRootNativeStack);
   }
-  jni_env_->locals.VisitRoots(visitor, arg, thread_id, kRootJNILocal);
-  jni_env_->monitors.VisitRoots(visitor, arg, thread_id, kRootJNIMonitor);
+  tlsPtr_.jni_env->locals.VisitRoots(visitor, arg, thread_id, kRootJNILocal);
+  tlsPtr_.jni_env->monitors.VisitRoots(visitor, arg, thread_id, kRootJNIMonitor);
   SirtVisitRoots(visitor, arg, thread_id);
-  if (debug_invoke_req_ != nullptr) {
-    debug_invoke_req_->VisitRoots(visitor, arg, thread_id, kRootDebugger);
+  if (tlsPtr_.debug_invoke_req != nullptr) {
+    tlsPtr_.debug_invoke_req->VisitRoots(visitor, arg, thread_id, kRootDebugger);
   }
-  if (single_step_control_ != nullptr) {
-    single_step_control_->VisitRoots(visitor, arg, thread_id, kRootDebugger);
+  if (tlsPtr_.single_step_control != nullptr) {
+    tlsPtr_.single_step_control->VisitRoots(visitor, arg, thread_id, kRootDebugger);
   }
   // Visit roots on this thread's stack
   Context* context = GetLongJumpContext();
@@ -2116,7 +2090,7 @@
 // Set the stack end to that to be used during a stack overflow
 void Thread::SetStackEndForStackOverflow() {
   // During stack overflow we allow use of the full stack.
-  if (stack_end_ == stack_begin_) {
+  if (tlsPtr_.stack_end == tlsPtr_.stack_begin) {
     // However, we seem to have already extended to use the full stack.
     LOG(ERROR) << "Need to increase kStackOverflowReservedBytes (currently "
                << kStackOverflowReservedBytes << ")?";
@@ -2124,23 +2098,23 @@
     LOG(FATAL) << "Recursive stack overflow.";
   }
 
-  stack_end_ = stack_begin_;
+  tlsPtr_.stack_end = tlsPtr_.stack_begin;
 }
 
 void Thread::SetTlab(byte* start, byte* end) {
   DCHECK_LE(start, end);
-  thread_local_start_ = start;
-  thread_local_pos_  = thread_local_start_;
-  thread_local_end_ = end;
-  thread_local_objects_ = 0;
+  tlsPtr_.thread_local_start = start;
+  tlsPtr_.thread_local_pos  = tlsPtr_.thread_local_start;
+  tlsPtr_.thread_local_end = end;
+  tlsPtr_.thread_local_objects = 0;
 }
 
 bool Thread::HasTlab() const {
-  bool has_tlab = thread_local_pos_ != nullptr;
+  bool has_tlab = tlsPtr_.thread_local_pos != nullptr;
   if (has_tlab) {
-    DCHECK(thread_local_start_ != nullptr && thread_local_end_ != nullptr);
+    DCHECK(tlsPtr_.thread_local_start != nullptr && tlsPtr_.thread_local_end != nullptr);
   } else {
-    DCHECK(thread_local_start_ == nullptr && thread_local_end_ == nullptr);
+    DCHECK(tlsPtr_.thread_local_start == nullptr && tlsPtr_.thread_local_end == nullptr);
   }
   return has_tlab;
 }
diff --git a/runtime/thread.h b/runtime/thread.h
index 63d22c5..59fe724 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -92,7 +92,7 @@
   kCheckpointRequest = 2  // Request that the thread do some checkpoint work and then continue.
 };
 
-class PACKED(4) Thread {
+class Thread {
  public:
   // Space to throw a StackOverflowError in.
   // TODO: shrink reserved space, in particular for 64bit.
@@ -145,7 +145,8 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Translates 172 to pAllocArrayFromCode and so on.
-  static void DumpThreadOffset(std::ostream& os, uint32_t offset, size_t size_of_pointers);
+  template<size_t size_of_pointers>
+  static void DumpThreadOffset(std::ostream& os, uint32_t offset);
 
   // Dumps a one-line summary of thread state (used for operator<<).
   void ShortDump(std::ostream& os) const;
@@ -162,32 +163,24 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   ThreadState GetState() const {
-    DCHECK(state_and_flags_.as_struct.state >= kTerminated && state_and_flags_.as_struct.state <= kSuspended);
-    return static_cast<ThreadState>(state_and_flags_.as_struct.state);
-  }
-
-  // This function can be used to make sure a thread's state is valid.
-  void CheckState(int id) const {
-    if (state_and_flags_.as_struct.state >= kTerminated && state_and_flags_.as_struct.state <= kSuspended) {
-      return;
-    }
-    LOG(INFO) << "Thread " << this << " state is invalid: " << state_and_flags_.as_struct.state << " id=" << id;
-    CHECK(false);
+    DCHECK_GE(tls32_.state_and_flags.as_struct.state, kTerminated);
+    DCHECK_LE(tls32_.state_and_flags.as_struct.state, kSuspended);
+    return static_cast<ThreadState>(tls32_.state_and_flags.as_struct.state);
   }
 
   ThreadState SetState(ThreadState new_state);
 
   int GetSuspendCount() const EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    return suspend_count_;
+    return tls32_.suspend_count;
   }
 
   int GetDebugSuspendCount() const EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    return debug_suspend_count_;
+    return tls32_.debug_suspend_count;
   }
 
   bool IsSuspended() const {
     union StateAndFlags state_and_flags;
-    state_and_flags.as_int = state_and_flags_.as_int;
+    state_and_flags.as_int = tls32_.state_and_flags.as_int;
     return state_and_flags.as_struct.state != kRunnable &&
         (state_and_flags.as_struct.flags & kSuspendRequest) != 0;
   }
@@ -221,9 +214,9 @@
   const char* StartAssertNoThreadSuspension(const char* cause) {
     if (kIsDebugBuild) {
       CHECK(cause != NULL);
-      const char* previous_cause = last_no_thread_suspension_cause_;
-      no_thread_suspension_++;
-      last_no_thread_suspension_cause_ = cause;
+      const char* previous_cause = tlsPtr_.last_no_thread_suspension_cause;
+      tls32_.no_thread_suspension++;
+      tlsPtr_.last_no_thread_suspension_cause = cause;
       return previous_cause;
     } else {
       return nullptr;
@@ -233,20 +226,20 @@
   // End region where no thread suspension is expected.
   void EndAssertNoThreadSuspension(const char* old_cause) {
     if (kIsDebugBuild) {
-      CHECK(old_cause != NULL || no_thread_suspension_ == 1);
-      CHECK_GT(no_thread_suspension_, 0U);
-      no_thread_suspension_--;
-      last_no_thread_suspension_cause_ = old_cause;
+      CHECK(old_cause != nullptr || tls32_.no_thread_suspension == 1);
+      CHECK_GT(tls32_.no_thread_suspension, 0U);
+      tls32_.no_thread_suspension--;
+      tlsPtr_.last_no_thread_suspension_cause = old_cause;
     }
   }
 
   void AssertThreadSuspensionIsAllowable(bool check_locks = true) const;
 
   bool IsDaemon() const {
-    return daemon_;
+    return tls32_.daemon;
   }
 
-  bool HoldsLock(mirror::Object*) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool HoldsLock(mirror::Object*) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   /*
    * Changes the priority of this thread to match that of the java.lang.Thread object.
@@ -265,11 +258,11 @@
   static int GetNativePriority();
 
   uint32_t GetThreadId() const {
-    return thin_lock_thread_id_;
+    return tls32_.thin_lock_thread_id;
   }
 
   pid_t GetTid() const {
-    return tid_;
+    return tls32_.tid;
   }
 
   // Returns the java.lang.Thread's name, or NULL if this Thread* doesn't have a peer.
@@ -287,30 +280,30 @@
   uint64_t GetCpuMicroTime() const;
 
   mirror::Object* GetPeer() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    CHECK(jpeer_ == NULL);
-    return opeer_;
+    CHECK(tlsPtr_.jpeer == nullptr);
+    return tlsPtr_.opeer;
   }
 
   bool HasPeer() const {
-    return jpeer_ != NULL || opeer_ != NULL;
+    return tlsPtr_.jpeer != nullptr || tlsPtr_.opeer != nullptr;
   }
 
   RuntimeStats* GetStats() {
-    return &stats_;
+    return &tls64_.stats;
   }
 
   bool IsStillStarting() const;
 
   bool IsExceptionPending() const {
-    return exception_ != NULL;
+    return tlsPtr_.exception != nullptr;
   }
 
   mirror::Throwable* GetException(ThrowLocation* throw_location) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    if (throw_location != NULL) {
-      *throw_location = throw_location_;
+    if (throw_location != nullptr) {
+      *throw_location = tlsPtr_.throw_location;
     }
-    return exception_;
+    return tlsPtr_.exception;
   }
 
   void AssertNoPendingException() const;
@@ -320,13 +313,13 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     CHECK(new_exception != NULL);
     // TODO: DCHECK(!IsExceptionPending());
-    exception_ = new_exception;
-    throw_location_ = throw_location;
+    tlsPtr_.exception = new_exception;
+    tlsPtr_.throw_location = throw_location;
   }
 
   void ClearException() {
-    exception_ = NULL;
-    throw_location_.Clear();
+    tlsPtr_.exception = nullptr;
+    tlsPtr_.throw_location.Clear();
   }
 
   // Find catch block and perform long jump to appropriate exception handle
@@ -334,8 +327,8 @@
 
   Context* GetLongJumpContext();
   void ReleaseLongJumpContext(Context* context) {
-    DCHECK(long_jump_context_ == NULL);
-    long_jump_context_ = context;
+    DCHECK(tlsPtr_.long_jump_context == nullptr);
+    tlsPtr_.long_jump_context = context;
   }
 
   mirror::ArtMethod* GetCurrentMethod(uint32_t* dex_pc) const
@@ -344,16 +337,17 @@
   ThrowLocation GetCurrentLocationForThrow() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void SetTopOfStack(mirror::ArtMethod** top_method, uintptr_t pc) {
-    managed_stack_.SetTopQuickFrame(top_method);
-    managed_stack_.SetTopQuickFramePc(pc);
+    tlsPtr_.managed_stack.SetTopQuickFrame(top_method);
+    tlsPtr_.managed_stack.SetTopQuickFramePc(pc);
   }
 
   void SetTopOfShadowStack(ShadowFrame* top) {
-    managed_stack_.SetTopShadowFrame(top);
+    tlsPtr_.managed_stack.SetTopShadowFrame(top);
   }
 
   bool HasManagedStack() const {
-    return managed_stack_.GetTopQuickFrame() != NULL || managed_stack_.GetTopShadowFrame() != NULL;
+    return (tlsPtr_.managed_stack.GetTopQuickFrame() != nullptr) ||
+        (tlsPtr_.managed_stack.GetTopShadowFrame() != nullptr);
   }
 
   // If 'msg' is NULL, no detail message is set.
@@ -387,21 +381,65 @@
 
   // JNI methods
   JNIEnvExt* GetJniEnv() const {
-    return jni_env_;
+    return tlsPtr_.jni_env;
   }
 
   // Convert a jobject into a Object*
   mirror::Object* DecodeJObject(jobject obj) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  mirror::Object* GetMonitorEnterObject() const {
+    return tlsPtr_.monitor_enter_object;
+  }
+
+  void SetMonitorEnterObject(mirror::Object* obj) {
+    tlsPtr_.monitor_enter_object = obj;
+  }
+
   // Implements java.lang.Thread.interrupted.
-  bool Interrupted();
+  bool Interrupted() LOCKS_EXCLUDED(wait_mutex_);
   // Implements java.lang.Thread.isInterrupted.
-  bool IsInterrupted();
-  void Interrupt();
-  void Notify();
+  bool IsInterrupted() LOCKS_EXCLUDED(wait_mutex_);
+  bool IsInterruptedLocked() EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return interrupted_;
+  }
+  void Interrupt(Thread* self) LOCKS_EXCLUDED(wait_mutex_);
+  void SetInterruptedLocked(bool i) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    interrupted_ = i;
+  }
+  void Notify() LOCKS_EXCLUDED(wait_mutex_);
+
+ private:
+  void NotifyLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_);
+
+ public:
+  Mutex* GetWaitMutex() const LOCK_RETURNED(wait_mutex_) {
+    return wait_mutex_;
+  }
+
+  ConditionVariable* GetWaitConditionVariable() const EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return wait_cond_;
+  }
+
+  Monitor* GetWaitMonitor() const EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return wait_monitor_;
+  }
+
+  void SetWaitMonitor(Monitor* mon) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    wait_monitor_ = mon;
+  }
+
+
+  // Waiter link-list support.
+  Thread* GetWaitNext() const {
+    return tlsPtr_.wait_next;
+  }
+
+  void SetWaitNext(Thread* next) {
+    tlsPtr_.wait_next = next;
+  }
 
   mirror::ClassLoader* GetClassLoaderOverride() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return class_loader_override_;
+    return tlsPtr_.class_loader_override;
   }
 
   void SetClassLoaderOverride(mirror::ClassLoader* class_loader_override)
@@ -428,41 +466,99 @@
   // Offsets of various members of native Thread class, used by compiled code.
   //
 
-  static ThreadOffset SelfOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, self_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThinLockIdOffset() {
+    return ThreadOffset<pointer_size>(
+        OFFSETOF_MEMBER(Thread, tls32_) +
+        OFFSETOF_MEMBER(tls_32bit_sized_values, thin_lock_thread_id));
   }
 
-  static ThreadOffset ExceptionOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, exception_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadFlagsOffset() {
+    return ThreadOffset<pointer_size>(
+        OFFSETOF_MEMBER(Thread, tls32_) +
+        OFFSETOF_MEMBER(tls_32bit_sized_values, state_and_flags));
   }
 
-  static ThreadOffset PeerOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, opeer_));
+ private:
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadOffsetFromTlsPtr(size_t tls_ptr_offset) {
+    size_t base = OFFSETOF_MEMBER(Thread, tlsPtr_);
+    size_t scale;
+    size_t shrink;
+    if (pointer_size == sizeof(void*)) {
+      scale = 1;
+      shrink = 1;
+    } else if (pointer_size > sizeof(void*)) {
+      scale = pointer_size / sizeof(void*);
+      shrink = 1;
+    } else {
+      DCHECK_GT(sizeof(void*), pointer_size);
+      scale = 1;
+      shrink = sizeof(void*) / pointer_size;
+    }
+    return ThreadOffset<pointer_size>(base + ((tls_ptr_offset * scale) / shrink));
   }
 
-  static ThreadOffset ThinLockIdOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+ public:
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> QuickEntryPointOffset(size_t quick_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, quick_entrypoints) + quick_entrypoint_offset);
   }
 
-  static ThreadOffset CardTableOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, card_table_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> InterpreterEntryPointOffset(size_t interp_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, interpreter_entrypoints) + interp_entrypoint_offset);
   }
 
-  static ThreadOffset ThreadFlagsOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, state_and_flags_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> JniEntryPointOffset(size_t jni_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, jni_entrypoints) + jni_entrypoint_offset);
   }
 
-  static ThreadOffset ThreadSuspendTriggerOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, suspend_trigger_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> PortableEntryPointOffset(size_t port_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, portable_entrypoints) + port_entrypoint_offset);
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> SelfOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, self));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ExceptionOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, exception));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> PeerOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, opeer));
+  }
+
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> CardTableOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, card_table));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadSuspendTriggerOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, suspend_trigger));
   }
 
   // Size of stack less any space reserved for stack overflow
   size_t GetStackSize() const {
-    return stack_size_ - (stack_end_ - stack_begin_);
+    return tlsPtr_.stack_size - (tlsPtr_.stack_end - tlsPtr_.stack_begin);
   }
 
   byte* GetStackEnd() const {
-    return stack_end_;
+    return tlsPtr_.stack_end;
   }
 
   // Set the stack end to that to be used during a stack overflow
@@ -475,9 +571,9 @@
     if (implicit_overflow_check) {
       // For implicit checks we also need to add in the protected region above the
       // overflow region.
-      stack_end_ = stack_begin_ + kStackOverflowImplicitCheckSize;
+      tlsPtr_.stack_end = tlsPtr_.stack_begin + kStackOverflowImplicitCheckSize;
     } else {
-      stack_end_ = stack_begin_ + kStackOverflowReservedBytes;
+      tlsPtr_.stack_end = tlsPtr_.stack_begin + kStackOverflowReservedBytes;
     }
   }
 
@@ -485,55 +581,65 @@
   void InstallImplicitProtection(bool is_main_stack);
 
   bool IsHandlingStackOverflow() const {
-    return stack_end_ == stack_begin_;
+    return tlsPtr_.stack_end == tlsPtr_.stack_begin;
   }
 
-  static ThreadOffset StackEndOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, stack_end_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> StackEndOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, stack_end));
   }
 
-  static ThreadOffset JniEnvOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, jni_env_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> JniEnvOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, jni_env));
   }
 
-  static ThreadOffset TopOfManagedStackOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopQuickFrameOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopOfManagedStackOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopQuickFrameOffset());
   }
 
-  static ThreadOffset TopOfManagedStackPcOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopQuickFramePcOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopOfManagedStackPcOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopQuickFramePcOffset());
   }
 
   const ManagedStack* GetManagedStack() const {
-    return &managed_stack_;
+    return &tlsPtr_.managed_stack;
   }
 
   // Linked list recording fragments of managed stack.
   void PushManagedStackFragment(ManagedStack* fragment) {
-    managed_stack_.PushManagedStackFragment(fragment);
+    tlsPtr_.managed_stack.PushManagedStackFragment(fragment);
   }
   void PopManagedStackFragment(const ManagedStack& fragment) {
-    managed_stack_.PopManagedStackFragment(fragment);
+    tlsPtr_.managed_stack.PopManagedStackFragment(fragment);
   }
 
   ShadowFrame* PushShadowFrame(ShadowFrame* new_top_frame) {
-    return managed_stack_.PushShadowFrame(new_top_frame);
+    return tlsPtr_.managed_stack.PushShadowFrame(new_top_frame);
   }
 
   ShadowFrame* PopShadowFrame() {
-    return managed_stack_.PopShadowFrame();
+    return tlsPtr_.managed_stack.PopShadowFrame();
   }
 
-  static ThreadOffset TopShadowFrameOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopShadowFrameOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopShadowFrameOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopShadowFrameOffset());
   }
 
   // Number of references allocated in JNI ShadowFrames on this thread.
   size_t NumJniShadowFrameReferences() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return managed_stack_.NumJniShadowFrameReferences();
+    return tlsPtr_.managed_stack.NumJniShadowFrameReferences();
   }
 
   // Number of references in SIRTs on this thread.
@@ -551,27 +657,28 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void PushSirt(StackIndirectReferenceTable* sirt) {
-    sirt->SetLink(top_sirt_);
-    top_sirt_ = sirt;
+    sirt->SetLink(tlsPtr_.top_sirt);
+    tlsPtr_.top_sirt = sirt;
   }
 
   StackIndirectReferenceTable* PopSirt() {
-    StackIndirectReferenceTable* sirt = top_sirt_;
+    StackIndirectReferenceTable* sirt = tlsPtr_.top_sirt;
     DCHECK(sirt != NULL);
-    top_sirt_ = top_sirt_->GetLink();
+    tlsPtr_.top_sirt = tlsPtr_.top_sirt->GetLink();
     return sirt;
   }
 
-  static ThreadOffset TopSirtOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, top_sirt_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopSirtOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, top_sirt));
   }
 
-  DebugInvokeReq* GetInvokeReq() {
-    return debug_invoke_req_;
+  DebugInvokeReq* GetInvokeReq() const {
+    return tlsPtr_.debug_invoke_req;
   }
 
   SingleStepControl* GetSingleStepControl() const {
-    return single_step_control_;
+    return tlsPtr_.single_step_control;
   }
 
   void SetDeoptimizationShadowFrame(ShadowFrame* sf);
@@ -580,41 +687,41 @@
   ShadowFrame* GetAndClearDeoptimizationShadowFrame(JValue* ret_val);
 
   std::deque<instrumentation::InstrumentationStackFrame>* GetInstrumentationStack() {
-    return instrumentation_stack_;
+    return tlsPtr_.instrumentation_stack;
   }
 
   std::vector<mirror::ArtMethod*>* GetStackTraceSample() const {
-    return stack_trace_sample_;
+    return tlsPtr_.stack_trace_sample;
   }
 
   void SetStackTraceSample(std::vector<mirror::ArtMethod*>* sample) {
-    stack_trace_sample_ = sample;
+    tlsPtr_.stack_trace_sample = sample;
   }
 
   uint64_t GetTraceClockBase() const {
-    return trace_clock_base_;
+    return tls64_.trace_clock_base;
   }
 
   void SetTraceClockBase(uint64_t clock_base) {
-    trace_clock_base_ = clock_base;
+    tls64_.trace_clock_base = clock_base;
   }
 
   BaseMutex* GetHeldMutex(LockLevel level) const {
-    return held_mutexes_[level];
+    return tlsPtr_.held_mutexes[level];
   }
 
   void SetHeldMutex(LockLevel level, BaseMutex* mutex) {
-    held_mutexes_[level] = mutex;
+    tlsPtr_.held_mutexes[level] = mutex;
   }
 
   void RunCheckpointFunction();
 
   bool ReadFlag(ThreadFlag flag) const {
-    return (state_and_flags_.as_struct.flags & flag) != 0;
+    return (tls32_.state_and_flags.as_struct.flags & flag) != 0;
   }
 
   bool TestAllFlags() const {
-    return (state_and_flags_.as_struct.flags != 0);
+    return (tls32_.state_and_flags.as_struct.flags != 0);
   }
 
   void AtomicSetFlag(ThreadFlag flag);
@@ -623,11 +730,57 @@
 
   void ResetQuickAllocEntryPointsForThread();
 
- private:
-  // We have no control over the size of 'bool', but want our boolean fields
-  // to be 4-byte quantities.
-  typedef uint32_t bool32_t;
+  // Returns the remaining space in the TLAB.
+  size_t TlabSize() const;
+  // Doesn't check that there is room.
+  mirror::Object* AllocTlab(size_t bytes);
+  void SetTlab(byte* start, byte* end);
+  bool HasTlab() const;
 
+  // Remove the suspend trigger for this thread by making the suspend_trigger_ TLS value
+  // equal to a valid pointer.
+  // TODO: does this need to atomic?  I don't think so.
+  void RemoveSuspendTrigger() {
+    tlsPtr_.suspend_trigger = reinterpret_cast<uintptr_t*>(&tlsPtr_.suspend_trigger);
+  }
+
+  // Trigger a suspend check by making the suspend_trigger_ TLS value an invalid pointer.
+  // The next time a suspend check is done, it will load from the value at this address
+  // and trigger a SIGSEGV.
+  void TriggerSuspend() {
+    tlsPtr_.suspend_trigger = nullptr;
+  }
+
+
+  // Push an object onto the allocation stack.
+  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+
+  // Set the thread local allocation pointers to the given pointers.
+  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+
+  // Resets the thread local allocation pointers.
+  void RevokeThreadLocalAllocationStack();
+
+  size_t GetThreadLocalBytesAllocated() const {
+    return tlsPtr_.thread_local_pos - tlsPtr_.thread_local_start;
+  }
+
+  size_t GetThreadLocalObjectsAllocated() const {
+    return tlsPtr_.thread_local_objects;
+  }
+
+  // ROS alloc TLS.
+  static constexpr size_t kRosAllocNumOfSizeBrackets = 34;
+
+  void* GetRosAllocRun(size_t index) const {
+    return tlsPtr_.rosalloc_runs[index];
+  }
+
+  void SetRosAllocRun(size_t index, void* run) {
+    tlsPtr_.rosalloc_runs[index] = run;
+  }
+
+ private:
   explicit Thread(bool daemon);
   ~Thread() LOCKS_EXCLUDED(Locks::mutator_lock_,
                            Locks::thread_suspend_count_lock_);
@@ -644,7 +797,7 @@
   // Dbg::Disconnected.
   ThreadState SetStateUnsafe(ThreadState new_state) {
     ThreadState old_state = GetState();
-    state_and_flags_.as_struct.state = new_state;
+    tls32_.state_and_flags.as_struct.state = new_state;
     return old_state;
   }
 
@@ -678,22 +831,6 @@
   void SetUpAlternateSignalStack();
   void TearDownAlternateSignalStack();
 
-  void NotifyLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_);
-
-  static void ThreadExitCallback(void* arg);
-
-  // Has Thread::Startup been called?
-  static bool is_started_;
-
-  // TLS key used to retrieve the Thread*.
-  static pthread_key_t pthread_key_self_;
-
-  // Used to notify threads that they should attempt to resume, they will suspend again if
-  // their suspend count is > 0.
-  static ConditionVariable* resume_cond_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // --- Frequently accessed fields first for short offsets ---
-
   // 32 bits of atomically changed state and flags. Keeping as 32 bits allows and atomic CAS to
   // change from being Suspended to Runnable without a suspend request occurring.
   union PACKED(4) StateAndFlags {
@@ -715,206 +852,225 @@
     // See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47409
     DISALLOW_COPY_AND_ASSIGN(StateAndFlags);
   };
-  union StateAndFlags state_and_flags_;
-  COMPILE_ASSERT(sizeof(union StateAndFlags) == sizeof(int32_t),
-                 sizeof_state_and_flags_and_int32_are_different);
 
-  // A non-zero value is used to tell the current thread to enter a safe point
-  // at the next poll.
-  int suspend_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // The biased card table, see CardTable for details
-  byte* card_table_;
-
-  // The pending exception or NULL.
-  mirror::Throwable* exception_;
-
-  // The end of this thread's stack. This is the lowest safely-addressable address on the stack.
-  // We leave extra space so there's room for the code that throws StackOverflowError.
-  byte* stack_end_;
-
-  // The top of the managed stack often manipulated directly by compiler generated code.
-  ManagedStack managed_stack_;
-
-  // Every thread may have an associated JNI environment
-  JNIEnvExt* jni_env_;
-
-  // Initialized to "this". On certain architectures (such as x86) reading
-  // off of Thread::Current is easy but getting the address of Thread::Current
-  // is hard. This field can be read off of Thread::Current to give the address.
-  Thread* self_;
-
-  // Our managed peer (an instance of java.lang.Thread). The jobject version is used during thread
-  // start up, until the thread is registered and the local opeer_ is used.
-  mirror::Object* opeer_;
-  jobject jpeer_;
-
-  // The "lowest addressable byte" of the stack
-  byte* stack_begin_;
-
-  // Size of the stack
-  size_t stack_size_;
-
-  // Thin lock thread id. This is a small integer used by the thin lock implementation.
-  // This is not to be confused with the native thread's tid, nor is it the value returned
-  // by java.lang.Thread.getId --- this is a distinct value, used only for locking. One
-  // important difference between this id and the ids visible to managed code is that these
-  // ones get reused (to ensure that they fit in the number of bits available).
-  uint32_t thin_lock_thread_id_;
-
-  // Pointer to previous stack trace captured by sampling profiler.
-  std::vector<mirror::ArtMethod*>* stack_trace_sample_;
-
-  // The clock base used for tracing.
-  uint64_t trace_clock_base_;
-
-  // System thread id.
-  pid_t tid_;
-
-  ThrowLocation throw_location_;
-
-  // Guards the 'interrupted_' and 'wait_monitor_' members.
-  mutable Mutex* wait_mutex_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  // Condition variable waited upon during a wait.
-  ConditionVariable* wait_cond_ GUARDED_BY(wait_mutex_);
-  // Pointer to the monitor lock we're currently waiting on or NULL if not waiting.
-  Monitor* wait_monitor_ GUARDED_BY(wait_mutex_);
-  // Thread "interrupted" status; stays raised until queried or thrown.
-  bool32_t interrupted_ GUARDED_BY(wait_mutex_);
-  // The next thread in the wait set this thread is part of or NULL if not waiting.
-  Thread* wait_next_;
-
-
-  // If we're blocked in MonitorEnter, this is the object we're trying to lock.
-  mirror::Object* monitor_enter_object_;
-
-  // Top of linked list of stack indirect reference tables or NULL for none
-  StackIndirectReferenceTable* top_sirt_;
-
-  Runtime* runtime_;
-
-  RuntimeStats stats_;
-
-  // Needed to get the right ClassLoader in JNI_OnLoad, but also
-  // useful for testing.
-  mirror::ClassLoader* class_loader_override_;
-
-  // Thread local, lazily allocated, long jump context. Used to deliver exceptions.
-  Context* long_jump_context_;
-
-  // A boolean telling us whether we're recursively throwing OOME.
-  bool32_t throwing_OutOfMemoryError_;
-
-  // How much of 'suspend_count_' is by request of the debugger, used to set things right
-  // when the debugger detaches. Must be <= suspend_count_.
-  int debug_suspend_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // JDWP invoke-during-breakpoint support.
-  DebugInvokeReq* debug_invoke_req_;
-
-  // JDWP single-stepping support.
-  SingleStepControl* single_step_control_;
-
-  // Shadow frame that is used temporarily during the deoptimization of a method.
-  ShadowFrame* deoptimization_shadow_frame_;
-  JValue deoptimization_return_value_;
-
-  // Additional stack used by method instrumentation to store method and return pc values.
-  // Stored as a pointer since std::deque is not PACKED.
-  std::deque<instrumentation::InstrumentationStackFrame>* instrumentation_stack_;
-
-  // A cached copy of the java.lang.Thread's name.
-  std::string* name_;
-
-  // Is the thread a daemon?
-  const bool32_t daemon_;
-
-  // A cached pthread_t for the pthread underlying this Thread*.
-  pthread_t pthread_self_;
-
-  // Support for Mutex lock hierarchy bug detection.
-  BaseMutex* held_mutexes_[kLockLevelCount];
-
-  // A positive value implies we're in a region where thread suspension isn't expected.
-  uint32_t no_thread_suspension_;
-
-  // If no_thread_suspension_ is > 0, what is causing that assertion.
-  const char* last_no_thread_suspension_cause_;
+  static void ThreadExitCallback(void* arg);
 
   // Maximum number of checkpoint functions.
   static constexpr uint32_t kMaxCheckpoints = 3;
 
-  // Pending checkpoint function or NULL if non-pending. Installation guarding by
-  // Locks::thread_suspend_count_lock_.
-  Closure* checkpoint_functions_[kMaxCheckpoints];
+  // Has Thread::Startup been called?
+  static bool is_started_;
 
- public:
-  // Entrypoint function pointers
-  // TODO: move this near the top, since changing its offset requires all oats to be recompiled!
-  InterpreterEntryPoints interpreter_entrypoints_;
-  JniEntryPoints jni_entrypoints_;
-  PortableEntryPoints portable_entrypoints_;
-  QuickEntryPoints quick_entrypoints_;
+  // TLS key used to retrieve the Thread*.
+  static pthread_key_t pthread_key_self_;
 
-  // Setting this to 0 will trigger a SEGV and thus a suspend check.  It is normally
-  // set to the address of itself.
-  uintptr_t* suspend_trigger_;
+  // Used to notify threads that they should attempt to resume, they will suspend again if
+  // their suspend count is > 0.
+  static ConditionVariable* resume_cond_ GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // How many times has our pthread key's destructor been called?
-  uint32_t thread_exit_check_count_;
+  /***********************************************************************************************/
+  // Thread local storage. Fields are grouped by size to enable 32 <-> 64 searching to account for
+  // pointer size differences. To encourage shorter encoding, more frequently used values appear
+  // first if possible.
+  /***********************************************************************************************/
 
-  // Thread-local allocation pointer.
-  byte* thread_local_start_;
-  byte* thread_local_pos_;
-  byte* thread_local_end_;
-  size_t thread_local_objects_;
-  // Returns the remaining space in the TLAB.
-  size_t TlabSize() const;
-  // Doesn't check that there is room.
-  mirror::Object* AllocTlab(size_t bytes);
-  void SetTlab(byte* start, byte* end);
-  bool HasTlab() const;
+  struct PACKED(4)  tls_32bit_sized_values {
+    // We have no control over the size of 'bool', but want our boolean fields
+    // to be 4-byte quantities.
+    typedef uint32_t bool32_t;
 
-  // Remove the suspend trigger for this thread by making the suspend_trigger_ TLS value
-  // equal to a valid pointer.
-  // TODO: does this need to atomic?  I don't think so.
-  void RemoveSuspendTrigger() {
-    suspend_trigger_ = reinterpret_cast<uintptr_t*>(&suspend_trigger_);
-  }
+    explicit tls_32bit_sized_values(bool is_daemon) :
+      suspend_count(0), debug_suspend_count(0), thin_lock_thread_id(0), tid(0),
+      daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
+      thread_exit_check_count(0) {
+    }
 
-  // Trigger a suspend check by making the suspend_trigger_ TLS value an invalid pointer.
-  // The next time a suspend check is done, it will load from the value at this address
-  // and trigger a SIGSEGV.
-  void TriggerSuspend() {
-    suspend_trigger_ = nullptr;
-  }
+    union StateAndFlags state_and_flags;
+    COMPILE_ASSERT(sizeof(union StateAndFlags) == sizeof(int32_t),
+                   sizeof_state_and_flags_and_int32_are_different);
 
-  // Thread-local rosalloc runs. There are 34 size brackets in rosalloc
-  // runs (RosAlloc::kNumOfSizeBrackets). We can't refer to the
-  // RosAlloc class due to a header file circular dependency issue.
-  // To compensate, we check that the two values match at RosAlloc
-  // initialization time.
-  static const size_t kRosAllocNumOfSizeBrackets = 34;
-  void* rosalloc_runs_[kRosAllocNumOfSizeBrackets];
+    // A non-zero value is used to tell the current thread to enter a safe point
+    // at the next poll.
+    int suspend_count GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // Thread-local allocation stack data/routines.
-  mirror::Object** thread_local_alloc_stack_top_;
-  mirror::Object** thread_local_alloc_stack_end_;
+    // How much of 'suspend_count_' is by request of the debugger, used to set things right
+    // when the debugger detaches. Must be <= suspend_count_.
+    int debug_suspend_count GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // Push an object onto the allocation stack.
-  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+    // Thin lock thread id. This is a small integer used by the thin lock implementation.
+    // This is not to be confused with the native thread's tid, nor is it the value returned
+    // by java.lang.Thread.getId --- this is a distinct value, used only for locking. One
+    // important difference between this id and the ids visible to managed code is that these
+    // ones get reused (to ensure that they fit in the number of bits available).
+    uint32_t thin_lock_thread_id;
 
-  // Set the thread local allocation pointers to the given pointers.
-  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+    // System thread id.
+    uint32_t tid;
 
-  // Resets the thread local allocation pointers.
-  void RevokeThreadLocalAllocationStack();
+    // Is the thread a daemon?
+    const bool32_t daemon;
 
- private:
+    // A boolean telling us whether we're recursively throwing OOME.
+    bool32_t throwing_OutOfMemoryError;
+
+    // A positive value implies we're in a region where thread suspension isn't expected.
+    uint32_t no_thread_suspension;
+
+    // How many times has our pthread key's destructor been called?
+    uint32_t thread_exit_check_count;
+  } tls32_;
+
+  struct PACKED(8) tls_64bit_sized_values {
+    tls_64bit_sized_values() : trace_clock_base(0), deoptimization_return_value() {
+    }
+
+    // The clock base used for tracing.
+    uint64_t trace_clock_base;
+
+    // Return value used by deoptimization.
+    JValue deoptimization_return_value;
+
+    RuntimeStats stats;
+  } tls64_;
+
+  struct PACKED(4) tls_ptr_sized_values {
+      tls_ptr_sized_values() : card_table(nullptr), exception(nullptr), stack_end(nullptr),
+      managed_stack(), suspend_trigger(nullptr), jni_env(nullptr), self(nullptr), opeer(nullptr),
+      jpeer(nullptr), stack_begin(nullptr), stack_size(0), throw_location(),
+      stack_trace_sample(nullptr), wait_next(nullptr), monitor_enter_object(nullptr),
+      top_sirt(nullptr), class_loader_override(nullptr), long_jump_context(nullptr),
+      instrumentation_stack(nullptr), debug_invoke_req(nullptr), single_step_control(nullptr),
+      deoptimization_shadow_frame(nullptr), name(nullptr), pthread_self(0),
+      last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
+      thread_local_pos(nullptr), thread_local_end(nullptr), thread_local_objects(0),
+      thread_local_alloc_stack_top(nullptr), thread_local_alloc_stack_end(nullptr) {
+    }
+
+    // The biased card table, see CardTable for details.
+    byte* card_table;
+
+    // The pending exception or NULL.
+    mirror::Throwable* exception;
+
+    // The end of this thread's stack. This is the lowest safely-addressable address on the stack.
+    // We leave extra space so there's room for the code that throws StackOverflowError.
+    byte* stack_end;
+
+    // The top of the managed stack often manipulated directly by compiler generated code.
+    ManagedStack managed_stack;
+
+    // In certain modes, setting this to 0 will trigger a SEGV and thus a suspend check.  It is
+    // normally set to the address of itself.
+    uintptr_t* suspend_trigger;
+
+    // Every thread may have an associated JNI environment
+    JNIEnvExt* jni_env;
+
+    // Initialized to "this". On certain architectures (such as x86) reading off of Thread::Current
+    // is easy but getting the address of Thread::Current is hard. This field can be read off of
+    // Thread::Current to give the address.
+    Thread* self;
+
+    // Our managed peer (an instance of java.lang.Thread). The jobject version is used during thread
+    // start up, until the thread is registered and the local opeer_ is used.
+    mirror::Object* opeer;
+    jobject jpeer;
+
+    // The "lowest addressable byte" of the stack.
+    byte* stack_begin;
+
+    // Size of the stack.
+    size_t stack_size;
+
+    // The location the current exception was thrown from.
+    ThrowLocation throw_location;
+
+    // Pointer to previous stack trace captured by sampling profiler.
+    std::vector<mirror::ArtMethod*>* stack_trace_sample;
+
+    // The next thread in the wait set this thread is part of or NULL if not waiting.
+    Thread* wait_next;
+
+    // If we're blocked in MonitorEnter, this is the object we're trying to lock.
+    mirror::Object* monitor_enter_object;
+
+    // Top of linked list of stack indirect reference tables or NULL for none.
+    StackIndirectReferenceTable* top_sirt;
+
+    // Needed to get the right ClassLoader in JNI_OnLoad, but also
+    // useful for testing.
+    mirror::ClassLoader* class_loader_override;
+
+    // Thread local, lazily allocated, long jump context. Used to deliver exceptions.
+    Context* long_jump_context;
+
+    // Additional stack used by method instrumentation to store method and return pc values.
+    // Stored as a pointer since std::deque is not PACKED.
+    std::deque<instrumentation::InstrumentationStackFrame>* instrumentation_stack;
+
+    // JDWP invoke-during-breakpoint support.
+    DebugInvokeReq* debug_invoke_req;
+
+    // JDWP single-stepping support.
+    SingleStepControl* single_step_control;
+
+    // Shadow frame stack that is used temporarily during the deoptimization of a method.
+    ShadowFrame* deoptimization_shadow_frame;
+
+    // A cached copy of the java.lang.Thread's name.
+    std::string* name;
+
+    // A cached pthread_t for the pthread underlying this Thread*.
+    pthread_t pthread_self;
+
+    // Support for Mutex lock hierarchy bug detection.
+    BaseMutex* held_mutexes[kLockLevelCount];
+
+    // If no_thread_suspension_ is > 0, what is causing that assertion.
+    const char* last_no_thread_suspension_cause;
+
+    // Pending checkpoint function or NULL if non-pending. Installation guarding by
+    // Locks::thread_suspend_count_lock_.
+    Closure* checkpoint_functions[kMaxCheckpoints];
+
+    // Entrypoint function pointers.
+    // TODO: move this to more of a global offset table model to avoid per-thread duplication.
+    InterpreterEntryPoints interpreter_entrypoints;
+    JniEntryPoints jni_entrypoints;
+    PortableEntryPoints portable_entrypoints;
+    QuickEntryPoints quick_entrypoints;
+
+    // Thread-local allocation pointer.
+    byte* thread_local_start;
+    byte* thread_local_pos;
+    byte* thread_local_end;
+    size_t thread_local_objects;
+
+    // Thread-local rosalloc runs. There are 34 size brackets in rosalloc
+    // runs (RosAlloc::kNumOfSizeBrackets). We can't refer to the
+    // RosAlloc class due to a header file circular dependency issue.
+    // To compensate, we check that the two values match at RosAlloc
+    // initialization time.
+    void* rosalloc_runs[kRosAllocNumOfSizeBrackets];
+
+    // Thread-local allocation stack data/routines.
+    mirror::Object** thread_local_alloc_stack_top;
+    mirror::Object** thread_local_alloc_stack_end;
+  } tlsPtr_;
+
+  // Guards the 'interrupted_' and 'wait_monitor_' members.
+  Mutex* wait_mutex_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+
+  // Condition variable waited upon during a wait.
+  ConditionVariable* wait_cond_ GUARDED_BY(wait_mutex_);
+  // Pointer to the monitor lock we're currently waiting on or NULL if not waiting.
+  Monitor* wait_monitor_ GUARDED_BY(wait_mutex_);
+
+  // Thread "interrupted" status; stays raised until queried or thrown.
+  bool interrupted_ GUARDED_BY(wait_mutex_);
+
   friend class Dbg;  // For SetStateUnsafe.
   friend class gc::collector::SemiSpace;  // For getting stack traces.
-  friend class Monitor;
-  friend class MonitorInfo;
   friend class Runtime;  // For CreatePeer.
   friend class ScopedThreadStateChange;
   friend class SignalCatcher;  // For SetStateUnsafe.
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index ec610e1..0933780 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -63,7 +63,7 @@
 
 bool ThreadList::Contains(pid_t tid) {
   for (const auto& thread : list_) {
-    if (thread->tid_ == tid) {
+    if (thread->GetTid() == tid) {
       return true;
     }
   }
@@ -77,8 +77,8 @@
 void ThreadList::DumpNativeStacks(std::ostream& os) {
   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
   for (const auto& thread : list_) {
-    os << "DUMPING THREAD " << thread->tid_ << "\n";
-    DumpNativeStack(os, thread->tid_, "\t", true);
+    os << "DUMPING THREAD " << thread->GetTid() << "\n";
+    DumpNativeStack(os, thread->GetTid(), "\t", true);
     os << "\n";
   }
 }
@@ -607,7 +607,7 @@
     // though.
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
     self->ModifySuspendCount(self, +1, true);
-    CHECK_GT(self->suspend_count_, 0);
+    CHECK_GT(self->GetSuspendCount(), 0);
   }
 
   VLOG(threads) << *self << " self-suspending (debugger)";
@@ -631,18 +631,18 @@
 
   {
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
-    while (self->suspend_count_ != 0) {
+    while (self->GetSuspendCount() != 0) {
       Thread::resume_cond_->Wait(self);
-      if (self->suspend_count_ != 0) {
+      if (self->GetSuspendCount() != 0) {
         // The condition was signaled but we're still suspended. This
         // can happen if the debugger lets go while a SIGQUIT thread
         // dump event is pending (assuming SignalCatcher was resumed for
         // just long enough to try to grab the thread-suspend lock).
         LOG(DEBUG) << *self << " still suspended after undo "
-                   << "(suspend count=" << self->suspend_count_ << ")";
+                   << "(suspend count=" << self->GetSuspendCount() << ")";
       }
     }
-    CHECK_EQ(self->suspend_count_, 0);
+    CHECK_EQ(self->GetSuspendCount(), 0);
   }
 
   VLOG(threads) << *self << " self-reviving (debugger)";
@@ -661,10 +661,10 @@
     debug_suspend_all_count_ = 0;
     // Update running threads.
     for (const auto& thread : list_) {
-      if (thread == self || thread->debug_suspend_count_ == 0) {
+      if (thread == self || thread->GetDebugSuspendCount() == 0) {
         continue;
       }
-      thread->ModifySuspendCount(self, -thread->debug_suspend_count_, true);
+      thread->ModifySuspendCount(self, -thread->GetDebugSuspendCount(), true);
     }
   }
 
@@ -749,11 +749,15 @@
   // SuspendAll requests.
   MutexLock mu(self, *Locks::thread_list_lock_);
   MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-  self->suspend_count_ = suspend_all_count_;
-  self->debug_suspend_count_ = debug_suspend_all_count_;
-  if (self->suspend_count_ > 0) {
-    self->AtomicSetFlag(kSuspendRequest);
-    self->TriggerSuspend();
+  CHECK_GE(suspend_all_count_, debug_suspend_all_count_);
+  if (debug_suspend_all_count_ > 0) {
+    self->ModifySuspendCount(self, debug_suspend_all_count_, true);
+  }
+  if (suspend_all_count_ > 0) {
+    int delta = suspend_all_count_ - debug_suspend_all_count_;
+    if (delta > 0) {
+      self->ModifySuspendCount(self, delta, false);
+    }
   }
   CHECK(!Contains(self));
   list_.push_back(self);
@@ -768,7 +772,7 @@
   // suspend and so on, must happen at this point, and not in ~Thread.
   self->Destroy();
 
-  uint32_t thin_lock_id = self->thin_lock_thread_id_;
+  uint32_t thin_lock_id = self->GetThreadId();
   while (self != nullptr) {
     // Remove and delete the Thread* while holding the thread_list_lock_ and
     // thread_suspend_count_lock_ so that the unregistering thread cannot be suspended.
diff --git a/runtime/throw_location.h b/runtime/throw_location.h
index c171b07..b36eb67 100644
--- a/runtime/throw_location.h
+++ b/runtime/throw_location.h
@@ -41,7 +41,16 @@
                 uint32_t throw_dex_pc) :
       this_object_(throw_this_object),
       method_(throw_method),
-      dex_pc_(throw_dex_pc) {}
+      dex_pc_(throw_dex_pc)
+#ifdef __LP64__
+      , pad_(0)
+#endif
+
+  {
+#ifdef __LP64__
+    UNUSED(pad_);
+#endif
+  }
 
   mirror::Object* GetThis() const {
     return this_object_;
@@ -72,6 +81,10 @@
   mirror::ArtMethod* method_;
   // The instruction within the throwing method.
   uint32_t dex_pc_;
+  // Ensure 8byte alignment on 64bit.
+#ifdef __LP64__
+  uint32_t pad_;
+#endif
 };
 
 }  // namespace art