Jit Code Cache instruction pipeline flushing

Restores instruction pipeline flushing on all cores following crashes
on ARMv7 with dual JIT code page mappings. We were inadvertantly
toggling permission on a non-executable page rather than executable.

Removes the data cache flush for roots data and replaces it with a
sequentially consistent barrier.

Fix MemMap::RemapAtEnd() when all pages are given out. To meet
invariants checked in the destructor, the base pointer needs to be
assigned as nullptr when this happens.

Bug: 63833411
Bug: 62332932
Test: art/test.py --target
Change-Id: I705cf5a3c80e78c4e912ea3d2c3c4aa89dee26bb
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index b76a0df..f4b67b2 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -1210,14 +1210,14 @@
   uint8_t* stack_map_data = nullptr;
   uint8_t* method_info_data = nullptr;
   uint8_t* roots_data = nullptr;
-  uint32_t data_size = code_cache->ReserveData(self,
-                                               stack_map_size,
-                                               method_info_size,
-                                               number_of_roots,
-                                               method,
-                                               &stack_map_data,
-                                               &method_info_data,
-                                               &roots_data);
+  code_cache->ReserveData(self,
+                          stack_map_size,
+                          method_info_size,
+                          number_of_roots,
+                          method,
+                          &stack_map_data,
+                          &method_info_data,
+                          &roots_data);
   if (stack_map_data == nullptr || roots_data == nullptr) {
     return false;
   }
@@ -1238,7 +1238,6 @@
       codegen->GetFpuSpillMask(),
       code_allocator.GetMemory().data(),
       code_allocator.GetSize(),
-      data_size,
       osr,
       roots,
       codegen->GetGraph()->HasShouldDeoptimizeFlag(),
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 27501b9..653fb87 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -121,6 +121,10 @@
     return nullptr;
   }
 
+  // Align both capacities to page size, as that's the unit mspaces use.
+  initial_capacity = RoundDown(initial_capacity, 2 * kPageSize);
+  max_capacity = RoundDown(max_capacity, 2 * kPageSize);
+
   std::string error_str;
   // Map name specific for android_os_Debug.cpp accounting.
   // Map in low 4gb to simplify accessing root tables for x86_64.
@@ -142,22 +146,21 @@
     return nullptr;
   }
 
-  // Align both capacities to page size, as that's the unit mspaces use.
-  initial_capacity = RoundDown(initial_capacity, 2 * kPageSize);
-  max_capacity = RoundDown(max_capacity, 2 * kPageSize);
-
   // Create a region for JIT data and executable code. This will be
   // laid out as:
   //
   //          +----------------+ --------------------
-  //          :                : ^                  ^
+  //          | code_sync_map_ | ^ code_sync_size   ^
+  //          |                | v                  |
+  //          +----------------+ --                 |
+  //          :                : ^                  |
   //          :  post_code_map : | post_code_size   |
   //          :   [padding]    : v                  |
   //          +----------------+ -                  |
   //          |                | ^                  |
-  //          |   code_map     | | code_size        |
+  //          |   code_map     | | code_size        | total_mapping_size
   //          |   [JIT Code]   | v                  |
-  //          +----------------+ -                  | total_mapping_size
+  //          +----------------+ -                  |
   //          :                : ^                  |
   //          :  pre_code_map  : | pre_code_size    |
   //          :   [padding]    : v                  |
@@ -167,17 +170,23 @@
   //          |   [Jit Data]   | v                  v
   //          +----------------+ --------------------
   //
+  // The code_sync_map_ contains a page that we use flush CPU instruction
+  // pipelines (see FlushInstructionPipelines()).
+  //
   // The padding regions - pre_code_map and post_code_map - exist to
   // put some random distance between the writable JIT code mapping
   // and the executable mapping. The padding is discarded at the end
   // of this function.
-  size_t total_mapping_size = kMaxMapSpacingPages * kPageSize;
-  size_t data_size = RoundUp((max_capacity - total_mapping_size) / 2, kPageSize);
+  //
+  size_t data_size = (max_capacity - kMaxMapSpacingPages * kPageSize) / 2;
   size_t pre_code_size =
-      GetRandomNumber(kMinMapSpacingPages, kMaxMapSpacingPages) * kPageSize;
-  size_t code_size = max_capacity - total_mapping_size - data_size;
-  size_t post_code_size = total_mapping_size - pre_code_size;
-  DCHECK_EQ(code_size + data_size + total_mapping_size, max_capacity);
+      GetRandomNumber(kMinMapSpacingPages, kMaxMapSpacingPages - 1) * kPageSize;
+  size_t code_size = max_capacity - data_size - kMaxMapSpacingPages * kPageSize;
+  size_t code_sync_size = kPageSize;
+  size_t post_code_size = kMaxMapSpacingPages * kPageSize - pre_code_size - code_sync_size;
+  DCHECK_EQ(data_size, code_size);
+  DCHECK_EQ(pre_code_size + post_code_size + code_sync_size, kMaxMapSpacingPages * kPageSize);
+  DCHECK_EQ(data_size + pre_code_size + code_size + post_code_size + code_sync_size, max_capacity);
 
   // Create pre-code padding region after data region, discarded after
   // code and data regions are set-up.
@@ -191,7 +200,7 @@
     return nullptr;
   }
   DCHECK_EQ(data_map->Size(), data_size);
-  DCHECK_EQ(pre_code_map->Size(), pre_code_size + code_size + post_code_size);
+  DCHECK_EQ(pre_code_map->Size(), pre_code_size + code_size + post_code_size + code_sync_size);
 
   // Create code region.
   unique_fd writable_code_fd;
@@ -206,7 +215,7 @@
     return nullptr;
   }
   DCHECK_EQ(pre_code_map->Size(), pre_code_size);
-  DCHECK_EQ(code_map->Size(), code_size + post_code_size);
+  DCHECK_EQ(code_map->Size(), code_size + post_code_size + code_sync_size);
 
   // Padding after code region, discarded after code and data regions
   // are set-up.
@@ -220,7 +229,19 @@
     return nullptr;
   }
   DCHECK_EQ(code_map->Size(), code_size);
+  DCHECK_EQ(post_code_map->Size(), post_code_size + code_sync_size);
+
+  std::unique_ptr<MemMap> code_sync_map(SplitMemMap(post_code_map.get(),
+                                                    "jit-code-sync",
+                                                    post_code_size,
+                                                    kProtCode,
+                                                    error_msg,
+                                                    use_ashmem));
+  if (code_sync_map == nullptr) {
+    return nullptr;
+  }
   DCHECK_EQ(post_code_map->Size(), post_code_size);
+  DCHECK_EQ(code_sync_map->Size(), code_sync_size);
 
   std::unique_ptr<MemMap> writable_code_map;
   if (use_two_mappings) {
@@ -246,6 +267,7 @@
   return new JitCodeCache(writable_code_map.release(),
                           code_map.release(),
                           data_map.release(),
+                          code_sync_map.release(),
                           code_size,
                           data_size,
                           max_capacity,
@@ -255,6 +277,7 @@
 JitCodeCache::JitCodeCache(MemMap* writable_code_map,
                            MemMap* executable_code_map,
                            MemMap* data_map,
+                           MemMap* code_sync_map,
                            size_t initial_code_capacity,
                            size_t initial_data_capacity,
                            size_t max_capacity,
@@ -265,6 +288,7 @@
       data_map_(data_map),
       executable_code_map_(executable_code_map),
       writable_code_map_(writable_code_map),
+      code_sync_map_(code_sync_map),
       max_capacity_(max_capacity),
       current_capacity_(initial_code_capacity + initial_data_capacity),
       code_end_(initial_code_capacity),
@@ -382,7 +406,7 @@
 
 class ScopedCodeCacheWrite : ScopedTrace {
  public:
-  explicit ScopedCodeCacheWrite(JitCodeCache* code_cache, bool only_for_tlb_shootdown = false)
+  explicit ScopedCodeCacheWrite(JitCodeCache* code_cache)
       : ScopedTrace("ScopedCodeCacheWrite") {
     ScopedTrace trace("mprotect all");
     int prot_to_start_writing = kProtAll;
@@ -398,7 +422,7 @@
     writable_map_ = code_cache->GetWritableMemMap();
     // If we're using ScopedCacheWrite only for TLB shootdown, we limit the scope of mprotect to
     // one page.
-    size_ = only_for_tlb_shootdown ? kPageSize : writable_map_->Size();
+    size_ = writable_map_->Size();
     CHECKED_MPROTECT(writable_map_->Begin(), size_, prot_to_start_writing);
   }
   ~ScopedCodeCacheWrite() {
@@ -424,7 +448,6 @@
                                   size_t fp_spill_mask,
                                   const uint8_t* code,
                                   size_t code_size,
-                                  size_t data_size,
                                   bool osr,
                                   Handle<mirror::ObjectArray<mirror::Object>> roots,
                                   bool has_should_deoptimize_flag,
@@ -439,7 +462,6 @@
                                        fp_spill_mask,
                                        code,
                                        code_size,
-                                       data_size,
                                        osr,
                                        roots,
                                        has_should_deoptimize_flag,
@@ -457,7 +479,6 @@
                                 fp_spill_mask,
                                 code,
                                 code_size,
-                                data_size,
                                 osr,
                                 roots,
                                 has_should_deoptimize_flag,
@@ -744,6 +765,20 @@
   method->SetCounter(std::min(jit_warmup_threshold - 1, 1));
 }
 
+static void FlushInstructionPiplines(uint8_t* sync_page) {
+  // After updating the JIT code cache we need to force all CPUs to
+  // flush their instruction pipelines. In the absence of system call
+  // to do this explicitly, we can achieve this indirectly by toggling
+  // permissions on an executable page. This should send an IPI to
+  // each core to update the TLB entry with the interrupt raised on
+  // each core causing the instruction pipeline to be flushed.
+  CHECKED_MPROTECT(sync_page, kPageSize, kProtAll);
+  // Ensure the sync_page is present otherwise a TLB update may not be
+  // necessary.
+  sync_page[0] = 0;
+  CHECKED_MPROTECT(sync_page, kPageSize, kProtCode);
+}
+
 #ifdef __aarch64__
 
 static void FlushJitCodeCacheRange(uint8_t* code_ptr,
@@ -835,7 +870,6 @@
                                           size_t fp_spill_mask,
                                           const uint8_t* code,
                                           size_t code_size,
-                                          size_t data_size,
                                           bool osr,
                                           Handle<mirror::ObjectArray<mirror::Object>> roots,
                                           bool has_should_deoptimize_flag,
@@ -878,6 +912,7 @@
           code_size);
 
       FlushJitCodeCacheRange(code_ptr, writable_ptr, code_size);
+      FlushInstructionPiplines(code_sync_map_->Begin());
 
       DCHECK(!Runtime::Current()->IsAotCompiler());
       if (has_should_deoptimize_flag) {
@@ -927,13 +962,10 @@
     DCHECK_EQ(FromStackMapToRoots(stack_map), roots_data);
     DCHECK_LE(roots_data, stack_map);
     FillRootTable(roots_data, roots);
-    {
-      // Flush data cache, as compiled code references literals in it.
-      // We also need a TLB shootdown to act as memory barrier across cores.
-      ScopedCodeCacheWrite ccw(this, /* only_for_tlb_shootdown */ true);
-      FlushDataCache(reinterpret_cast<char*>(roots_data),
-                     reinterpret_cast<char*>(roots_data + data_size));
-    }
+
+    // Ensure the updates to the root table are visible with a store fence.
+    QuasiAtomic::ThreadFenceSequentiallyConsistent();
+
     method_code_map_.Put(code_ptr, method);
     if (osr) {
       number_of_osr_compilations_++;
diff --git a/runtime/jit/jit_code_cache.h b/runtime/jit/jit_code_cache.h
index a062ce4..175501f 100644
--- a/runtime/jit/jit_code_cache.h
+++ b/runtime/jit/jit_code_cache.h
@@ -113,7 +113,6 @@
                       size_t fp_spill_mask,
                       const uint8_t* code,
                       size_t code_size,
-                      size_t data_size,
                       bool osr,
                       Handle<mirror::ObjectArray<mirror::Object>> roots,
                       bool has_should_deoptimize_flag,
@@ -255,6 +254,7 @@
   JitCodeCache(MemMap* code_map,
                MemMap* data_map,
                MemMap* writable_code_map,
+               MemMap* code_sync_map,
                size_t initial_code_capacity,
                size_t initial_data_capacity,
                size_t max_capacity,
@@ -272,7 +272,6 @@
                               size_t fp_spill_mask,
                               const uint8_t* code,
                               size_t code_size,
-                              size_t data_size,
                               bool osr,
                               Handle<mirror::ObjectArray<mirror::Object>> roots,
                               bool has_should_deoptimize_flag,
@@ -383,6 +382,9 @@
   std::unique_ptr<MemMap> executable_code_map_;
   // Mem map which holds a non-executable view of code for JIT.
   std::unique_ptr<MemMap> writable_code_map_;
+  // Mem map which holds one executable page that we use for flushing instruction
+  // fetch buffers. The code on this page is never executed.
+  std::unique_ptr<MemMap> code_sync_map_;
   // The opaque mspace for allocating code.
   void* code_mspace_ GUARDED_BY(lock_);
   // The opaque mspace for allocating data.
diff --git a/runtime/mem_map.cc b/runtime/mem_map.cc
index 17035dd..1ef72ba 100644
--- a/runtime/mem_map.cc
+++ b/runtime/mem_map.cc
@@ -497,7 +497,7 @@
     MEMORY_TOOL_MAKE_UNDEFINED(base_begin_, base_size_);
     int result = munmap(base_begin_, base_size_);
     if (result == -1) {
-      PLOG(FATAL) << "munmap failed";
+      PLOG(FATAL) << "munmap failed: " << BaseBegin() << "..." << BaseEnd();
     }
   }
 
@@ -561,6 +561,12 @@
   size_ = new_end - reinterpret_cast<uint8_t*>(begin_);
   base_size_ = new_base_end - reinterpret_cast<uint8_t*>(base_begin_);
   DCHECK_LE(begin_ + size_, reinterpret_cast<uint8_t*>(base_begin_) + base_size_);
+  if (base_size_ == 0u) {
+    // All pages in this MemMap have been handed out. Invalidate base
+    // pointer to prevent the destructor calling munmap() on
+    // zero-length region (which can't succeed).
+    base_begin_ = nullptr;
+  }
   size_t tail_size = old_end - new_end;
   uint8_t* tail_base_begin = new_base_end;
   size_t tail_base_size = old_base_end - new_base_end;