Use MADV_FREE to reclaim pages of freed regions

In order to release memory consumed by freed regions back to kernel we
use madvise(MADV_DONTNEED) syscall, which synchronously takes away all
the pages of the given memory range. As a side-effect, this also
provides us clean (zeroed-out) pages the next time region is used by
an application thread as a TLAB. The downside is overhead of
MADV_DONTNEED (it has to manipulate all the corresponding page-table
entries). Furthermore, the application thread gets page fault each
time it moves from one page to the next while consuming the TLAB.

MADV_FREE is another madvise option that also advises kernel to take
away the pages, but only when there is memory pressure. It is a much
more light-weight operation than MADV_DONTNEED. Also, the application
thread doesn't get the page fault overhead, if the page is not already
reclaimed. However, a page which is not reclaimed by the kernel will
have stale data on it.

This change uses MADV_FREE for region-space's reclamation and ensures
that pages about to be used for new allocations are properly cleared.

Bug: 74447417
Bug: 140130889
Test: Golem, mpts/greenday tests
Change-Id: I1e4a75abed51844d5062685bf77871f609af5a65
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index d769543..6922e7f 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -3112,11 +3112,7 @@
 
 static void b13564922() {
 #if defined(__linux__) && defined(__arm__)
-  int major, minor;
-  struct utsname uts;
-  if (uname(&uts) != -1 &&
-      sscanf(uts.release, "%d.%d", &major, &minor) == 2 &&
-      ((major < 3) || ((major == 3) && (minor < 4)))) {
+  if (KernelVersionLower(3, 4)) {
     // Kernels before 3.4 don't handle the ASLR well and we can run out of address
     // space (http://b/13564922). Work around the issue by inhibiting further mmap() randomization.
     int old_personality = personality(0xffffffff);
diff --git a/libartbase/base/membarrier.cc b/libartbase/base/membarrier.cc
index 48f47df..d925049 100644
--- a/libartbase/base/membarrier.cc
+++ b/libartbase/base/membarrier.cc
@@ -21,7 +21,6 @@
 
 #if !defined(_WIN32)
 #include <sys/syscall.h>
-#include <sys/utsname.h>
 #include <unistd.h>
 #endif
 #include "macros.h"
@@ -29,6 +28,7 @@
 #if defined(__BIONIC__)
 
 #include <atomic>
+#include <base/utils.h>
 #include <linux/membarrier.h>
 
 #define CHECK_MEMBARRIER_CMD(art_value, membarrier_value) \
@@ -49,14 +49,7 @@
 
 int membarrier(MembarrierCommand command) {
   // Check kernel version supports membarrier(2).
-  static constexpr int kRequiredMajor = 4;
-  static constexpr int kRequiredMinor = 16;
-  struct utsname uts;
-  int major, minor;
-  if (uname(&uts) != 0 ||
-      strcmp(uts.sysname, "Linux") != 0 ||
-      sscanf(uts.release, "%d.%d", &major, &minor) != 2 ||
-      (major < kRequiredMajor || (major == kRequiredMajor && minor < kRequiredMinor))) {
+  if (KernelVersionLower(4, 16)) {
     errno = ENOSYS;
     return -1;
   }
diff --git a/libartbase/base/memfd.cc b/libartbase/base/memfd.cc
index 8512a3a..d031fb6 100644
--- a/libartbase/base/memfd.cc
+++ b/libartbase/base/memfd.cc
@@ -21,7 +21,6 @@
 #if !defined(_WIN32)
 #include <fcntl.h>
 #include <sys/syscall.h>
-#include <sys/utsname.h>
 #include <unistd.h>
 #endif
 #if defined(__BIONIC__)
@@ -30,6 +29,7 @@
 
 #include <android-base/logging.h>
 #include <android-base/unique_fd.h>
+#include <base/utils.h>
 
 #include "macros.h"
 
@@ -51,14 +51,7 @@
 int memfd_create(const char* name, unsigned int flags) {
   // Check kernel version supports memfd_create(). Some older kernels segfault executing
   // memfd_create() rather than returning ENOSYS (b/116769556).
-  static constexpr int kRequiredMajor = 3;
-  static constexpr int kRequiredMinor = 17;
-  struct utsname uts;
-  int major, minor;
-  if (uname(&uts) != 0 ||
-      strcmp(uts.sysname, "Linux") != 0 ||
-      sscanf(uts.release, "%d.%d", &major, &minor) != 2 ||
-      (major < kRequiredMajor || (major == kRequiredMajor && minor < kRequiredMinor))) {
+  if (KernelVersionLower(3, 17)) {
     errno = ENOSYS;
     return -1;
   }
diff --git a/libartbase/base/utils.cc b/libartbase/base/utils.cc
index 19311b3..4e8d306 100644
--- a/libartbase/base/utils.cc
+++ b/libartbase/base/utils.cc
@@ -156,6 +156,33 @@
 
 #endif
 
+// On non-linux builds assume that the kernel version is lower than required.
+#if defined(__linux__)
+std::pair<int, int> GetKernelVersion() {
+  struct utsname uts;
+  int major, minor;
+  CHECK_EQ(uname(&uts), 0);
+  CHECK_EQ(strcmp(uts.sysname, "Linux"), 0);
+  CHECK_EQ(sscanf(uts.release, "%d.%d", &major, &minor), 2);
+  return std::pair(major, minor);
+}
+
+bool KernelVersionLower(int required_major, int required_minor) {
+  // static (major, minor) pair as it never changes during runtime.
+  static std::pair<int, int> kernel_version = GetKernelVersion();
+  if (kernel_version.first < required_major
+      || (kernel_version.first == required_major && kernel_version.second < required_minor)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+#else
+bool KernelVersionLower(int required_major ATTRIBUTE_UNUSED, int required_minor ATTRIBUTE_UNUSED) {
+  return true;
+}
+#endif
+
 bool CacheOperationsMaySegFault() {
 #if defined(__linux__) && defined(__aarch64__)
   // Avoid issue on older ARM64 kernels where data cache operations could be classified as writes
@@ -165,14 +192,7 @@
   //
   // This behaviour means we should avoid the dual view JIT on the device. This is just
   // an issue when running tests on devices that have an old kernel.
-  static constexpr int kRequiredMajor = 3;
-  static constexpr int kRequiredMinor = 12;
-  struct utsname uts;
-  int major, minor;
-  if (uname(&uts) != 0 ||
-      strcmp(uts.sysname, "Linux") != 0 ||
-      sscanf(uts.release, "%d.%d", &major, &minor) != 2 ||
-      (major < kRequiredMajor || (major == kRequiredMajor && minor < kRequiredMinor))) {
+  if (KernelVersionLower(3, 12)) {
     return true;
   }
 #endif
diff --git a/libartbase/base/utils.h b/libartbase/base/utils.h
index 4bcb915..1fe465c 100644
--- a/libartbase/base/utils.h
+++ b/libartbase/base/utils.h
@@ -42,6 +42,9 @@
 // Returns a human-readable size string such as "1MB".
 std::string PrettySize(int64_t size_in_bytes);
 
+// Returns true if the kernel's version (based on uname) is lower than required.
+bool KernelVersionLower(int required_major, int required_minor);
+
 // Splits a string using the given separator character into a vector of
 // strings. Empty strings will be omitted.
 void Split(const std::string& s, char separator, std::vector<std::string>* result);
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index f989360..f18fb53 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -4187,6 +4187,11 @@
       return nullptr;
     }
     *bytes_tl_bulk_allocated = expand_bytes;
+    // Zero the TLAB pages as we MADV_FREE the regions in CC, which doesn't
+    // guarantee clean pages.
+    if (allocator_type == kAllocatorTypeRegionTLAB) {
+      region_space_->ZeroAllocRange(self->GetTlabEnd(), expand_bytes);
+    }
     self->ExpandTlab(expand_bytes);
     DCHECK_LE(alloc_size, self->TlabSize());
   } else if (allocator_type == kAllocatorTypeTLAB) {
diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index 901568e..65cd111 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h
@@ -56,26 +56,26 @@
   mirror::Object* obj;
   if (LIKELY(num_bytes <= kRegionSize)) {
     // Non-large object.
-    obj = (kForEvac ? evac_region_ : current_region_)->Alloc(num_bytes,
-                                                             bytes_allocated,
-                                                             usable_size,
-                                                             bytes_tl_bulk_allocated);
+    obj = (kForEvac ? evac_region_ : current_region_)->Alloc<kForEvac>(num_bytes,
+                                                                       bytes_allocated,
+                                                                       usable_size,
+                                                                       bytes_tl_bulk_allocated);
     if (LIKELY(obj != nullptr)) {
       return obj;
     }
     MutexLock mu(Thread::Current(), region_lock_);
     // Retry with current region since another thread may have updated
     // current_region_ or evac_region_.  TODO: fix race.
-    obj = (kForEvac ? evac_region_ : current_region_)->Alloc(num_bytes,
-                                                             bytes_allocated,
-                                                             usable_size,
-                                                             bytes_tl_bulk_allocated);
+    obj = (kForEvac ? evac_region_ : current_region_)->Alloc<kForEvac>(num_bytes,
+                                                                       bytes_allocated,
+                                                                       usable_size,
+                                                                       bytes_tl_bulk_allocated);
     if (LIKELY(obj != nullptr)) {
       return obj;
     }
     Region* r = AllocateRegion(kForEvac);
     if (LIKELY(r != nullptr)) {
-      obj = r->Alloc(num_bytes, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
+      obj = r->Alloc<kForEvac>(num_bytes, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
       CHECK(obj != nullptr);
       // Do our allocation before setting the region, this makes sure no threads race ahead
       // and fill in the region before we allocate the object. b/63153464
@@ -96,6 +96,14 @@
   return nullptr;
 }
 
+inline void RegionSpace::ZeroAllocRange(uint8_t* start, size_t length) {
+  if (gPurgeAdvice == MADV_DONTNEED) {
+    return;
+  }
+  std::fill(start, start + length, 0);
+}
+
+template <bool kForEvac>
 inline mirror::Object* RegionSpace::Region::Alloc(size_t num_bytes,
                                                   /* out */ size_t* bytes_allocated,
                                                   /* out */ size_t* usable_size,
@@ -120,6 +128,10 @@
     *usable_size = num_bytes;
   }
   *bytes_tl_bulk_allocated = num_bytes;
+  // We don't need to clean allocations for evacuation as we memcpy in that case.
+  if (!kForEvac) {
+    ZeroAllocRange(old_top, num_bytes);
+  }
   return reinterpret_cast<mirror::Object*>(old_top);
 }
 
@@ -283,6 +295,14 @@
         reinterpret_cast<uintptr_t>(top),
         visitor);
   } else {
+    // When using MADV_FREE instead of MADV_DONTNEED for release regions' pages
+    // in ClearFromSpace(), we may have non-zero pages beyond r->Top().
+    // This can happen only with regions which are TLABs. Therefore, we can
+    // fetch the right pos from thread-local TLAB values.
+    if (r->is_a_tlab_) {
+      DCHECK(r->thread_ != nullptr);
+      top = r->thread_->GetTlabPos();
+    }
     while (pos < top) {
       mirror::Object* obj = reinterpret_cast<mirror::Object*>(pos);
       if (obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() != nullptr) {
@@ -371,8 +391,13 @@
                                          usable_size,
                                          bytes_tl_bulk_allocated);
   }
-  if (kForEvac && region != nullptr) {
-    TraceHeapSize();
+  if (kForEvac) {
+    if (region != nullptr) {
+      TraceHeapSize();
+    }
+  } else {
+    // We don't need to clean allocations for evacuation as we memcpy in that case.
+    ZeroAllocRange(reinterpret_cast<uint8_t*>(region), *bytes_tl_bulk_allocated);
   }
   return region;
 }
diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index faeeec0..7b3a8f7 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc
@@ -24,6 +24,10 @@
 #include "mirror/object-inl.h"
 #include "thread_list.h"
 
+#if defined(__linux__)
+#include <sys/utsname.h>
+#endif
+
 namespace art {
 namespace gc {
 namespace space {
@@ -47,6 +51,12 @@
 // Whether we check a region's live bytes count against the region bitmap.
 static constexpr bool kCheckLiveBytesAgainstRegionBitmap = kIsDebugBuild;
 
+#ifndef MADV_FREE
+const int RegionSpace::gPurgeAdvice = MADV_DONTNEED;
+#else
+const int RegionSpace::gPurgeAdvice = KernelVersionLower(4, 12) ? MADV_DONTNEED : MADV_FREE;
+#endif
+
 MemMap RegionSpace::CreateMemMap(const std::string& name,
                                  size_t capacity,
                                  uint8_t* requested_begin) {
@@ -141,7 +151,7 @@
   DCHECK(!full_region_.IsFree());
   DCHECK(full_region_.IsAllocated());
   size_t ignored;
-  DCHECK(full_region_.Alloc(kAlignment, &ignored, nullptr, &ignored) == nullptr);
+  DCHECK(full_region_.Alloc</*kForEvac*/true>(kAlignment, &ignored, nullptr, &ignored) == nullptr);
   // Protect the whole region space from the start.
   Protect();
 }
@@ -497,10 +507,9 @@
       madvise_list.push_back(std::pair(clear_block_begin, clear_block_end));
     }
   }
-
   // Madvise the memory ranges.
   for (const auto &iter : madvise_list) {
-    ZeroAndProtectRegion(iter.first, iter.second);
+    PurgePages(iter.first, iter.second - iter.first);
     if (clear_bitmap) {
       GetLiveBitmap()->ClearRange(
           reinterpret_cast<mirror::Object*>(iter.first),
@@ -630,6 +639,19 @@
   num_evac_regions_ = 0;
 }
 
+void RegionSpace::PurgePages(void* address, size_t length) {
+  DCHECK(IsAligned<kPageSize>(address));
+  if (length == 0) {
+    return;
+  }
+#ifdef _WIN32
+  // PurgePages does not madvise on Windows.
+#else
+  CHECK_EQ(madvise(address, length, gPurgeAdvice), 0)
+      << "madvise failed: " << strerror(errno);
+#endif
+}
+
 void RegionSpace::CheckLiveBytesAgainstRegionBitmap(Region* r) {
   if (r->LiveBytes() == static_cast<size_t>(-1)) {
     // Live bytes count is undefined for `r`; nothing to check here.
@@ -867,6 +889,9 @@
   if (r != nullptr) {
     uint8_t* start = pos != nullptr ? pos : r->Begin();
     DCHECK_ALIGNED(start, kObjectAlignment);
+    // If we are allocating a partially utilized TLAB, then the tlab is already
+    // clean from [pos, r->Top()).
+    ZeroAllocRange(pos != nullptr ? r->Top() : r->Begin(), *bytes_tl_bulk_allocated);
     r->is_a_tlab_ = true;
     r->thread_ = self;
     r->SetTop(r->End());
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index f74abfb..d515412 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -45,6 +45,11 @@
 
 // A space that consists of equal-sized regions.
 class RegionSpace final : public ContinuousMemMapAllocSpace {
+ private:
+  // Constant used to mark the non-zero pages before madvise(MADV_FREE) them.
+  static constexpr uint8_t kMadvFreeMagic = 0xdf;
+  static const int gPurgeAdvice;  // advice to madvise for reclaiming pages.
+
  public:
   typedef void(*WalkCallback)(void *start, void *end, size_t num_bytes, void* callback_arg);
 
@@ -366,6 +371,8 @@
     }
   }
 
+  ALWAYS_INLINE static void ZeroAllocRange(uint8_t* start, size_t length);
+
   // Increment object allocation count for region containing ref.
   void RecordAlloc(mirror::Object* ref) REQUIRES(!region_lock_);
 
@@ -426,6 +433,7 @@
 
     void Clear(bool zero_and_release_pages);
 
+    template <bool kForEvac>
     ALWAYS_INLINE mirror::Object* Alloc(size_t num_bytes,
                                         /* out */ size_t* bytes_allocated,
                                         /* out */ size_t* usable_size,
@@ -645,6 +653,7 @@
     return RefToRegionLocked(ref);
   }
 
+  void PurgePages(void* address, size_t length);
   void TraceHeapSize() REQUIRES(region_lock_);
 
   Region* RefToRegionUnlocked(mirror::Object* ref) NO_THREAD_SAFETY_ANALYSIS {