Add assembler for riscv64, part 6.

Add macro instructions for loads and stores with arbitrary
32-bit offsets. Add macro instructions for adding arbitrary
32-bit and 64-bit values.

Add macro instructions for loading floating point literals.
Rename macro instructions for loading integer literals.

Test: m test-art-host-gtest
Bug: 283082089
Change-Id: I8c7f4c63b07b90b2745028ba684207d622ef8c22
diff --git a/compiler/utils/riscv64/assembler_riscv64.cc b/compiler/utils/riscv64/assembler_riscv64.cc
index b8c077d..98d2489 100644
--- a/compiler/utils/riscv64/assembler_riscv64.cc
+++ b/compiler/utils/riscv64/assembler_riscv64.cc
@@ -943,6 +943,85 @@
   Csrrci(Zero, csr, uimm5);
 }
 
+void Riscv64Assembler::Loadb(XRegister rd, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  Lb(rd, rs1, offset);
+}
+
+void Riscv64Assembler::Loadh(XRegister rd, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  Lh(rd, rs1, offset);
+}
+
+void Riscv64Assembler::Loadw(XRegister rd, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  Lw(rd, rs1, offset);
+}
+
+void Riscv64Assembler::Loadd(XRegister rd, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  Ld(rd, rs1, offset);
+}
+
+void Riscv64Assembler::Loadbu(XRegister rd, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  Lbu(rd, rs1, offset);
+}
+
+void Riscv64Assembler::Loadhu(XRegister rd, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  Lhu(rd, rs1, offset);
+}
+
+void Riscv64Assembler::Loadwu(XRegister rd, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  Lwu(rd, rs1, offset);
+}
+
+void Riscv64Assembler::Storeb(XRegister rs2, XRegister rs1, int32_t offset) {
+  CHECK_NE(rs2, TMP);
+  AdjustBaseAndOffset(rs1, offset);
+  Sb(rs2, rs1, offset);
+}
+
+void Riscv64Assembler::Storeh(XRegister rs2, XRegister rs1, int32_t offset) {
+  CHECK_NE(rs2, TMP);
+  AdjustBaseAndOffset(rs1, offset);
+  Sh(rs2, rs1, offset);
+}
+
+void Riscv64Assembler::Storew(XRegister rs2, XRegister rs1, int32_t offset) {
+  CHECK_NE(rs2, TMP);
+  AdjustBaseAndOffset(rs1, offset);
+  Sw(rs2, rs1, offset);
+}
+
+void Riscv64Assembler::Stored(XRegister rs2, XRegister rs1, int32_t offset) {
+  CHECK_NE(rs2, TMP);
+  AdjustBaseAndOffset(rs1, offset);
+  Sd(rs2, rs1, offset);
+}
+
+void Riscv64Assembler::FLoadw(FRegister rd, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  FLw(rd, rs1, offset);
+}
+
+void Riscv64Assembler::FLoadd(FRegister rd, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  FLd(rd, rs1, offset);
+}
+
+void Riscv64Assembler::FStorew(FRegister rs2, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  FSw(rs2, rs1, offset);
+}
+
+void Riscv64Assembler::FStored(FRegister rs2, XRegister rs1, int32_t offset) {
+  AdjustBaseAndOffset(rs1, offset);
+  FSd(rs2, rs1, offset);
+}
+
 void Riscv64Assembler::LoadConst32(XRegister rd, int32_t value) {
   LoadImmediate(rd, value, /*can_use_tmp=*/ false);  // No need to use TMP for 32-bit values.
 }
@@ -952,6 +1031,57 @@
   LoadImmediate(rd, value, /*can_use_tmp=*/ true);
 }
 
+template <typename ValueType, typename Addi, typename AddLarge>
+void AddConstImpl(XRegister rd,
+                  XRegister rs1,
+                  ValueType value,
+                  Addi&& addi,
+                  AddLarge&& add_large) {
+  CHECK_NE(rs1, TMP);
+  if (IsInt<12>(value)) {
+    addi(rd, rs1, value);
+    return;
+  }
+
+  constexpr int32_t kPositiveValueSimpleAdjustment = 0x7ff;
+  constexpr int32_t kHighestValueForSimpleAdjustment = 2 * kPositiveValueSimpleAdjustment;
+  constexpr int32_t kNegativeValueSimpleAdjustment = -0x800;
+  constexpr int32_t kLowestValueForSimpleAdjustment = 2 * kNegativeValueSimpleAdjustment;
+
+  if (value >= 0 && value <= kHighestValueForSimpleAdjustment) {
+    addi(rd, rs1, kPositiveValueSimpleAdjustment);
+    addi(rd, rd, value - kPositiveValueSimpleAdjustment);
+  } else if (value < 0 && value >= kLowestValueForSimpleAdjustment) {
+    addi(rd, rs1, kNegativeValueSimpleAdjustment);
+    addi(rd, rd, value - kNegativeValueSimpleAdjustment);
+  } else {
+    add_large(rd, rs1, value);
+  }
+}
+
+void Riscv64Assembler::AddConst32(XRegister rd, XRegister rs1, int32_t value) {
+  auto addiw = [&](XRegister rd, XRegister rs1, int32_t value) { Addiw(rd, rs1, value); };
+  auto add_large = [&](XRegister rd, XRegister rs1, int32_t value) {
+    LoadConst32(TMP, value);
+    Addw(rd, rs1, TMP);
+  };
+  AddConstImpl(rd, rs1, value, addiw, add_large);
+}
+
+void Riscv64Assembler::AddConst64(XRegister rd, XRegister rs1, int64_t value) {
+  auto addi = [&](XRegister rd, XRegister rs1, int32_t value) { Addi(rd, rs1, value); };
+  auto add_large = [&](XRegister rd, XRegister rs1, int64_t value) {
+    // We cannot load TMP with `LoadConst64()`, so use `Li()`.
+    // TODO(riscv64): Refactor `LoadImmediate()` so that we can reuse the code to detect
+    // when the code path using the `TMP` is beneficial, and use that path with a small
+    // modification - instead of adding the two parts togeter, add them individually
+    // to the input `rs1`. (This works as long as `rd` is not `TMP`.)
+    Li(TMP, value);
+    Add(rd, rs1, TMP);
+  };
+  AddConstImpl(rd, rs1, value, addi, add_large);
+}
+
 void Riscv64Assembler::Beqz(XRegister rs, Riscv64Label* label, bool is_bare) {
   Beq(rs, Zero, label, is_bare);
 }
@@ -1028,21 +1158,31 @@
   Jal(RA, label, is_bare);
 }
 
-void Riscv64Assembler::Lw(XRegister rd, Literal* literal) {
+void Riscv64Assembler::Loadw(XRegister rd, Literal* literal) {
   DCHECK_EQ(literal->GetSize(), 4u);
   LoadLiteral(literal, rd, Branch::kLiteral);
 }
 
-void Riscv64Assembler::Lwu(XRegister rd, Literal* literal) {
+void Riscv64Assembler::Loadwu(XRegister rd, Literal* literal) {
   DCHECK_EQ(literal->GetSize(), 4u);
   LoadLiteral(literal, rd, Branch::kLiteralUnsigned);
 }
 
-void Riscv64Assembler::Ld(XRegister rd, Literal* literal) {
+void Riscv64Assembler::Loadd(XRegister rd, Literal* literal) {
   DCHECK_EQ(literal->GetSize(), 8u);
   LoadLiteral(literal, rd, Branch::kLiteralLong);
 }
 
+void Riscv64Assembler::FLoadw(FRegister rd, Literal* literal) {
+  DCHECK_EQ(literal->GetSize(), 4u);
+  LoadLiteral(literal, rd, Branch::kLiteralFloat);
+}
+
+void Riscv64Assembler::FLoadd(FRegister rd, Literal* literal) {
+  DCHECK_EQ(literal->GetSize(), 8u);
+  LoadLiteral(literal, rd, Branch::kLiteralDouble);
+}
+
 /////////////////////////////// RV64 MACRO Instructions END ///////////////////////////////
 
 const Riscv64Assembler::Branch::BranchInfo Riscv64Assembler::Branch::branch_info_[] = {
@@ -1070,6 +1210,8 @@
     {8, 0, Riscv64Assembler::Branch::kOffset32},  // kLiteral
     {8, 0, Riscv64Assembler::Branch::kOffset32},  // kLiteralUnsigned
     {8, 0, Riscv64Assembler::Branch::kOffset32},  // kLiteralLong
+    {8, 0, Riscv64Assembler::Branch::kOffset32},  // kLiteralFloat
+    {8, 0, Riscv64Assembler::Branch::kOffset32},  // kLiteralDouble
 };
 
 void Riscv64Assembler::Branch::InitShortOrLong(Riscv64Assembler::Branch::OffsetBits offset_size,
@@ -1123,6 +1265,8 @@
     case kLiteral:
     case kLiteralUnsigned:
     case kLiteralLong:
+    case kLiteralFloat:
+    case kLiteralDouble:
       CHECK(!IsResolved());
       type_ = initial_type;
       break;
@@ -1168,6 +1312,7 @@
       target_(target),
       lhs_reg_(rd),
       rhs_reg_(Zero),
+      freg_(kNoFRegister),
       condition_(kUncond) {
   InitializeType(
       (rd != Zero ? (is_bare ? kBareCall : kCall) : (is_bare ? kBareUncondBranch : kUncondBranch)));
@@ -1184,6 +1329,7 @@
       target_(target),
       lhs_reg_(lhs_reg),
       rhs_reg_(rhs_reg),
+      freg_(kNoFRegister),
       condition_(condition) {
   DCHECK_NE(condition, kUncond);
   DCHECK(!IsNop(condition, lhs_reg, rhs_reg));
@@ -1200,11 +1346,26 @@
       target_(target),
       lhs_reg_(rd),
       rhs_reg_(Zero),
+      freg_(kNoFRegister),
       condition_(kUncond) {
   CHECK_NE(rd , Zero);
   InitializeType(label_or_literal_type);
 }
 
+Riscv64Assembler::Branch::Branch(uint32_t location,
+                                 uint32_t target,
+                                 FRegister rd,
+                                 Type literal_type)
+    : old_location_(location),
+      location_(location),
+      target_(target),
+      lhs_reg_(Zero),
+      rhs_reg_(Zero),
+      freg_(rd),
+      condition_(kUncond) {
+  InitializeType(literal_type);
+}
+
 Riscv64Assembler::BranchCondition Riscv64Assembler::Branch::OppositeCondition(
     Riscv64Assembler::BranchCondition cond) {
   switch (cond) {
@@ -1244,6 +1405,8 @@
 
 XRegister Riscv64Assembler::Branch::GetRightRegister() const { return rhs_reg_; }
 
+FRegister Riscv64Assembler::Branch::GetFRegister() const { return freg_; }
+
 uint32_t Riscv64Assembler::Branch::GetTarget() const { return target_; }
 
 uint32_t Riscv64Assembler::Branch::GetLocation() const { return location_; }
@@ -1467,6 +1630,14 @@
     case Branch::kLiteralLong:
       emit_auipc_and_next(lhs, [&](int32_t short_offset) { Ld(lhs, lhs, short_offset); });
       break;
+    case Branch::kLiteralFloat:
+      emit_auipc_and_next(
+          TMP, [&](int32_t short_offset) { FLw(branch->GetFRegister(), TMP, short_offset); });
+      break;
+    case Branch::kLiteralDouble:
+      emit_auipc_and_next(
+          TMP, [&](int32_t short_offset) { FLd(branch->GetFRegister(), TMP, short_offset); });
+      break;
   }
   CHECK_EQ(overwrite_location_, branch->GetEndLocation());
   CHECK_LE(branch->GetLength(), static_cast<uint32_t>(Branch::kMaxBranchLength));
@@ -1527,7 +1698,10 @@
   FinalizeLabeledBranch(label);
 }
 
-void Riscv64Assembler::LoadLiteral(Literal* literal, XRegister rd, Branch::Type literal_type) {
+template <typename XRegisterOrFRegister>
+void Riscv64Assembler::LoadLiteral(Literal* literal,
+                                   XRegisterOrFRegister rd,
+                                   Branch::Type literal_type) {
   Riscv64Label* label = literal->GetLabel();
   DCHECK(!label->IsBound());
   branches_.emplace_back(buffer_.Size(), Branch::kUnresolved, rd, literal_type);
@@ -1820,6 +1994,51 @@
   }
 }
 
+// This method is used to adjust the base register and offset pair for
+// a load/store when the offset doesn't fit into 12-bit signed integer.
+void Riscv64Assembler::AdjustBaseAndOffset(XRegister& base, int32_t& offset) {
+  CHECK_NE(base, TMP);  // The `TMP` is reserved for adjustment even if it's not needed.
+  if (IsInt<12>(offset)) {
+    return;
+  }
+
+  constexpr int32_t kPositiveOffsetMaxSimpleAdjustment = 0x7ff;
+  constexpr int32_t kHighestOffsetForSimpleAdjustment = 2 * kPositiveOffsetMaxSimpleAdjustment;
+  constexpr int32_t kPositiveOffsetSimpleAdjustmentAligned8 =
+      RoundDown(kPositiveOffsetMaxSimpleAdjustment, 8);
+  constexpr int32_t kPositiveOffsetSimpleAdjustmentAligned4 =
+      RoundDown(kPositiveOffsetMaxSimpleAdjustment, 4);
+  constexpr int32_t kNegativeOffsetSimpleAdjustment = -0x800;
+  constexpr int32_t kLowestOffsetForSimpleAdjustment = 2 * kNegativeOffsetSimpleAdjustment;
+
+  if (offset >= 0 && offset <= kHighestOffsetForSimpleAdjustment) {
+    // Make the adjustment 8-byte aligned (0x7f8) except for offsets that cannot be reached
+    // with this adjustment, then try 4-byte alignment, then just half of the offset.
+    int32_t adjustment = IsInt<12>(offset - kPositiveOffsetSimpleAdjustmentAligned8)
+        ? kPositiveOffsetSimpleAdjustmentAligned8
+        : IsInt<12>(offset - kPositiveOffsetSimpleAdjustmentAligned4)
+            ? kPositiveOffsetSimpleAdjustmentAligned4
+            : offset / 2;
+    DCHECK(IsInt<12>(adjustment));
+    Addi(TMP, base, adjustment);
+    offset -= adjustment;
+  } else if (offset < 0 && offset >= kLowestOffsetForSimpleAdjustment) {
+    Addi(TMP, base, kNegativeOffsetSimpleAdjustment);
+    offset -= kNegativeOffsetSimpleAdjustment;
+  } else if (offset >= 0x7ffff800) {
+    // Support even large offsets outside the range supported by `SplitOffset()`.
+    LoadConst32(TMP, offset);
+    Add(TMP, TMP, base);
+    offset = 0;
+  } else {
+    auto [imm20, short_offset] = SplitOffset(offset);
+    Lui(TMP, imm20);
+    Add(TMP, TMP, base);
+    offset = short_offset;
+  }
+  base = TMP;
+}
+
 void Riscv64Assembler::LoadImmediate(XRegister rd, int64_t imm, bool can_use_tmp) {
   DCHECK_IMPLIES(can_use_tmp, rd != TMP);
 
diff --git a/compiler/utils/riscv64/assembler_riscv64.h b/compiler/utils/riscv64/assembler_riscv64.h
index 70fb5ae..09bbef0 100644
--- a/compiler/utils/riscv64/assembler_riscv64.h
+++ b/compiler/utils/riscv64/assembler_riscv64.h
@@ -506,10 +506,31 @@
   void Csrsi(uint32_t csr, uint32_t uimm5);
   void Csrci(uint32_t csr, uint32_t uimm5);
 
+  // Load/store macros for arbitrary 32-bit offsets.
+  void Loadb(XRegister rd, XRegister rs1, int32_t offset);
+  void Loadh(XRegister rd, XRegister rs1, int32_t offset);
+  void Loadw(XRegister rd, XRegister rs1, int32_t offset);
+  void Loadd(XRegister rd, XRegister rs1, int32_t offset);
+  void Loadbu(XRegister rd, XRegister rs1, int32_t offset);
+  void Loadhu(XRegister rd, XRegister rs1, int32_t offset);
+  void Loadwu(XRegister rd, XRegister rs1, int32_t offset);
+  void Storeb(XRegister rs2, XRegister rs1, int32_t offset);
+  void Storeh(XRegister rs2, XRegister rs1, int32_t offset);
+  void Storew(XRegister rs2, XRegister rs1, int32_t offset);
+  void Stored(XRegister rs2, XRegister rs1, int32_t offset);
+  void FLoadw(FRegister rd, XRegister rs1, int32_t offset);
+  void FLoadd(FRegister rd, XRegister rs1, int32_t offset);
+  void FStorew(FRegister rs2, XRegister rs1, int32_t offset);
+  void FStored(FRegister rs2, XRegister rs1, int32_t offset);
+
   // Macros for loading constants.
   void LoadConst32(XRegister rd, int32_t value);
   void LoadConst64(XRegister rd, int64_t value);
 
+  // Macros for adding constants.
+  void AddConst32(XRegister rd, XRegister rs1, int32_t value);
+  void AddConst64(XRegister rd, XRegister rs1, int64_t value);
+
   // Jumps and branches to a label.
   void Beqz(XRegister rs, Riscv64Label* label, bool is_bare = false);
   void Bnez(XRegister rs, Riscv64Label* label, bool is_bare = false);
@@ -532,9 +553,11 @@
   void Jal(Riscv64Label* label, bool is_bare = false);
 
   // Literal load.
-  void Lw(XRegister rd, Literal* literal);
-  void Lwu(XRegister rd, Literal* literal);
-  void Ld(XRegister rd, Literal* literal);
+  void Loadw(XRegister rd, Literal* literal);
+  void Loadwu(XRegister rd, Literal* literal);
+  void Loadd(XRegister rd, Literal* literal);
+  void FLoadw(FRegister rd, Literal* literal);
+  void FLoadd(FRegister rd, Literal* literal);
 
   /////////////////////////////// RV64 MACRO Instructions END ///////////////////////////////
 
@@ -567,9 +590,6 @@
   JumpTable* CreateJumpTable(ArenaVector<Riscv64Label*>&& labels);
 
  public:
-  // Emit data (e.g. encoded instruction or immediate) to the instruction stream.
-  void Emit(uint32_t value);
-
   // Emit slow paths queued during assembly and promote short branches to long if needed.
   void FinalizeCode() override;
 
@@ -637,7 +657,8 @@
       kLiteral,
       kLiteralUnsigned,
       kLiteralLong,
-      // TODO(riscv64): Add FP literals.
+      kLiteralFloat,
+      kLiteralDouble,
     };
 
     // Bit sizes of offsets defined as enums to minimize chance of typos.
@@ -671,8 +692,9 @@
            XRegister lhs_reg,
            XRegister rhs_reg,
            bool is_bare);
-    // Label address (in literal area) or literal.
+    // Label address or literal.
     Branch(uint32_t location, uint32_t target, XRegister rd, Type label_or_literal_type);
+    Branch(uint32_t location, uint32_t target, FRegister rd, Type literal_type);
 
     // Some conditional branches with lhs = rhs are effectively NOPs, while some
     // others are effectively unconditional.
@@ -685,6 +707,7 @@
     BranchCondition GetCondition() const;
     XRegister GetLeftRegister() const;
     XRegister GetRightRegister() const;
+    FRegister GetFRegister() const;
     uint32_t GetTarget() const;
     uint32_t GetLocation() const;
     uint32_t GetOldLocation() const;
@@ -735,6 +758,7 @@
     XRegister lhs_reg_;          // Left-hand side register in conditional branches or
                                  // destination register in calls or literals.
     XRegister rhs_reg_;          // Right-hand side register in conditional branches.
+    FRegister freg_;             // Destination register in FP literals.
     BranchCondition condition_;  // Condition for conditional branches.
 
     Type type_;      // Current type of the branch.
@@ -756,7 +780,8 @@
              XRegister lhs,
              XRegister rhs);
   void Buncond(Riscv64Label* label, XRegister rd, bool is_bare);
-  void LoadLiteral(Literal* literal, XRegister rd, Branch::Type literal_type);
+  template <typename XRegisterOrFRegister>
+  void LoadLiteral(Literal* literal, XRegisterOrFRegister rd, Branch::Type literal_type);
 
   Branch* GetBranch(uint32_t branch_id);
   const Branch* GetBranch(uint32_t branch_id) const;
@@ -765,6 +790,12 @@
   void PromoteBranches();
   void PatchCFI();
 
+  // Emit data (e.g. encoded instruction or immediate) to the instruction stream.
+  void Emit(uint32_t value);
+
+  // Adjust base register and offset if needed for load/store with a large offset.
+  void AdjustBaseAndOffset(XRegister& base, int32_t& offset);
+
   // Implementation helper for `Li()`, `LoadConst32()` and `LoadConst64()`.
   void LoadImmediate(XRegister rd, int64_t imm, bool can_use_tmp);
 
diff --git a/compiler/utils/riscv64/assembler_riscv64_test.cc b/compiler/utils/riscv64/assembler_riscv64_test.cc
index 8b6e25f..b48b168 100644
--- a/compiler/utils/riscv64/assembler_riscv64_test.cc
+++ b/compiler/utils/riscv64/assembler_riscv64_test.cc
@@ -626,6 +626,176 @@
     DriverStr(expected, test_name);
   }
 
+  template <typename EmitOp>
+  void TestAddConst(const std::string& test_name,
+                    size_t bits,
+                    const std::string& suffix,
+                    EmitOp&& emit_op) {
+    int64_t kImm12s[] = {
+        0, 1, 2, 0xff, 0x100, 0x1ff, 0x200, 0x3ff, 0x400, 0x7ff,
+        -1, -2, -0x100, -0x101, -0x200, -0x201, -0x400, -0x401, -0x800,
+    };
+    int64_t kSimplePositiveValues[] = {
+        0x800, 0x801, 0xbff, 0xc00, 0xff0, 0xff7, 0xff8, 0xffb, 0xffc, 0xffd, 0xffe,
+    };
+    int64_t kSimpleNegativeValues[] = {
+        -0x801, -0x802, -0xbff, -0xc00, -0xff0, -0xff8, -0xffc, -0xffe, -0xfff, -0x1000,
+    };
+    std::vector<int64_t> large_values = CreateImmediateValuesBits(bits, /*as_uint=*/ false);
+    auto kept_end = std::remove_if(large_values.begin(),
+                                   large_values.end(),
+                                   [](int64_t value) { return IsInt<13>(value); });
+    large_values.erase(kept_end, large_values.end());
+    large_values.push_back(0xfff);
+
+    std::string tmp_name = GetRegisterName(TMP);
+
+    std::string expected;
+    for (XRegister* rd : GetRegisters()) {
+      std::string rd_name = GetRegisterName(*rd);
+      std::string addi_rd = "addi" + suffix + " " + rd_name + ", ";
+      std::string add_rd = "add" + suffix + " " + rd_name + ", ";
+      for (XRegister* rs1 : GetRegisters()) {
+        // TMP can be the destination register but not the source register.
+        if (*rs1 == TMP) {
+          continue;
+        }
+        std::string rs1_name = GetRegisterName(*rs1);
+
+        for (int64_t imm : kImm12s) {
+          emit_op(*rd, *rs1, imm);
+          expected += addi_rd + rs1_name + ", " + std::to_string(imm) + "\n";
+        }
+
+        auto emit_simple_ops = [&](ArrayRef<const int64_t> imms, int64_t adjustment) {
+          for (int64_t imm : imms) {
+            emit_op(*rd, *rs1, imm);
+            expected += addi_rd + rs1_name + ", " + std::to_string(adjustment) + "\n" +
+                        addi_rd + rd_name + ", " + std::to_string(imm - adjustment) + "\n";
+          }
+        };
+        emit_simple_ops(ArrayRef<const int64_t>(kSimplePositiveValues), 0x7ff);
+        emit_simple_ops(ArrayRef<const int64_t>(kSimpleNegativeValues), -0x800);
+
+        for (int64_t imm : large_values) {
+          emit_op(*rd, *rs1, imm);
+          expected += "li " + tmp_name + ", " + std::to_string(imm) + "\n" +
+                      add_rd + rs1_name + ", " + tmp_name + "\n";
+        }
+      }
+    }
+    DriverStr(expected, test_name);
+  }
+
+  template <typename EmitOp>
+  std::string RepeatLoadStoreArbitraryOffset(const std::string& head, EmitOp&& emit_op) {
+    int64_t kImm12s[] = {
+        0, 1, 2, 0xff, 0x100, 0x1ff, 0x200, 0x3ff, 0x400, 0x7ff,
+        -1, -2, -0x100, -0x101, -0x200, -0x201, -0x400, -0x401, -0x800,
+    };
+    int64_t kSimplePositiveOffsetsAlign8[] = {
+        0x800, 0x801, 0xbff, 0xc00, 0xff0, 0xff4, 0xff6, 0xff7
+    };
+    int64_t kSimplePositiveOffsetsAlign4[] = {
+        0xff8, 0xff9, 0xffa, 0xffb
+    };
+    int64_t kSimplePositiveOffsetsAlign2[] = {
+        0xffc, 0xffd
+    };
+    int64_t kSimplePositiveOffsetsNoAlign[] = {
+        0xffe
+    };
+    int64_t kSimpleNegativeOffsets[] = {
+        -0x801, -0x802, -0xbff, -0xc00, -0xff0, -0xff8, -0xffc, -0xffe, -0xfff, -0x1000,
+    };
+    int64_t kSplitOffsets[] = {
+        0xfff, 0x1000, 0x1001, 0x17ff, 0x1800, 0x1fff, 0x2000, 0x2001, 0x27ff, 0x2800,
+        0x7fffe7ff, 0x7fffe800, 0x7fffefff, 0x7ffff000, 0x7ffff001, 0x7ffff7ff,
+        -0x1001, -0x1002, -0x17ff, -0x1800, -0x1801, -0x2000, -0x2001, -0x2800, -0x2801,
+        -0x7ffff000, -0x7ffff001, -0x7ffff800, -0x7ffff801, -0x7fffffff, -0x80000000,
+    };
+    int64_t kSpecialOffsets[] = {
+        0x7ffff800, 0x7ffff801, 0x7ffffffe, 0x7fffffff
+    };
+
+    std::string tmp_name = GetRegisterName(TMP);
+    std::string expected;
+    for (XRegister* rs1 : GetRegisters()) {
+      if (*rs1 == TMP) {
+        continue;  // TMP cannot be the address base register.
+      }
+      std::string rs1_name = GetRegisterName(*rs1);
+
+      for (int64_t imm : kImm12s) {
+        emit_op(*rs1, imm);
+        expected += head + ", " + std::to_string(imm) + "(" + rs1_name + ")" + "\n";
+      }
+
+      auto emit_simple_ops = [&](ArrayRef<const int64_t> imms, int64_t adjustment) {
+        for (int64_t imm : imms) {
+          emit_op(*rs1, imm);
+          expected +=
+              "addi " + tmp_name + ", " + rs1_name + ", " + std::to_string(adjustment) + "\n" +
+              head + ", " + std::to_string(imm - adjustment) + "(" + tmp_name + ")" + "\n";
+        }
+      };
+      emit_simple_ops(ArrayRef<const int64_t>(kSimplePositiveOffsetsAlign8), 0x7f8);
+      emit_simple_ops(ArrayRef<const int64_t>(kSimplePositiveOffsetsAlign4), 0x7fc);
+      emit_simple_ops(ArrayRef<const int64_t>(kSimplePositiveOffsetsAlign2), 0x7fe);
+      emit_simple_ops(ArrayRef<const int64_t>(kSimplePositiveOffsetsNoAlign), 0x7ff);
+      emit_simple_ops(ArrayRef<const int64_t>(kSimpleNegativeOffsets), -0x800);
+
+      for (int64_t imm : kSplitOffsets) {
+        emit_op(*rs1, imm);
+        uint32_t imm20 = ((imm >> 12) + ((imm >> 11) & 1)) & 0xfffff;
+        int32_t small_offset = (imm & 0xfff) - ((imm & 0x800) << 1);
+        expected += "lui " + tmp_name + ", " + std::to_string(imm20) + "\n"
+                    "add " + tmp_name + ", " + tmp_name + ", " + rs1_name + "\n" +
+                    head + ", " + std::to_string(small_offset) + "(" + tmp_name + ")\n";
+      }
+
+      for (int64_t imm : kSpecialOffsets) {
+        emit_op(*rs1, imm);
+        expected +=
+            "lui " + tmp_name + ", 0x80000\n"
+            "addiw " + tmp_name + ", " + tmp_name + ", " + std::to_string(imm - 0x80000000) + "\n" +
+            "add " + tmp_name + ", " + tmp_name + ", " + rs1_name + "\n" +
+            head + ", (" + tmp_name + ")\n";
+      }
+    }
+    return expected;
+  }
+
+  void TestLoadStoreArbitraryOffset(const std::string& test_name,
+                                    const std::string& insn,
+                                    void (Riscv64Assembler::*fn)(XRegister, XRegister, int32_t),
+                                    bool is_store) {
+    std::string expected;
+    for (XRegister* rd : GetRegisters()) {
+      // TMP can be the target register for loads but not for stores where loading the
+      // adjusted address to TMP would clobber the value we want to store.
+      if (is_store && *rd == TMP) {
+        continue;
+      }
+      expected += RepeatLoadStoreArbitraryOffset(
+          insn + " " + GetRegisterName(*rd),
+          [&](XRegister rs1, int64_t offset) { (GetAssembler()->*fn)(*rd, rs1, offset); });
+    }
+    DriverStr(expected, test_name);
+  }
+
+  void TestFPLoadStoreArbitraryOffset(const std::string& test_name,
+                                      const std::string& insn,
+                                      void (Riscv64Assembler::*fn)(FRegister, XRegister, int32_t)) {
+    std::string expected;
+    for (FRegister* rd : GetFPRegisters()) {
+      expected += RepeatLoadStoreArbitraryOffset(
+          insn + " " + GetFPRegName(*rd),
+          [&](XRegister rs1, int64_t offset) { (GetAssembler()->*fn)(*rd, rs1, offset); });
+    }
+    DriverStr(expected, test_name);
+  }
+
   void TestLoadLiteral(const std::string& test_name, bool with_padding_for_long) {
     std::string expected;
     Literal* narrow_literal = __ NewLiteral<uint32_t>(0x12345678);
@@ -638,14 +808,27 @@
     };
     for (XRegister* reg : GetRegisters()) {
       if (*reg != Zero) {
-        __ Lw(*reg, narrow_literal);
+        __ Loadw(*reg, narrow_literal);
         print_load("lw", *reg, "2");
-        __ Lwu(*reg, narrow_literal);
+        __ Loadwu(*reg, narrow_literal);
         print_load("lwu", *reg, "2");
-        __ Ld(*reg, wide_literal);
+        __ Loadd(*reg, wide_literal);
         print_load("ld", *reg, "3");
       }
     }
+    std::string tmp = GetRegisterName(TMP);
+    auto print_fp_load = [&](const std::string& load, FRegister rd, const std::string& label) {
+      std::string rd_name = GetFPRegName(rd);
+      expected += "1:\n"
+                  "auipc " + tmp + ", %pcrel_hi(" + label + "f)\n" +
+                  load + " " + rd_name + ", %pcrel_lo(1b)(" + tmp + ")\n";
+    };
+    for (FRegister* freg : GetFPRegisters()) {
+      __ FLoadw(*freg, narrow_literal);
+      print_fp_load("flw", *freg, "2");
+      __ FLoadd(*freg, wide_literal);
+      print_fp_load("fld", *freg, "3");
+    }
     // All literal loads above emit 8 bytes of code. The narrow literal shall emit 4 bytes of code.
     // If we do not add another instruction, we shall end up with padding before the long literal.
     expected += EmitNops(with_padding_for_long ? 0u : sizeof(uint32_t));
@@ -1838,6 +2021,20 @@
                   [&](XRegister rd, int64_t value) { __ LoadConst64(rd, value); });
 }
 
+TEST_F(AssemblerRISCV64Test, AddConst32) {
+  auto emit_op = [&](XRegister rd, XRegister rs1, int64_t value) {
+    __ AddConst32(rd, rs1, dchecked_integral_cast<int32_t>(value));
+  };
+  TestAddConst("AddConst32", 32, /*suffix=*/ "w", emit_op);
+}
+
+TEST_F(AssemblerRISCV64Test, AddConst64) {
+  auto emit_op = [&](XRegister rd, XRegister rs1, int64_t value) {
+    __ AddConst64(rd, rs1, value);
+  };
+  TestAddConst("AddConst64", 64, /*suffix=*/ "", emit_op);
+}
+
 TEST_F(AssemblerRISCV64Test, BcondForward3KiB) {
   TestBcondForward("BcondForward3KiB", 3 * KB, "1", GetPrintBcond());
 }
@@ -2071,6 +2268,66 @@
                       GetPrintCall("2"));
 }
 
+TEST_F(AssemblerRISCV64Test, Loadb) {
+  TestLoadStoreArbitraryOffset("Loadb", "lb", &Riscv64Assembler::Loadb, /*is_store=*/ false);
+}
+
+TEST_F(AssemblerRISCV64Test, Loadh) {
+  TestLoadStoreArbitraryOffset("Loadh", "lh", &Riscv64Assembler::Loadh, /*is_store=*/ false);
+}
+
+TEST_F(AssemblerRISCV64Test, Loadw) {
+  TestLoadStoreArbitraryOffset("Loadw", "lw", &Riscv64Assembler::Loadw, /*is_store=*/ false);
+}
+
+TEST_F(AssemblerRISCV64Test, Loadd) {
+  TestLoadStoreArbitraryOffset("Loadd", "ld", &Riscv64Assembler::Loadd, /*is_store=*/ false);
+}
+
+TEST_F(AssemblerRISCV64Test, Loadbu) {
+  TestLoadStoreArbitraryOffset("Loadbu", "lbu", &Riscv64Assembler::Loadbu, /*is_store=*/ false);
+}
+
+TEST_F(AssemblerRISCV64Test, Loadhu) {
+  TestLoadStoreArbitraryOffset("Loadhu", "lhu", &Riscv64Assembler::Loadhu, /*is_store=*/ false);
+}
+
+TEST_F(AssemblerRISCV64Test, Loadwu) {
+  TestLoadStoreArbitraryOffset("Loadwu", "lwu", &Riscv64Assembler::Loadwu, /*is_store=*/ false);
+}
+
+TEST_F(AssemblerRISCV64Test, Storeb) {
+  TestLoadStoreArbitraryOffset("Storeb", "sb", &Riscv64Assembler::Storeb, /*is_store=*/ true);
+}
+
+TEST_F(AssemblerRISCV64Test, Storeh) {
+  TestLoadStoreArbitraryOffset("Storeh", "sh", &Riscv64Assembler::Storeh, /*is_store=*/ true);
+}
+
+TEST_F(AssemblerRISCV64Test, Storew) {
+  TestLoadStoreArbitraryOffset("Storew", "sw", &Riscv64Assembler::Storew, /*is_store=*/ true);
+}
+
+TEST_F(AssemblerRISCV64Test, Stored) {
+  TestLoadStoreArbitraryOffset("Stored", "sd", &Riscv64Assembler::Stored, /*is_store=*/ true);
+}
+
+TEST_F(AssemblerRISCV64Test, FLoadw) {
+  TestFPLoadStoreArbitraryOffset("FLoadw", "flw", &Riscv64Assembler::FLoadw);
+}
+
+TEST_F(AssemblerRISCV64Test, FLoadd) {
+  TestFPLoadStoreArbitraryOffset("FLoadd", "fld", &Riscv64Assembler::FLoadd);
+}
+
+TEST_F(AssemblerRISCV64Test, FStorew) {
+  TestFPLoadStoreArbitraryOffset("FStorew", "fsw", &Riscv64Assembler::FStorew);
+}
+
+TEST_F(AssemblerRISCV64Test, FStored) {
+  TestFPLoadStoreArbitraryOffset("FStored", "fsd", &Riscv64Assembler::FStored);
+}
+
 TEST_F(AssemblerRISCV64Test, LoadLabelAddress) {
   std::string expected;
   constexpr size_t kNumLoadsForward = 4 * KB;
@@ -2095,12 +2352,12 @@
   DriverStr(expected, "LoadLabelAddress");
 }
 
-TEST_F(AssemblerRISCV64Test, LoadLiteralWithPadingForLong) {
-  TestLoadLiteral("LoadLiteralWithPadingForLong", /*with_padding_for_long=*/ true);
+TEST_F(AssemblerRISCV64Test, LoadLiteralWithPaddingForLong) {
+  TestLoadLiteral("LoadLiteralWithPaddingForLong", /*with_padding_for_long=*/ true);
 }
 
-TEST_F(AssemblerRISCV64Test, LoadLiteralWithoutPadingForLong) {
-  TestLoadLiteral("LoadLiteralWithoutPadingForLong", /*with_padding_for_long=*/ false);
+TEST_F(AssemblerRISCV64Test, LoadLiteralWithoutPaddingForLong) {
+  TestLoadLiteral("LoadLiteralWithoutPaddingForLong", /*with_padding_for_long=*/ false);
 }
 
 TEST_F(AssemblerRISCV64Test, JumpTable) {