x86_64: Implement missing variants of XCHG. Add tests for them.

Also, clean up the handling of a special case when at least one of the
operands for XCHG is RAX/EAX/AX. Add a helper function that deduplicates
the code for different operand sizes.

This patch also extends `EmitOptionalByteRegNormalizingRex32` function
to handle the case when an instruction has both operands in byte
registers, and so it needs REX prefix if either of them is in a special
register (an example of such instruction is `xchg bpl, al`: if only the
source register is checked, no REX would be emitted). Previously
`EmitOptionalByteRegNormalizingRex32` handled only the case when the
source register is special (an example of such instruction is
`movzxb rax, bpl`).

Bug: 65872996
Test: m test-art-host-gtest
Change-Id: I20c5e9375bbd15d799e5748b127d154ddcc0fc11
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index f385f22..8ef8931 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -775,10 +775,18 @@
   void fptan();
   void fprem();
 
+  void xchgb(CpuRegister dst, CpuRegister src);
+  void xchgb(CpuRegister reg, const Address& address);
+
+  void xchgw(CpuRegister dst, CpuRegister src);
+  void xchgw(CpuRegister reg, const Address& address);
+
   void xchgl(CpuRegister dst, CpuRegister src);
-  void xchgq(CpuRegister dst, CpuRegister src);
   void xchgl(CpuRegister reg, const Address& address);
 
+  void xchgq(CpuRegister dst, CpuRegister src);
+  void xchgq(CpuRegister reg, const Address& address);
+
   void cmpb(const Address& address, const Immediate& imm);
   void cmpw(const Address& address, const Immediate& imm);
 
@@ -1102,7 +1110,13 @@
   void EmitRex64(CpuRegister dst, XmmRegister src);
 
   // Emit a REX prefix to normalize byte registers plus necessary register bit encodings.
-  void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src);
+  // `normalize_both` parameter controls if the REX prefix is checked only for the `src` register
+  // (which is the case for instructions like `movzxb rax, bpl`), or for both `src` and `dst`
+  // registers (which is the case of instructions like `xchg bpl, al`). By default only `src` is
+  // used to decide if REX is needed.
+  void EmitOptionalByteRegNormalizingRex32(CpuRegister dst,
+                                           CpuRegister src,
+                                           bool normalize_both = false);
   void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, const Operand& operand);
 
   uint8_t EmitVexPrefixByteZero(bool is_twobyte_form);
@@ -1118,6 +1132,12 @@
   uint8_t EmitVexPrefixByteTwo(bool W,
                                int SET_VEX_L,
                                int SET_VEX_PP);
+
+  // Helper function to emit a shorter variant of XCHG if at least one operand is RAX/EAX/AX.
+  bool try_xchg_rax(CpuRegister dst,
+                    CpuRegister src,
+                    void (X86_64Assembler::*prefix_fn)(CpuRegister));
+
   ConstantArea constant_area_;
   bool has_AVX_;     // x86 256bit SIMD AVX.
   bool has_AVX2_;    // x86 256bit SIMD AVX 2.0.