Switch from aeabi_lmul to custom assembly.

Spotted a sizeable performance regression on an app with a hot long
multiply. aebi_lmul is noticably slower than the previous custom Dalvik
assembly so switch to using that.

Change-Id: Id829de69193506ba7ac8fdc8b78009351631b0fc
diff --git a/src/oat/runtime/arm/oat_support_entrypoints_arm.cc b/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
index 305f359..5aedff7 100644
--- a/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
+++ b/src/oat/runtime/arm/oat_support_entrypoints_arm.cc
@@ -111,7 +111,7 @@
 // Long long arithmetics - REM_LONG[_2ADDR] and DIV_LONG[_2ADDR]
 extern "C" int64_t __aeabi_ldivmod(int64_t, int64_t);
-extern "C" int64_t __aeabi_lmul(int64_t, int64_t);
+extern "C" int64_t art_mul_long(int64_t, int64_t);
 extern "C" uint64_t art_shl_long(uint64_t, uint32_t);
 extern "C" uint64_t art_shr_long(uint64_t, uint32_t);
 extern "C" uint64_t art_ushr_long(uint64_t, uint32_t);
@@ -230,7 +230,7 @@
   points->pF2l = art_f2l;
   points->pLdiv = __aeabi_ldivmod;
   points->pLdivmod = __aeabi_ldivmod;  // result returned in r2:r3
-  points->pLmul = __aeabi_lmul;
+  points->pLmul = art_mul_long;
   points->pShlLong = art_shl_long;
   points->pShrLong = art_shr_long;
   points->pUshrLong = art_ushr_long;
diff --git a/src/oat/runtime/arm/runtime_support_arm.S b/src/oat/runtime/arm/runtime_support_arm.S
index 5b4cd1b..eefaed0 100644
--- a/src/oat/runtime/arm/runtime_support_arm.S
+++ b/src/oat/runtime/arm/runtime_support_arm.S
@@ -794,6 +794,38 @@
     pop   {r0, r1}       @ restore return value
     bx    lr             @ return
+    .global art_mul_long
+    /*
+     * Signed 64-bit integer multiply.
+     *
+     * Consider WXxYZ (r1r0 x r3r2) with a long multiply:
+     *        WX
+     *      x YZ
+     *  --------
+     *     ZW ZX
+     *  YW YX
+     *
+     * The low word of the result holds ZX, the high word holds
+     * (ZW+YX) + (the high overflow from ZX).  YW doesn't matter because
+     * it doesn't fit in the low 64 bits.
+     *
+     * Unlike most ARM math operations, multiply instructions have
+     * restrictions on using the same register more than once (Rd and Rm
+     * cannot be the same).
+     */
+    /* mul-long vAA, vBB, vCC */
+    push    {r9 - r10}
+    mul     ip, r2, r1                  @  ip<- ZxW
+    umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
+    mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
+    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
+    mov     r0,r9
+    mov     r1,r10
+    pop     {r9 - r10}
+    bx      lr
     .global art_shl_long
      * Long integer shift.  This is different from the generic 32/64-bit