runtime/arch/arm64/memcmp16_arm64.S - LeafOS-Project/android_art - Gitiles

 /*
  * Copyright (C) 2014 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /* Assumptions:
  *
  * ARMv8-a, AArch64
  */

 #ifndef ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
 #define ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_

 #include "asm_support_arm64.S"

 /* Parameters and result.  */
 #define src1        x0
 #define src2        x1
 #define limit       x2
 #define result      x0

 /* Internal variables.  */
 #define data1       x3
 #define data1w      w3
 #define data2       x4
 #define data2w      w4
 #define has_nul     x5
 #define diff        x6
 #define endloop     x7
 #define tmp1        x8
 #define tmp2        x9
 #define tmp3        x10
 #define limit_wd    x12
 #define mask        x13

 // WARNING: If you change this code to use x14 and x15, you must also change
 //          art_quick_string_compareto, which relies on these temps being unused.

 ENTRY __memcmp16
   cbz     limit, .Lret0
   lsl     limit, limit, #1  /* Half-words to bytes.  */
   eor     tmp1, src1, src2
   tst     tmp1, #7
   b.ne    .Lmisaligned8
   ands    tmp1, src1, #7
   b.ne    .Lmutual_align
   add     limit_wd, limit, #7
   lsr     limit_wd, limit_wd, #3
   /* Start of performance-critical section  -- one 64B cache line.  */
 .Lloop_aligned:
   ldr     data1, [src1], #8
   ldr     data2, [src2], #8
 .Lstart_realigned:
   subs    limit_wd, limit_wd, #1
   eor     diff, data1, data2  /* Non-zero if differences found.  */
   csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
   cbz     endloop, .Lloop_aligned
   /* End of performance-critical section  -- one 64B cache line.  */

   /* Not reached the limit, must have found a diff.  */
   cbnz    limit_wd, .Lnot_limit

   /* Limit % 8 == 0 => all bytes significant.  */
   ands    limit, limit, #7
   b.eq    .Lnot_limit

   lsl     limit, limit, #3  /* Bits -> bytes.  */
   mov     mask, #~0
   lsl     mask, mask, limit
   bic     data1, data1, mask
   bic     data2, data2, mask

 .Lnot_limit:

   // Swap the byte order of diff. Exact reverse is not important, as we only need to detect
   // the half-word.
   rev     diff, diff
   // The most significant bit of DIFF marks the least significant bit of change between DATA1/2
   clz     diff, diff
   // Mask off 0xF to have shift amount. Why does ARM64 not have BIC with immediate?!?!
   bfi     diff, xzr, #0, #4
   // Create a 16b mask
   mov     mask, #0xFFFF
   // Shift to the right half-word.
   lsr     data1, data1, diff
   lsr     data2, data2, diff
   // Mask the lowest half-word.
   and     data1, data1, mask
   and     data2, data2, mask
   // Compute difference.
   sub     result, data1, data2
   ret

 .Lmutual_align:
   /* Sources are mutually aligned, but are not currently at an
      alignment boundary.  Round down the addresses and then mask off
      the bytes that precede the start point.  */
   bic     src1, src1, #7
   bic     src2, src2, #7
   add     limit, limit, tmp1  /* Adjust the limit for the extra.  */
   lsl     tmp1, tmp1, #3    /* Bytes beyond alignment -> bits.  */
   ldr     data1, [src1], #8
   neg     tmp1, tmp1    /* Bits to alignment -64.  */
   ldr     data2, [src2], #8
   mov     tmp2, #~0
   /* Little-endian.  Early bytes are at LSB.  */
   lsr     tmp2, tmp2, tmp1  /* Shift (tmp1 & 63).  */
   add     limit_wd, limit, #7
   orr     data1, data1, tmp2
   orr     data2, data2, tmp2
   lsr     limit_wd, limit_wd, #3
   b       .Lstart_realigned

 .Lret0:
   mov     result, #0
   ret

   .p2align 6
 .Lmisaligned8:
   sub     limit, limit, #1
 1:
   /* Perhaps we can do better than this.  */
   ldrh    data1w, [src1], #2
   ldrh    data2w, [src2], #2
   subs    limit, limit, #2
   ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
   b.eq    1b
   sub     result, data1, data2
   ret
 END __memcmp16

 #endif  // ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
	/*
	* Copyright (C) 2014 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/* Assumptions:
	*
	* ARMv8-a, AArch64
	*/

	#ifndef ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
	#define ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_

	#include "asm_support_arm64.S"

	/* Parameters and result. */
	#define src1 x0
	#define src2 x1
	#define limit x2
	#define result x0

	/* Internal variables. */
	#define data1 x3
	#define data1w w3
	#define data2 x4
	#define data2w w4
	#define has_nul x5
	#define diff x6
	#define endloop x7
	#define tmp1 x8
	#define tmp2 x9
	#define tmp3 x10
	#define limit_wd x12
	#define mask x13

	// WARNING: If you change this code to use x14 and x15, you must also change
	// art_quick_string_compareto, which relies on these temps being unused.

	ENTRY __memcmp16
	cbz limit, .Lret0
	lsl limit, limit, #1 /* Half-words to bytes. */
	eor tmp1, src1, src2
	tst tmp1, #7
	b.ne .Lmisaligned8
	ands tmp1, src1, #7
	b.ne .Lmutual_align
	add limit_wd, limit, #7
	lsr limit_wd, limit_wd, #3
	/* Start of performance-critical section -- one 64B cache line. */
	.Lloop_aligned:
	ldr data1, [src1], #8
	ldr data2, [src2], #8
	.Lstart_realigned:
	subs limit_wd, limit_wd, #1
	eor diff, data1, data2 /* Non-zero if differences found. */
	csinv endloop, diff, xzr, ne /* Last Dword or differences. */
	cbz endloop, .Lloop_aligned
	/* End of performance-critical section -- one 64B cache line. */

	/* Not reached the limit, must have found a diff. */
	cbnz limit_wd, .Lnot_limit

	/* Limit % 8 == 0 => all bytes significant. */
	ands limit, limit, #7
	b.eq .Lnot_limit

	lsl limit, limit, #3 /* Bits -> bytes. */
	mov mask, #~0
	lsl mask, mask, limit
	bic data1, data1, mask
	bic data2, data2, mask

	.Lnot_limit:

	// Swap the byte order of diff. Exact reverse is not important, as we only need to detect
	// the half-word.
	rev diff, diff
	// The most significant bit of DIFF marks the least significant bit of change between DATA1/2
	clz diff, diff
	// Mask off 0xF to have shift amount. Why does ARM64 not have BIC with immediate?!?!
	bfi diff, xzr, #0, #4
	// Create a 16b mask
	mov mask, #0xFFFF
	// Shift to the right half-word.
	lsr data1, data1, diff
	lsr data2, data2, diff
	// Mask the lowest half-word.
	and data1, data1, mask
	and data2, data2, mask
	// Compute difference.
	sub result, data1, data2
	ret

	.Lmutual_align:
	/* Sources are mutually aligned, but are not currently at an
	alignment boundary. Round down the addresses and then mask off
	the bytes that precede the start point. */
	bic src1, src1, #7
	bic src2, src2, #7
	add limit, limit, tmp1 /* Adjust the limit for the extra. */
	lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
	ldr data1, [src1], #8
	neg tmp1, tmp1 /* Bits to alignment -64. */
	ldr data2, [src2], #8
	mov tmp2, #~0
	/* Little-endian. Early bytes are at LSB. */
	lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
	add limit_wd, limit, #7
	orr data1, data1, tmp2
	orr data2, data2, tmp2
	lsr limit_wd, limit_wd, #3
	b .Lstart_realigned

	.Lret0:
	mov result, #0
	ret

	.p2align 6
	.Lmisaligned8:
	sub limit, limit, #1
	1:
	/* Perhaps we can do better than this. */
	ldrh data1w, [src1], #2
	ldrh data2w, [src2], #2
	subs limit, limit, #2
	ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
	b.eq 1b
	sub result, data1, data2
	ret
	END __memcmp16

	#endif // ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_