| /* |
| * Copyright (C) 2014 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /* Assumptions: |
| * |
| * ARMv8-a, AArch64 |
| */ |
| |
| #ifndef ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_ |
| #define ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_ |
| |
| #include "asm_support_arm64.S" |
| |
| /* Parameters and result. */ |
| #define src1 x0 |
| #define src2 x1 |
| #define limit x2 |
| #define result x0 |
| |
| /* Internal variables. */ |
| #define data1 x3 |
| #define data1w w3 |
| #define data2 x4 |
| #define data2w w4 |
| #define has_nul x5 |
| #define diff x6 |
| #define endloop x7 |
| #define tmp1 x8 |
| #define tmp2 x9 |
| #define tmp3 x10 |
| #define limit_wd x12 |
| #define mask x13 |
| |
| // WARNING: If you change this code to use x14 and x15, you must also change |
| // art_quick_string_compareto, which relies on these temps being unused. |
| |
| ENTRY __memcmp16 |
| cbz limit, .Lret0 |
| lsl limit, limit, #1 /* Half-words to bytes. */ |
| eor tmp1, src1, src2 |
| tst tmp1, #7 |
| b.ne .Lmisaligned8 |
| ands tmp1, src1, #7 |
| b.ne .Lmutual_align |
| add limit_wd, limit, #7 |
| lsr limit_wd, limit_wd, #3 |
| /* Start of performance-critical section -- one 64B cache line. */ |
| .Lloop_aligned: |
| ldr data1, [src1], #8 |
| ldr data2, [src2], #8 |
| .Lstart_realigned: |
| subs limit_wd, limit_wd, #1 |
| eor diff, data1, data2 /* Non-zero if differences found. */ |
| csinv endloop, diff, xzr, ne /* Last Dword or differences. */ |
| cbz endloop, .Lloop_aligned |
| /* End of performance-critical section -- one 64B cache line. */ |
| |
| /* Not reached the limit, must have found a diff. */ |
| cbnz limit_wd, .Lnot_limit |
| |
| /* Limit % 8 == 0 => all bytes significant. */ |
| ands limit, limit, #7 |
| b.eq .Lnot_limit |
| |
| lsl limit, limit, #3 /* Bits -> bytes. */ |
| mov mask, #~0 |
| lsl mask, mask, limit |
| bic data1, data1, mask |
| bic data2, data2, mask |
| |
| .Lnot_limit: |
| |
| // Swap the byte order of diff. Exact reverse is not important, as we only need to detect |
| // the half-word. |
| rev diff, diff |
| // The most significant bit of DIFF marks the least significant bit of change between DATA1/2 |
| clz diff, diff |
| // Mask off 0xF to have shift amount. Why does ARM64 not have BIC with immediate?!?! |
| bfi diff, xzr, #0, #4 |
| // Create a 16b mask |
| mov mask, #0xFFFF |
| // Shift to the right half-word. |
| lsr data1, data1, diff |
| lsr data2, data2, diff |
| // Mask the lowest half-word. |
| and data1, data1, mask |
| and data2, data2, mask |
| // Compute difference. |
| sub result, data1, data2 |
| ret |
| |
| .Lmutual_align: |
| /* Sources are mutually aligned, but are not currently at an |
| alignment boundary. Round down the addresses and then mask off |
| the bytes that precede the start point. */ |
| bic src1, src1, #7 |
| bic src2, src2, #7 |
| add limit, limit, tmp1 /* Adjust the limit for the extra. */ |
| lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ |
| ldr data1, [src1], #8 |
| neg tmp1, tmp1 /* Bits to alignment -64. */ |
| ldr data2, [src2], #8 |
| mov tmp2, #~0 |
| /* Little-endian. Early bytes are at LSB. */ |
| lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ |
| add limit_wd, limit, #7 |
| orr data1, data1, tmp2 |
| orr data2, data2, tmp2 |
| lsr limit_wd, limit_wd, #3 |
| b .Lstart_realigned |
| |
| .Lret0: |
| mov result, #0 |
| ret |
| |
| .p2align 6 |
| .Lmisaligned8: |
| sub limit, limit, #1 |
| 1: |
| /* Perhaps we can do better than this. */ |
| ldrh data1w, [src1], #2 |
| ldrh data2w, [src2], #2 |
| subs limit, limit, #2 |
| ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ |
| b.eq 1b |
| sub result, data1, data2 |
| ret |
| END __memcmp16 |
| |
| #endif // ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_ |