blob: a0f41c6134ecbd287e2dfd7afc2b3fecb45f585e [file] [log] [blame]
/*
* lz4armv8.S
* LZ4 decompression optimization based on arm64 NEON instruction
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
/**
* _lz4_decompress_asm: The fast LZ4 decompression, lz4 decompression algothrim asm
* routine,support Huawei EROFS filesystem striving for maximum decompression speed.
* Entry point _lz4_decompress_asm.
* @para:
* x0 = current destination address ptr
* x1 = destination start position
* x2 = destination end position
* x3 = current source address ptr
* x4 = source end position
* x5 = flag for DIP
* @ret:
* 0 on success, -1 on failure
*
* x7: match_length
* x8: literal_legth
* x9: copy start ptr
* x10: copy end ptr
*/
#define match_length x7
#define literal_length x8
#define copy_from_ptr x9 /* copy source ptr*/
#define copy_to_ptr x10 /* copy destination ptr*/
#define w_tmp w11 /* temp var */
#define tmp x11
#define w_offset w12
#define offset x12
#define permtable_addr x13
#define cplen_table_addr x14
#define save_dst x15
#define save_src x16
#define offset_src_ptr x17
#define w_tmp_match_length w6
#define tmp_match_length x6
/* x3 >= x4 src overflow */
.macro check_src_overflow
cmp x3, x4
b.hs Done
.endm
.macro check_src_overflow1
cmp x3, x4
b.hs Done1
.endm
/* x0 >= x2 dst overflow */
.macro check_dst_overflow
cmp x0, x2
b.hs Done
.endm
.macro check_dst_overflow1
cmp x0, x2
b.hs Done1
.endm
.altmacro
.macro lz4_decompress_asm_generic doprfm=1
stp x29, x30, [sp, #-16]!
mov x29, sp
stp x3, x0, [sp, #-16]! /* push src and dst in stack */
ldr x3, [x3] /* x3 = *src_ptr */
ldr x0, [x0] /* x0 = *dst_ptr */
adr permtable_addr, Permtable
adr cplen_table_addr, Copylength_table
Lz4_decompress_begin\@:
/*
* save current dst and src ,ensure when return from asm routine
* current both of "dst" and "src" save good position.
*/
mov save_dst, x0
mov save_src, x3
check_dst_overflow
check_src_overflow
.if \doprfm
add tmp, x0, #512
cmp x2, tmp
b.ls Decode_token\@
prfm pstl2strm,[x0,#512]
.endif
/* Decode Token Byte: */
Decode_token\@:
ldrb w_tmp, [x3], #1 /* read Token Byte */
lsr literal_length, tmp, #4 /* get literal_length */
and tmp_match_length, tmp, #0xf /* get match_length */
add match_length, tmp_match_length, #4 /* match_length >=4 */
/*
* literal_length <= 14 : no more literal length byte,fllowing zero
* or more bytes are liteal bytes.
*/
cmp literal_length, #14
b.ls Copy_literal_lt_15\@
/*
* literal_length == 15 : more literal length bytes after TokenByte.
* continue decoding more literal length bytes.
*/
Get_literal_length\@:
check_src_overflow
ldrb w_tmp, [x3], #1
add literal_length, literal_length, tmp
cmp tmp, #255
b.eq Get_literal_length\@
/* literal copy */
Copy_long_literal_hs_15\@:
mov copy_from_ptr, x3
mov copy_to_ptr, x0
add x3, x3, literal_length
add x0, x0, literal_length
check_dst_overflow
check_src_overflow
Copy_long_literal_loop\@:
ldr q0, [copy_from_ptr], #16
str q0, [copy_to_ptr], #16
cmp x0, copy_to_ptr
b.ls Decode_offset_matchlength\@
b Copy_long_literal_loop\@
Copy_literal_lt_15\@:
ldr q0, [x3]
str q0, [x0]
add x3, x3, literal_length
add x0, x0, literal_length
/* Decode offset and match_length */
Decode_offset_matchlength\@:
mov offset_src_ptr, x3
ldrh w_offset, [x3], #2 /* 2Byte:offset bytes */
cbz offset, Failed /* match_length == 0 is invalid */
sub copy_from_ptr, x0, offset
cmp copy_from_ptr, x1
b.lo Failed
mov copy_to_ptr, x0
/*
* set x0 to the end of "match copy";
*/
add x0, x0, match_length
cmp match_length, #19
b.lo Copy_match_begin\@
/*
* continue decoding more match length bytes.
*/
Get_long_matchlength\@:
check_src_overflow1
ldrb w_tmp, [x3], #1
add x0, x0, tmp
add match_length, match_length, tmp
cmp tmp, #255
b.eq Get_long_matchlength\@
/*
* here got the matchlength,start "match copy".
*/
Copy_match_begin\@:
check_dst_overflow1
cmp offset , match_length
b.hs Cond_offset_ge_matchlength\@
Cond_offset_lt_matchlength\@:
cmp offset , #32
b.hs Cond_offset_ge_matchlength\@
Copy_offset_lt_32\@:
ldr q1, [copy_from_ptr]
add tmp, permtable_addr, offset, lsl #5
ldp q2, q3, [tmp]
tbl v0.16b, {v1.16b}, v2.16b
tbl v1.16b, {v1.16b}, v3.16b
cmp offset , #16
b.lo Copy_match_perm\@
ldp q0, q1, [copy_from_ptr]
Copy_match_perm\@:
ldrb w_tmp, [cplen_table_addr, offset]
stp q0, q1, [copy_to_ptr]
add copy_to_ptr, copy_to_ptr, tmp
cmp x0, copy_to_ptr
b.ls Lz4_decompress_begin\@
Copy_offset_lt_32_loop\@:
stp q0, q1, [copy_to_ptr]
add copy_to_ptr, copy_to_ptr, tmp
stp q0, q1, [copy_to_ptr]
add copy_to_ptr, copy_to_ptr, tmp
cmp x0, copy_to_ptr
b.hi Copy_offset_lt_32_loop\@
b Lz4_decompress_begin\@
/* offset >= match */
Cond_offset_ge_matchlength\@:
ldr q0, [copy_from_ptr], #16
str q0, [copy_to_ptr], #16
cmp x0, copy_to_ptr
b.ls Lz4_decompress_begin\@
Copy_offset_ge_match_loop\@:
ldp q0, q1, [copy_from_ptr], #32
stp q0, q1, [copy_to_ptr], #32
cmp x0, copy_to_ptr
b.hi Copy_offset_ge_match_loop\@
b Lz4_decompress_begin\@
.endm
.text
.p2align 4
ENTRY(_lz4_decompress_asm)
lz4_decompress_asm_generic
ENDPROC(_lz4_decompress_asm)
Failed:
mov tmp, #-1
b Exit_here
Done1:
cbz x5, Done
sub save_src, offset_src_ptr, #1
strb w_tmp_match_length, [save_src]
add save_dst,save_dst,literal_length
Done:
mov tmp, #0
Exit_here:
ldp x3, x0, [sp], #16
str save_src, [x3]
str save_dst, [x0]
mov x0, tmp
ldp x29, x30, [sp], #16
ret x30
/*
* In case of offset <= 31 < matchlength ,expand the pattern and store in
* repeating pattern size(RPS),store the RPS in Copylength_table.
* case 1): 1 <= offset <= 15
* expand the pattern according to the Permtable and store their repeating pattern in q0 q1;
* RPS = 32 - (32 % offset) offset <= 31
* case 2): offset >= 16
* read the pattern and store in q0 q1.
* RPS = offset.
*/
.text
.p2align 8
Permtable:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 //offset = 0
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 //offset = 1
.byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 //offset = 2
.byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 //offset = 3
.byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 //offset = 4
.byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 //offset = 5
.byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 //offset = 6
.byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 //offset = 7
.byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 //offset = 8
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 //offset = 9
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 //offset = 10
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 //offset = 11
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 //offset = 12
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 //offset = 13
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 //offset = 14
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 //offset = 15
.p2align 8
Copylength_table:
.byte 32,32,32,30,32,30,30,28,32,27,30,22,24,26,28,30 // 0 .. 15
.byte 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 // 16 .. 31
.text
.p2align 4
ENTRY(_lz4_decompress_asm_noprfm)
lz4_decompress_asm_generic 0
ENDPROC(_lz4_decompress_asm_noprfm)