| /* |
| * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) |
| * |
| * The white paper on CRC32C calculations with PCLMULQDQ instruction can be |
| * downloaded from: |
| * http://download.intel.com/design/intarch/papers/323405.pdf |
| * |
| * Copyright (C) 2012 Intel Corporation. |
| * |
| * Authors: |
| * Wajdi Feghali <wajdi.k.feghali@intel.com> |
| * James Guilford <james.guilford@intel.com> |
| * David Cote <david.m.cote@intel.com> |
| * Tim Chen <tim.c.chen@linux.intel.com> |
| * |
| * This software is available to you under a choice of one of two |
| * licenses. You may choose to be licensed under the terms of the GNU |
| * General Public License (GPL) Version 2, available from the file |
| * COPYING in the main directory of this source tree, or the |
| * OpenIB.org BSD license below: |
| * |
| * Redistribution and use in source and binary forms, with or |
| * without modification, are permitted provided that the following |
| * conditions are met: |
| * |
| * - Redistributions of source code must retain the above |
| * copyright notice, this list of conditions and the following |
| * disclaimer. |
| * |
| * - Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following |
| * disclaimer in the documentation and/or other materials |
| * provided with the distribution. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
| * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
| * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
| * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction |
| |
| .macro LABEL prefix n |
| \prefix\n\(): |
| .endm |
| |
| .macro JMPTBL_ENTRY i |
| .word crc_\i - crc_array |
| .endm |
| |
| .macro JNC_LESS_THAN j |
| jnc less_than_\j |
| .endm |
| |
| # Define threshold where buffers are considered "small" and routed to more |
| # efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so |
| # SMALL_SIZE can be no larger than 255. |
| |
| #define SMALL_SIZE 200 |
| |
| .if (SMALL_SIZE > 255) |
| .error "SMALL_ SIZE must be < 256" |
| .endif |
| |
| # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); |
| |
| .global crc_pcl |
| crc_pcl: |
| #define bufp %rdi |
| #define bufp_dw %edi |
| #define bufp_w %di |
| #define bufp_b %dil |
| #define bufptmp %rcx |
| #define block_0 %rcx |
| #define block_1 %rdx |
| #define block_2 %r11 |
| #define len %rsi |
| #define len_dw %esi |
| #define len_w %si |
| #define len_b %sil |
| #define crc_init_arg %rdx |
| #define tmp %rbx |
| #define crc_init %r8 |
| #define crc_init_dw %r8d |
| #define crc1 %r9 |
| #define crc2 %r10 |
| |
| pushq %rbx |
| pushq %rdi |
| pushq %rsi |
| |
| ## Move crc_init for Linux to a different |
| mov crc_init_arg, crc_init |
| |
| ################################################################ |
| ## 1) ALIGN: |
| ################################################################ |
| |
| mov bufp, bufptmp # rdi = *buf |
| neg bufp |
| and $7, bufp # calculate the unalignment amount of |
| # the address |
| je proc_block # Skip if aligned |
| |
| ## If len is less than 8 and we're unaligned, we need to jump |
| ## to special code to avoid reading beyond the end of the buffer |
| cmp $8, len |
| jae do_align |
| # less_than_8 expects length in upper 3 bits of len_dw |
| # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] |
| shl $32-3+1, len_dw |
| jmp less_than_8_post_shl1 |
| |
| do_align: |
| #### Calculate CRC of unaligned bytes of the buffer (if any) |
| movq (bufptmp), tmp # load a quadward from the buffer |
| add bufp, bufptmp # align buffer pointer for quadword |
| # processing |
| sub bufp, len # update buffer length |
| align_loop: |
| crc32b %bl, crc_init_dw # compute crc32 of 1-byte |
| shr $8, tmp # get next byte |
| dec bufp |
| jne align_loop |
| |
| proc_block: |
| |
| ################################################################ |
| ## 2) PROCESS BLOCKS: |
| ################################################################ |
| |
| ## compute num of bytes to be processed |
| movq len, tmp # save num bytes in tmp |
| |
| cmpq $128*24, len |
| jae full_block |
| |
| continue_block: |
| cmpq $SMALL_SIZE, len |
| jb small |
| |
| ## len < 128*24 |
| movq $2731, %rax # 2731 = ceil(2^16 / 24) |
| mul len_dw |
| shrq $16, %rax |
| |
| ## eax contains floor(bytes / 24) = num 24-byte chunks to do |
| |
| ## process rax 24-byte chunks (128 >= rax >= 0) |
| |
| ## compute end address of each block |
| ## block 0 (base addr + RAX * 8) |
| ## block 1 (base addr + RAX * 16) |
| ## block 2 (base addr + RAX * 24) |
| lea (bufptmp, %rax, 8), block_0 |
| lea (block_0, %rax, 8), block_1 |
| lea (block_1, %rax, 8), block_2 |
| |
| xor crc1, crc1 |
| xor crc2, crc2 |
| |
| ## branch into array |
| lea jump_table(%rip), bufp |
| movzxw (bufp, %rax, 2), len |
| offset=crc_array-jump_table |
| lea offset(bufp, len, 1), bufp |
| jmp *bufp |
| |
| ################################################################ |
| ## 2a) PROCESS FULL BLOCKS: |
| ################################################################ |
| full_block: |
| movq $128,%rax |
| lea 128*8*2(block_0), block_1 |
| lea 128*8*3(block_0), block_2 |
| add $128*8*1, block_0 |
| |
| xor crc1,crc1 |
| xor crc2,crc2 |
| |
| # Fall thruogh into top of crc array (crc_128) |
| |
| ################################################################ |
| ## 3) CRC Array: |
| ################################################################ |
| |
| crc_array: |
| i=128 |
| .rept 128-1 |
| .altmacro |
| LABEL crc_ %i |
| .noaltmacro |
| crc32q -i*8(block_0), crc_init |
| crc32q -i*8(block_1), crc1 |
| crc32q -i*8(block_2), crc2 |
| i=(i-1) |
| .endr |
| |
| .altmacro |
| LABEL crc_ %i |
| .noaltmacro |
| crc32q -i*8(block_0), crc_init |
| crc32q -i*8(block_1), crc1 |
| # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet |
| |
| mov block_2, block_0 |
| |
| ################################################################ |
| ## 4) Combine three results: |
| ################################################################ |
| |
| lea (K_table-16)(%rip), bufp # first entry is for idx 1 |
| shlq $3, %rax # rax *= 8 |
| subq %rax, tmp # tmp -= rax*8 |
| shlq $1, %rax |
| subq %rax, tmp # tmp -= rax*16 |
| # (total tmp -= rax*24) |
| addq %rax, bufp |
| |
| movdqa (bufp), %xmm0 # 2 consts: K1:K2 |
| |
| movq crc_init, %xmm1 # CRC for block 1 |
| pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2 |
| |
| movq crc1, %xmm2 # CRC for block 2 |
| pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 |
| |
| pxor %xmm2,%xmm1 |
| movq %xmm1, %rax |
| xor -i*8(block_2), %rax |
| mov crc2, crc_init |
| crc32 %rax, crc_init |
| |
| ################################################################ |
| ## 5) Check for end: |
| ################################################################ |
| |
| LABEL crc_ 0 |
| mov tmp, len |
| cmp $128*24, tmp |
| jae full_block |
| cmp $24, tmp |
| jae continue_block |
| |
| less_than_24: |
| shl $32-4, len_dw # less_than_16 expects length |
| # in upper 4 bits of len_dw |
| jnc less_than_16 |
| crc32q (bufptmp), crc_init |
| crc32q 8(bufptmp), crc_init |
| jz do_return |
| add $16, bufptmp |
| # len is less than 8 if we got here |
| # less_than_8 expects length in upper 3 bits of len_dw |
| # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] |
| shl $2, len_dw |
| jmp less_than_8_post_shl1 |
| |
| ####################################################################### |
| ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) |
| ####################################################################### |
| small: |
| shl $32-8, len_dw # Prepare len_dw for less_than_256 |
| j=256 |
| .rept 5 # j = {256, 128, 64, 32, 16} |
| .altmacro |
| LABEL less_than_ %j # less_than_j: Length should be in |
| # upper lg(j) bits of len_dw |
| j=(j/2) |
| shl $1, len_dw # Get next MSB |
| JNC_LESS_THAN %j |
| .noaltmacro |
| i=0 |
| .rept (j/8) |
| crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data |
| i=i+8 |
| .endr |
| jz do_return # Return if remaining length is zero |
| add $j, bufptmp # Advance buf |
| .endr |
| |
| less_than_8: # Length should be stored in |
| # upper 3 bits of len_dw |
| shl $1, len_dw |
| less_than_8_post_shl1: |
| jnc less_than_4 |
| crc32l (bufptmp), crc_init_dw # CRC of 4 bytes |
| jz do_return # return if remaining data is zero |
| add $4, bufptmp |
| less_than_4: # Length should be stored in |
| # upper 2 bits of len_dw |
| shl $1, len_dw |
| jnc less_than_2 |
| crc32w (bufptmp), crc_init_dw # CRC of 2 bytes |
| jz do_return # return if remaining data is zero |
| add $2, bufptmp |
| less_than_2: # Length should be stored in the MSB |
| # of len_dw |
| shl $1, len_dw |
| jnc less_than_1 |
| crc32b (bufptmp), crc_init_dw # CRC of 1 byte |
| less_than_1: # Length should be zero |
| do_return: |
| movq crc_init, %rax |
| popq %rsi |
| popq %rdi |
| popq %rbx |
| ret |
| |
| ################################################################ |
| ## jump table Table is 129 entries x 2 bytes each |
| ################################################################ |
| .align 4 |
| jump_table: |
| i=0 |
| .rept 129 |
| .altmacro |
| JMPTBL_ENTRY %i |
| .noaltmacro |
| i=i+1 |
| .endr |
| ################################################################ |
| ## PCLMULQDQ tables |
| ## Table is 128 entries x 2 quad words each |
| ################################################################ |
| .data |
| .align 64 |
| K_table: |
| .quad 0x14cd00bd6,0x105ec76f0 |
| .quad 0x0ba4fc28e,0x14cd00bd6 |
| .quad 0x1d82c63da,0x0f20c0dfe |
| .quad 0x09e4addf8,0x0ba4fc28e |
| .quad 0x039d3b296,0x1384aa63a |
| .quad 0x102f9b8a2,0x1d82c63da |
| .quad 0x14237f5e6,0x01c291d04 |
| .quad 0x00d3b6092,0x09e4addf8 |
| .quad 0x0c96cfdc0,0x0740eef02 |
| .quad 0x18266e456,0x039d3b296 |
| .quad 0x0daece73e,0x0083a6eec |
| .quad 0x0ab7aff2a,0x102f9b8a2 |
| .quad 0x1248ea574,0x1c1733996 |
| .quad 0x083348832,0x14237f5e6 |
| .quad 0x12c743124,0x02ad91c30 |
| .quad 0x0b9e02b86,0x00d3b6092 |
| .quad 0x018b33a4e,0x06992cea2 |
| .quad 0x1b331e26a,0x0c96cfdc0 |
| .quad 0x17d35ba46,0x07e908048 |
| .quad 0x1bf2e8b8a,0x18266e456 |
| .quad 0x1a3e0968a,0x11ed1f9d8 |
| .quad 0x0ce7f39f4,0x0daece73e |
| .quad 0x061d82e56,0x0f1d0f55e |
| .quad 0x0d270f1a2,0x0ab7aff2a |
| .quad 0x1c3f5f66c,0x0a87ab8a8 |
| .quad 0x12ed0daac,0x1248ea574 |
| .quad 0x065863b64,0x08462d800 |
| .quad 0x11eef4f8e,0x083348832 |
| .quad 0x1ee54f54c,0x071d111a8 |
| .quad 0x0b3e32c28,0x12c743124 |
| .quad 0x0064f7f26,0x0ffd852c6 |
| .quad 0x0dd7e3b0c,0x0b9e02b86 |
| .quad 0x0f285651c,0x0dcb17aa4 |
| .quad 0x010746f3c,0x018b33a4e |
| .quad 0x1c24afea4,0x0f37c5aee |
| .quad 0x0271d9844,0x1b331e26a |
| .quad 0x08e766a0c,0x06051d5a2 |
| .quad 0x093a5f730,0x17d35ba46 |
| .quad 0x06cb08e5c,0x11d5ca20e |
| .quad 0x06b749fb2,0x1bf2e8b8a |
| .quad 0x1167f94f2,0x021f3d99c |
| .quad 0x0cec3662e,0x1a3e0968a |
| .quad 0x19329634a,0x08f158014 |
| .quad 0x0e6fc4e6a,0x0ce7f39f4 |
| .quad 0x08227bb8a,0x1a5e82106 |
| .quad 0x0b0cd4768,0x061d82e56 |
| .quad 0x13c2b89c4,0x188815ab2 |
| .quad 0x0d7a4825c,0x0d270f1a2 |
| .quad 0x10f5ff2ba,0x105405f3e |
| .quad 0x00167d312,0x1c3f5f66c |
| .quad 0x0f6076544,0x0e9adf796 |
| .quad 0x026f6a60a,0x12ed0daac |
| .quad 0x1a2adb74e,0x096638b34 |
| .quad 0x19d34af3a,0x065863b64 |
| .quad 0x049c3cc9c,0x1e50585a0 |
| .quad 0x068bce87a,0x11eef4f8e |
| .quad 0x1524fa6c6,0x19f1c69dc |
| .quad 0x16cba8aca,0x1ee54f54c |
| .quad 0x042d98888,0x12913343e |
| .quad 0x1329d9f7e,0x0b3e32c28 |
| .quad 0x1b1c69528,0x088f25a3a |
| .quad 0x02178513a,0x0064f7f26 |
| .quad 0x0e0ac139e,0x04e36f0b0 |
| .quad 0x0170076fa,0x0dd7e3b0c |
| .quad 0x141a1a2e2,0x0bd6f81f8 |
| .quad 0x16ad828b4,0x0f285651c |
| .quad 0x041d17b64,0x19425cbba |
| .quad 0x1fae1cc66,0x010746f3c |
| .quad 0x1a75b4b00,0x18db37e8a |
| .quad 0x0f872e54c,0x1c24afea4 |
| .quad 0x01e41e9fc,0x04c144932 |
| .quad 0x086d8e4d2,0x0271d9844 |
| .quad 0x160f7af7a,0x052148f02 |
| .quad 0x05bb8f1bc,0x08e766a0c |
| .quad 0x0a90fd27a,0x0a3c6f37a |
| .quad 0x0b3af077a,0x093a5f730 |
| .quad 0x04984d782,0x1d22c238e |
| .quad 0x0ca6ef3ac,0x06cb08e5c |
| .quad 0x0234e0b26,0x063ded06a |
| .quad 0x1d88abd4a,0x06b749fb2 |
| .quad 0x04597456a,0x04d56973c |
| .quad 0x0e9e28eb4,0x1167f94f2 |
| .quad 0x07b3ff57a,0x19385bf2e |
| .quad 0x0c9c8b782,0x0cec3662e |
| .quad 0x13a9cba9e,0x0e417f38a |
| .quad 0x093e106a4,0x19329634a |
| .quad 0x167001a9c,0x14e727980 |
| .quad 0x1ddffc5d4,0x0e6fc4e6a |
| .quad 0x00df04680,0x0d104b8fc |
| .quad 0x02342001e,0x08227bb8a |
| .quad 0x00a2a8d7e,0x05b397730 |
| .quad 0x168763fa6,0x0b0cd4768 |
| .quad 0x1ed5a407a,0x0e78eb416 |
| .quad 0x0d2c3ed1a,0x13c2b89c4 |
| .quad 0x0995a5724,0x1641378f0 |
| .quad 0x19b1afbc4,0x0d7a4825c |
| .quad 0x109ffedc0,0x08d96551c |
| .quad 0x0f2271e60,0x10f5ff2ba |
| .quad 0x00b0bf8ca,0x00bf80dd2 |
| .quad 0x123888b7a,0x00167d312 |
| .quad 0x1e888f7dc,0x18dcddd1c |
| .quad 0x002ee03b2,0x0f6076544 |
| .quad 0x183e8d8fe,0x06a45d2b2 |
| .quad 0x133d7a042,0x026f6a60a |
| .quad 0x116b0f50c,0x1dd3e10e8 |
| .quad 0x05fabe670,0x1a2adb74e |
| .quad 0x130004488,0x0de87806c |
| .quad 0x000bcf5f6,0x19d34af3a |
| .quad 0x18f0c7078,0x014338754 |
| .quad 0x017f27698,0x049c3cc9c |
| .quad 0x058ca5f00,0x15e3e77ee |
| .quad 0x1af900c24,0x068bce87a |
| .quad 0x0b5cfca28,0x0dd07448e |
| .quad 0x0ded288f8,0x1524fa6c6 |
| .quad 0x059f229bc,0x1d8048348 |
| .quad 0x06d390dec,0x16cba8aca |
| .quad 0x037170390,0x0a3e3e02c |
| .quad 0x06353c1cc,0x042d98888 |
| .quad 0x0c4584f5c,0x0d73c7bea |
| .quad 0x1f16a3418,0x1329d9f7e |
| .quad 0x0531377e2,0x185137662 |
| .quad 0x1d8d9ca7c,0x1b1c69528 |
| .quad 0x0b25b29f2,0x18a08b5bc |
| .quad 0x19fb2a8b0,0x02178513a |
| .quad 0x1a08fe6ac,0x1da758ae0 |
| .quad 0x045cddf4e,0x0e0ac139e |
| .quad 0x1a91647f2,0x169cf9eb0 |
| .quad 0x1a0f717c4,0x0170076fa |