mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2024-05-18 16:28:07 +00:00
07207e71e9
* update go-structr => v0.2.0 * update readme * whoops, fix the link
587 lines
13 KiB
ArmAsm
587 lines
13 KiB
ArmAsm
// Code generated by command: go run gen.go -avx -out ../accum_vector_avx_amd64.s -pkg xxh3. DO NOT EDIT.
|
|
|
|
#include "textflag.h"
|
|
|
|
DATA prime_avx<>+0(SB)/8, $0x000000009e3779b1
|
|
DATA prime_avx<>+8(SB)/8, $0x000000009e3779b1
|
|
DATA prime_avx<>+16(SB)/8, $0x000000009e3779b1
|
|
DATA prime_avx<>+24(SB)/8, $0x000000009e3779b1
|
|
GLOBL prime_avx<>(SB), RODATA|NOPTR, $32
|
|
|
|
// func accumAVX2(acc *[8]uint64, data *byte, key *byte, len uint64)
|
|
// Requires: AVX, AVX2, MMX+
|
|
TEXT ·accumAVX2(SB), NOSPLIT, $0-32
|
|
MOVQ acc+0(FP), AX
|
|
MOVQ data+8(FP), CX
|
|
MOVQ key+16(FP), DX
|
|
MOVQ key+16(FP), BX
|
|
MOVQ len+24(FP), SI
|
|
VMOVDQU (AX), Y1
|
|
VMOVDQU 32(AX), Y2
|
|
VMOVDQU prime_avx<>+0(SB), Y0
|
|
|
|
accum_large:
|
|
CMPQ SI, $0x00000400
|
|
JLE accum
|
|
VMOVDQU (CX), Y3
|
|
VMOVDQU 32(CX), Y6
|
|
PREFETCHT0 512(CX)
|
|
VPXOR (DX), Y3, Y4
|
|
VPXOR 32(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 64(CX), Y3
|
|
VMOVDQU 96(CX), Y6
|
|
PREFETCHT0 576(CX)
|
|
VPXOR 8(DX), Y3, Y4
|
|
VPXOR 40(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 128(CX), Y3
|
|
VMOVDQU 160(CX), Y6
|
|
PREFETCHT0 640(CX)
|
|
VPXOR 16(DX), Y3, Y4
|
|
VPXOR 48(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 192(CX), Y3
|
|
VMOVDQU 224(CX), Y6
|
|
PREFETCHT0 704(CX)
|
|
VPXOR 24(DX), Y3, Y4
|
|
VPXOR 56(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 256(CX), Y3
|
|
VMOVDQU 288(CX), Y6
|
|
PREFETCHT0 768(CX)
|
|
VPXOR 32(DX), Y3, Y4
|
|
VPXOR 64(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 320(CX), Y3
|
|
VMOVDQU 352(CX), Y6
|
|
PREFETCHT0 832(CX)
|
|
VPXOR 40(DX), Y3, Y4
|
|
VPXOR 72(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 384(CX), Y3
|
|
VMOVDQU 416(CX), Y6
|
|
PREFETCHT0 896(CX)
|
|
VPXOR 48(DX), Y3, Y4
|
|
VPXOR 80(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 448(CX), Y3
|
|
VMOVDQU 480(CX), Y6
|
|
PREFETCHT0 960(CX)
|
|
VPXOR 56(DX), Y3, Y4
|
|
VPXOR 88(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 512(CX), Y3
|
|
VMOVDQU 544(CX), Y6
|
|
PREFETCHT0 1024(CX)
|
|
VPXOR 64(DX), Y3, Y4
|
|
VPXOR 96(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 576(CX), Y3
|
|
VMOVDQU 608(CX), Y6
|
|
PREFETCHT0 1088(CX)
|
|
VPXOR 72(DX), Y3, Y4
|
|
VPXOR 104(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 640(CX), Y3
|
|
VMOVDQU 672(CX), Y6
|
|
PREFETCHT0 1152(CX)
|
|
VPXOR 80(DX), Y3, Y4
|
|
VPXOR 112(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 704(CX), Y3
|
|
VMOVDQU 736(CX), Y6
|
|
PREFETCHT0 1216(CX)
|
|
VPXOR 88(DX), Y3, Y4
|
|
VPXOR 120(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 768(CX), Y3
|
|
VMOVDQU 800(CX), Y6
|
|
PREFETCHT0 1280(CX)
|
|
VPXOR 96(DX), Y3, Y4
|
|
VPXOR 128(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 832(CX), Y3
|
|
VMOVDQU 864(CX), Y6
|
|
PREFETCHT0 1344(CX)
|
|
VPXOR 104(DX), Y3, Y4
|
|
VPXOR 136(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 896(CX), Y3
|
|
VMOVDQU 928(CX), Y6
|
|
PREFETCHT0 1408(CX)
|
|
VPXOR 112(DX), Y3, Y4
|
|
VPXOR 144(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 960(CX), Y3
|
|
VMOVDQU 992(CX), Y6
|
|
PREFETCHT0 1472(CX)
|
|
VPXOR 120(DX), Y3, Y4
|
|
VPXOR 152(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
ADDQ $0x00000400, CX
|
|
SUBQ $0x00000400, SI
|
|
VPSRLQ $0x2f, Y1, Y3
|
|
VPXOR Y1, Y3, Y3
|
|
VPXOR 128(DX), Y3, Y3
|
|
VPMULUDQ Y0, Y3, Y1
|
|
VPSHUFD $0xf5, Y3, Y3
|
|
VPMULUDQ Y0, Y3, Y3
|
|
VPSLLQ $0x20, Y3, Y3
|
|
VPADDQ Y1, Y3, Y1
|
|
VPSRLQ $0x2f, Y2, Y3
|
|
VPXOR Y2, Y3, Y3
|
|
VPXOR 160(DX), Y3, Y3
|
|
VPMULUDQ Y0, Y3, Y2
|
|
VPSHUFD $0xf5, Y3, Y3
|
|
VPMULUDQ Y0, Y3, Y3
|
|
VPSLLQ $0x20, Y3, Y3
|
|
VPADDQ Y2, Y3, Y2
|
|
JMP accum_large
|
|
|
|
accum:
|
|
CMPQ SI, $0x40
|
|
JLE finalize
|
|
VMOVDQU (CX), Y0
|
|
VMOVDQU 32(CX), Y5
|
|
VPXOR (BX), Y0, Y3
|
|
VPXOR 32(BX), Y5, Y6
|
|
VPSHUFD $0x31, Y3, Y4
|
|
VPSHUFD $0x31, Y6, Y7
|
|
VPMULUDQ Y3, Y4, Y3
|
|
VPMULUDQ Y6, Y7, Y6
|
|
VPSHUFD $0x4e, Y0, Y0
|
|
VPSHUFD $0x4e, Y5, Y5
|
|
VPADDQ Y1, Y0, Y1
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y2, Y5, Y2
|
|
VPADDQ Y2, Y6, Y2
|
|
ADDQ $0x00000040, CX
|
|
SUBQ $0x00000040, SI
|
|
ADDQ $0x00000008, BX
|
|
JMP accum
|
|
|
|
finalize:
|
|
CMPQ SI, $0x00
|
|
JE return
|
|
SUBQ $0x40, CX
|
|
ADDQ SI, CX
|
|
VMOVDQU (CX), Y0
|
|
VMOVDQU 32(CX), Y5
|
|
VPXOR 121(DX), Y0, Y3
|
|
VPXOR 153(DX), Y5, Y6
|
|
VPSHUFD $0x31, Y3, Y4
|
|
VPSHUFD $0x31, Y6, Y7
|
|
VPMULUDQ Y3, Y4, Y3
|
|
VPMULUDQ Y6, Y7, Y6
|
|
VPSHUFD $0x4e, Y0, Y0
|
|
VPSHUFD $0x4e, Y5, Y5
|
|
VPADDQ Y1, Y0, Y1
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y2, Y5, Y2
|
|
VPADDQ Y2, Y6, Y2
|
|
|
|
return:
|
|
VMOVDQU Y1, (AX)
|
|
VMOVDQU Y2, 32(AX)
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func accumBlockAVX2(acc *[8]uint64, data *byte, key *byte)
|
|
// Requires: AVX, AVX2
|
|
TEXT ·accumBlockAVX2(SB), NOSPLIT, $0-24
|
|
MOVQ acc+0(FP), AX
|
|
MOVQ data+8(FP), CX
|
|
MOVQ key+16(FP), DX
|
|
VMOVDQU (AX), Y1
|
|
VMOVDQU 32(AX), Y2
|
|
VMOVDQU prime_avx<>+0(SB), Y0
|
|
VMOVDQU (CX), Y3
|
|
VMOVDQU 32(CX), Y6
|
|
VPXOR (DX), Y3, Y4
|
|
VPXOR 32(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 64(CX), Y3
|
|
VMOVDQU 96(CX), Y6
|
|
VPXOR 8(DX), Y3, Y4
|
|
VPXOR 40(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 128(CX), Y3
|
|
VMOVDQU 160(CX), Y6
|
|
VPXOR 16(DX), Y3, Y4
|
|
VPXOR 48(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 192(CX), Y3
|
|
VMOVDQU 224(CX), Y6
|
|
VPXOR 24(DX), Y3, Y4
|
|
VPXOR 56(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 256(CX), Y3
|
|
VMOVDQU 288(CX), Y6
|
|
VPXOR 32(DX), Y3, Y4
|
|
VPXOR 64(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 320(CX), Y3
|
|
VMOVDQU 352(CX), Y6
|
|
VPXOR 40(DX), Y3, Y4
|
|
VPXOR 72(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 384(CX), Y3
|
|
VMOVDQU 416(CX), Y6
|
|
VPXOR 48(DX), Y3, Y4
|
|
VPXOR 80(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 448(CX), Y3
|
|
VMOVDQU 480(CX), Y6
|
|
VPXOR 56(DX), Y3, Y4
|
|
VPXOR 88(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 512(CX), Y3
|
|
VMOVDQU 544(CX), Y6
|
|
VPXOR 64(DX), Y3, Y4
|
|
VPXOR 96(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 576(CX), Y3
|
|
VMOVDQU 608(CX), Y6
|
|
VPXOR 72(DX), Y3, Y4
|
|
VPXOR 104(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 640(CX), Y3
|
|
VMOVDQU 672(CX), Y6
|
|
VPXOR 80(DX), Y3, Y4
|
|
VPXOR 112(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 704(CX), Y3
|
|
VMOVDQU 736(CX), Y6
|
|
VPXOR 88(DX), Y3, Y4
|
|
VPXOR 120(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 768(CX), Y3
|
|
VMOVDQU 800(CX), Y6
|
|
VPXOR 96(DX), Y3, Y4
|
|
VPXOR 128(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 832(CX), Y3
|
|
VMOVDQU 864(CX), Y6
|
|
VPXOR 104(DX), Y3, Y4
|
|
VPXOR 136(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 896(CX), Y3
|
|
VMOVDQU 928(CX), Y6
|
|
VPXOR 112(DX), Y3, Y4
|
|
VPXOR 144(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VMOVDQU 960(CX), Y3
|
|
VMOVDQU 992(CX), Y6
|
|
VPXOR 120(DX), Y3, Y4
|
|
VPXOR 152(DX), Y6, Y7
|
|
VPSHUFD $0x31, Y4, Y5
|
|
VPSHUFD $0x31, Y7, Y8
|
|
VPMULUDQ Y4, Y5, Y4
|
|
VPMULUDQ Y7, Y8, Y7
|
|
VPSHUFD $0x4e, Y3, Y3
|
|
VPSHUFD $0x4e, Y6, Y6
|
|
VPADDQ Y1, Y3, Y1
|
|
VPADDQ Y1, Y4, Y1
|
|
VPADDQ Y2, Y6, Y2
|
|
VPADDQ Y2, Y7, Y2
|
|
VPSRLQ $0x2f, Y1, Y3
|
|
VPXOR Y1, Y3, Y3
|
|
VPXOR 128(DX), Y3, Y3
|
|
VPMULUDQ Y0, Y3, Y1
|
|
VPSHUFD $0xf5, Y3, Y3
|
|
VPMULUDQ Y0, Y3, Y3
|
|
VPSLLQ $0x20, Y3, Y3
|
|
VPADDQ Y1, Y3, Y1
|
|
VPSRLQ $0x2f, Y2, Y3
|
|
VPXOR Y2, Y3, Y3
|
|
VPXOR 160(DX), Y3, Y3
|
|
VPMULUDQ Y0, Y3, Y2
|
|
VPSHUFD $0xf5, Y3, Y3
|
|
VPMULUDQ Y0, Y3, Y3
|
|
VPSLLQ $0x20, Y3, Y3
|
|
VPADDQ Y2, Y3, Y2
|
|
VMOVDQU Y1, (AX)
|
|
VMOVDQU Y2, 32(AX)
|
|
VZEROUPPER
|
|
RET
|