diff --git a/vector/acc_amd64.go b/vector/acc_amd64.go index e2149de..40d2448 100644 --- a/vector/acc_amd64.go +++ b/vector/acc_amd64.go @@ -14,8 +14,14 @@ var haveFixedAccumulateSIMD = haveSSE4_1() const haveFloatingAccumulateSIMD = true +//go:noescape +func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) + //go:noescape func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) +//go:noescape +func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) + //go:noescape func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s index c74b2a9..f8e03e3 100644 --- a/vector/acc_amd64.s +++ b/vector/acc_amd64.s @@ -10,20 +10,50 @@ DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff +DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff +DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000 DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000 DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff -DATA shuffleMask<>+0x00(SB)/8, $0x0c0804000c080400 -DATA shuffleMask<>+0x08(SB)/8, $0x0c0804000c080400 + +// scatterAndMulBy0x101 is a PSHUFB mask that brings the low four bytes of an +// XMM register to the low byte of that register's four uint32 values. It +// duplicates those bytes, effectively multiplying each uint32 by 0x101. +// +// It transforms a little-endian 16-byte XMM value from +// ijkl???????????? +// to +// ii00jj00kk00ll00 +DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000 +DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202 + +// gather is a PSHUFB mask that brings the low byte of the XMM register's four +// uint32 values to the low four bytes of that register. +// +// It transforms a little-endian 16-byte XMM value from +// i???j???k???l??? +// to +// ijkl000000000000 +DATA gather<>+0x00(SB)/8, $0x808080800c080400 +DATA gather<>+0x08(SB)/8, $0x8080808080808080 + DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff +DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff +DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff +DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001 +DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001 GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16 +GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16 GLOBL flOne<>(SB), (NOPTR+RODATA), $16 GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16 -GLOBL shuffleMask<>(SB), (NOPTR+RODATA), $16 +GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16 +GLOBL gather<>(SB), (NOPTR+RODATA), $16 GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16 +GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16 +GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16 // func haveSSE4_1() bool TEXT ·haveSSE4_1(SB), NOSPLIT, $0 @@ -36,6 +66,224 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0 // ---------------------------------------------------------------------------- +// func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 - +// xmm4 - +// xmm5 fxAlmost65536 +// xmm6 gather +// xmm7 offset +// xmm8 scatterAndMulBy0x101 +// xmm9 fxAlmost65536 +// xmm10 inverseFFFF +TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48 + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT fxAccOpOverEnd + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. + MOVOU fxAlmost65536<>(SB), X5 + + // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. + // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. + MOVOU scatterAndMulBy0x101<>(SB), X8 + MOVOU fxAlmost65536<>(SB), X9 + MOVOU inverseFFFF<>(SB), X10 + + // gather := XMM(see above) // PSHUFB shuffle mask. + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + MOVOU gather<>(SB), X6 + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +fxAccOpOverLoop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE fxAccOpOverLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + PADDD X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + PADDD X0, X1 + + // x += offset + PADDD X7, X1 + + // y = abs(x) + // y >>= 4 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x4,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = convertToInt32(y) + // No-op. + + // Blend over the dst's prior value. SIMD for i in 0..3: + // + // dstA := uint32(dst[i]) * 0x101 + // maskA := z@i + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[i] = uint8(outA >> 8) + // + // First, set X0 to dstA*(0xfff-maskA). + MOVL (DI), X0 + PSHUFB X8, X0 + MOVOU X9, X11 + PSUBL X2, X11 + PMULLD X11, X0 + + // We implement uint32 division by 0xffff as multiplication by a magic + // constant (0x800080001) and then a shift by a magic constant (47). + // See TestDivideByFFFF for a justification. + // + // That multiplication widens from uint32 to uint64, so we have to + // duplicate and shift our four uint32s from one XMM register (X0) to + // two XMM registers (X0 and X11). + // + // Move the second and fourth uint32s in X0 to be the first and third + // uint32s in X11. + MOVOU X0, X11 + PSRLQ $32, X11 + + // Multiply by magic, shift by magic. + // + // pmuludq %xmm10,%xmm0 + // pmuludq %xmm10,%xmm11 + BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 + BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda + PSRLQ $47, X0 + PSRLQ $47, X11 + + // Merge the two registers back to one, X11. + PSLLQ $32, X11 + XORPS X0, X11 + + // Add maskA, shift from 16 bit color to 8 bit color. + PADDD X11, X2 + PSRLQ $8, X2 + + // As per opSrcStore4, shuffle and copy the low 4 bytes. + PSHUFB X6, X2 + MOVL X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ $4, DI + ADDQ $16, SI + JMP fxAccOpOverLoop4 + +fxAccOpOverLoop1: + // for i < len(src) + CMPQ R9, R11 + JAE fxAccOpOverCleanup + + // x = src[i] + offset + MOVL (SI), X1 + PADDD X7, X1 + + // y = abs(x) + // y >>= 4 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x4,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = convertToInt32(y) + // No-op. + + // Blend over the dst's prior value. + // + // dstA := uint32(dst[0]) * 0x101 + // maskA := z + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[0] = uint8(outA >> 8) + MOVBLZX (DI), R12 + IMULL $0x101, R12 + MOVL X2, R13 + MOVL $0xffff, AX + SUBL R13, AX + MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. + MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... + MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. + SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). + ADDL DX, R13 + SHRL $8, R13 + MOVB R13, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ $1, DI + ADDQ $4, SI + JMP fxAccOpOverLoop1 + +fxAccOpOverCleanup: + // No-op. + +fxAccOpOverEnd: + RET + +// ---------------------------------------------------------------------------- + // func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) // // XMM registers. Variable names are per @@ -47,37 +295,40 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0 // xmm3 - // xmm4 - // xmm5 fxAlmost256 -// xmm6 shuffleMask +// xmm6 gather // xmm7 offset +// xmm8 - +// xmm9 - +// xmm10 - TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48 MOVQ dst_base+0(FP), DI MOVQ dst_len+8(FP), BX MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), CX + MOVQ src_len+32(FP), R10 // Sanity check that len(dst) >= len(src). - CMPQ BX, CX + CMPQ BX, R10 JLT fxAccOpSrcEnd - // CX = len(src) &^ 3 - // DX = len(src) - MOVQ CX, DX - ANDQ $-4, CX + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 // fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8. MOVOU fxAlmost256<>(SB), X5 - // shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask. - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - MOVOU shuffleMask<>(SB), X6 + // gather := XMM(see above) // PSHUFB shuffle mask. + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + MOVOU gather<>(SB), X6 XORPS X7, X7 // i := 0 - MOVQ $0, AX + MOVQ $0, R9 fxAccOpSrcLoop4: // for i < (len(src) &^ 3) - CMPQ AX, CX + CMPQ R9, R10 JAE fxAccOpSrcLoop1 // x = XMM(s0, s1, s2, s3) @@ -131,14 +382,14 @@ fxAccOpSrcLoop4: // i += 4 // dst = dst[4:] // src = src[4:] - ADDQ $4, AX + ADDQ $4, R9 ADDQ $4, DI ADDQ $16, SI JMP fxAccOpSrcLoop4 fxAccOpSrcLoop1: // for i < len(src) - CMPQ AX, DX + CMPQ R9, R11 JAE fxAccOpSrcCleanup // x = src[i] + offset @@ -173,7 +424,7 @@ fxAccOpSrcLoop1: // i += 1 // dst = dst[1:] // src = src[1:] - ADDQ $1, AX + ADDQ $1, R9 ADDQ $1, DI ADDQ $4, SI JMP fxAccOpSrcLoop1 @@ -186,6 +437,221 @@ fxAccOpSrcEnd: // ---------------------------------------------------------------------------- +// func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 flAlmost65536 +// xmm4 flOne +// xmm5 flSignMask +// xmm6 gather +// xmm7 offset +// xmm8 scatterAndMulBy0x101 +// xmm9 fxAlmost65536 +// xmm10 inverseFFFF +TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48 + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT flAccOpOverEnd + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + // Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". + STMXCSR mxcsrOrig-8(SP) + MOVL mxcsrOrig-8(SP), AX + ORL $0x6000, AX + MOVL AX, mxcsrNew-4(SP) + LDMXCSR mxcsrNew-4(SP) + + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + MOVOU flAlmost65536<>(SB), X3 + MOVOU flOne<>(SB), X4 + MOVOU flSignMask<>(SB), X5 + + // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. + // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. + MOVOU scatterAndMulBy0x101<>(SB), X8 + MOVOU fxAlmost65536<>(SB), X9 + MOVOU inverseFFFF<>(SB), X10 + + // gather := XMM(see above) // PSHUFB shuffle mask. + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + MOVOU gather<>(SB), X6 + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +flAccOpOverLoop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE flAccOpOverLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + ADDPS X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + ADDPS X0, X1 + + // x += offset + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X5, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X3, X2 + + // z = convertToInt32(y) + CVTPS2PL X2, X2 + + // Blend over the dst's prior value. SIMD for i in 0..3: + // + // dstA := uint32(dst[i]) * 0x101 + // maskA := z@i + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[i] = uint8(outA >> 8) + // + // First, set X0 to dstA*(0xfff-maskA). + MOVL (DI), X0 + PSHUFB X8, X0 + MOVOU X9, X11 + PSUBL X2, X11 + PMULLD X11, X0 + + // We implement uint32 division by 0xffff as multiplication by a magic + // constant (0x800080001) and then a shift by a magic constant (47). + // See TestDivideByFFFF for a justification. + // + // That multiplication widens from uint32 to uint64, so we have to + // duplicate and shift our four uint32s from one XMM register (X0) to + // two XMM registers (X0 and X11). + // + // Move the second and fourth uint32s in X0 to be the first and third + // uint32s in X11. + MOVOU X0, X11 + PSRLQ $32, X11 + + // Multiply by magic, shift by magic. + // + // pmuludq %xmm10,%xmm0 + // pmuludq %xmm10,%xmm11 + BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 + BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda + PSRLQ $47, X0 + PSRLQ $47, X11 + + // Merge the two registers back to one, X11. + PSLLQ $32, X11 + XORPS X0, X11 + + // Add maskA, shift from 16 bit color to 8 bit color. + PADDD X11, X2 + PSRLQ $8, X2 + + // As per opSrcStore4, shuffle and copy the low 4 bytes. + PSHUFB X6, X2 + MOVL X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ $4, DI + ADDQ $16, SI + JMP flAccOpOverLoop4 + +flAccOpOverLoop1: + // for i < len(src) + CMPQ R9, R11 + JAE flAccOpOverCleanup + + // x = src[i] + offset + MOVL (SI), X1 + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X5, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X3, X2 + + // z = convertToInt32(y) + CVTPS2PL X2, X2 + + // Blend over the dst's prior value. + // + // dstA := uint32(dst[0]) * 0x101 + // maskA := z + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[0] = uint8(outA >> 8) + MOVBLZX (DI), R12 + IMULL $0x101, R12 + MOVL X2, R13 + MOVL $0xffff, AX + SUBL R13, AX + MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. + MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... + MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. + SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). + ADDL DX, R13 + SHRL $8, R13 + MOVB R13, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ $1, DI + ADDQ $4, SI + JMP flAccOpOverLoop1 + +flAccOpOverCleanup: + LDMXCSR mxcsrOrig-8(SP) + +flAccOpOverEnd: + RET + +// ---------------------------------------------------------------------------- + // func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) // // XMM registers. Variable names are per @@ -197,22 +663,25 @@ fxAccOpSrcEnd: // xmm3 flAlmost256 // xmm4 flOne // xmm5 flSignMask -// xmm6 shuffleMask +// xmm6 gather // xmm7 offset +// xmm8 - +// xmm9 - +// xmm10 - TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 MOVQ dst_base+0(FP), DI MOVQ dst_len+8(FP), BX MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), CX + MOVQ src_len+32(FP), R10 // Sanity check that len(dst) >= len(src). - CMPQ BX, CX + CMPQ BX, R10 JLT flAccOpSrcEnd - // CX = len(src) &^ 3 - // DX = len(src) - MOVQ CX, DX - ANDQ $-4, CX + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 // Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". STMXCSR mxcsrOrig-8(SP) @@ -228,17 +697,17 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 MOVOU flOne<>(SB), X4 MOVOU flSignMask<>(SB), X5 - // shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask. - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - MOVOU shuffleMask<>(SB), X6 + // gather := XMM(see above) // PSHUFB shuffle mask. + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + MOVOU gather<>(SB), X6 XORPS X7, X7 // i := 0 - MOVQ $0, AX + MOVQ $0, R9 flAccOpSrcLoop4: // for i < (len(src) &^ 3) - CMPQ AX, CX + CMPQ R9, R10 JAE flAccOpSrcLoop1 // x = XMM(s0, s1, s2, s3) @@ -285,14 +754,14 @@ flAccOpSrcLoop4: // i += 4 // dst = dst[4:] // src = src[4:] - ADDQ $4, AX + ADDQ $4, R9 ADDQ $4, DI ADDQ $16, SI JMP flAccOpSrcLoop4 flAccOpSrcLoop1: // for i < len(src) - CMPQ AX, DX + CMPQ R9, R11 JAE flAccOpSrcCleanup // x = src[i] + offset @@ -320,7 +789,7 @@ flAccOpSrcLoop1: // i += 1 // dst = dst[1:] // src = src[1:] - ADDQ $1, AX + ADDQ $1, R9 ADDQ $1, DI ADDQ $4, SI JMP flAccOpSrcLoop1 diff --git a/vector/acc_other.go b/vector/acc_other.go index c0b78f8..c12adf8 100644 --- a/vector/acc_other.go +++ b/vector/acc_other.go @@ -9,5 +9,7 @@ package vector const haveFixedAccumulateSIMD = false const haveFloatingAccumulateSIMD = false -func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {} -func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {} +func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) {} +func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {} +func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) {} +func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {} diff --git a/vector/acc_test.go b/vector/acc_test.go index 1c59c2b..fe0fb21 100644 --- a/vector/acc_test.go +++ b/vector/acc_test.go @@ -7,9 +7,34 @@ package vector import ( "bytes" "fmt" + "math/rand" "testing" ) +// TestDivideByFFFF tests that dividing by 0xffff is equivalent to multiplying +// and then shifting by magic constants. The Go compiler itself issues this +// multiply-and-shift for a division by the constant value 0xffff. This trick +// is used in the asm code as the GOARCH=amd64 SIMD instructions have parallel +// multiply but not parallel divide. +// +// There's undoubtedly a justification somewhere in Hacker's Delight chapter 10 +// "Integer Division by Constants", but I don't have a more specific link. +// +// http://www.hackersdelight.org/divcMore.pdf and +// http://www.hackersdelight.org/magic.htm +func TestDivideByFFFF(t *testing.T) { + const mul, shift = 0x80008001, 47 + rng := rand.New(rand.NewSource(1)) + for i := 0; i < 20000; i++ { + u := rng.Uint32() + got := uint32((uint64(u) * mul) >> shift) + want := u / 0xffff + if got != want { + t.Fatalf("i=%d, u=%#08x: got %#08x, want %#08x", i, u, got, want) + } + } +} + // TestXxxSIMDUnaligned tests that unaligned SIMD loads/stores don't crash. func TestFixedAccumulateSIMDUnaligned(t *testing.T) { @@ -163,7 +188,11 @@ func testAcc(t *testing.T, in interface{}, mask []uint32, op string) { case []uint32: switch op { case "over": - fixedAccumulateOpOver(got8, in[:n]) + if simd { + fixedAccumulateOpOverSIMD(got8, in[:n]) + } else { + fixedAccumulateOpOver(got8, in[:n]) + } case "src": if simd { fixedAccumulateOpSrcSIMD(got8, in[:n]) @@ -177,7 +206,11 @@ func testAcc(t *testing.T, in interface{}, mask []uint32, op string) { case []float32: switch op { case "over": - floatingAccumulateOpOver(got8, in[:n]) + if simd { + floatingAccumulateOpOverSIMD(got8, in[:n]) + } else { + floatingAccumulateOpOver(got8, in[:n]) + } case "src": if simd { floatingAccumulateOpSrcSIMD(got8, in[:n]) @@ -226,23 +259,27 @@ func float32sEqual(xs, ys []float32) bool { return true } -func BenchmarkFixedAccumulateOpOver16(b *testing.B) { benchAcc(b, fxIn16, "over", false) } -func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src", false) } -func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, fxIn16, "src", true) } -func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask", false) } -func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over", false) } -func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src", false) } -func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) } -func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask", false) } +func BenchmarkFixedAccumulateOpOver16(b *testing.B) { benchAcc(b, fxIn16, "over", false) } +func BenchmarkFixedAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, fxIn16, "over", true) } +func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src", false) } +func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, fxIn16, "src", true) } +func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask", false) } +func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over", false) } +func BenchmarkFloatingAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, flIn16, "over", true) } +func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src", false) } +func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) } +func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask", false) } -func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over", false) } -func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src", false) } -func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, fxIn64, "src", true) } -func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask", false) } -func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over", false) } -func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src", false) } -func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) } -func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask", false) } +func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over", false) } +func BenchmarkFixedAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, fxIn64, "over", true) } +func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src", false) } +func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, fxIn64, "src", true) } +func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask", false) } +func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over", false) } +func BenchmarkFloatingAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, flIn64, "over", true) } +func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src", false) } +func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) } +func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask", false) } func benchAcc(b *testing.B, in interface{}, op string, simd bool) { var f func() @@ -256,7 +293,11 @@ func benchAcc(b *testing.B, in interface{}, op string, simd bool) { switch op { case "over": dst := make([]uint8, len(in)) - f = func() { fixedAccumulateOpOver(dst, in) } + if simd { + f = func() { fixedAccumulateOpOverSIMD(dst, in) } + } else { + f = func() { fixedAccumulateOpOver(dst, in) } + } case "src": dst := make([]uint8, len(in)) if simd { @@ -278,7 +319,11 @@ func benchAcc(b *testing.B, in interface{}, op string, simd bool) { switch op { case "over": dst := make([]uint8, len(in)) - f = func() { floatingAccumulateOpOver(dst, in) } + if simd { + f = func() { floatingAccumulateOpOverSIMD(dst, in) } + } else { + f = func() { floatingAccumulateOpOver(dst, in) } + } case "src": dst := make([]uint8, len(in)) if simd { diff --git a/vector/gen.go b/vector/gen.go index 482e8f6..22cf39d 100644 --- a/vector/gen.go +++ b/vector/gen.go @@ -72,37 +72,93 @@ var instances = []struct { XMM3 string XMM4 string XMM5 string + XMM8 string + XMM9 string + XMM10 string Setup string Cleanup string + LoadXMMRegs string Add string ClampAndScale string ConvertToInt32 string + Store4 string + Store1 string }{{ + LongName: "fixedAccumulateOpOver", + ShortName: "fxAccOpOver", + FrameSize: fxFrameSize, + SrcType: fxSrcType, + XMM3: fxXMM3, + XMM4: fxXMM4, + XMM5: fxXMM5_65536, + XMM8: opOverXMM8, + XMM9: opOverXMM9, + XMM10: opOverXMM10, + Setup: fxSetup, + LoadXMMRegs: fxLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs, + Cleanup: fxCleanup, + Add: fxAdd, + ClampAndScale: fxClampAndScale65536, + ConvertToInt32: fxConvertToInt32, + Store4: opOverStore4, + Store1: opOverStore1, +}, { LongName: "fixedAccumulateOpSrc", ShortName: "fxAccOpSrc", FrameSize: fxFrameSize, SrcType: fxSrcType, XMM3: fxXMM3, XMM4: fxXMM4, - XMM5: fxXMM5, + XMM5: fxXMM5_256, + XMM8: opSrcXMM8, + XMM9: opSrcXMM9, + XMM10: opSrcXMM10, Setup: fxSetup, + LoadXMMRegs: fxLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs, Cleanup: fxCleanup, Add: fxAdd, - ClampAndScale: fxClampAndScale, + ClampAndScale: fxClampAndScale256, ConvertToInt32: fxConvertToInt32, + Store4: opSrcStore4, + Store1: opSrcStore1, +}, { + LongName: "floatingAccumulateOpOver", + ShortName: "flAccOpOver", + FrameSize: flFrameSize, + SrcType: flSrcType, + XMM3: flXMM3_65536, + XMM4: flXMM4, + XMM5: flXMM5, + XMM8: opOverXMM8, + XMM9: opOverXMM9, + XMM10: opOverXMM10, + Setup: flSetup, + LoadXMMRegs: flLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs, + Cleanup: flCleanup, + Add: flAdd, + ClampAndScale: flClampAndScale65536, + ConvertToInt32: flConvertToInt32, + Store4: opOverStore4, + Store1: opOverStore1, }, { LongName: "floatingAccumulateOpSrc", ShortName: "flAccOpSrc", FrameSize: flFrameSize, SrcType: flSrcType, - XMM3: flXMM3, + XMM3: flXMM3_256, XMM4: flXMM4, XMM5: flXMM5, + XMM8: opSrcXMM8, + XMM9: opSrcXMM9, + XMM10: opSrcXMM10, Setup: flSetup, + LoadXMMRegs: flLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs, Cleanup: flCleanup, Add: flAdd, - ClampAndScale: flClampAndScale, + ClampAndScale: flClampAndScale256, ConvertToInt32: flConvertToInt32, + Store4: opSrcStore4, + Store1: opSrcStore1, }} const ( @@ -112,19 +168,18 @@ const ( fxSrcType = `[]uint32` flSrcType = `[]float32` - fxXMM3 = `-` - flXMM3 = `flAlmost256` + fxXMM3 = `-` + flXMM3_256 = `flAlmost256` + flXMM3_65536 = `flAlmost65536` fxXMM4 = `-` flXMM4 = `flOne` - fxXMM5 = `fxAlmost256` - flXMM5 = `flSignMask` + fxXMM5_256 = `fxAlmost256` + fxXMM5_65536 = `fxAlmost65536` + flXMM5 = `flSignMask` - fxSetup = ` - // fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8. - MOVOU fxAlmost256<>(SB), X5 - ` + fxSetup = `` flSetup = ` // Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". STMXCSR mxcsrOrig-8(SP) @@ -132,7 +187,20 @@ const ( ORL $0x6000, AX MOVL AX, mxcsrNew-4(SP) LDMXCSR mxcsrNew-4(SP) + ` + fxCleanup = `// No-op.` + flCleanup = `LDMXCSR mxcsrOrig-8(SP)` + + fxLoadXMMRegs256 = ` + // fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8. + MOVOU fxAlmost256<>(SB), X5 + ` + fxLoadXMMRegs65536 = ` + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. + MOVOU fxAlmost65536<>(SB), X5 + ` + flLoadXMMRegs256 = ` // flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32. // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. @@ -140,14 +208,19 @@ const ( MOVOU flOne<>(SB), X4 MOVOU flSignMask<>(SB), X5 ` - - fxCleanup = `// No-op.` - flCleanup = `LDMXCSR mxcsrOrig-8(SP)` + flLoadXMMRegs65536 = ` + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + MOVOU flAlmost65536<>(SB), X3 + MOVOU flOne<>(SB), X4 + MOVOU flSignMask<>(SB), X5 + ` fxAdd = `PADDD` flAdd = `ADDPS` - fxClampAndScale = ` + fxClampAndScale256 = ` // y = abs(x) // y >>= 12 // Shift by 2*ϕ - 8. // y = min(y, fxAlmost256) @@ -163,7 +236,23 @@ const ( BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 ` - flClampAndScale = ` + fxClampAndScale65536 = ` + // y = abs(x) + // y >>= 4 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x4,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + ` + flClampAndScale256 = ` // y = x & flSignMask // y = min(y, flOne) // y = mul(y, flAlmost256) @@ -172,7 +261,112 @@ const ( MINPS X4, X2 MULPS X3, X2 ` + flClampAndScale65536 = ` + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X5, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X3, X2 + ` fxConvertToInt32 = `// No-op.` flConvertToInt32 = `CVTPS2PL X2, X2` + + opOverStore4 = ` + // Blend over the dst's prior value. SIMD for i in 0..3: + // + // dstA := uint32(dst[i]) * 0x101 + // maskA := z@i + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[i] = uint8(outA >> 8) + // + // First, set X0 to dstA*(0xfff-maskA). + MOVL (DI), X0 + PSHUFB X8, X0 + MOVOU X9, X11 + PSUBL X2, X11 + PMULLD X11, X0 + // We implement uint32 division by 0xffff as multiplication by a magic + // constant (0x800080001) and then a shift by a magic constant (47). + // See TestDivideByFFFF for a justification. + // + // That multiplication widens from uint32 to uint64, so we have to + // duplicate and shift our four uint32s from one XMM register (X0) to + // two XMM registers (X0 and X11). + // + // Move the second and fourth uint32s in X0 to be the first and third + // uint32s in X11. + MOVOU X0, X11 + PSRLQ $32, X11 + // Multiply by magic, shift by magic. + // + // pmuludq %xmm10,%xmm0 + // pmuludq %xmm10,%xmm11 + BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 + BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda + PSRLQ $47, X0 + PSRLQ $47, X11 + // Merge the two registers back to one, X11. + PSLLQ $32, X11 + XORPS X0, X11 + // Add maskA, shift from 16 bit color to 8 bit color. + PADDD X11, X2 + PSRLQ $8, X2 + // As per opSrcStore4, shuffle and copy the low 4 bytes. + PSHUFB X6, X2 + MOVL X2, (DI) + ` + opSrcStore4 = ` + // z = shuffleTheLowBytesOfEach4ByteElement(z) + // copy(dst[:4], low4BytesOf(z)) + PSHUFB X6, X2 + MOVL X2, (DI) + ` + + opOverStore1 = ` + // Blend over the dst's prior value. + // + // dstA := uint32(dst[0]) * 0x101 + // maskA := z + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[0] = uint8(outA >> 8) + MOVBLZX (DI), R12 + IMULL $0x101, R12 + MOVL X2, R13 + MOVL $0xffff, AX + SUBL R13, AX + MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. + MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... + MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. + SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). + ADDL DX, R13 + SHRL $8, R13 + MOVB R13, (DI) + ` + opSrcStore1 = ` + // dst[0] = uint8(z) + MOVL X2, BX + MOVB BX, (DI) + ` + + opOverXMM8 = `scatterAndMulBy0x101` + opSrcXMM8 = `-` + + opOverXMM9 = `fxAlmost65536` + opSrcXMM9 = `-` + + opOverXMM10 = `inverseFFFF` + opSrcXMM10 = `-` + + opOverLoadXMMRegs = ` + // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. + // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. + MOVOU scatterAndMulBy0x101<>(SB), X8 + MOVOU fxAlmost65536<>(SB), X9 + MOVOU inverseFFFF<>(SB), X10 + ` + opSrcLoadXMMRegs = `` ) diff --git a/vector/gen_acc_amd64.s.tmpl b/vector/gen_acc_amd64.s.tmpl index 854abe5..71e4769 100644 --- a/vector/gen_acc_amd64.s.tmpl +++ b/vector/gen_acc_amd64.s.tmpl @@ -12,20 +12,50 @@ DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff +DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff +DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000 DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000 DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff -DATA shuffleMask<>+0x00(SB)/8, $0x0c0804000c080400 -DATA shuffleMask<>+0x08(SB)/8, $0x0c0804000c080400 + +// scatterAndMulBy0x101 is a PSHUFB mask that brings the low four bytes of an +// XMM register to the low byte of that register's four uint32 values. It +// duplicates those bytes, effectively multiplying each uint32 by 0x101. +// +// It transforms a little-endian 16-byte XMM value from +// ijkl???????????? +// to +// ii00jj00kk00ll00 +DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000 +DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202 + +// gather is a PSHUFB mask that brings the low byte of the XMM register's four +// uint32 values to the low four bytes of that register. +// +// It transforms a little-endian 16-byte XMM value from +// i???j???k???l??? +// to +// ijkl000000000000 +DATA gather<>+0x00(SB)/8, $0x808080800c080400 +DATA gather<>+0x08(SB)/8, $0x8080808080808080 + DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff +DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff +DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff +DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001 +DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001 GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16 +GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16 GLOBL flOne<>(SB), (NOPTR+RODATA), $16 GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16 -GLOBL shuffleMask<>(SB), (NOPTR+RODATA), $16 +GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16 +GLOBL gather<>(SB), (NOPTR+RODATA), $16 GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16 +GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16 +GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16 // func haveSSE4_1() bool TEXT ·haveSSE4_1(SB), NOSPLIT, $0 @@ -49,36 +79,41 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0 // xmm3 {{.XMM3}} // xmm4 {{.XMM4}} // xmm5 {{.XMM5}} -// xmm6 shuffleMask +// xmm6 gather // xmm7 offset +// xmm8 {{.XMM8}} +// xmm9 {{.XMM9}} +// xmm10 {{.XMM10}} TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 MOVQ dst_base+0(FP), DI MOVQ dst_len+8(FP), BX MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), CX + MOVQ src_len+32(FP), R10 // Sanity check that len(dst) >= len(src). - CMPQ BX, CX + CMPQ BX, R10 JLT {{.ShortName}}End - // CX = len(src) &^ 3 - // DX = len(src) - MOVQ CX, DX - ANDQ $-4, CX + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 {{.Setup}} - // shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask. - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - MOVOU shuffleMask<>(SB), X6 + {{.LoadXMMRegs}} + + // gather := XMM(see above) // PSHUFB shuffle mask. + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + MOVOU gather<>(SB), X6 XORPS X7, X7 // i := 0 - MOVQ $0, AX + MOVQ $0, R9 {{.ShortName}}Loop4: // for i < (len(src) &^ 3) - CMPQ AX, CX + CMPQ R9, R10 JAE {{.ShortName}}Loop1 // x = XMM(s0, s1, s2, s3) @@ -107,10 +142,7 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 // z = convertToInt32(y) {{.ConvertToInt32}} - // z = shuffleTheLowBytesOfEach4ByteElement(z) - // copy(dst[:4], low4BytesOf(z)) - PSHUFB X6, X2 - MOVL X2, (DI) + {{.Store4}} // offset = XMM(x@3, x@3, x@3, x@3) MOVOU X1, X7 @@ -119,14 +151,14 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 // i += 4 // dst = dst[4:] // src = src[4:] - ADDQ $4, AX + ADDQ $4, R9 ADDQ $4, DI ADDQ $16, SI JMP {{.ShortName}}Loop4 {{.ShortName}}Loop1: // for i < len(src) - CMPQ AX, DX + CMPQ R9, R11 JAE {{.ShortName}}Cleanup // x = src[i] + offset @@ -138,9 +170,7 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 // z = convertToInt32(y) {{.ConvertToInt32}} - // dst[0] = uint8(z) - MOVL X2, BX - MOVB BX, (DI) + {{.Store1}} // offset = x MOVOU X1, X7 @@ -148,7 +178,7 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 // i += 1 // dst = dst[1:] // src = src[1:] - ADDQ $1, AX + ADDQ $1, R9 ADDQ $1, DI ADDQ $4, SI JMP {{.ShortName}}Loop1 diff --git a/vector/raster_fixed.go b/vector/raster_fixed.go index 40c17aa..21e10d6 100644 --- a/vector/raster_fixed.go +++ b/vector/raster_fixed.go @@ -200,6 +200,11 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) { } func fixedAccumulateOpOver(dst []uint8, src []uint32) { + // Sanity check that len(dst) >= len(src). + if len(dst) < len(src) { + return + } + acc := int2ϕ(0) for i, v := range src { acc += int2ϕ(v) diff --git a/vector/raster_floating.go b/vector/raster_floating.go index 50621e1..2ed426a 100644 --- a/vector/raster_floating.go +++ b/vector/raster_floating.go @@ -146,6 +146,11 @@ const ( ) func floatingAccumulateOpOver(dst []uint8, src []float32) { + // Sanity check that len(dst) >= len(src). + if len(dst) < len(src) { + return + } + acc := float32(0) for i, v := range src { acc += v diff --git a/vector/vector.go b/vector/vector.go index aed62d2..ee7300d 100644 --- a/vector/vector.go +++ b/vector/vector.go @@ -322,9 +322,17 @@ func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image. // We bypass the z.accumulateMask step and convert straight from // z.bufF32 or z.bufU32 to dst.Pix. if z.useFloatingPointMath { - floatingAccumulateOpOver(dst.Pix, z.bufF32) + if haveFloatingAccumulateSIMD { + floatingAccumulateOpOverSIMD(dst.Pix, z.bufF32) + } else { + floatingAccumulateOpOver(dst.Pix, z.bufF32) + } } else { - fixedAccumulateOpOver(dst.Pix, z.bufU32) + if haveFixedAccumulateSIMD { + fixedAccumulateOpOverSIMD(dst.Pix, z.bufU32) + } else { + fixedAccumulateOpOver(dst.Pix, z.bufU32) + } } return }