From e78c45720c450ce81f17c669d3ea2e0e8e9c369e Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Thu, 13 Oct 2016 14:50:52 +1100 Subject: [PATCH] vector: add SIMD versions of xxxAccumulateMask. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit name old time/op new time/op delta GlyphAlphaLoose16Over-8 3.96µs ± 0% 3.64µs ± 1% -8.08% (p=0.000 n=8+10) GlyphAlphaLoose16Src-8 3.64µs ± 0% 3.35µs ± 0% -7.88% (p=0.000 n=8+9) GlyphAlphaLoose32Over-8 8.45µs ± 0% 6.74µs ± 0% -20.22% (p=0.000 n=8+9) GlyphAlphaLoose32Src-8 7.24µs ± 0% 5.54µs ± 1% -23.48% (p=0.000 n=8+10) GlyphAlphaLoose64Over-8 22.2µs ± 0% 17.4µs ± 0% -21.67% (p=0.000 n=9+9) GlyphAlphaLoose64Src-8 17.6µs ± 1% 12.2µs ± 1% -30.32% (p=0.000 n=10+10) GlyphAlphaLoose128Over-8 67.9µs ± 0% 53.3µs ± 1% -21.53% (p=0.000 n=10+10) GlyphAlphaLoose128Src-8 48.2µs ± 0% 32.6µs ± 2% -32.41% (p=0.000 n=9+10) GlyphAlphaLoose256Over-8 242µs ± 1% 187µs ± 1% -22.96% (p=0.000 n=9+9) GlyphAlphaLoose256Src-8 163µs ± 0% 105µs ± 1% -35.83% (p=0.000 n=9+9) GlyphRGBA16Over-8 5.25µs ± 1% 4.95µs ± 0% -5.78% (p=0.000 n=9+9) GlyphRGBA16Src-8 4.72µs ± 0% 4.43µs ± 1% -6.22% (p=0.000 n=9+10) GlyphRGBA32Over-8 13.5µs ± 0% 11.9µs ± 1% -12.19% (p=0.000 n=9+10) GlyphRGBA32Src-8 11.5µs ± 1% 9.8µs ± 0% -14.72% (p=0.000 n=9+9) GlyphRGBA64Over-8 42.0µs ± 2% 36.9µs ± 1% -12.19% (p=0.000 n=10+10) GlyphRGBA64Src-8 34.1µs ± 1% 28.5µs ± 0% -16.25% (p=0.000 n=9+7) GlyphRGBA128Over-8 149µs ± 2% 133µs ± 1% -10.24% (p=0.000 n=10+9) GlyphRGBA128Src-8 115µs ± 1% 99µs ± 1% -13.57% (p=0.000 n=9+10) GlyphRGBA256Over-8 566µs ± 0% 511µs ± 1% -9.85% (p=0.000 n=9+10) GlyphRGBA256Src-8 435µs ± 0% 372µs ± 0% -14.64% (p=0.000 n=9+8) GlyphNRGBA16Over-8 26.9µs ± 3% 26.0µs ± 3% -3.55% (p=0.000 n=10+9) GlyphNRGBA16Src-8 18.8µs ± 2% 18.4µs ± 2% -2.21% (p=0.000 n=9+10) GlyphNRGBA32Over-8 99.1µs ± 2% 95.9µs ± 3% -3.23% (p=0.000 n=10+10) GlyphNRGBA32Src-8 65.6µs ± 3% 62.8µs ± 2% -4.36% (p=0.000 n=10+10) GlyphNRGBA64Over-8 376µs ± 4% 370µs ± 2% ~ (p=0.063 n=10+10) GlyphNRGBA64Src-8 238µs ± 3% 233µs ± 1% -2.21% (p=0.000 n=9+10) GlyphNRGBA128Over-8 1.52ms ± 2% 1.48ms ± 0% -2.11% (p=0.000 n=10+8) GlyphNRGBA128Src-8 951µs ± 3% 935µs ± 1% -1.69% (p=0.013 n=10+9) GlyphNRGBA256Over-8 6.00ms ± 1% 5.87ms ± 3% -2.12% (p=0.002 n=10+10) GlyphNRGBA256Src-8 3.94ms ± 2% 3.80ms ± 2% -3.64% (p=0.000 n=10+10) A comparison of the non-SIMD and SIMD versions: name time/op FixedAccumulateMask16-8 237ns ± 0% FixedAccumulateMaskSIMD16-8 80.0ns ± 1% FloatingAccumulateMask16-8 413ns ± 2% FloatingAccumulateMaskSIMD16-8 166ns ± 0% FixedAccumulateMask64-8 3.42µs ± 0% FixedAccumulateMaskSIMD64-8 1.09µs ± 0% FloatingAccumulateMask64-8 6.92µs ± 0% FloatingAccumulateMaskSIMD64-8 2.47µs ± 1% Change-Id: Ib6980e5975ed2842ff2a372f76dd5f2e95c5526c Reviewed-on: https://go-review.googlesource.com/30898 Reviewed-by: David Crawshaw --- vector/acc_amd64.go | 6 + vector/acc_amd64.s | 345 +++++++++++++++++++++++++++++++++--- vector/acc_other.go | 2 + vector/acc_test.go | 28 ++- vector/gen.go | 155 +++++++++++++--- vector/gen_acc_amd64.s.tmpl | 28 +-- vector/raster_floating.go | 5 + vector/vector.go | 12 +- 8 files changed, 507 insertions(+), 74 deletions(-) diff --git a/vector/acc_amd64.go b/vector/acc_amd64.go index 40d2448..cf535fc 100644 --- a/vector/acc_amd64.go +++ b/vector/acc_amd64.go @@ -20,8 +20,14 @@ func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) //go:noescape func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) +//go:noescape +func fixedAccumulateMaskSIMD(buf []uint32) + //go:noescape func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) //go:noescape func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) + +//go:noescape +func floatingAccumulateMaskSIMD(dst []uint32, src []float32) diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s index f8e03e3..31b9c6e 100644 --- a/vector/acc_amd64.s +++ b/vector/acc_amd64.s @@ -83,6 +83,7 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0 // xmm9 fxAlmost65536 // xmm10 inverseFFFF TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48 + MOVQ dst_base+0(FP), DI MOVQ dst_len+8(FP), BX MOVQ src_base+24(FP), SI @@ -100,16 +101,16 @@ TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48 // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. MOVOU fxAlmost65536<>(SB), X5 + // gather := XMM(see above) // PSHUFB shuffle mask. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. + MOVOU gather<>(SB), X6 MOVOU scatterAndMulBy0x101<>(SB), X8 MOVOU fxAlmost65536<>(SB), X9 MOVOU inverseFFFF<>(SB), X10 - // gather := XMM(see above) // PSHUFB shuffle mask. // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - MOVOU gather<>(SB), X6 XORPS X7, X7 // i := 0 @@ -222,7 +223,7 @@ fxAccOpOverLoop4: fxAccOpOverLoop1: // for i < len(src) CMPQ R9, R11 - JAE fxAccOpOverCleanup + JAE fxAccOpOverEnd // x = src[i] + offset MOVL (SI), X1 @@ -276,9 +277,6 @@ fxAccOpOverLoop1: ADDQ $4, SI JMP fxAccOpOverLoop1 -fxAccOpOverCleanup: - // No-op. - fxAccOpOverEnd: RET @@ -301,6 +299,7 @@ fxAccOpOverEnd: // xmm9 - // xmm10 - TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48 + MOVQ dst_base+0(FP), DI MOVQ dst_len+8(FP), BX MOVQ src_base+24(FP), SI @@ -318,9 +317,10 @@ TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48 // fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8. MOVOU fxAlmost256<>(SB), X5 - // gather := XMM(see above) // PSHUFB shuffle mask. - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + // gather := XMM(see above) // PSHUFB shuffle mask. MOVOU gather<>(SB), X6 + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. XORPS X7, X7 // i := 0 @@ -390,7 +390,7 @@ fxAccOpSrcLoop4: fxAccOpSrcLoop1: // for i < len(src) CMPQ R9, R11 - JAE fxAccOpSrcCleanup + JAE fxAccOpSrcEnd // x = src[i] + offset MOVL (SI), X1 @@ -429,10 +429,149 @@ fxAccOpSrcLoop1: ADDQ $4, SI JMP fxAccOpSrcLoop1 -fxAccOpSrcCleanup: +fxAccOpSrcEnd: + RET + +// ---------------------------------------------------------------------------- + +// func fixedAccumulateMaskSIMD(buf []uint32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 - +// xmm4 - +// xmm5 fxAlmost65536 +// xmm6 - +// xmm7 offset +// xmm8 - +// xmm9 - +// xmm10 - +TEXT ·fixedAccumulateMaskSIMD(SB), NOSPLIT, $0-24 + + MOVQ buf_base+0(FP), DI + MOVQ buf_len+8(FP), BX + MOVQ buf_base+0(FP), SI + MOVQ buf_len+8(FP), R10 + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. + MOVOU fxAlmost65536<>(SB), X5 + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +fxAccMaskLoop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE fxAccMaskLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + PADDD X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + PADDD X0, X1 + + // x += offset + PADDD X7, X1 + + // y = abs(x) + // y >>= 4 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x4,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = convertToInt32(y) // No-op. -fxAccOpSrcEnd: + // copy(dst[:4], z) + MOVOU X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ $16, DI + ADDQ $16, SI + JMP fxAccMaskLoop4 + +fxAccMaskLoop1: + // for i < len(src) + CMPQ R9, R11 + JAE fxAccMaskEnd + + // x = src[i] + offset + MOVL (SI), X1 + PADDD X7, X1 + + // y = abs(x) + // y >>= 4 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x4,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = convertToInt32(y) + // No-op. + + // dst[0] = uint32(z) + MOVL X2, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ $4, DI + ADDQ $4, SI + JMP fxAccMaskLoop1 + +fxAccMaskEnd: RET // ---------------------------------------------------------------------------- @@ -454,6 +593,7 @@ fxAccOpSrcEnd: // xmm9 fxAlmost65536 // xmm10 inverseFFFF TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48 + MOVQ dst_base+0(FP), DI MOVQ dst_len+8(FP), BX MOVQ src_base+24(FP), SI @@ -468,12 +608,12 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48 MOVQ R10, R11 ANDQ $-4, R10 - // Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". + // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is + // "Round To Zero". STMXCSR mxcsrOrig-8(SP) MOVL mxcsrOrig-8(SP), AX ORL $0x6000, AX MOVL AX, mxcsrNew-4(SP) - LDMXCSR mxcsrNew-4(SP) // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. @@ -482,16 +622,16 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48 MOVOU flOne<>(SB), X4 MOVOU flSignMask<>(SB), X5 + // gather := XMM(see above) // PSHUFB shuffle mask. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. + MOVOU gather<>(SB), X6 MOVOU scatterAndMulBy0x101<>(SB), X8 MOVOU fxAlmost65536<>(SB), X9 MOVOU inverseFFFF<>(SB), X10 - // gather := XMM(see above) // PSHUFB shuffle mask. // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - MOVOU gather<>(SB), X6 XORPS X7, X7 // i := 0 @@ -532,7 +672,9 @@ flAccOpOverLoop4: MULPS X3, X2 // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) // Blend over the dst's prior value. SIMD for i in 0..3: // @@ -597,7 +739,7 @@ flAccOpOverLoop4: flAccOpOverLoop1: // for i < len(src) CMPQ R9, R11 - JAE flAccOpOverCleanup + JAE flAccOpOverEnd // x = src[i] + offset MOVL (SI), X1 @@ -612,7 +754,9 @@ flAccOpOverLoop1: MULPS X3, X2 // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) // Blend over the dst's prior value. // @@ -644,9 +788,6 @@ flAccOpOverLoop1: ADDQ $4, SI JMP flAccOpOverLoop1 -flAccOpOverCleanup: - LDMXCSR mxcsrOrig-8(SP) - flAccOpOverEnd: RET @@ -669,6 +810,7 @@ flAccOpOverEnd: // xmm9 - // xmm10 - TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 + MOVQ dst_base+0(FP), DI MOVQ dst_len+8(FP), BX MOVQ src_base+24(FP), SI @@ -683,12 +825,12 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 MOVQ R10, R11 ANDQ $-4, R10 - // Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". + // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is + // "Round To Zero". STMXCSR mxcsrOrig-8(SP) MOVL mxcsrOrig-8(SP), AX ORL $0x6000, AX MOVL AX, mxcsrNew-4(SP) - LDMXCSR mxcsrNew-4(SP) // flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32. // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. @@ -697,9 +839,10 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 MOVOU flOne<>(SB), X4 MOVOU flSignMask<>(SB), X5 - // gather := XMM(see above) // PSHUFB shuffle mask. - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + // gather := XMM(see above) // PSHUFB shuffle mask. MOVOU gather<>(SB), X6 + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. XORPS X7, X7 // i := 0 @@ -740,7 +883,9 @@ flAccOpSrcLoop4: MULPS X3, X2 // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) // z = shuffleTheLowBytesOfEach4ByteElement(z) // copy(dst[:4], low4BytesOf(z)) @@ -762,7 +907,7 @@ flAccOpSrcLoop4: flAccOpSrcLoop1: // for i < len(src) CMPQ R9, R11 - JAE flAccOpSrcCleanup + JAE flAccOpSrcEnd // x = src[i] + offset MOVL (SI), X1 @@ -777,7 +922,9 @@ flAccOpSrcLoop1: MULPS X3, X2 // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) // dst[0] = uint8(z) MOVL X2, BX @@ -794,8 +941,152 @@ flAccOpSrcLoop1: ADDQ $4, SI JMP flAccOpSrcLoop1 -flAccOpSrcCleanup: - LDMXCSR mxcsrOrig-8(SP) - flAccOpSrcEnd: RET + +// ---------------------------------------------------------------------------- + +// func floatingAccumulateMaskSIMD(dst []uint32, src []float32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 flAlmost65536 +// xmm4 flOne +// xmm5 flSignMask +// xmm6 - +// xmm7 offset +// xmm8 - +// xmm9 - +// xmm10 - +TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48 + + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT flAccMaskEnd + + // R10 = len(src) &^ 3 + // R11 = len(src) + MOVQ R10, R11 + ANDQ $-4, R10 + + // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is + // "Round To Zero". + STMXCSR mxcsrOrig-8(SP) + MOVL mxcsrOrig-8(SP), AX + ORL $0x6000, AX + MOVL AX, mxcsrNew-4(SP) + + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + MOVOU flAlmost65536<>(SB), X3 + MOVOU flOne<>(SB), X4 + MOVOU flSignMask<>(SB), X5 + + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + XORPS X7, X7 + + // i := 0 + MOVQ $0, R9 + +flAccMaskLoop4: + // for i < (len(src) &^ 3) + CMPQ R9, R10 + JAE flAccMaskLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + ADDPS X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + ADDPS X0, X1 + + // x += offset + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X5, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X3, X2 + + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + + // copy(dst[:4], z) + MOVOU X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, R9 + ADDQ $16, DI + ADDQ $16, SI + JMP flAccMaskLoop4 + +flAccMaskLoop1: + // for i < len(src) + CMPQ R9, R11 + JAE flAccMaskEnd + + // x = src[i] + offset + MOVL (SI), X1 + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X5, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X3, X2 + + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + + // dst[0] = uint32(z) + MOVL X2, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, R9 + ADDQ $4, DI + ADDQ $4, SI + JMP flAccMaskLoop1 + +flAccMaskEnd: + RET diff --git a/vector/acc_other.go b/vector/acc_other.go index c12adf8..c98d20b 100644 --- a/vector/acc_other.go +++ b/vector/acc_other.go @@ -11,5 +11,7 @@ const haveFloatingAccumulateSIMD = false func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) {} func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {} +func fixedAccumulateMaskSIMD(buf []uint32) {} func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) {} func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {} +func floatingAccumulateMaskSIMD(dst []uint32, src []float32) {} diff --git a/vector/acc_test.go b/vector/acc_test.go index fe0fb21..db238f7 100644 --- a/vector/acc_test.go +++ b/vector/acc_test.go @@ -201,7 +201,11 @@ func testAcc(t *testing.T, in interface{}, mask []uint32, op string) { } case "mask": copy(got32, in[:n]) - fixedAccumulateMask(got32) + if simd { + fixedAccumulateMaskSIMD(got32) + } else { + fixedAccumulateMask(got32) + } } case []float32: switch op { @@ -218,7 +222,11 @@ func testAcc(t *testing.T, in interface{}, mask []uint32, op string) { floatingAccumulateOpSrc(got8, in[:n]) } case "mask": - floatingAccumulateMask(got32, in[:n]) + if simd { + floatingAccumulateMaskSIMD(got32, in[:n]) + } else { + floatingAccumulateMask(got32, in[:n]) + } } } @@ -264,22 +272,26 @@ func BenchmarkFixedAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, fxIn16, func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src", false) } func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, fxIn16, "src", true) } func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask", false) } +func BenchmarkFixedAccumulateMaskSIMD16(b *testing.B) { benchAcc(b, fxIn16, "mask", true) } func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over", false) } func BenchmarkFloatingAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, flIn16, "over", true) } func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src", false) } func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) } func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask", false) } +func BenchmarkFloatingAccumulateMaskSIMD16(b *testing.B) { benchAcc(b, flIn16, "mask", true) } func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over", false) } func BenchmarkFixedAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, fxIn64, "over", true) } func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src", false) } func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, fxIn64, "src", true) } func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask", false) } +func BenchmarkFixedAccumulateMaskSIMD64(b *testing.B) { benchAcc(b, fxIn64, "mask", true) } func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over", false) } func BenchmarkFloatingAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, flIn64, "over", true) } func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src", false) } func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) } func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask", false) } +func BenchmarkFloatingAccumulateMaskSIMD64(b *testing.B) { benchAcc(b, flIn64, "mask", true) } func benchAcc(b *testing.B, in interface{}, op string, simd bool) { var f func() @@ -308,7 +320,11 @@ func benchAcc(b *testing.B, in interface{}, op string, simd bool) { case "mask": buf := make([]uint32, len(in)) copy(buf, in) - f = func() { fixedAccumulateMask(buf) } + if simd { + f = func() { fixedAccumulateMaskSIMD(buf) } + } else { + f = func() { fixedAccumulateMask(buf) } + } } case []float32: @@ -333,7 +349,11 @@ func benchAcc(b *testing.B, in interface{}, op string, simd bool) { } case "mask": dst := make([]uint32, len(in)) - f = func() { floatingAccumulateMask(dst, in) } + if simd { + f = func() { floatingAccumulateMaskSIMD(dst, in) } + } else { + f = func() { floatingAccumulateMask(dst, in) } + } } } diff --git a/vector/gen.go b/vector/gen.go index 22cf39d..355226a 100644 --- a/vector/gen.go +++ b/vector/gen.go @@ -10,6 +10,7 @@ import ( "bytes" "io/ioutil" "log" + "strings" "text/template" ) @@ -54,6 +55,9 @@ func main() { if i != 0 { out.WriteString("\n") } + if strings.Contains(v.LoadArgs, "{{.ShortName}}") { + v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1) + } if err := t.Execute(out, v); err != nil { log.Fatalf("Execute(%q): %v", v.ShortName, err) } @@ -68,15 +72,19 @@ var instances = []struct { LongName string ShortName string FrameSize string - SrcType string + ArgsSize string + Args string + DstElemSize1 int + DstElemSize4 int XMM3 string XMM4 string XMM5 string + XMM6 string XMM8 string XMM9 string XMM10 string + LoadArgs string Setup string - Cleanup string LoadXMMRegs string Add string ClampAndScale string @@ -87,16 +95,20 @@ var instances = []struct { LongName: "fixedAccumulateOpOver", ShortName: "fxAccOpOver", FrameSize: fxFrameSize, - SrcType: fxSrcType, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []uint32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, XMM3: fxXMM3, XMM4: fxXMM4, XMM5: fxXMM5_65536, + XMM6: opOverXMM6, XMM8: opOverXMM8, XMM9: opOverXMM9, XMM10: opOverXMM10, + LoadArgs: twoArgLoadArgs, Setup: fxSetup, LoadXMMRegs: fxLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs, - Cleanup: fxCleanup, Add: fxAdd, ClampAndScale: fxClampAndScale65536, ConvertToInt32: fxConvertToInt32, @@ -106,35 +118,66 @@ var instances = []struct { LongName: "fixedAccumulateOpSrc", ShortName: "fxAccOpSrc", FrameSize: fxFrameSize, - SrcType: fxSrcType, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []uint32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, XMM3: fxXMM3, XMM4: fxXMM4, XMM5: fxXMM5_256, + XMM6: opSrcXMM6, XMM8: opSrcXMM8, XMM9: opSrcXMM9, XMM10: opSrcXMM10, + LoadArgs: twoArgLoadArgs, Setup: fxSetup, LoadXMMRegs: fxLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs, - Cleanup: fxCleanup, Add: fxAdd, ClampAndScale: fxClampAndScale256, ConvertToInt32: fxConvertToInt32, Store4: opSrcStore4, Store1: opSrcStore1, +}, { + LongName: "fixedAccumulateMask", + ShortName: "fxAccMask", + FrameSize: fxFrameSize, + ArgsSize: oneArgArgsSize, + Args: "buf []uint32", + DstElemSize1: 1 * sizeOfUint32, + DstElemSize4: 4 * sizeOfUint32, + XMM3: fxXMM3, + XMM4: fxXMM4, + XMM5: fxXMM5_65536, + XMM6: maskXMM6, + XMM8: maskXMM8, + XMM9: maskXMM9, + XMM10: maskXMM10, + LoadArgs: oneArgLoadArgs, + Setup: fxSetup, + LoadXMMRegs: fxLoadXMMRegs65536 + "\n" + maskLoadXMMRegs, + Add: fxAdd, + ClampAndScale: fxClampAndScale65536, + ConvertToInt32: fxConvertToInt32, + Store4: maskStore4, + Store1: maskStore1, }, { LongName: "floatingAccumulateOpOver", ShortName: "flAccOpOver", FrameSize: flFrameSize, - SrcType: flSrcType, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []float32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, XMM3: flXMM3_65536, XMM4: flXMM4, XMM5: flXMM5, + XMM6: opOverXMM6, XMM8: opOverXMM8, XMM9: opOverXMM9, XMM10: opOverXMM10, + LoadArgs: twoArgLoadArgs, Setup: flSetup, LoadXMMRegs: flLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs, - Cleanup: flCleanup, Add: flAdd, ClampAndScale: flClampAndScale65536, ConvertToInt32: flConvertToInt32, @@ -144,29 +187,59 @@ var instances = []struct { LongName: "floatingAccumulateOpSrc", ShortName: "flAccOpSrc", FrameSize: flFrameSize, - SrcType: flSrcType, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []float32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, XMM3: flXMM3_256, XMM4: flXMM4, XMM5: flXMM5, + XMM6: opSrcXMM6, XMM8: opSrcXMM8, XMM9: opSrcXMM9, XMM10: opSrcXMM10, + LoadArgs: twoArgLoadArgs, Setup: flSetup, LoadXMMRegs: flLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs, - Cleanup: flCleanup, Add: flAdd, ClampAndScale: flClampAndScale256, ConvertToInt32: flConvertToInt32, Store4: opSrcStore4, Store1: opSrcStore1, +}, { + LongName: "floatingAccumulateMask", + ShortName: "flAccMask", + FrameSize: flFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint32, src []float32", + DstElemSize1: 1 * sizeOfUint32, + DstElemSize4: 4 * sizeOfUint32, + XMM3: flXMM3_65536, + XMM4: flXMM4, + XMM5: flXMM5, + XMM6: maskXMM6, + XMM8: maskXMM8, + XMM9: maskXMM9, + XMM10: maskXMM10, + LoadArgs: twoArgLoadArgs, + Setup: flSetup, + LoadXMMRegs: flLoadXMMRegs65536 + "\n" + maskLoadXMMRegs, + Add: flAdd, + ClampAndScale: flClampAndScale65536, + ConvertToInt32: flConvertToInt32, + Store4: maskStore4, + Store1: maskStore1, }} const ( fxFrameSize = `0` flFrameSize = `8` - fxSrcType = `[]uint32` - flSrcType = `[]float32` + oneArgArgsSize = `24` + twoArgArgsSize = `48` + + sizeOfUint8 = 1 + sizeOfUint32 = 4 fxXMM3 = `-` flXMM3_256 = `flAlmost256` @@ -179,19 +252,32 @@ const ( fxXMM5_65536 = `fxAlmost65536` flXMM5 = `flSignMask` + oneArgLoadArgs = ` + MOVQ buf_base+0(FP), DI + MOVQ buf_len+8(FP), BX + MOVQ buf_base+0(FP), SI + MOVQ buf_len+8(FP), R10 + ` + twoArgLoadArgs = ` + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT {{.ShortName}}End + ` + fxSetup = `` flSetup = ` - // Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". + // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is + // "Round To Zero". STMXCSR mxcsrOrig-8(SP) MOVL mxcsrOrig-8(SP), AX ORL $0x6000, AX MOVL AX, mxcsrNew-4(SP) - LDMXCSR mxcsrNew-4(SP) ` - fxCleanup = `// No-op.` - flCleanup = `LDMXCSR mxcsrOrig-8(SP)` - fxLoadXMMRegs256 = ` // fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8. MOVOU fxAlmost256<>(SB), X5 @@ -271,8 +357,16 @@ const ( MULPS X3, X2 ` - fxConvertToInt32 = `// No-op.` - flConvertToInt32 = `CVTPS2PL X2, X2` + fxConvertToInt32 = ` + // z = convertToInt32(y) + // No-op. + ` + flConvertToInt32 = ` + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + ` opOverStore4 = ` // Blend over the dst's prior value. SIMD for i in 0..3: @@ -324,6 +418,10 @@ const ( PSHUFB X6, X2 MOVL X2, (DI) ` + maskStore4 = ` + // copy(dst[:4], z) + MOVOU X2, (DI) + ` opOverStore1 = ` // Blend over the dst's prior value. @@ -350,23 +448,40 @@ const ( MOVL X2, BX MOVB BX, (DI) ` + maskStore1 = ` + // dst[0] = uint32(z) + MOVL X2, (DI) + ` + + opOverXMM6 = `gather` + opSrcXMM6 = `gather` + maskXMM6 = `-` opOverXMM8 = `scatterAndMulBy0x101` opSrcXMM8 = `-` + maskXMM8 = `-` opOverXMM9 = `fxAlmost65536` opSrcXMM9 = `-` + maskXMM9 = `-` opOverXMM10 = `inverseFFFF` opSrcXMM10 = `-` + maskXMM10 = `-` opOverLoadXMMRegs = ` + // gather := XMM(see above) // PSHUFB shuffle mask. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. + MOVOU gather<>(SB), X6 MOVOU scatterAndMulBy0x101<>(SB), X8 MOVOU fxAlmost65536<>(SB), X9 MOVOU inverseFFFF<>(SB), X10 ` - opSrcLoadXMMRegs = `` + opSrcLoadXMMRegs = ` + // gather := XMM(see above) // PSHUFB shuffle mask. + MOVOU gather<>(SB), X6 + ` + maskLoadXMMRegs = `` ) diff --git a/vector/gen_acc_amd64.s.tmpl b/vector/gen_acc_amd64.s.tmpl index 71e4769..615d7a0 100644 --- a/vector/gen_acc_amd64.s.tmpl +++ b/vector/gen_acc_amd64.s.tmpl @@ -68,7 +68,7 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0 // ---------------------------------------------------------------------------- -// func {{.LongName}}SIMD(dst []uint8, src {{.SrcType}}) +// func {{.LongName}}SIMD({{.Args}}) // // XMM registers. Variable names are per // https://github.com/google/font-rs/blob/master/src/accumulate.c @@ -79,20 +79,13 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0 // xmm3 {{.XMM3}} // xmm4 {{.XMM4}} // xmm5 {{.XMM5}} -// xmm6 gather +// xmm6 {{.XMM6}} // xmm7 offset // xmm8 {{.XMM8}} // xmm9 {{.XMM9}} // xmm10 {{.XMM10}} -TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), BX - MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), R10 - - // Sanity check that len(dst) >= len(src). - CMPQ BX, R10 - JLT {{.ShortName}}End +TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-{{.ArgsSize}} + {{.LoadArgs}} // R10 = len(src) &^ 3 // R11 = len(src) @@ -103,9 +96,7 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 {{.LoadXMMRegs}} - // gather := XMM(see above) // PSHUFB shuffle mask. // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - MOVOU gather<>(SB), X6 XORPS X7, X7 // i := 0 @@ -139,7 +130,6 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 {{.ClampAndScale}} - // z = convertToInt32(y) {{.ConvertToInt32}} {{.Store4}} @@ -152,14 +142,14 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 // dst = dst[4:] // src = src[4:] ADDQ $4, R9 - ADDQ $4, DI + ADDQ ${{.DstElemSize4}}, DI ADDQ $16, SI JMP {{.ShortName}}Loop4 {{.ShortName}}Loop1: // for i < len(src) CMPQ R9, R11 - JAE {{.ShortName}}Cleanup + JAE {{.ShortName}}End // x = src[i] + offset MOVL (SI), X1 @@ -167,7 +157,6 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 {{.ClampAndScale}} - // z = convertToInt32(y) {{.ConvertToInt32}} {{.Store1}} @@ -179,12 +168,9 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 // dst = dst[1:] // src = src[1:] ADDQ $1, R9 - ADDQ $1, DI + ADDQ ${{.DstElemSize1}}, DI ADDQ $4, SI JMP {{.ShortName}}Loop1 -{{.ShortName}}Cleanup: - {{.Cleanup}} - {{.ShortName}}End: RET diff --git a/vector/raster_floating.go b/vector/raster_floating.go index 2ed426a..119845a 100644 --- a/vector/raster_floating.go +++ b/vector/raster_floating.go @@ -190,6 +190,11 @@ func floatingAccumulateOpSrc(dst []uint8, src []float32) { } func floatingAccumulateMask(dst []uint32, src []float32) { + // Sanity check that len(dst) >= len(src). + if len(dst) < len(src) { + return + } + acc := float32(0) for i, v := range src { acc += v diff --git a/vector/vector.go b/vector/vector.go index ee7300d..9caccf8 100644 --- a/vector/vector.go +++ b/vector/vector.go @@ -310,9 +310,17 @@ func (z *Rasterizer) accumulateMask() { } else { z.bufU32 = z.bufU32[:n] } - floatingAccumulateMask(z.bufU32, z.bufF32) + if haveFloatingAccumulateSIMD { + floatingAccumulateMaskSIMD(z.bufU32, z.bufF32) + } else { + floatingAccumulateMask(z.bufU32, z.bufF32) + } } else { - fixedAccumulateMask(z.bufU32) + if haveFixedAccumulateSIMD { + fixedAccumulateMaskSIMD(z.bufU32) + } else { + fixedAccumulateMask(z.bufU32) + } } }