vector: add SIMD versions of xxxAccumulateMask.

name                              old time/op  new time/op  delta
GlyphAlphaLoose16Over-8           3.96µs ± 0%  3.64µs ± 1%   -8.08%   (p=0.000 n=8+10)
GlyphAlphaLoose16Src-8            3.64µs ± 0%  3.35µs ± 0%   -7.88%    (p=0.000 n=8+9)
GlyphAlphaLoose32Over-8           8.45µs ± 0%  6.74µs ± 0%  -20.22%    (p=0.000 n=8+9)
GlyphAlphaLoose32Src-8            7.24µs ± 0%  5.54µs ± 1%  -23.48%   (p=0.000 n=8+10)
GlyphAlphaLoose64Over-8           22.2µs ± 0%  17.4µs ± 0%  -21.67%    (p=0.000 n=9+9)
GlyphAlphaLoose64Src-8            17.6µs ± 1%  12.2µs ± 1%  -30.32%  (p=0.000 n=10+10)
GlyphAlphaLoose128Over-8          67.9µs ± 0%  53.3µs ± 1%  -21.53%  (p=0.000 n=10+10)
GlyphAlphaLoose128Src-8           48.2µs ± 0%  32.6µs ± 2%  -32.41%   (p=0.000 n=9+10)
GlyphAlphaLoose256Over-8           242µs ± 1%   187µs ± 1%  -22.96%    (p=0.000 n=9+9)
GlyphAlphaLoose256Src-8            163µs ± 0%   105µs ± 1%  -35.83%    (p=0.000 n=9+9)
GlyphRGBA16Over-8                 5.25µs ± 1%  4.95µs ± 0%   -5.78%    (p=0.000 n=9+9)
GlyphRGBA16Src-8                  4.72µs ± 0%  4.43µs ± 1%   -6.22%   (p=0.000 n=9+10)
GlyphRGBA32Over-8                 13.5µs ± 0%  11.9µs ± 1%  -12.19%   (p=0.000 n=9+10)
GlyphRGBA32Src-8                  11.5µs ± 1%   9.8µs ± 0%  -14.72%    (p=0.000 n=9+9)
GlyphRGBA64Over-8                 42.0µs ± 2%  36.9µs ± 1%  -12.19%  (p=0.000 n=10+10)
GlyphRGBA64Src-8                  34.1µs ± 1%  28.5µs ± 0%  -16.25%    (p=0.000 n=9+7)
GlyphRGBA128Over-8                 149µs ± 2%   133µs ± 1%  -10.24%   (p=0.000 n=10+9)
GlyphRGBA128Src-8                  115µs ± 1%    99µs ± 1%  -13.57%   (p=0.000 n=9+10)
GlyphRGBA256Over-8                 566µs ± 0%   511µs ± 1%   -9.85%   (p=0.000 n=9+10)
GlyphRGBA256Src-8                  435µs ± 0%   372µs ± 0%  -14.64%    (p=0.000 n=9+8)
GlyphNRGBA16Over-8                26.9µs ± 3%  26.0µs ± 3%   -3.55%   (p=0.000 n=10+9)
GlyphNRGBA16Src-8                 18.8µs ± 2%  18.4µs ± 2%   -2.21%   (p=0.000 n=9+10)
GlyphNRGBA32Over-8                99.1µs ± 2%  95.9µs ± 3%   -3.23%  (p=0.000 n=10+10)
GlyphNRGBA32Src-8                 65.6µs ± 3%  62.8µs ± 2%   -4.36%  (p=0.000 n=10+10)
GlyphNRGBA64Over-8                 376µs ± 4%   370µs ± 2%     ~     (p=0.063 n=10+10)
GlyphNRGBA64Src-8                  238µs ± 3%   233µs ± 1%   -2.21%   (p=0.000 n=9+10)
GlyphNRGBA128Over-8               1.52ms ± 2%  1.48ms ± 0%   -2.11%   (p=0.000 n=10+8)
GlyphNRGBA128Src-8                 951µs ± 3%   935µs ± 1%   -1.69%   (p=0.013 n=10+9)
GlyphNRGBA256Over-8               6.00ms ± 1%  5.87ms ± 3%   -2.12%  (p=0.002 n=10+10)
GlyphNRGBA256Src-8                3.94ms ± 2%  3.80ms ± 2%   -3.64%  (p=0.000 n=10+10)

A comparison of the non-SIMD and SIMD versions:

name                              time/op
FixedAccumulateMask16-8            237ns ± 0%
FixedAccumulateMaskSIMD16-8       80.0ns ± 1%
FloatingAccumulateMask16-8         413ns ± 2%
FloatingAccumulateMaskSIMD16-8     166ns ± 0%
FixedAccumulateMask64-8           3.42µs ± 0%
FixedAccumulateMaskSIMD64-8       1.09µs ± 0%
FloatingAccumulateMask64-8        6.92µs ± 0%
FloatingAccumulateMaskSIMD64-8    2.47µs ± 1%

Change-Id: Ib6980e5975ed2842ff2a372f76dd5f2e95c5526c
Reviewed-on: https://go-review.googlesource.com/30898
Reviewed-by: David Crawshaw <crawshaw@golang.org>
This commit is contained in:
Nigel Tao 2016-10-13 14:50:52 +11:00
parent beb9675609
commit e78c45720c
8 changed files with 507 additions and 74 deletions

View File

@ -20,8 +20,14 @@ func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32)
//go:noescape
func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
//go:noescape
func fixedAccumulateMaskSIMD(buf []uint32)
//go:noescape
func floatingAccumulateOpOverSIMD(dst []uint8, src []float32)
//go:noescape
func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
//go:noescape
func floatingAccumulateMaskSIMD(dst []uint32, src []float32)

View File

@ -83,6 +83,7 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0
// xmm9 fxAlmost65536
// xmm10 inverseFFFF
TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI
@ -100,16 +101,16 @@ TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48
// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
MOVOU fxAlmost65536<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask.
// scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
// fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
// inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
MOVOU gather<>(SB), X6
MOVOU scatterAndMulBy0x101<>(SB), X8
MOVOU fxAlmost65536<>(SB), X9
MOVOU inverseFFFF<>(SB), X10
// gather := XMM(see above) // PSHUFB shuffle mask.
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
MOVOU gather<>(SB), X6
XORPS X7, X7
// i := 0
@ -222,7 +223,7 @@ fxAccOpOverLoop4:
fxAccOpOverLoop1:
// for i < len(src)
CMPQ R9, R11
JAE fxAccOpOverCleanup
JAE fxAccOpOverEnd
// x = src[i] + offset
MOVL (SI), X1
@ -276,9 +277,6 @@ fxAccOpOverLoop1:
ADDQ $4, SI
JMP fxAccOpOverLoop1
fxAccOpOverCleanup:
// No-op.
fxAccOpOverEnd:
RET
@ -301,6 +299,7 @@ fxAccOpOverEnd:
// xmm9 -
// xmm10 -
TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI
@ -318,9 +317,10 @@ TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
// fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
MOVOU fxAlmost256<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask.
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
// gather := XMM(see above) // PSHUFB shuffle mask.
MOVOU gather<>(SB), X6
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
XORPS X7, X7
// i := 0
@ -390,7 +390,7 @@ fxAccOpSrcLoop4:
fxAccOpSrcLoop1:
// for i < len(src)
CMPQ R9, R11
JAE fxAccOpSrcCleanup
JAE fxAccOpSrcEnd
// x = src[i] + offset
MOVL (SI), X1
@ -429,10 +429,149 @@ fxAccOpSrcLoop1:
ADDQ $4, SI
JMP fxAccOpSrcLoop1
fxAccOpSrcCleanup:
fxAccOpSrcEnd:
RET
// ----------------------------------------------------------------------------
// func fixedAccumulateMaskSIMD(buf []uint32)
//
// XMM registers. Variable names are per
// https://github.com/google/font-rs/blob/master/src/accumulate.c
//
// xmm0 scratch
// xmm1 x
// xmm2 y, z
// xmm3 -
// xmm4 -
// xmm5 fxAlmost65536
// xmm6 -
// xmm7 offset
// xmm8 -
// xmm9 -
// xmm10 -
TEXT ·fixedAccumulateMaskSIMD(SB), NOSPLIT, $0-24
MOVQ buf_base+0(FP), DI
MOVQ buf_len+8(FP), BX
MOVQ buf_base+0(FP), SI
MOVQ buf_len+8(FP), R10
// R10 = len(src) &^ 3
// R11 = len(src)
MOVQ R10, R11
ANDQ $-4, R10
// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
MOVOU fxAlmost65536<>(SB), X5
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
XORPS X7, X7
// i := 0
MOVQ $0, R9
fxAccMaskLoop4:
// for i < (len(src) &^ 3)
CMPQ R9, R10
JAE fxAccMaskLoop1
// x = XMM(s0, s1, s2, s3)
//
// Where s0 is src[i+0], s1 is src[i+1], etc.
MOVOU (SI), X1
// scratch = XMM(0, s0, s1, s2)
// x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
MOVOU X1, X0
PSLLO $4, X0
PADDD X0, X1
// scratch = XMM(0, 0, 0, 0)
// scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
// x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
XORPS X0, X0
SHUFPS $0x40, X1, X0
PADDD X0, X1
// x += offset
PADDD X7, X1
// y = abs(x)
// y >>= 4 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
//
// pabsd %xmm1,%xmm2
// psrld $0x4,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
// z = convertToInt32(y)
// No-op.
fxAccOpSrcEnd:
// copy(dst[:4], z)
MOVOU X2, (DI)
// offset = XMM(x@3, x@3, x@3, x@3)
MOVOU X1, X7
SHUFPS $0xff, X1, X7
// i += 4
// dst = dst[4:]
// src = src[4:]
ADDQ $4, R9
ADDQ $16, DI
ADDQ $16, SI
JMP fxAccMaskLoop4
fxAccMaskLoop1:
// for i < len(src)
CMPQ R9, R11
JAE fxAccMaskEnd
// x = src[i] + offset
MOVL (SI), X1
PADDD X7, X1
// y = abs(x)
// y >>= 4 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
//
// pabsd %xmm1,%xmm2
// psrld $0x4,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
// z = convertToInt32(y)
// No-op.
// dst[0] = uint32(z)
MOVL X2, (DI)
// offset = x
MOVOU X1, X7
// i += 1
// dst = dst[1:]
// src = src[1:]
ADDQ $1, R9
ADDQ $4, DI
ADDQ $4, SI
JMP fxAccMaskLoop1
fxAccMaskEnd:
RET
// ----------------------------------------------------------------------------
@ -454,6 +593,7 @@ fxAccOpSrcEnd:
// xmm9 fxAlmost65536
// xmm10 inverseFFFF
TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI
@ -468,12 +608,12 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
MOVQ R10, R11
ANDQ $-4, R10
// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero".
// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
// "Round To Zero".
STMXCSR mxcsrOrig-8(SP)
MOVL mxcsrOrig-8(SP), AX
ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP)
LDMXCSR mxcsrNew-4(SP)
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
@ -482,16 +622,16 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask.
// scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
// fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
// inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
MOVOU gather<>(SB), X6
MOVOU scatterAndMulBy0x101<>(SB), X8
MOVOU fxAlmost65536<>(SB), X9
MOVOU inverseFFFF<>(SB), X10
// gather := XMM(see above) // PSHUFB shuffle mask.
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
MOVOU gather<>(SB), X6
XORPS X7, X7
// i := 0
@ -532,7 +672,9 @@ flAccOpOverLoop4:
MULPS X3, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// Blend over the dst's prior value. SIMD for i in 0..3:
//
@ -597,7 +739,7 @@ flAccOpOverLoop4:
flAccOpOverLoop1:
// for i < len(src)
CMPQ R9, R11
JAE flAccOpOverCleanup
JAE flAccOpOverEnd
// x = src[i] + offset
MOVL (SI), X1
@ -612,7 +754,9 @@ flAccOpOverLoop1:
MULPS X3, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// Blend over the dst's prior value.
//
@ -644,9 +788,6 @@ flAccOpOverLoop1:
ADDQ $4, SI
JMP flAccOpOverLoop1
flAccOpOverCleanup:
LDMXCSR mxcsrOrig-8(SP)
flAccOpOverEnd:
RET
@ -669,6 +810,7 @@ flAccOpOverEnd:
// xmm9 -
// xmm10 -
TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI
@ -683,12 +825,12 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
MOVQ R10, R11
ANDQ $-4, R10
// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero".
// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
// "Round To Zero".
STMXCSR mxcsrOrig-8(SP)
MOVL mxcsrOrig-8(SP), AX
ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP)
LDMXCSR mxcsrNew-4(SP)
// flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
@ -697,9 +839,10 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask.
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
// gather := XMM(see above) // PSHUFB shuffle mask.
MOVOU gather<>(SB), X6
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
XORPS X7, X7
// i := 0
@ -740,7 +883,9 @@ flAccOpSrcLoop4:
MULPS X3, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// z = shuffleTheLowBytesOfEach4ByteElement(z)
// copy(dst[:4], low4BytesOf(z))
@ -762,7 +907,7 @@ flAccOpSrcLoop4:
flAccOpSrcLoop1:
// for i < len(src)
CMPQ R9, R11
JAE flAccOpSrcCleanup
JAE flAccOpSrcEnd
// x = src[i] + offset
MOVL (SI), X1
@ -777,7 +922,9 @@ flAccOpSrcLoop1:
MULPS X3, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// dst[0] = uint8(z)
MOVL X2, BX
@ -794,8 +941,152 @@ flAccOpSrcLoop1:
ADDQ $4, SI
JMP flAccOpSrcLoop1
flAccOpSrcCleanup:
LDMXCSR mxcsrOrig-8(SP)
flAccOpSrcEnd:
RET
// ----------------------------------------------------------------------------
// func floatingAccumulateMaskSIMD(dst []uint32, src []float32)
//
// XMM registers. Variable names are per
// https://github.com/google/font-rs/blob/master/src/accumulate.c
//
// xmm0 scratch
// xmm1 x
// xmm2 y, z
// xmm3 flAlmost65536
// xmm4 flOne
// xmm5 flSignMask
// xmm6 -
// xmm7 offset
// xmm8 -
// xmm9 -
// xmm10 -
TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), R10
// Sanity check that len(dst) >= len(src).
CMPQ BX, R10
JLT flAccMaskEnd
// R10 = len(src) &^ 3
// R11 = len(src)
MOVQ R10, R11
ANDQ $-4, R10
// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
// "Round To Zero".
STMXCSR mxcsrOrig-8(SP)
MOVL mxcsrOrig-8(SP), AX
ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP)
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
MOVOU flAlmost65536<>(SB), X3
MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
XORPS X7, X7
// i := 0
MOVQ $0, R9
flAccMaskLoop4:
// for i < (len(src) &^ 3)
CMPQ R9, R10
JAE flAccMaskLoop1
// x = XMM(s0, s1, s2, s3)
//
// Where s0 is src[i+0], s1 is src[i+1], etc.
MOVOU (SI), X1
// scratch = XMM(0, s0, s1, s2)
// x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
MOVOU X1, X0
PSLLO $4, X0
ADDPS X0, X1
// scratch = XMM(0, 0, 0, 0)
// scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
// x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
XORPS X0, X0
SHUFPS $0x40, X1, X0
ADDPS X0, X1
// x += offset
ADDPS X7, X1
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// copy(dst[:4], z)
MOVOU X2, (DI)
// offset = XMM(x@3, x@3, x@3, x@3)
MOVOU X1, X7
SHUFPS $0xff, X1, X7
// i += 4
// dst = dst[4:]
// src = src[4:]
ADDQ $4, R9
ADDQ $16, DI
ADDQ $16, SI
JMP flAccMaskLoop4
flAccMaskLoop1:
// for i < len(src)
CMPQ R9, R11
JAE flAccMaskEnd
// x = src[i] + offset
MOVL (SI), X1
ADDPS X7, X1
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// dst[0] = uint32(z)
MOVL X2, (DI)
// offset = x
MOVOU X1, X7
// i += 1
// dst = dst[1:]
// src = src[1:]
ADDQ $1, R9
ADDQ $4, DI
ADDQ $4, SI
JMP flAccMaskLoop1
flAccMaskEnd:
RET

View File

@ -11,5 +11,7 @@ const haveFloatingAccumulateSIMD = false
func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) {}
func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {}
func fixedAccumulateMaskSIMD(buf []uint32) {}
func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) {}
func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {}
func floatingAccumulateMaskSIMD(dst []uint32, src []float32) {}

View File

@ -201,7 +201,11 @@ func testAcc(t *testing.T, in interface{}, mask []uint32, op string) {
}
case "mask":
copy(got32, in[:n])
fixedAccumulateMask(got32)
if simd {
fixedAccumulateMaskSIMD(got32)
} else {
fixedAccumulateMask(got32)
}
}
case []float32:
switch op {
@ -218,7 +222,11 @@ func testAcc(t *testing.T, in interface{}, mask []uint32, op string) {
floatingAccumulateOpSrc(got8, in[:n])
}
case "mask":
floatingAccumulateMask(got32, in[:n])
if simd {
floatingAccumulateMaskSIMD(got32, in[:n])
} else {
floatingAccumulateMask(got32, in[:n])
}
}
}
@ -264,22 +272,26 @@ func BenchmarkFixedAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, fxIn16,
func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src", false) }
func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, fxIn16, "src", true) }
func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask", false) }
func BenchmarkFixedAccumulateMaskSIMD16(b *testing.B) { benchAcc(b, fxIn16, "mask", true) }
func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over", false) }
func BenchmarkFloatingAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, flIn16, "over", true) }
func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src", false) }
func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) }
func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask", false) }
func BenchmarkFloatingAccumulateMaskSIMD16(b *testing.B) { benchAcc(b, flIn16, "mask", true) }
func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over", false) }
func BenchmarkFixedAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, fxIn64, "over", true) }
func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src", false) }
func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, fxIn64, "src", true) }
func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask", false) }
func BenchmarkFixedAccumulateMaskSIMD64(b *testing.B) { benchAcc(b, fxIn64, "mask", true) }
func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over", false) }
func BenchmarkFloatingAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, flIn64, "over", true) }
func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src", false) }
func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) }
func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask", false) }
func BenchmarkFloatingAccumulateMaskSIMD64(b *testing.B) { benchAcc(b, flIn64, "mask", true) }
func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
var f func()
@ -308,7 +320,11 @@ func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
case "mask":
buf := make([]uint32, len(in))
copy(buf, in)
f = func() { fixedAccumulateMask(buf) }
if simd {
f = func() { fixedAccumulateMaskSIMD(buf) }
} else {
f = func() { fixedAccumulateMask(buf) }
}
}
case []float32:
@ -333,7 +349,11 @@ func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
}
case "mask":
dst := make([]uint32, len(in))
f = func() { floatingAccumulateMask(dst, in) }
if simd {
f = func() { floatingAccumulateMaskSIMD(dst, in) }
} else {
f = func() { floatingAccumulateMask(dst, in) }
}
}
}

View File

@ -10,6 +10,7 @@ import (
"bytes"
"io/ioutil"
"log"
"strings"
"text/template"
)
@ -54,6 +55,9 @@ func main() {
if i != 0 {
out.WriteString("\n")
}
if strings.Contains(v.LoadArgs, "{{.ShortName}}") {
v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1)
}
if err := t.Execute(out, v); err != nil {
log.Fatalf("Execute(%q): %v", v.ShortName, err)
}
@ -68,15 +72,19 @@ var instances = []struct {
LongName string
ShortName string
FrameSize string
SrcType string
ArgsSize string
Args string
DstElemSize1 int
DstElemSize4 int
XMM3 string
XMM4 string
XMM5 string
XMM6 string
XMM8 string
XMM9 string
XMM10 string
LoadArgs string
Setup string
Cleanup string
LoadXMMRegs string
Add string
ClampAndScale string
@ -87,16 +95,20 @@ var instances = []struct {
LongName: "fixedAccumulateOpOver",
ShortName: "fxAccOpOver",
FrameSize: fxFrameSize,
SrcType: fxSrcType,
ArgsSize: twoArgArgsSize,
Args: "dst []uint8, src []uint32",
DstElemSize1: 1 * sizeOfUint8,
DstElemSize4: 4 * sizeOfUint8,
XMM3: fxXMM3,
XMM4: fxXMM4,
XMM5: fxXMM5_65536,
XMM6: opOverXMM6,
XMM8: opOverXMM8,
XMM9: opOverXMM9,
XMM10: opOverXMM10,
LoadArgs: twoArgLoadArgs,
Setup: fxSetup,
LoadXMMRegs: fxLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs,
Cleanup: fxCleanup,
Add: fxAdd,
ClampAndScale: fxClampAndScale65536,
ConvertToInt32: fxConvertToInt32,
@ -106,35 +118,66 @@ var instances = []struct {
LongName: "fixedAccumulateOpSrc",
ShortName: "fxAccOpSrc",
FrameSize: fxFrameSize,
SrcType: fxSrcType,
ArgsSize: twoArgArgsSize,
Args: "dst []uint8, src []uint32",
DstElemSize1: 1 * sizeOfUint8,
DstElemSize4: 4 * sizeOfUint8,
XMM3: fxXMM3,
XMM4: fxXMM4,
XMM5: fxXMM5_256,
XMM6: opSrcXMM6,
XMM8: opSrcXMM8,
XMM9: opSrcXMM9,
XMM10: opSrcXMM10,
LoadArgs: twoArgLoadArgs,
Setup: fxSetup,
LoadXMMRegs: fxLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs,
Cleanup: fxCleanup,
Add: fxAdd,
ClampAndScale: fxClampAndScale256,
ConvertToInt32: fxConvertToInt32,
Store4: opSrcStore4,
Store1: opSrcStore1,
}, {
LongName: "fixedAccumulateMask",
ShortName: "fxAccMask",
FrameSize: fxFrameSize,
ArgsSize: oneArgArgsSize,
Args: "buf []uint32",
DstElemSize1: 1 * sizeOfUint32,
DstElemSize4: 4 * sizeOfUint32,
XMM3: fxXMM3,
XMM4: fxXMM4,
XMM5: fxXMM5_65536,
XMM6: maskXMM6,
XMM8: maskXMM8,
XMM9: maskXMM9,
XMM10: maskXMM10,
LoadArgs: oneArgLoadArgs,
Setup: fxSetup,
LoadXMMRegs: fxLoadXMMRegs65536 + "\n" + maskLoadXMMRegs,
Add: fxAdd,
ClampAndScale: fxClampAndScale65536,
ConvertToInt32: fxConvertToInt32,
Store4: maskStore4,
Store1: maskStore1,
}, {
LongName: "floatingAccumulateOpOver",
ShortName: "flAccOpOver",
FrameSize: flFrameSize,
SrcType: flSrcType,
ArgsSize: twoArgArgsSize,
Args: "dst []uint8, src []float32",
DstElemSize1: 1 * sizeOfUint8,
DstElemSize4: 4 * sizeOfUint8,
XMM3: flXMM3_65536,
XMM4: flXMM4,
XMM5: flXMM5,
XMM6: opOverXMM6,
XMM8: opOverXMM8,
XMM9: opOverXMM9,
XMM10: opOverXMM10,
LoadArgs: twoArgLoadArgs,
Setup: flSetup,
LoadXMMRegs: flLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs,
Cleanup: flCleanup,
Add: flAdd,
ClampAndScale: flClampAndScale65536,
ConvertToInt32: flConvertToInt32,
@ -144,29 +187,59 @@ var instances = []struct {
LongName: "floatingAccumulateOpSrc",
ShortName: "flAccOpSrc",
FrameSize: flFrameSize,
SrcType: flSrcType,
ArgsSize: twoArgArgsSize,
Args: "dst []uint8, src []float32",
DstElemSize1: 1 * sizeOfUint8,
DstElemSize4: 4 * sizeOfUint8,
XMM3: flXMM3_256,
XMM4: flXMM4,
XMM5: flXMM5,
XMM6: opSrcXMM6,
XMM8: opSrcXMM8,
XMM9: opSrcXMM9,
XMM10: opSrcXMM10,
LoadArgs: twoArgLoadArgs,
Setup: flSetup,
LoadXMMRegs: flLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs,
Cleanup: flCleanup,
Add: flAdd,
ClampAndScale: flClampAndScale256,
ConvertToInt32: flConvertToInt32,
Store4: opSrcStore4,
Store1: opSrcStore1,
}, {
LongName: "floatingAccumulateMask",
ShortName: "flAccMask",
FrameSize: flFrameSize,
ArgsSize: twoArgArgsSize,
Args: "dst []uint32, src []float32",
DstElemSize1: 1 * sizeOfUint32,
DstElemSize4: 4 * sizeOfUint32,
XMM3: flXMM3_65536,
XMM4: flXMM4,
XMM5: flXMM5,
XMM6: maskXMM6,
XMM8: maskXMM8,
XMM9: maskXMM9,
XMM10: maskXMM10,
LoadArgs: twoArgLoadArgs,
Setup: flSetup,
LoadXMMRegs: flLoadXMMRegs65536 + "\n" + maskLoadXMMRegs,
Add: flAdd,
ClampAndScale: flClampAndScale65536,
ConvertToInt32: flConvertToInt32,
Store4: maskStore4,
Store1: maskStore1,
}}
const (
fxFrameSize = `0`
flFrameSize = `8`
fxSrcType = `[]uint32`
flSrcType = `[]float32`
oneArgArgsSize = `24`
twoArgArgsSize = `48`
sizeOfUint8 = 1
sizeOfUint32 = 4
fxXMM3 = `-`
flXMM3_256 = `flAlmost256`
@ -179,19 +252,32 @@ const (
fxXMM5_65536 = `fxAlmost65536`
flXMM5 = `flSignMask`
oneArgLoadArgs = `
MOVQ buf_base+0(FP), DI
MOVQ buf_len+8(FP), BX
MOVQ buf_base+0(FP), SI
MOVQ buf_len+8(FP), R10
`
twoArgLoadArgs = `
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), R10
// Sanity check that len(dst) >= len(src).
CMPQ BX, R10
JLT {{.ShortName}}End
`
fxSetup = ``
flSetup = `
// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero".
// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
// "Round To Zero".
STMXCSR mxcsrOrig-8(SP)
MOVL mxcsrOrig-8(SP), AX
ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP)
LDMXCSR mxcsrNew-4(SP)
`
fxCleanup = `// No-op.`
flCleanup = `LDMXCSR mxcsrOrig-8(SP)`
fxLoadXMMRegs256 = `
// fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
MOVOU fxAlmost256<>(SB), X5
@ -271,8 +357,16 @@ const (
MULPS X3, X2
`
fxConvertToInt32 = `// No-op.`
flConvertToInt32 = `CVTPS2PL X2, X2`
fxConvertToInt32 = `
// z = convertToInt32(y)
// No-op.
`
flConvertToInt32 = `
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
`
opOverStore4 = `
// Blend over the dst's prior value. SIMD for i in 0..3:
@ -324,6 +418,10 @@ const (
PSHUFB X6, X2
MOVL X2, (DI)
`
maskStore4 = `
// copy(dst[:4], z)
MOVOU X2, (DI)
`
opOverStore1 = `
// Blend over the dst's prior value.
@ -350,23 +448,40 @@ const (
MOVL X2, BX
MOVB BX, (DI)
`
maskStore1 = `
// dst[0] = uint32(z)
MOVL X2, (DI)
`
opOverXMM6 = `gather`
opSrcXMM6 = `gather`
maskXMM6 = `-`
opOverXMM8 = `scatterAndMulBy0x101`
opSrcXMM8 = `-`
maskXMM8 = `-`
opOverXMM9 = `fxAlmost65536`
opSrcXMM9 = `-`
maskXMM9 = `-`
opOverXMM10 = `inverseFFFF`
opSrcXMM10 = `-`
maskXMM10 = `-`
opOverLoadXMMRegs = `
// gather := XMM(see above) // PSHUFB shuffle mask.
// scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
// fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
// inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
MOVOU gather<>(SB), X6
MOVOU scatterAndMulBy0x101<>(SB), X8
MOVOU fxAlmost65536<>(SB), X9
MOVOU inverseFFFF<>(SB), X10
`
opSrcLoadXMMRegs = ``
opSrcLoadXMMRegs = `
// gather := XMM(see above) // PSHUFB shuffle mask.
MOVOU gather<>(SB), X6
`
maskLoadXMMRegs = ``
)

View File

@ -68,7 +68,7 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0
// ----------------------------------------------------------------------------
// func {{.LongName}}SIMD(dst []uint8, src {{.SrcType}})
// func {{.LongName}}SIMD({{.Args}})
//
// XMM registers. Variable names are per
// https://github.com/google/font-rs/blob/master/src/accumulate.c
@ -79,20 +79,13 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0
// xmm3 {{.XMM3}}
// xmm4 {{.XMM4}}
// xmm5 {{.XMM5}}
// xmm6 gather
// xmm6 {{.XMM6}}
// xmm7 offset
// xmm8 {{.XMM8}}
// xmm9 {{.XMM9}}
// xmm10 {{.XMM10}}
TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), R10
// Sanity check that len(dst) >= len(src).
CMPQ BX, R10
JLT {{.ShortName}}End
TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-{{.ArgsSize}}
{{.LoadArgs}}
// R10 = len(src) &^ 3
// R11 = len(src)
@ -103,9 +96,7 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
{{.LoadXMMRegs}}
// gather := XMM(see above) // PSHUFB shuffle mask.
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
MOVOU gather<>(SB), X6
XORPS X7, X7
// i := 0
@ -139,7 +130,6 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
{{.ClampAndScale}}
// z = convertToInt32(y)
{{.ConvertToInt32}}
{{.Store4}}
@ -152,14 +142,14 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
// dst = dst[4:]
// src = src[4:]
ADDQ $4, R9
ADDQ $4, DI
ADDQ ${{.DstElemSize4}}, DI
ADDQ $16, SI
JMP {{.ShortName}}Loop4
{{.ShortName}}Loop1:
// for i < len(src)
CMPQ R9, R11
JAE {{.ShortName}}Cleanup
JAE {{.ShortName}}End
// x = src[i] + offset
MOVL (SI), X1
@ -167,7 +157,6 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
{{.ClampAndScale}}
// z = convertToInt32(y)
{{.ConvertToInt32}}
{{.Store1}}
@ -179,12 +168,9 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
// dst = dst[1:]
// src = src[1:]
ADDQ $1, R9
ADDQ $1, DI
ADDQ ${{.DstElemSize1}}, DI
ADDQ $4, SI
JMP {{.ShortName}}Loop1
{{.ShortName}}Cleanup:
{{.Cleanup}}
{{.ShortName}}End:
RET

View File

@ -190,6 +190,11 @@ func floatingAccumulateOpSrc(dst []uint8, src []float32) {
}
func floatingAccumulateMask(dst []uint32, src []float32) {
// Sanity check that len(dst) >= len(src).
if len(dst) < len(src) {
return
}
acc := float32(0)
for i, v := range src {
acc += v

View File

@ -310,9 +310,17 @@ func (z *Rasterizer) accumulateMask() {
} else {
z.bufU32 = z.bufU32[:n]
}
floatingAccumulateMask(z.bufU32, z.bufF32)
if haveFloatingAccumulateSIMD {
floatingAccumulateMaskSIMD(z.bufU32, z.bufF32)
} else {
floatingAccumulateMask(z.bufU32, z.bufF32)
}
} else {
fixedAccumulateMask(z.bufU32)
if haveFixedAccumulateSIMD {
fixedAccumulateMaskSIMD(z.bufU32)
} else {
fixedAccumulateMask(z.bufU32)
}
}
}