vector: add SIMD versions of xxxAccumulateMask.

name                              old time/op  new time/op  delta
GlyphAlphaLoose16Over-8           3.96µs ± 0%  3.64µs ± 1%   -8.08%   (p=0.000 n=8+10)
GlyphAlphaLoose16Src-8            3.64µs ± 0%  3.35µs ± 0%   -7.88%    (p=0.000 n=8+9)
GlyphAlphaLoose32Over-8           8.45µs ± 0%  6.74µs ± 0%  -20.22%    (p=0.000 n=8+9)
GlyphAlphaLoose32Src-8            7.24µs ± 0%  5.54µs ± 1%  -23.48%   (p=0.000 n=8+10)
GlyphAlphaLoose64Over-8           22.2µs ± 0%  17.4µs ± 0%  -21.67%    (p=0.000 n=9+9)
GlyphAlphaLoose64Src-8            17.6µs ± 1%  12.2µs ± 1%  -30.32%  (p=0.000 n=10+10)
GlyphAlphaLoose128Over-8          67.9µs ± 0%  53.3µs ± 1%  -21.53%  (p=0.000 n=10+10)
GlyphAlphaLoose128Src-8           48.2µs ± 0%  32.6µs ± 2%  -32.41%   (p=0.000 n=9+10)
GlyphAlphaLoose256Over-8           242µs ± 1%   187µs ± 1%  -22.96%    (p=0.000 n=9+9)
GlyphAlphaLoose256Src-8            163µs ± 0%   105µs ± 1%  -35.83%    (p=0.000 n=9+9)
GlyphRGBA16Over-8                 5.25µs ± 1%  4.95µs ± 0%   -5.78%    (p=0.000 n=9+9)
GlyphRGBA16Src-8                  4.72µs ± 0%  4.43µs ± 1%   -6.22%   (p=0.000 n=9+10)
GlyphRGBA32Over-8                 13.5µs ± 0%  11.9µs ± 1%  -12.19%   (p=0.000 n=9+10)
GlyphRGBA32Src-8                  11.5µs ± 1%   9.8µs ± 0%  -14.72%    (p=0.000 n=9+9)
GlyphRGBA64Over-8                 42.0µs ± 2%  36.9µs ± 1%  -12.19%  (p=0.000 n=10+10)
GlyphRGBA64Src-8                  34.1µs ± 1%  28.5µs ± 0%  -16.25%    (p=0.000 n=9+7)
GlyphRGBA128Over-8                 149µs ± 2%   133µs ± 1%  -10.24%   (p=0.000 n=10+9)
GlyphRGBA128Src-8                  115µs ± 1%    99µs ± 1%  -13.57%   (p=0.000 n=9+10)
GlyphRGBA256Over-8                 566µs ± 0%   511µs ± 1%   -9.85%   (p=0.000 n=9+10)
GlyphRGBA256Src-8                  435µs ± 0%   372µs ± 0%  -14.64%    (p=0.000 n=9+8)
GlyphNRGBA16Over-8                26.9µs ± 3%  26.0µs ± 3%   -3.55%   (p=0.000 n=10+9)
GlyphNRGBA16Src-8                 18.8µs ± 2%  18.4µs ± 2%   -2.21%   (p=0.000 n=9+10)
GlyphNRGBA32Over-8                99.1µs ± 2%  95.9µs ± 3%   -3.23%  (p=0.000 n=10+10)
GlyphNRGBA32Src-8                 65.6µs ± 3%  62.8µs ± 2%   -4.36%  (p=0.000 n=10+10)
GlyphNRGBA64Over-8                 376µs ± 4%   370µs ± 2%     ~     (p=0.063 n=10+10)
GlyphNRGBA64Src-8                  238µs ± 3%   233µs ± 1%   -2.21%   (p=0.000 n=9+10)
GlyphNRGBA128Over-8               1.52ms ± 2%  1.48ms ± 0%   -2.11%   (p=0.000 n=10+8)
GlyphNRGBA128Src-8                 951µs ± 3%   935µs ± 1%   -1.69%   (p=0.013 n=10+9)
GlyphNRGBA256Over-8               6.00ms ± 1%  5.87ms ± 3%   -2.12%  (p=0.002 n=10+10)
GlyphNRGBA256Src-8                3.94ms ± 2%  3.80ms ± 2%   -3.64%  (p=0.000 n=10+10)

A comparison of the non-SIMD and SIMD versions:

name                              time/op
FixedAccumulateMask16-8            237ns ± 0%
FixedAccumulateMaskSIMD16-8       80.0ns ± 1%
FloatingAccumulateMask16-8         413ns ± 2%
FloatingAccumulateMaskSIMD16-8     166ns ± 0%
FixedAccumulateMask64-8           3.42µs ± 0%
FixedAccumulateMaskSIMD64-8       1.09µs ± 0%
FloatingAccumulateMask64-8        6.92µs ± 0%
FloatingAccumulateMaskSIMD64-8    2.47µs ± 1%

Change-Id: Ib6980e5975ed2842ff2a372f76dd5f2e95c5526c
Reviewed-on: https://go-review.googlesource.com/30898
Reviewed-by: David Crawshaw <crawshaw@golang.org>
This commit is contained in:
Nigel Tao 2016-10-13 14:50:52 +11:00
parent beb9675609
commit e78c45720c
8 changed files with 507 additions and 74 deletions

View File

@ -20,8 +20,14 @@ func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32)
//go:noescape //go:noescape
func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
//go:noescape
func fixedAccumulateMaskSIMD(buf []uint32)
//go:noescape //go:noescape
func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) func floatingAccumulateOpOverSIMD(dst []uint8, src []float32)
//go:noescape //go:noescape
func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
//go:noescape
func floatingAccumulateMaskSIMD(dst []uint32, src []float32)

View File

@ -83,6 +83,7 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0
// xmm9 fxAlmost65536 // xmm9 fxAlmost65536
// xmm10 inverseFFFF // xmm10 inverseFFFF
TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48 TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48
MOVQ dst_base+0(FP), DI MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI MOVQ src_base+24(FP), SI
@ -100,16 +101,16 @@ TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48
// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
MOVOU fxAlmost65536<>(SB), X5 MOVOU fxAlmost65536<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask.
// scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
// fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
// inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
MOVOU gather<>(SB), X6
MOVOU scatterAndMulBy0x101<>(SB), X8 MOVOU scatterAndMulBy0x101<>(SB), X8
MOVOU fxAlmost65536<>(SB), X9 MOVOU fxAlmost65536<>(SB), X9
MOVOU inverseFFFF<>(SB), X10 MOVOU inverseFFFF<>(SB), X10
// gather := XMM(see above) // PSHUFB shuffle mask.
// offset := XMM(0x00000000 repeated four times) // Cumulative sum. // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
MOVOU gather<>(SB), X6
XORPS X7, X7 XORPS X7, X7
// i := 0 // i := 0
@ -222,7 +223,7 @@ fxAccOpOverLoop4:
fxAccOpOverLoop1: fxAccOpOverLoop1:
// for i < len(src) // for i < len(src)
CMPQ R9, R11 CMPQ R9, R11
JAE fxAccOpOverCleanup JAE fxAccOpOverEnd
// x = src[i] + offset // x = src[i] + offset
MOVL (SI), X1 MOVL (SI), X1
@ -276,9 +277,6 @@ fxAccOpOverLoop1:
ADDQ $4, SI ADDQ $4, SI
JMP fxAccOpOverLoop1 JMP fxAccOpOverLoop1
fxAccOpOverCleanup:
// No-op.
fxAccOpOverEnd: fxAccOpOverEnd:
RET RET
@ -301,6 +299,7 @@ fxAccOpOverEnd:
// xmm9 - // xmm9 -
// xmm10 - // xmm10 -
TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48 TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
MOVQ dst_base+0(FP), DI MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI MOVQ src_base+24(FP), SI
@ -319,8 +318,9 @@ TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
MOVOU fxAlmost256<>(SB), X5 MOVOU fxAlmost256<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask. // gather := XMM(see above) // PSHUFB shuffle mask.
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
MOVOU gather<>(SB), X6 MOVOU gather<>(SB), X6
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
XORPS X7, X7 XORPS X7, X7
// i := 0 // i := 0
@ -390,7 +390,7 @@ fxAccOpSrcLoop4:
fxAccOpSrcLoop1: fxAccOpSrcLoop1:
// for i < len(src) // for i < len(src)
CMPQ R9, R11 CMPQ R9, R11
JAE fxAccOpSrcCleanup JAE fxAccOpSrcEnd
// x = src[i] + offset // x = src[i] + offset
MOVL (SI), X1 MOVL (SI), X1
@ -429,10 +429,149 @@ fxAccOpSrcLoop1:
ADDQ $4, SI ADDQ $4, SI
JMP fxAccOpSrcLoop1 JMP fxAccOpSrcLoop1
fxAccOpSrcCleanup: fxAccOpSrcEnd:
RET
// ----------------------------------------------------------------------------
// func fixedAccumulateMaskSIMD(buf []uint32)
//
// XMM registers. Variable names are per
// https://github.com/google/font-rs/blob/master/src/accumulate.c
//
// xmm0 scratch
// xmm1 x
// xmm2 y, z
// xmm3 -
// xmm4 -
// xmm5 fxAlmost65536
// xmm6 -
// xmm7 offset
// xmm8 -
// xmm9 -
// xmm10 -
TEXT ·fixedAccumulateMaskSIMD(SB), NOSPLIT, $0-24
MOVQ buf_base+0(FP), DI
MOVQ buf_len+8(FP), BX
MOVQ buf_base+0(FP), SI
MOVQ buf_len+8(FP), R10
// R10 = len(src) &^ 3
// R11 = len(src)
MOVQ R10, R11
ANDQ $-4, R10
// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
MOVOU fxAlmost65536<>(SB), X5
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
XORPS X7, X7
// i := 0
MOVQ $0, R9
fxAccMaskLoop4:
// for i < (len(src) &^ 3)
CMPQ R9, R10
JAE fxAccMaskLoop1
// x = XMM(s0, s1, s2, s3)
//
// Where s0 is src[i+0], s1 is src[i+1], etc.
MOVOU (SI), X1
// scratch = XMM(0, s0, s1, s2)
// x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
MOVOU X1, X0
PSLLO $4, X0
PADDD X0, X1
// scratch = XMM(0, 0, 0, 0)
// scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
// x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
XORPS X0, X0
SHUFPS $0x40, X1, X0
PADDD X0, X1
// x += offset
PADDD X7, X1
// y = abs(x)
// y >>= 4 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
//
// pabsd %xmm1,%xmm2
// psrld $0x4,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
// z = convertToInt32(y)
// No-op. // No-op.
fxAccOpSrcEnd: // copy(dst[:4], z)
MOVOU X2, (DI)
// offset = XMM(x@3, x@3, x@3, x@3)
MOVOU X1, X7
SHUFPS $0xff, X1, X7
// i += 4
// dst = dst[4:]
// src = src[4:]
ADDQ $4, R9
ADDQ $16, DI
ADDQ $16, SI
JMP fxAccMaskLoop4
fxAccMaskLoop1:
// for i < len(src)
CMPQ R9, R11
JAE fxAccMaskEnd
// x = src[i] + offset
MOVL (SI), X1
PADDD X7, X1
// y = abs(x)
// y >>= 4 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
//
// pabsd %xmm1,%xmm2
// psrld $0x4,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
// z = convertToInt32(y)
// No-op.
// dst[0] = uint32(z)
MOVL X2, (DI)
// offset = x
MOVOU X1, X7
// i += 1
// dst = dst[1:]
// src = src[1:]
ADDQ $1, R9
ADDQ $4, DI
ADDQ $4, SI
JMP fxAccMaskLoop1
fxAccMaskEnd:
RET RET
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
@ -454,6 +593,7 @@ fxAccOpSrcEnd:
// xmm9 fxAlmost65536 // xmm9 fxAlmost65536
// xmm10 inverseFFFF // xmm10 inverseFFFF
TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48 TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
MOVQ dst_base+0(FP), DI MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI MOVQ src_base+24(FP), SI
@ -468,12 +608,12 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
MOVQ R10, R11 MOVQ R10, R11
ANDQ $-4, R10 ANDQ $-4, R10
// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
// "Round To Zero".
STMXCSR mxcsrOrig-8(SP) STMXCSR mxcsrOrig-8(SP)
MOVL mxcsrOrig-8(SP), AX MOVL mxcsrOrig-8(SP), AX
ORL $0x6000, AX ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP) MOVL AX, mxcsrNew-4(SP)
LDMXCSR mxcsrNew-4(SP)
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
@ -482,16 +622,16 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
MOVOU flOne<>(SB), X4 MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5 MOVOU flSignMask<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask.
// scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
// fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
// inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
MOVOU gather<>(SB), X6
MOVOU scatterAndMulBy0x101<>(SB), X8 MOVOU scatterAndMulBy0x101<>(SB), X8
MOVOU fxAlmost65536<>(SB), X9 MOVOU fxAlmost65536<>(SB), X9
MOVOU inverseFFFF<>(SB), X10 MOVOU inverseFFFF<>(SB), X10
// gather := XMM(see above) // PSHUFB shuffle mask.
// offset := XMM(0x00000000 repeated four times) // Cumulative sum. // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
MOVOU gather<>(SB), X6
XORPS X7, X7 XORPS X7, X7
// i := 0 // i := 0
@ -532,7 +672,9 @@ flAccOpOverLoop4:
MULPS X3, X2 MULPS X3, X2
// z = convertToInt32(y) // z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2 CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// Blend over the dst's prior value. SIMD for i in 0..3: // Blend over the dst's prior value. SIMD for i in 0..3:
// //
@ -597,7 +739,7 @@ flAccOpOverLoop4:
flAccOpOverLoop1: flAccOpOverLoop1:
// for i < len(src) // for i < len(src)
CMPQ R9, R11 CMPQ R9, R11
JAE flAccOpOverCleanup JAE flAccOpOverEnd
// x = src[i] + offset // x = src[i] + offset
MOVL (SI), X1 MOVL (SI), X1
@ -612,7 +754,9 @@ flAccOpOverLoop1:
MULPS X3, X2 MULPS X3, X2
// z = convertToInt32(y) // z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2 CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// Blend over the dst's prior value. // Blend over the dst's prior value.
// //
@ -644,9 +788,6 @@ flAccOpOverLoop1:
ADDQ $4, SI ADDQ $4, SI
JMP flAccOpOverLoop1 JMP flAccOpOverLoop1
flAccOpOverCleanup:
LDMXCSR mxcsrOrig-8(SP)
flAccOpOverEnd: flAccOpOverEnd:
RET RET
@ -669,6 +810,7 @@ flAccOpOverEnd:
// xmm9 - // xmm9 -
// xmm10 - // xmm10 -
TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
MOVQ dst_base+0(FP), DI MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI MOVQ src_base+24(FP), SI
@ -683,12 +825,12 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
MOVQ R10, R11 MOVQ R10, R11
ANDQ $-4, R10 ANDQ $-4, R10
// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
// "Round To Zero".
STMXCSR mxcsrOrig-8(SP) STMXCSR mxcsrOrig-8(SP)
MOVL mxcsrOrig-8(SP), AX MOVL mxcsrOrig-8(SP), AX
ORL $0x6000, AX ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP) MOVL AX, mxcsrNew-4(SP)
LDMXCSR mxcsrNew-4(SP)
// flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32. // flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
@ -698,8 +840,9 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
MOVOU flSignMask<>(SB), X5 MOVOU flSignMask<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask. // gather := XMM(see above) // PSHUFB shuffle mask.
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
MOVOU gather<>(SB), X6 MOVOU gather<>(SB), X6
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
XORPS X7, X7 XORPS X7, X7
// i := 0 // i := 0
@ -740,7 +883,9 @@ flAccOpSrcLoop4:
MULPS X3, X2 MULPS X3, X2
// z = convertToInt32(y) // z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2 CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// z = shuffleTheLowBytesOfEach4ByteElement(z) // z = shuffleTheLowBytesOfEach4ByteElement(z)
// copy(dst[:4], low4BytesOf(z)) // copy(dst[:4], low4BytesOf(z))
@ -762,7 +907,7 @@ flAccOpSrcLoop4:
flAccOpSrcLoop1: flAccOpSrcLoop1:
// for i < len(src) // for i < len(src)
CMPQ R9, R11 CMPQ R9, R11
JAE flAccOpSrcCleanup JAE flAccOpSrcEnd
// x = src[i] + offset // x = src[i] + offset
MOVL (SI), X1 MOVL (SI), X1
@ -777,7 +922,9 @@ flAccOpSrcLoop1:
MULPS X3, X2 MULPS X3, X2
// z = convertToInt32(y) // z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2 CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// dst[0] = uint8(z) // dst[0] = uint8(z)
MOVL X2, BX MOVL X2, BX
@ -794,8 +941,152 @@ flAccOpSrcLoop1:
ADDQ $4, SI ADDQ $4, SI
JMP flAccOpSrcLoop1 JMP flAccOpSrcLoop1
flAccOpSrcCleanup:
LDMXCSR mxcsrOrig-8(SP)
flAccOpSrcEnd: flAccOpSrcEnd:
RET RET
// ----------------------------------------------------------------------------
// func floatingAccumulateMaskSIMD(dst []uint32, src []float32)
//
// XMM registers. Variable names are per
// https://github.com/google/font-rs/blob/master/src/accumulate.c
//
// xmm0 scratch
// xmm1 x
// xmm2 y, z
// xmm3 flAlmost65536
// xmm4 flOne
// xmm5 flSignMask
// xmm6 -
// xmm7 offset
// xmm8 -
// xmm9 -
// xmm10 -
TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), R10
// Sanity check that len(dst) >= len(src).
CMPQ BX, R10
JLT flAccMaskEnd
// R10 = len(src) &^ 3
// R11 = len(src)
MOVQ R10, R11
ANDQ $-4, R10
// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
// "Round To Zero".
STMXCSR mxcsrOrig-8(SP)
MOVL mxcsrOrig-8(SP), AX
ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP)
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
MOVOU flAlmost65536<>(SB), X3
MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
XORPS X7, X7
// i := 0
MOVQ $0, R9
flAccMaskLoop4:
// for i < (len(src) &^ 3)
CMPQ R9, R10
JAE flAccMaskLoop1
// x = XMM(s0, s1, s2, s3)
//
// Where s0 is src[i+0], s1 is src[i+1], etc.
MOVOU (SI), X1
// scratch = XMM(0, s0, s1, s2)
// x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
MOVOU X1, X0
PSLLO $4, X0
ADDPS X0, X1
// scratch = XMM(0, 0, 0, 0)
// scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
// x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
XORPS X0, X0
SHUFPS $0x40, X1, X0
ADDPS X0, X1
// x += offset
ADDPS X7, X1
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// copy(dst[:4], z)
MOVOU X2, (DI)
// offset = XMM(x@3, x@3, x@3, x@3)
MOVOU X1, X7
SHUFPS $0xff, X1, X7
// i += 4
// dst = dst[4:]
// src = src[4:]
ADDQ $4, R9
ADDQ $16, DI
ADDQ $16, SI
JMP flAccMaskLoop4
flAccMaskLoop1:
// for i < len(src)
CMPQ R9, R11
JAE flAccMaskEnd
// x = src[i] + offset
MOVL (SI), X1
ADDPS X7, X1
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
// dst[0] = uint32(z)
MOVL X2, (DI)
// offset = x
MOVOU X1, X7
// i += 1
// dst = dst[1:]
// src = src[1:]
ADDQ $1, R9
ADDQ $4, DI
ADDQ $4, SI
JMP flAccMaskLoop1
flAccMaskEnd:
RET

View File

@ -11,5 +11,7 @@ const haveFloatingAccumulateSIMD = false
func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) {} func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) {}
func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {} func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {}
func fixedAccumulateMaskSIMD(buf []uint32) {}
func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) {} func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) {}
func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {} func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {}
func floatingAccumulateMaskSIMD(dst []uint32, src []float32) {}

View File

@ -201,8 +201,12 @@ func testAcc(t *testing.T, in interface{}, mask []uint32, op string) {
} }
case "mask": case "mask":
copy(got32, in[:n]) copy(got32, in[:n])
if simd {
fixedAccumulateMaskSIMD(got32)
} else {
fixedAccumulateMask(got32) fixedAccumulateMask(got32)
} }
}
case []float32: case []float32:
switch op { switch op {
case "over": case "over":
@ -218,9 +222,13 @@ func testAcc(t *testing.T, in interface{}, mask []uint32, op string) {
floatingAccumulateOpSrc(got8, in[:n]) floatingAccumulateOpSrc(got8, in[:n])
} }
case "mask": case "mask":
if simd {
floatingAccumulateMaskSIMD(got32, in[:n])
} else {
floatingAccumulateMask(got32, in[:n]) floatingAccumulateMask(got32, in[:n])
} }
} }
}
if op != "mask" { if op != "mask" {
if !bytes.Equal(got8, want8) { if !bytes.Equal(got8, want8) {
@ -264,22 +272,26 @@ func BenchmarkFixedAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, fxIn16,
func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src", false) } func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src", false) }
func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, fxIn16, "src", true) } func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, fxIn16, "src", true) }
func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask", false) } func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask", false) }
func BenchmarkFixedAccumulateMaskSIMD16(b *testing.B) { benchAcc(b, fxIn16, "mask", true) }
func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over", false) } func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over", false) }
func BenchmarkFloatingAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, flIn16, "over", true) } func BenchmarkFloatingAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, flIn16, "over", true) }
func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src", false) } func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src", false) }
func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) } func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) }
func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask", false) } func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask", false) }
func BenchmarkFloatingAccumulateMaskSIMD16(b *testing.B) { benchAcc(b, flIn16, "mask", true) }
func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over", false) } func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over", false) }
func BenchmarkFixedAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, fxIn64, "over", true) } func BenchmarkFixedAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, fxIn64, "over", true) }
func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src", false) } func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src", false) }
func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, fxIn64, "src", true) } func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, fxIn64, "src", true) }
func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask", false) } func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask", false) }
func BenchmarkFixedAccumulateMaskSIMD64(b *testing.B) { benchAcc(b, fxIn64, "mask", true) }
func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over", false) } func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over", false) }
func BenchmarkFloatingAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, flIn64, "over", true) } func BenchmarkFloatingAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, flIn64, "over", true) }
func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src", false) } func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src", false) }
func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) } func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) }
func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask", false) } func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask", false) }
func BenchmarkFloatingAccumulateMaskSIMD64(b *testing.B) { benchAcc(b, flIn64, "mask", true) }
func benchAcc(b *testing.B, in interface{}, op string, simd bool) { func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
var f func() var f func()
@ -308,8 +320,12 @@ func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
case "mask": case "mask":
buf := make([]uint32, len(in)) buf := make([]uint32, len(in))
copy(buf, in) copy(buf, in)
if simd {
f = func() { fixedAccumulateMaskSIMD(buf) }
} else {
f = func() { fixedAccumulateMask(buf) } f = func() { fixedAccumulateMask(buf) }
} }
}
case []float32: case []float32:
if simd && !haveFloatingAccumulateSIMD { if simd && !haveFloatingAccumulateSIMD {
@ -333,9 +349,13 @@ func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
} }
case "mask": case "mask":
dst := make([]uint32, len(in)) dst := make([]uint32, len(in))
if simd {
f = func() { floatingAccumulateMaskSIMD(dst, in) }
} else {
f = func() { floatingAccumulateMask(dst, in) } f = func() { floatingAccumulateMask(dst, in) }
} }
} }
}
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {

View File

@ -10,6 +10,7 @@ import (
"bytes" "bytes"
"io/ioutil" "io/ioutil"
"log" "log"
"strings"
"text/template" "text/template"
) )
@ -54,6 +55,9 @@ func main() {
if i != 0 { if i != 0 {
out.WriteString("\n") out.WriteString("\n")
} }
if strings.Contains(v.LoadArgs, "{{.ShortName}}") {
v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1)
}
if err := t.Execute(out, v); err != nil { if err := t.Execute(out, v); err != nil {
log.Fatalf("Execute(%q): %v", v.ShortName, err) log.Fatalf("Execute(%q): %v", v.ShortName, err)
} }
@ -68,15 +72,19 @@ var instances = []struct {
LongName string LongName string
ShortName string ShortName string
FrameSize string FrameSize string
SrcType string ArgsSize string
Args string
DstElemSize1 int
DstElemSize4 int
XMM3 string XMM3 string
XMM4 string XMM4 string
XMM5 string XMM5 string
XMM6 string
XMM8 string XMM8 string
XMM9 string XMM9 string
XMM10 string XMM10 string
LoadArgs string
Setup string Setup string
Cleanup string
LoadXMMRegs string LoadXMMRegs string
Add string Add string
ClampAndScale string ClampAndScale string
@ -87,16 +95,20 @@ var instances = []struct {
LongName: "fixedAccumulateOpOver", LongName: "fixedAccumulateOpOver",
ShortName: "fxAccOpOver", ShortName: "fxAccOpOver",
FrameSize: fxFrameSize, FrameSize: fxFrameSize,
SrcType: fxSrcType, ArgsSize: twoArgArgsSize,
Args: "dst []uint8, src []uint32",
DstElemSize1: 1 * sizeOfUint8,
DstElemSize4: 4 * sizeOfUint8,
XMM3: fxXMM3, XMM3: fxXMM3,
XMM4: fxXMM4, XMM4: fxXMM4,
XMM5: fxXMM5_65536, XMM5: fxXMM5_65536,
XMM6: opOverXMM6,
XMM8: opOverXMM8, XMM8: opOverXMM8,
XMM9: opOverXMM9, XMM9: opOverXMM9,
XMM10: opOverXMM10, XMM10: opOverXMM10,
LoadArgs: twoArgLoadArgs,
Setup: fxSetup, Setup: fxSetup,
LoadXMMRegs: fxLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs, LoadXMMRegs: fxLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs,
Cleanup: fxCleanup,
Add: fxAdd, Add: fxAdd,
ClampAndScale: fxClampAndScale65536, ClampAndScale: fxClampAndScale65536,
ConvertToInt32: fxConvertToInt32, ConvertToInt32: fxConvertToInt32,
@ -106,35 +118,66 @@ var instances = []struct {
LongName: "fixedAccumulateOpSrc", LongName: "fixedAccumulateOpSrc",
ShortName: "fxAccOpSrc", ShortName: "fxAccOpSrc",
FrameSize: fxFrameSize, FrameSize: fxFrameSize,
SrcType: fxSrcType, ArgsSize: twoArgArgsSize,
Args: "dst []uint8, src []uint32",
DstElemSize1: 1 * sizeOfUint8,
DstElemSize4: 4 * sizeOfUint8,
XMM3: fxXMM3, XMM3: fxXMM3,
XMM4: fxXMM4, XMM4: fxXMM4,
XMM5: fxXMM5_256, XMM5: fxXMM5_256,
XMM6: opSrcXMM6,
XMM8: opSrcXMM8, XMM8: opSrcXMM8,
XMM9: opSrcXMM9, XMM9: opSrcXMM9,
XMM10: opSrcXMM10, XMM10: opSrcXMM10,
LoadArgs: twoArgLoadArgs,
Setup: fxSetup, Setup: fxSetup,
LoadXMMRegs: fxLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs, LoadXMMRegs: fxLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs,
Cleanup: fxCleanup,
Add: fxAdd, Add: fxAdd,
ClampAndScale: fxClampAndScale256, ClampAndScale: fxClampAndScale256,
ConvertToInt32: fxConvertToInt32, ConvertToInt32: fxConvertToInt32,
Store4: opSrcStore4, Store4: opSrcStore4,
Store1: opSrcStore1, Store1: opSrcStore1,
}, {
LongName: "fixedAccumulateMask",
ShortName: "fxAccMask",
FrameSize: fxFrameSize,
ArgsSize: oneArgArgsSize,
Args: "buf []uint32",
DstElemSize1: 1 * sizeOfUint32,
DstElemSize4: 4 * sizeOfUint32,
XMM3: fxXMM3,
XMM4: fxXMM4,
XMM5: fxXMM5_65536,
XMM6: maskXMM6,
XMM8: maskXMM8,
XMM9: maskXMM9,
XMM10: maskXMM10,
LoadArgs: oneArgLoadArgs,
Setup: fxSetup,
LoadXMMRegs: fxLoadXMMRegs65536 + "\n" + maskLoadXMMRegs,
Add: fxAdd,
ClampAndScale: fxClampAndScale65536,
ConvertToInt32: fxConvertToInt32,
Store4: maskStore4,
Store1: maskStore1,
}, { }, {
LongName: "floatingAccumulateOpOver", LongName: "floatingAccumulateOpOver",
ShortName: "flAccOpOver", ShortName: "flAccOpOver",
FrameSize: flFrameSize, FrameSize: flFrameSize,
SrcType: flSrcType, ArgsSize: twoArgArgsSize,
Args: "dst []uint8, src []float32",
DstElemSize1: 1 * sizeOfUint8,
DstElemSize4: 4 * sizeOfUint8,
XMM3: flXMM3_65536, XMM3: flXMM3_65536,
XMM4: flXMM4, XMM4: flXMM4,
XMM5: flXMM5, XMM5: flXMM5,
XMM6: opOverXMM6,
XMM8: opOverXMM8, XMM8: opOverXMM8,
XMM9: opOverXMM9, XMM9: opOverXMM9,
XMM10: opOverXMM10, XMM10: opOverXMM10,
LoadArgs: twoArgLoadArgs,
Setup: flSetup, Setup: flSetup,
LoadXMMRegs: flLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs, LoadXMMRegs: flLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs,
Cleanup: flCleanup,
Add: flAdd, Add: flAdd,
ClampAndScale: flClampAndScale65536, ClampAndScale: flClampAndScale65536,
ConvertToInt32: flConvertToInt32, ConvertToInt32: flConvertToInt32,
@ -144,29 +187,59 @@ var instances = []struct {
LongName: "floatingAccumulateOpSrc", LongName: "floatingAccumulateOpSrc",
ShortName: "flAccOpSrc", ShortName: "flAccOpSrc",
FrameSize: flFrameSize, FrameSize: flFrameSize,
SrcType: flSrcType, ArgsSize: twoArgArgsSize,
Args: "dst []uint8, src []float32",
DstElemSize1: 1 * sizeOfUint8,
DstElemSize4: 4 * sizeOfUint8,
XMM3: flXMM3_256, XMM3: flXMM3_256,
XMM4: flXMM4, XMM4: flXMM4,
XMM5: flXMM5, XMM5: flXMM5,
XMM6: opSrcXMM6,
XMM8: opSrcXMM8, XMM8: opSrcXMM8,
XMM9: opSrcXMM9, XMM9: opSrcXMM9,
XMM10: opSrcXMM10, XMM10: opSrcXMM10,
LoadArgs: twoArgLoadArgs,
Setup: flSetup, Setup: flSetup,
LoadXMMRegs: flLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs, LoadXMMRegs: flLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs,
Cleanup: flCleanup,
Add: flAdd, Add: flAdd,
ClampAndScale: flClampAndScale256, ClampAndScale: flClampAndScale256,
ConvertToInt32: flConvertToInt32, ConvertToInt32: flConvertToInt32,
Store4: opSrcStore4, Store4: opSrcStore4,
Store1: opSrcStore1, Store1: opSrcStore1,
}, {
LongName: "floatingAccumulateMask",
ShortName: "flAccMask",
FrameSize: flFrameSize,
ArgsSize: twoArgArgsSize,
Args: "dst []uint32, src []float32",
DstElemSize1: 1 * sizeOfUint32,
DstElemSize4: 4 * sizeOfUint32,
XMM3: flXMM3_65536,
XMM4: flXMM4,
XMM5: flXMM5,
XMM6: maskXMM6,
XMM8: maskXMM8,
XMM9: maskXMM9,
XMM10: maskXMM10,
LoadArgs: twoArgLoadArgs,
Setup: flSetup,
LoadXMMRegs: flLoadXMMRegs65536 + "\n" + maskLoadXMMRegs,
Add: flAdd,
ClampAndScale: flClampAndScale65536,
ConvertToInt32: flConvertToInt32,
Store4: maskStore4,
Store1: maskStore1,
}} }}
const ( const (
fxFrameSize = `0` fxFrameSize = `0`
flFrameSize = `8` flFrameSize = `8`
fxSrcType = `[]uint32` oneArgArgsSize = `24`
flSrcType = `[]float32` twoArgArgsSize = `48`
sizeOfUint8 = 1
sizeOfUint32 = 4
fxXMM3 = `-` fxXMM3 = `-`
flXMM3_256 = `flAlmost256` flXMM3_256 = `flAlmost256`
@ -179,19 +252,32 @@ const (
fxXMM5_65536 = `fxAlmost65536` fxXMM5_65536 = `fxAlmost65536`
flXMM5 = `flSignMask` flXMM5 = `flSignMask`
oneArgLoadArgs = `
MOVQ buf_base+0(FP), DI
MOVQ buf_len+8(FP), BX
MOVQ buf_base+0(FP), SI
MOVQ buf_len+8(FP), R10
`
twoArgLoadArgs = `
MOVQ dst_base+0(FP), DI
MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), R10
// Sanity check that len(dst) >= len(src).
CMPQ BX, R10
JLT {{.ShortName}}End
`
fxSetup = `` fxSetup = ``
flSetup = ` flSetup = `
// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
// "Round To Zero".
STMXCSR mxcsrOrig-8(SP) STMXCSR mxcsrOrig-8(SP)
MOVL mxcsrOrig-8(SP), AX MOVL mxcsrOrig-8(SP), AX
ORL $0x6000, AX ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP) MOVL AX, mxcsrNew-4(SP)
LDMXCSR mxcsrNew-4(SP)
` `
fxCleanup = `// No-op.`
flCleanup = `LDMXCSR mxcsrOrig-8(SP)`
fxLoadXMMRegs256 = ` fxLoadXMMRegs256 = `
// fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8. // fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
MOVOU fxAlmost256<>(SB), X5 MOVOU fxAlmost256<>(SB), X5
@ -271,8 +357,16 @@ const (
MULPS X3, X2 MULPS X3, X2
` `
fxConvertToInt32 = `// No-op.` fxConvertToInt32 = `
flConvertToInt32 = `CVTPS2PL X2, X2` // z = convertToInt32(y)
// No-op.
`
flConvertToInt32 = `
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
CVTPS2PL X2, X2
LDMXCSR mxcsrOrig-8(SP)
`
opOverStore4 = ` opOverStore4 = `
// Blend over the dst's prior value. SIMD for i in 0..3: // Blend over the dst's prior value. SIMD for i in 0..3:
@ -324,6 +418,10 @@ const (
PSHUFB X6, X2 PSHUFB X6, X2
MOVL X2, (DI) MOVL X2, (DI)
` `
maskStore4 = `
// copy(dst[:4], z)
MOVOU X2, (DI)
`
opOverStore1 = ` opOverStore1 = `
// Blend over the dst's prior value. // Blend over the dst's prior value.
@ -350,23 +448,40 @@ const (
MOVL X2, BX MOVL X2, BX
MOVB BX, (DI) MOVB BX, (DI)
` `
maskStore1 = `
// dst[0] = uint32(z)
MOVL X2, (DI)
`
opOverXMM6 = `gather`
opSrcXMM6 = `gather`
maskXMM6 = `-`
opOverXMM8 = `scatterAndMulBy0x101` opOverXMM8 = `scatterAndMulBy0x101`
opSrcXMM8 = `-` opSrcXMM8 = `-`
maskXMM8 = `-`
opOverXMM9 = `fxAlmost65536` opOverXMM9 = `fxAlmost65536`
opSrcXMM9 = `-` opSrcXMM9 = `-`
maskXMM9 = `-`
opOverXMM10 = `inverseFFFF` opOverXMM10 = `inverseFFFF`
opSrcXMM10 = `-` opSrcXMM10 = `-`
maskXMM10 = `-`
opOverLoadXMMRegs = ` opOverLoadXMMRegs = `
// gather := XMM(see above) // PSHUFB shuffle mask.
// scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
// fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
// inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
MOVOU gather<>(SB), X6
MOVOU scatterAndMulBy0x101<>(SB), X8 MOVOU scatterAndMulBy0x101<>(SB), X8
MOVOU fxAlmost65536<>(SB), X9 MOVOU fxAlmost65536<>(SB), X9
MOVOU inverseFFFF<>(SB), X10 MOVOU inverseFFFF<>(SB), X10
` `
opSrcLoadXMMRegs = `` opSrcLoadXMMRegs = `
// gather := XMM(see above) // PSHUFB shuffle mask.
MOVOU gather<>(SB), X6
`
maskLoadXMMRegs = ``
) )

View File

@ -68,7 +68,7 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// func {{.LongName}}SIMD(dst []uint8, src {{.SrcType}}) // func {{.LongName}}SIMD({{.Args}})
// //
// XMM registers. Variable names are per // XMM registers. Variable names are per
// https://github.com/google/font-rs/blob/master/src/accumulate.c // https://github.com/google/font-rs/blob/master/src/accumulate.c
@ -79,20 +79,13 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0
// xmm3 {{.XMM3}} // xmm3 {{.XMM3}}
// xmm4 {{.XMM4}} // xmm4 {{.XMM4}}
// xmm5 {{.XMM5}} // xmm5 {{.XMM5}}
// xmm6 gather // xmm6 {{.XMM6}}
// xmm7 offset // xmm7 offset
// xmm8 {{.XMM8}} // xmm8 {{.XMM8}}
// xmm9 {{.XMM9}} // xmm9 {{.XMM9}}
// xmm10 {{.XMM10}} // xmm10 {{.XMM10}}
TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-{{.ArgsSize}}
MOVQ dst_base+0(FP), DI {{.LoadArgs}}
MOVQ dst_len+8(FP), BX
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), R10
// Sanity check that len(dst) >= len(src).
CMPQ BX, R10
JLT {{.ShortName}}End
// R10 = len(src) &^ 3 // R10 = len(src) &^ 3
// R11 = len(src) // R11 = len(src)
@ -103,9 +96,7 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
{{.LoadXMMRegs}} {{.LoadXMMRegs}}
// gather := XMM(see above) // PSHUFB shuffle mask.
// offset := XMM(0x00000000 repeated four times) // Cumulative sum. // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
MOVOU gather<>(SB), X6
XORPS X7, X7 XORPS X7, X7
// i := 0 // i := 0
@ -139,7 +130,6 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
{{.ClampAndScale}} {{.ClampAndScale}}
// z = convertToInt32(y)
{{.ConvertToInt32}} {{.ConvertToInt32}}
{{.Store4}} {{.Store4}}
@ -152,14 +142,14 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
// dst = dst[4:] // dst = dst[4:]
// src = src[4:] // src = src[4:]
ADDQ $4, R9 ADDQ $4, R9
ADDQ $4, DI ADDQ ${{.DstElemSize4}}, DI
ADDQ $16, SI ADDQ $16, SI
JMP {{.ShortName}}Loop4 JMP {{.ShortName}}Loop4
{{.ShortName}}Loop1: {{.ShortName}}Loop1:
// for i < len(src) // for i < len(src)
CMPQ R9, R11 CMPQ R9, R11
JAE {{.ShortName}}Cleanup JAE {{.ShortName}}End
// x = src[i] + offset // x = src[i] + offset
MOVL (SI), X1 MOVL (SI), X1
@ -167,7 +157,6 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
{{.ClampAndScale}} {{.ClampAndScale}}
// z = convertToInt32(y)
{{.ConvertToInt32}} {{.ConvertToInt32}}
{{.Store1}} {{.Store1}}
@ -179,12 +168,9 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
// dst = dst[1:] // dst = dst[1:]
// src = src[1:] // src = src[1:]
ADDQ $1, R9 ADDQ $1, R9
ADDQ $1, DI ADDQ ${{.DstElemSize1}}, DI
ADDQ $4, SI ADDQ $4, SI
JMP {{.ShortName}}Loop1 JMP {{.ShortName}}Loop1
{{.ShortName}}Cleanup:
{{.Cleanup}}
{{.ShortName}}End: {{.ShortName}}End:
RET RET

View File

@ -190,6 +190,11 @@ func floatingAccumulateOpSrc(dst []uint8, src []float32) {
} }
func floatingAccumulateMask(dst []uint32, src []float32) { func floatingAccumulateMask(dst []uint32, src []float32) {
// Sanity check that len(dst) >= len(src).
if len(dst) < len(src) {
return
}
acc := float32(0) acc := float32(0)
for i, v := range src { for i, v := range src {
acc += v acc += v

View File

@ -310,11 +310,19 @@ func (z *Rasterizer) accumulateMask() {
} else { } else {
z.bufU32 = z.bufU32[:n] z.bufU32 = z.bufU32[:n]
} }
if haveFloatingAccumulateSIMD {
floatingAccumulateMaskSIMD(z.bufU32, z.bufF32)
} else {
floatingAccumulateMask(z.bufU32, z.bufF32) floatingAccumulateMask(z.bufU32, z.bufF32)
}
} else {
if haveFixedAccumulateSIMD {
fixedAccumulateMaskSIMD(z.bufU32)
} else { } else {
fixedAccumulateMask(z.bufU32) fixedAccumulateMask(z.bufU32)
} }
} }
}
func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.Rectangle) { func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.Rectangle) {
// TODO: non-zero vs even-odd winding? // TODO: non-zero vs even-odd winding?