vector: add SIMD versions of xxxAccumulateMask.

name old time/op new time/op delta GlyphAlphaLoose16Over-8 3.96µs ± 0% 3.64µs ± 1% -8.08% (p=0.000 n=8+10) GlyphAlphaLoose16Src-8 3.64µs ± 0% 3.35µs ± 0% -7.88% (p=0.000 n=8+9) GlyphAlphaLoose32Over-8 8.45µs ± 0% 6.74µs ± 0% -20.22% (p=0.000 n=8+9) GlyphAlphaLoose32Src-8 7.24µs ± 0% 5.54µs ± 1% -23.48% (p=0.000 n=8+10) GlyphAlphaLoose64Over-8 22.2µs ± 0% 17.4µs ± 0% -21.67% (p=0.000 n=9+9) GlyphAlphaLoose64Src-8 17.6µs ± 1% 12.2µs ± 1% -30.32% (p=0.000 n=10+10) GlyphAlphaLoose128Over-8 67.9µs ± 0% 53.3µs ± 1% -21.53% (p=0.000 n=10+10) GlyphAlphaLoose128Src-8 48.2µs ± 0% 32.6µs ± 2% -32.41% (p=0.000 n=9+10) GlyphAlphaLoose256Over-8 242µs ± 1% 187µs ± 1% -22.96% (p=0.000 n=9+9) GlyphAlphaLoose256Src-8 163µs ± 0% 105µs ± 1% -35.83% (p=0.000 n=9+9) GlyphRGBA16Over-8 5.25µs ± 1% 4.95µs ± 0% -5.78% (p=0.000 n=9+9) GlyphRGBA16Src-8 4.72µs ± 0% 4.43µs ± 1% -6.22% (p=0.000 n=9+10) GlyphRGBA32Over-8 13.5µs ± 0% 11.9µs ± 1% -12.19% (p=0.000 n=9+10) GlyphRGBA32Src-8 11.5µs ± 1% 9.8µs ± 0% -14.72% (p=0.000 n=9+9) GlyphRGBA64Over-8 42.0µs ± 2% 36.9µs ± 1% -12.19% (p=0.000 n=10+10) GlyphRGBA64Src-8 34.1µs ± 1% 28.5µs ± 0% -16.25% (p=0.000 n=9+7) GlyphRGBA128Over-8 149µs ± 2% 133µs ± 1% -10.24% (p=0.000 n=10+9) GlyphRGBA128Src-8 115µs ± 1% 99µs ± 1% -13.57% (p=0.000 n=9+10) GlyphRGBA256Over-8 566µs ± 0% 511µs ± 1% -9.85% (p=0.000 n=9+10) GlyphRGBA256Src-8 435µs ± 0% 372µs ± 0% -14.64% (p=0.000 n=9+8) GlyphNRGBA16Over-8 26.9µs ± 3% 26.0µs ± 3% -3.55% (p=0.000 n=10+9) GlyphNRGBA16Src-8 18.8µs ± 2% 18.4µs ± 2% -2.21% (p=0.000 n=9+10) GlyphNRGBA32Over-8 99.1µs ± 2% 95.9µs ± 3% -3.23% (p=0.000 n=10+10) GlyphNRGBA32Src-8 65.6µs ± 3% 62.8µs ± 2% -4.36% (p=0.000 n=10+10) GlyphNRGBA64Over-8 376µs ± 4% 370µs ± 2% ~ (p=0.063 n=10+10) GlyphNRGBA64Src-8 238µs ± 3% 233µs ± 1% -2.21% (p=0.000 n=9+10) GlyphNRGBA128Over-8 1.52ms ± 2% 1.48ms ± 0% -2.11% (p=0.000 n=10+8) GlyphNRGBA128Src-8 951µs ± 3% 935µs ± 1% -1.69% (p=0.013 n=10+9) GlyphNRGBA256Over-8 6.00ms ± 1% 5.87ms ± 3% -2.12% (p=0.002 n=10+10) GlyphNRGBA256Src-8 3.94ms ± 2% 3.80ms ± 2% -3.64% (p=0.000 n=10+10) A comparison of the non-SIMD and SIMD versions: name time/op FixedAccumulateMask16-8 237ns ± 0% FixedAccumulateMaskSIMD16-8 80.0ns ± 1% FloatingAccumulateMask16-8 413ns ± 2% FloatingAccumulateMaskSIMD16-8 166ns ± 0% FixedAccumulateMask64-8 3.42µs ± 0% FixedAccumulateMaskSIMD64-8 1.09µs ± 0% FloatingAccumulateMask64-8 6.92µs ± 0% FloatingAccumulateMaskSIMD64-8 2.47µs ± 1% Change-Id: Ib6980e5975ed2842ff2a372f76dd5f2e95c5526c Reviewed-on: https://go-review.googlesource.com/30898 Reviewed-by: David Crawshaw <crawshaw@golang.org>
2016-10-13 14:50:52 +11:00 · 2016-10-13 14:50:52 +11:00 · e78c45720c
commit e78c45720c
parent beb9675609
8 changed files with 507 additions and 74 deletions
--- a/vector/acc_amd64.go
+++ b/vector/acc_amd64.go
@ -20,8 +20,14 @@ func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32)
 //go:noescape
 func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
 //go:noescape
 func fixedAccumulateMaskSIMD(buf []uint32)
 //go:noescape
 func floatingAccumulateOpOverSIMD(dst []uint8, src []float32)
 //go:noescape
 func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
 //go:noescape
 func floatingAccumulateMaskSIMD(dst []uint32, src []float32)
--- a/vector/acc_amd64.s
+++ b/vector/acc_amd64.s
@ -83,6 +83,7 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0
 //	xmm9	fxAlmost65536
 //	xmm10	inverseFFFF
 TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ dst_len+8(FP), BX
 	MOVQ src_base+24(FP), SI
@ -100,16 +101,16 @@ TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48
 	// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
 	MOVOU fxAlmost65536<>(SB), X5
 	// gather               := XMM(see above)                      // PSHUFB shuffle mask.
 	// scatterAndMulBy0x101 := XMM(see above)                      // PSHUFB shuffle mask.
 	// fxAlmost65536        := XMM(0x0000ffff repeated four times) // 0xffff.
 	// inverseFFFF          := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
 	MOVOU gather<>(SB), X6
 	MOVOU scatterAndMulBy0x101<>(SB), X8
 	MOVOU fxAlmost65536<>(SB), X9
 	MOVOU inverseFFFF<>(SB), X10
 	// gather := XMM(see above)                      // PSHUFB shuffle mask.
 	// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
 	MOVOU gather<>(SB), X6
 	XORPS X7, X7
 	// i := 0
@ -222,7 +223,7 @@ fxAccOpOverLoop4:
 fxAccOpOverLoop1:
 	// for i < len(src)
 	CMPQ R9, R11
-	JAE  fxAccOpOverCleanup
+	JAE  fxAccOpOverEnd
 	// x = src[i] + offset
 	MOVL  (SI), X1
@ -276,9 +277,6 @@ fxAccOpOverLoop1:
 	ADDQ $4, SI
 	JMP  fxAccOpOverLoop1
 fxAccOpOverCleanup:
 	// No-op.
 fxAccOpOverEnd:
 	RET
@ -301,6 +299,7 @@ fxAccOpOverEnd:
 //	xmm9	-
 //	xmm10	-
 TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ dst_len+8(FP), BX
 	MOVQ src_base+24(FP), SI
@ -319,8 +318,9 @@ TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
 	MOVOU fxAlmost256<>(SB), X5
 	// gather := XMM(see above) // PSHUFB shuffle mask.
 	// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
 	MOVOU gather<>(SB), X6
 	// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
 	XORPS X7, X7
 	// i := 0
@ -390,7 +390,7 @@ fxAccOpSrcLoop4:
 fxAccOpSrcLoop1:
 	// for i < len(src)
 	CMPQ R9, R11
-	JAE  fxAccOpSrcCleanup
+	JAE  fxAccOpSrcEnd
 	// x = src[i] + offset
 	MOVL  (SI), X1
@ -429,10 +429,149 @@ fxAccOpSrcLoop1:
 	ADDQ $4, SI
 	JMP  fxAccOpSrcLoop1
-fxAccOpSrcCleanup:
+fxAccOpSrcEnd:
 	RET
 // ----------------------------------------------------------------------------
 // func fixedAccumulateMaskSIMD(buf []uint32)
 //
 // XMM registers. Variable names are per
 // https://github.com/google/font-rs/blob/master/src/accumulate.c
 //
 //	xmm0	scratch
 //	xmm1	x
 //	xmm2	y, z
 //	xmm3	-
 //	xmm4	-
 //	xmm5	fxAlmost65536
 //	xmm6	-
 //	xmm7	offset
 //	xmm8	-
 //	xmm9	-
 //	xmm10	-
 TEXT ·fixedAccumulateMaskSIMD(SB), NOSPLIT, $0-24
 	MOVQ buf_base+0(FP), DI
 	MOVQ buf_len+8(FP), BX
 	MOVQ buf_base+0(FP), SI
 	MOVQ buf_len+8(FP), R10
 	// R10 = len(src) &^ 3
 	// R11 = len(src)
 	MOVQ R10, R11
 	ANDQ $-4, R10
 	// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
 	MOVOU fxAlmost65536<>(SB), X5
 	// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
 	XORPS X7, X7
 	// i := 0
 	MOVQ $0, R9
 fxAccMaskLoop4:
 	// for i < (len(src) &^ 3)
 	CMPQ R9, R10
 	JAE  fxAccMaskLoop1
 	// x = XMM(s0, s1, s2, s3)
 	//
 	// Where s0 is src[i+0], s1 is src[i+1], etc.
 	MOVOU (SI), X1
 	// scratch = XMM(0, s0, s1, s2)
 	// x += scratch                                  // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
 	MOVOU X1, X0
 	PSLLO $4, X0
 	PADDD X0, X1
 	// scratch = XMM(0, 0, 0, 0)
 	// scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
 	// x += scratch                                  // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
 	XORPS  X0, X0
 	SHUFPS $0x40, X1, X0
 	PADDD  X0, X1
 	// x += offset
 	PADDD X7, X1
 	// y = abs(x)
 	// y >>= 4 // Shift by 2*ϕ - 16.
 	// y = min(y, fxAlmost65536)
 	//
 	// pabsd  %xmm1,%xmm2
 	// psrld  $0x4,%xmm2
 	// pminud %xmm5,%xmm2
 	//
 	// Hopefully we'll get these opcode mnemonics into the assembler for Go
 	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
 	// it's similar.
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
 	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 	// z = convertToInt32(y)
 	// No-op.
-fxAccOpSrcEnd:
+	// copy(dst[:4], z)
 	MOVOU X2, (DI)
 	// offset = XMM(x@3, x@3, x@3, x@3)
 	MOVOU  X1, X7
 	SHUFPS $0xff, X1, X7
 	// i += 4
 	// dst = dst[4:]
 	// src = src[4:]
 	ADDQ $4, R9
 	ADDQ $16, DI
 	ADDQ $16, SI
 	JMP  fxAccMaskLoop4
 fxAccMaskLoop1:
 	// for i < len(src)
 	CMPQ R9, R11
 	JAE  fxAccMaskEnd
 	// x = src[i] + offset
 	MOVL  (SI), X1
 	PADDD X7, X1
 	// y = abs(x)
 	// y >>= 4 // Shift by 2*ϕ - 16.
 	// y = min(y, fxAlmost65536)
 	//
 	// pabsd  %xmm1,%xmm2
 	// psrld  $0x4,%xmm2
 	// pminud %xmm5,%xmm2
 	//
 	// Hopefully we'll get these opcode mnemonics into the assembler for Go
 	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
 	// it's similar.
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
 	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 	// z = convertToInt32(y)
 	// No-op.
 	// dst[0] = uint32(z)
 	MOVL X2, (DI)
 	// offset = x
 	MOVOU X1, X7
 	// i += 1
 	// dst = dst[1:]
 	// src = src[1:]
 	ADDQ $1, R9
 	ADDQ $4, DI
 	ADDQ $4, SI
 	JMP  fxAccMaskLoop1
 fxAccMaskEnd:
 	RET
 // ----------------------------------------------------------------------------
@ -454,6 +593,7 @@ fxAccOpSrcEnd:
 //	xmm9	fxAlmost65536
 //	xmm10	inverseFFFF
 TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ dst_len+8(FP), BX
 	MOVQ src_base+24(FP), SI
@ -468,12 +608,12 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
 	MOVQ R10, R11
 	ANDQ $-4, R10
-	// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero".
+	// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
 	// "Round To Zero".
 	STMXCSR mxcsrOrig-8(SP)
 	MOVL    mxcsrOrig-8(SP), AX
 	ORL     $0x6000, AX
 	MOVL    AX, mxcsrNew-4(SP)
 	LDMXCSR mxcsrNew-4(SP)
 	// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
 	// flOne         := XMM(0x3f800000 repeated four times) // 1 as a float32.
@ -482,16 +622,16 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
 	MOVOU flOne<>(SB), X4
 	MOVOU flSignMask<>(SB), X5
 	// gather               := XMM(see above)                      // PSHUFB shuffle mask.
 	// scatterAndMulBy0x101 := XMM(see above)                      // PSHUFB shuffle mask.
 	// fxAlmost65536        := XMM(0x0000ffff repeated four times) // 0xffff.
 	// inverseFFFF          := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
 	MOVOU gather<>(SB), X6
 	MOVOU scatterAndMulBy0x101<>(SB), X8
 	MOVOU fxAlmost65536<>(SB), X9
 	MOVOU inverseFFFF<>(SB), X10
 	// gather := XMM(see above)                      // PSHUFB shuffle mask.
 	// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
 	MOVOU gather<>(SB), X6
 	XORPS X7, X7
 	// i := 0
@ -532,7 +672,9 @@ flAccOpOverLoop4:
 	MULPS X3, X2
 	// z = convertToInt32(y)
 	LDMXCSR  mxcsrNew-4(SP)
 	CVTPS2PL X2, X2
 	LDMXCSR  mxcsrOrig-8(SP)
 	// Blend over the dst's prior value. SIMD for i in 0..3:
 	//
@ -597,7 +739,7 @@ flAccOpOverLoop4:
 flAccOpOverLoop1:
 	// for i < len(src)
 	CMPQ R9, R11
-	JAE  flAccOpOverCleanup
+	JAE  flAccOpOverEnd
 	// x = src[i] + offset
 	MOVL  (SI), X1
@ -612,7 +754,9 @@ flAccOpOverLoop1:
 	MULPS X3, X2
 	// z = convertToInt32(y)
 	LDMXCSR  mxcsrNew-4(SP)
 	CVTPS2PL X2, X2
 	LDMXCSR  mxcsrOrig-8(SP)
 	// Blend over the dst's prior value.
 	//
@ -644,9 +788,6 @@ flAccOpOverLoop1:
 	ADDQ $4, SI
 	JMP  flAccOpOverLoop1
 flAccOpOverCleanup:
 	LDMXCSR mxcsrOrig-8(SP)
 flAccOpOverEnd:
 	RET
@ -669,6 +810,7 @@ flAccOpOverEnd:
 //	xmm9	-
 //	xmm10	-
 TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ dst_len+8(FP), BX
 	MOVQ src_base+24(FP), SI
@ -683,12 +825,12 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
 	MOVQ R10, R11
 	ANDQ $-4, R10
-	// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero".
+	// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
 	// "Round To Zero".
 	STMXCSR mxcsrOrig-8(SP)
 	MOVL    mxcsrOrig-8(SP), AX
 	ORL     $0x6000, AX
 	MOVL    AX, mxcsrNew-4(SP)
 	LDMXCSR mxcsrNew-4(SP)
 	// flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32.
 	// flOne       := XMM(0x3f800000 repeated four times) // 1 as a float32.
@ -698,8 +840,9 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
 	MOVOU flSignMask<>(SB), X5
 	// gather := XMM(see above) // PSHUFB shuffle mask.
 	// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
 	MOVOU gather<>(SB), X6
 	// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
 	XORPS X7, X7
 	// i := 0
@ -740,7 +883,9 @@ flAccOpSrcLoop4:
 	MULPS X3, X2
 	// z = convertToInt32(y)
 	LDMXCSR  mxcsrNew-4(SP)
 	CVTPS2PL X2, X2
 	LDMXCSR  mxcsrOrig-8(SP)
 	// z = shuffleTheLowBytesOfEach4ByteElement(z)
 	// copy(dst[:4], low4BytesOf(z))
@ -762,7 +907,7 @@ flAccOpSrcLoop4:
 flAccOpSrcLoop1:
 	// for i < len(src)
 	CMPQ R9, R11
-	JAE  flAccOpSrcCleanup
+	JAE  flAccOpSrcEnd
 	// x = src[i] + offset
 	MOVL  (SI), X1
@ -777,7 +922,9 @@ flAccOpSrcLoop1:
 	MULPS X3, X2
 	// z = convertToInt32(y)
 	LDMXCSR  mxcsrNew-4(SP)
 	CVTPS2PL X2, X2
 	LDMXCSR  mxcsrOrig-8(SP)
 	// dst[0] = uint8(z)
 	MOVL X2, BX
@ -794,8 +941,152 @@ flAccOpSrcLoop1:
 	ADDQ $4, SI
 	JMP  flAccOpSrcLoop1
 flAccOpSrcCleanup:
 	LDMXCSR mxcsrOrig-8(SP)
 flAccOpSrcEnd:
 	RET
 // ----------------------------------------------------------------------------
 // func floatingAccumulateMaskSIMD(dst []uint32, src []float32)
 //
 // XMM registers. Variable names are per
 // https://github.com/google/font-rs/blob/master/src/accumulate.c
 //
 //	xmm0	scratch
 //	xmm1	x
 //	xmm2	y, z
 //	xmm3	flAlmost65536
 //	xmm4	flOne
 //	xmm5	flSignMask
 //	xmm6	-
 //	xmm7	offset
 //	xmm8	-
 //	xmm9	-
 //	xmm10	-
 TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ dst_len+8(FP), BX
 	MOVQ src_base+24(FP), SI
 	MOVQ src_len+32(FP), R10
 	// Sanity check that len(dst) >= len(src).
 	CMPQ BX, R10
 	JLT  flAccMaskEnd
 	// R10 = len(src) &^ 3
 	// R11 = len(src)
 	MOVQ R10, R11
 	ANDQ $-4, R10
 	// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
 	// "Round To Zero".
 	STMXCSR mxcsrOrig-8(SP)
 	MOVL    mxcsrOrig-8(SP), AX
 	ORL     $0x6000, AX
 	MOVL    AX, mxcsrNew-4(SP)
 	// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
 	// flOne         := XMM(0x3f800000 repeated four times) // 1 as a float32.
 	// flSignMask    := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
 	MOVOU flAlmost65536<>(SB), X3
 	MOVOU flOne<>(SB), X4
 	MOVOU flSignMask<>(SB), X5
 	// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
 	XORPS X7, X7
 	// i := 0
 	MOVQ $0, R9
 flAccMaskLoop4:
 	// for i < (len(src) &^ 3)
 	CMPQ R9, R10
 	JAE  flAccMaskLoop1
 	// x = XMM(s0, s1, s2, s3)
 	//
 	// Where s0 is src[i+0], s1 is src[i+1], etc.
 	MOVOU (SI), X1
 	// scratch = XMM(0, s0, s1, s2)
 	// x += scratch                                  // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
 	MOVOU X1, X0
 	PSLLO $4, X0
 	ADDPS X0, X1
 	// scratch = XMM(0, 0, 0, 0)
 	// scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
 	// x += scratch                                  // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
 	XORPS  X0, X0
 	SHUFPS $0x40, X1, X0
 	ADDPS  X0, X1
 	// x += offset
 	ADDPS X7, X1
 	// y = x & flSignMask
 	// y = min(y, flOne)
 	// y = mul(y, flAlmost65536)
 	MOVOU X5, X2
 	ANDPS X1, X2
 	MINPS X4, X2
 	MULPS X3, X2
 	// z = convertToInt32(y)
 	LDMXCSR  mxcsrNew-4(SP)
 	CVTPS2PL X2, X2
 	LDMXCSR  mxcsrOrig-8(SP)
 	// copy(dst[:4], z)
 	MOVOU X2, (DI)
 	// offset = XMM(x@3, x@3, x@3, x@3)
 	MOVOU  X1, X7
 	SHUFPS $0xff, X1, X7
 	// i += 4
 	// dst = dst[4:]
 	// src = src[4:]
 	ADDQ $4, R9
 	ADDQ $16, DI
 	ADDQ $16, SI
 	JMP  flAccMaskLoop4
 flAccMaskLoop1:
 	// for i < len(src)
 	CMPQ R9, R11
 	JAE  flAccMaskEnd
 	// x = src[i] + offset
 	MOVL  (SI), X1
 	ADDPS X7, X1
 	// y = x & flSignMask
 	// y = min(y, flOne)
 	// y = mul(y, flAlmost65536)
 	MOVOU X5, X2
 	ANDPS X1, X2
 	MINPS X4, X2
 	MULPS X3, X2
 	// z = convertToInt32(y)
 	LDMXCSR  mxcsrNew-4(SP)
 	CVTPS2PL X2, X2
 	LDMXCSR  mxcsrOrig-8(SP)
 	// dst[0] = uint32(z)
 	MOVL X2, (DI)
 	// offset = x
 	MOVOU X1, X7
 	// i += 1
 	// dst = dst[1:]
 	// src = src[1:]
 	ADDQ $1, R9
 	ADDQ $4, DI
 	ADDQ $4, SI
 	JMP  flAccMaskLoop1
 flAccMaskEnd:
 	RET
--- a/vector/acc_other.go
+++ b/vector/acc_other.go
@ -11,5 +11,7 @@ const haveFloatingAccumulateSIMD = false
 func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32)     {}
 func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)      {}
 func fixedAccumulateMaskSIMD(buf []uint32)                    {}
 func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) {}
 func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)  {}
 func floatingAccumulateMaskSIMD(dst []uint32, src []float32)  {}
--- a/vector/acc_test.go
+++ b/vector/acc_test.go
@ -201,8 +201,12 @@ func testAcc(t *testing.T, in interface{}, mask []uint32, op string) {
 					}
 				case "mask":
 					copy(got32, in[:n])
 					if simd {
 						fixedAccumulateMaskSIMD(got32)
 					} else {
 						fixedAccumulateMask(got32)
 					}
 				}
 			case []float32:
 				switch op {
 				case "over":
@ -218,9 +222,13 @@ func testAcc(t *testing.T, in interface{}, mask []uint32, op string) {
 						floatingAccumulateOpSrc(got8, in[:n])
 					}
 				case "mask":
 					if simd {
 						floatingAccumulateMaskSIMD(got32, in[:n])
 					} else {
 						floatingAccumulateMask(got32, in[:n])
 					}
 				}
 			}
 			if op != "mask" {
 				if !bytes.Equal(got8, want8) {
@ -264,22 +272,26 @@ func BenchmarkFixedAccumulateOpOverSIMD16(b *testing.B)    { benchAcc(b, fxIn16,
 func BenchmarkFixedAccumulateOpSrc16(b *testing.B)         { benchAcc(b, fxIn16, "src", false) }
 func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B)     { benchAcc(b, fxIn16, "src", true) }
 func BenchmarkFixedAccumulateMask16(b *testing.B)          { benchAcc(b, fxIn16, "mask", false) }
 func BenchmarkFixedAccumulateMaskSIMD16(b *testing.B)      { benchAcc(b, fxIn16, "mask", true) }
 func BenchmarkFloatingAccumulateOpOver16(b *testing.B)     { benchAcc(b, flIn16, "over", false) }
 func BenchmarkFloatingAccumulateOpOverSIMD16(b *testing.B) { benchAcc(b, flIn16, "over", true) }
 func BenchmarkFloatingAccumulateOpSrc16(b *testing.B)      { benchAcc(b, flIn16, "src", false) }
 func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B)  { benchAcc(b, flIn16, "src", true) }
 func BenchmarkFloatingAccumulateMask16(b *testing.B)       { benchAcc(b, flIn16, "mask", false) }
 func BenchmarkFloatingAccumulateMaskSIMD16(b *testing.B)   { benchAcc(b, flIn16, "mask", true) }
 func BenchmarkFixedAccumulateOpOver64(b *testing.B)        { benchAcc(b, fxIn64, "over", false) }
 func BenchmarkFixedAccumulateOpOverSIMD64(b *testing.B)    { benchAcc(b, fxIn64, "over", true) }
 func BenchmarkFixedAccumulateOpSrc64(b *testing.B)         { benchAcc(b, fxIn64, "src", false) }
 func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B)     { benchAcc(b, fxIn64, "src", true) }
 func BenchmarkFixedAccumulateMask64(b *testing.B)          { benchAcc(b, fxIn64, "mask", false) }
 func BenchmarkFixedAccumulateMaskSIMD64(b *testing.B)      { benchAcc(b, fxIn64, "mask", true) }
 func BenchmarkFloatingAccumulateOpOver64(b *testing.B)     { benchAcc(b, flIn64, "over", false) }
 func BenchmarkFloatingAccumulateOpOverSIMD64(b *testing.B) { benchAcc(b, flIn64, "over", true) }
 func BenchmarkFloatingAccumulateOpSrc64(b *testing.B)      { benchAcc(b, flIn64, "src", false) }
 func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B)  { benchAcc(b, flIn64, "src", true) }
 func BenchmarkFloatingAccumulateMask64(b *testing.B)       { benchAcc(b, flIn64, "mask", false) }
 func BenchmarkFloatingAccumulateMaskSIMD64(b *testing.B)   { benchAcc(b, flIn64, "mask", true) }
 func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
 	var f func()
@ -308,8 +320,12 @@ func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
 		case "mask":
 			buf := make([]uint32, len(in))
 			copy(buf, in)
 			if simd {
 				f = func() { fixedAccumulateMaskSIMD(buf) }
 			} else {
 				f = func() { fixedAccumulateMask(buf) }
 			}
 		}
 	case []float32:
 		if simd && !haveFloatingAccumulateSIMD {
@ -333,9 +349,13 @@ func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
 			}
 		case "mask":
 			dst := make([]uint32, len(in))
 			if simd {
 				f = func() { floatingAccumulateMaskSIMD(dst, in) }
 			} else {
 				f = func() { floatingAccumulateMask(dst, in) }
 			}
 		}
 	}
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
--- a/vector/gen.go
+++ b/vector/gen.go
@ -10,6 +10,7 @@ import (
 	"bytes"
 	"io/ioutil"
 	"log"
 	"strings"
 	"text/template"
 )
@ -54,6 +55,9 @@ func main() {
 		if i != 0 {
 			out.WriteString("\n")
 		}
 		if strings.Contains(v.LoadArgs, "{{.ShortName}}") {
 			v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1)
 		}
 		if err := t.Execute(out, v); err != nil {
 			log.Fatalf("Execute(%q): %v", v.ShortName, err)
 		}
@ -68,15 +72,19 @@ var instances = []struct {
 	LongName       string
 	ShortName      string
 	FrameSize      string
-	SrcType        string
+	ArgsSize       string
 	Args           string
 	DstElemSize1   int
 	DstElemSize4   int
 	XMM3           string
 	XMM4           string
 	XMM5           string
 	XMM6           string
 	XMM8           string
 	XMM9           string
 	XMM10          string
 	LoadArgs       string
 	Setup          string
 	Cleanup        string
 	LoadXMMRegs    string
 	Add            string
 	ClampAndScale  string
@ -87,16 +95,20 @@ var instances = []struct {
 	LongName:       "fixedAccumulateOpOver",
 	ShortName:      "fxAccOpOver",
 	FrameSize:      fxFrameSize,
-	SrcType:        fxSrcType,
+	ArgsSize:       twoArgArgsSize,
 	Args:           "dst []uint8, src []uint32",
 	DstElemSize1:   1 * sizeOfUint8,
 	DstElemSize4:   4 * sizeOfUint8,
 	XMM3:           fxXMM3,
 	XMM4:           fxXMM4,
 	XMM5:           fxXMM5_65536,
 	XMM6:           opOverXMM6,
 	XMM8:           opOverXMM8,
 	XMM9:           opOverXMM9,
 	XMM10:          opOverXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          fxSetup,
 	LoadXMMRegs:    fxLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs,
 	Cleanup:        fxCleanup,
 	Add:            fxAdd,
 	ClampAndScale:  fxClampAndScale65536,
 	ConvertToInt32: fxConvertToInt32,
@ -106,35 +118,66 @@ var instances = []struct {
 	LongName:       "fixedAccumulateOpSrc",
 	ShortName:      "fxAccOpSrc",
 	FrameSize:      fxFrameSize,
-	SrcType:        fxSrcType,
+	ArgsSize:       twoArgArgsSize,
 	Args:           "dst []uint8, src []uint32",
 	DstElemSize1:   1 * sizeOfUint8,
 	DstElemSize4:   4 * sizeOfUint8,
 	XMM3:           fxXMM3,
 	XMM4:           fxXMM4,
 	XMM5:           fxXMM5_256,
 	XMM6:           opSrcXMM6,
 	XMM8:           opSrcXMM8,
 	XMM9:           opSrcXMM9,
 	XMM10:          opSrcXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          fxSetup,
 	LoadXMMRegs:    fxLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs,
 	Cleanup:        fxCleanup,
 	Add:            fxAdd,
 	ClampAndScale:  fxClampAndScale256,
 	ConvertToInt32: fxConvertToInt32,
 	Store4:         opSrcStore4,
 	Store1:         opSrcStore1,
 }, {
 	LongName:       "fixedAccumulateMask",
 	ShortName:      "fxAccMask",
 	FrameSize:      fxFrameSize,
 	ArgsSize:       oneArgArgsSize,
 	Args:           "buf []uint32",
 	DstElemSize1:   1 * sizeOfUint32,
 	DstElemSize4:   4 * sizeOfUint32,
 	XMM3:           fxXMM3,
 	XMM4:           fxXMM4,
 	XMM5:           fxXMM5_65536,
 	XMM6:           maskXMM6,
 	XMM8:           maskXMM8,
 	XMM9:           maskXMM9,
 	XMM10:          maskXMM10,
 	LoadArgs:       oneArgLoadArgs,
 	Setup:          fxSetup,
 	LoadXMMRegs:    fxLoadXMMRegs65536 + "\n" + maskLoadXMMRegs,
 	Add:            fxAdd,
 	ClampAndScale:  fxClampAndScale65536,
 	ConvertToInt32: fxConvertToInt32,
 	Store4:         maskStore4,
 	Store1:         maskStore1,
 }, {
 	LongName:       "floatingAccumulateOpOver",
 	ShortName:      "flAccOpOver",
 	FrameSize:      flFrameSize,
-	SrcType:        flSrcType,
+	ArgsSize:       twoArgArgsSize,
 	Args:           "dst []uint8, src []float32",
 	DstElemSize1:   1 * sizeOfUint8,
 	DstElemSize4:   4 * sizeOfUint8,
 	XMM3:           flXMM3_65536,
 	XMM4:           flXMM4,
 	XMM5:           flXMM5,
 	XMM6:           opOverXMM6,
 	XMM8:           opOverXMM8,
 	XMM9:           opOverXMM9,
 	XMM10:          opOverXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          flSetup,
 	LoadXMMRegs:    flLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs,
 	Cleanup:        flCleanup,
 	Add:            flAdd,
 	ClampAndScale:  flClampAndScale65536,
 	ConvertToInt32: flConvertToInt32,
@ -144,29 +187,59 @@ var instances = []struct {
 	LongName:       "floatingAccumulateOpSrc",
 	ShortName:      "flAccOpSrc",
 	FrameSize:      flFrameSize,
-	SrcType:        flSrcType,
+	ArgsSize:       twoArgArgsSize,
 	Args:           "dst []uint8, src []float32",
 	DstElemSize1:   1 * sizeOfUint8,
 	DstElemSize4:   4 * sizeOfUint8,
 	XMM3:           flXMM3_256,
 	XMM4:           flXMM4,
 	XMM5:           flXMM5,
 	XMM6:           opSrcXMM6,
 	XMM8:           opSrcXMM8,
 	XMM9:           opSrcXMM9,
 	XMM10:          opSrcXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          flSetup,
 	LoadXMMRegs:    flLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs,
 	Cleanup:        flCleanup,
 	Add:            flAdd,
 	ClampAndScale:  flClampAndScale256,
 	ConvertToInt32: flConvertToInt32,
 	Store4:         opSrcStore4,
 	Store1:         opSrcStore1,
 }, {
 	LongName:       "floatingAccumulateMask",
 	ShortName:      "flAccMask",
 	FrameSize:      flFrameSize,
 	ArgsSize:       twoArgArgsSize,
 	Args:           "dst []uint32, src []float32",
 	DstElemSize1:   1 * sizeOfUint32,
 	DstElemSize4:   4 * sizeOfUint32,
 	XMM3:           flXMM3_65536,
 	XMM4:           flXMM4,
 	XMM5:           flXMM5,
 	XMM6:           maskXMM6,
 	XMM8:           maskXMM8,
 	XMM9:           maskXMM9,
 	XMM10:          maskXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          flSetup,
 	LoadXMMRegs:    flLoadXMMRegs65536 + "\n" + maskLoadXMMRegs,
 	Add:            flAdd,
 	ClampAndScale:  flClampAndScale65536,
 	ConvertToInt32: flConvertToInt32,
 	Store4:         maskStore4,
 	Store1:         maskStore1,
 }}
 const (
 	fxFrameSize = `0`
 	flFrameSize = `8`
-	fxSrcType = `[]uint32`
+	oneArgArgsSize = `24`
-	flSrcType = `[]float32`
+	twoArgArgsSize = `48`
 	sizeOfUint8  = 1
 	sizeOfUint32 = 4
 	fxXMM3       = `-`
 	flXMM3_256   = `flAlmost256`
@ -179,19 +252,32 @@ const (
 	fxXMM5_65536 = `fxAlmost65536`
 	flXMM5       = `flSignMask`
 	oneArgLoadArgs = `
 		MOVQ buf_base+0(FP), DI
 		MOVQ buf_len+8(FP), BX
 		MOVQ buf_base+0(FP), SI
 		MOVQ buf_len+8(FP), R10
 		`
 	twoArgLoadArgs = `
 		MOVQ dst_base+0(FP), DI
 		MOVQ dst_len+8(FP), BX
 		MOVQ src_base+24(FP), SI
 		MOVQ src_len+32(FP), R10
 		// Sanity check that len(dst) >= len(src).
 		CMPQ BX, R10
 		JLT  {{.ShortName}}End
 		`
 	fxSetup = ``
 	flSetup = `
-		// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero".
+		// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
 		// "Round To Zero".
 		STMXCSR mxcsrOrig-8(SP)
 		MOVL    mxcsrOrig-8(SP), AX
 		ORL     $0x6000, AX
 		MOVL    AX, mxcsrNew-4(SP)
 		LDMXCSR mxcsrNew-4(SP)
 		`
 	fxCleanup = `// No-op.`
 	flCleanup = `LDMXCSR mxcsrOrig-8(SP)`
 	fxLoadXMMRegs256 = `
 		// fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
 		MOVOU fxAlmost256<>(SB), X5
@ -271,8 +357,16 @@ const (
 		MULPS X3, X2
 		`
-	fxConvertToInt32 = `// No-op.`
+	fxConvertToInt32 = `
-	flConvertToInt32 = `CVTPS2PL X2, X2`
+		// z = convertToInt32(y)
 		// No-op.
 		`
 	flConvertToInt32 = `
 		// z = convertToInt32(y)
 		LDMXCSR  mxcsrNew-4(SP)
 		CVTPS2PL X2, X2
 		LDMXCSR  mxcsrOrig-8(SP)
 		`
 	opOverStore4 = `
 		// Blend over the dst's prior value. SIMD for i in 0..3:
@ -324,6 +418,10 @@ const (
 		PSHUFB X6, X2
 		MOVL   X2, (DI)
 		`
 	maskStore4 = `
 		// copy(dst[:4], z)
 		MOVOU X2, (DI)
 		`
 	opOverStore1 = `
 		// Blend over the dst's prior value.
@ -350,23 +448,40 @@ const (
 		MOVL X2, BX
 		MOVB BX, (DI)
 		`
 	maskStore1 = `
 		// dst[0] = uint32(z)
 		MOVL X2, (DI)
 		`
 	opOverXMM6 = `gather`
 	opSrcXMM6  = `gather`
 	maskXMM6   = `-`
 	opOverXMM8 = `scatterAndMulBy0x101`
 	opSrcXMM8  = `-`
 	maskXMM8   = `-`
 	opOverXMM9 = `fxAlmost65536`
 	opSrcXMM9  = `-`
 	maskXMM9   = `-`
 	opOverXMM10 = `inverseFFFF`
 	opSrcXMM10  = `-`
 	maskXMM10   = `-`
 	opOverLoadXMMRegs = `
 		// gather               := XMM(see above)                      // PSHUFB shuffle mask.
 		// scatterAndMulBy0x101 := XMM(see above)                      // PSHUFB shuffle mask.
 		// fxAlmost65536        := XMM(0x0000ffff repeated four times) // 0xffff.
 		// inverseFFFF          := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
 		MOVOU gather<>(SB), X6
 		MOVOU scatterAndMulBy0x101<>(SB), X8
 		MOVOU fxAlmost65536<>(SB), X9
 		MOVOU inverseFFFF<>(SB), X10
 		`
-	opSrcLoadXMMRegs = ``
+	opSrcLoadXMMRegs = `
 		// gather := XMM(see above) // PSHUFB shuffle mask.
 		MOVOU gather<>(SB), X6
 		`
 	maskLoadXMMRegs = ``
 )
--- a/vector/gen_acc_amd64.s.tmpl
+++ b/vector/gen_acc_amd64.s.tmpl
@ -68,7 +68,7 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0
 // ----------------------------------------------------------------------------
-// func {{.LongName}}SIMD(dst []uint8, src {{.SrcType}})
+// func {{.LongName}}SIMD({{.Args}})
 //
 // XMM registers. Variable names are per
 // https://github.com/google/font-rs/blob/master/src/accumulate.c
@ -79,20 +79,13 @@ TEXT ·haveSSE4_1(SB), NOSPLIT, $0
 //	xmm3	{{.XMM3}}
 //	xmm4	{{.XMM4}}
 //	xmm5	{{.XMM5}}
-//	xmm6	gather
+//	xmm6	{{.XMM6}}
 //	xmm7	offset
 //	xmm8	{{.XMM8}}
 //	xmm9	{{.XMM9}}
 //	xmm10	{{.XMM10}}
-TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
+TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-{{.ArgsSize}}
-	MOVQ dst_base+0(FP), DI
+	{{.LoadArgs}}
 	MOVQ dst_len+8(FP), BX
 	MOVQ src_base+24(FP), SI
 	MOVQ src_len+32(FP), R10
 	// Sanity check that len(dst) >= len(src).
 	CMPQ BX, R10
 	JLT  {{.ShortName}}End
 	// R10 = len(src) &^ 3
 	// R11 = len(src)
@ -103,9 +96,7 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
 	{{.LoadXMMRegs}}
 	// gather := XMM(see above)                      // PSHUFB shuffle mask.
 	// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
 	MOVOU gather<>(SB), X6
 	XORPS X7, X7
 	// i := 0
@ -139,7 +130,6 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
 	{{.ClampAndScale}}
 	// z = convertToInt32(y)
 	{{.ConvertToInt32}}
 	{{.Store4}}
@ -152,14 +142,14 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
 	// dst = dst[4:]
 	// src = src[4:]
 	ADDQ $4, R9
-	ADDQ $4, DI
+	ADDQ ${{.DstElemSize4}}, DI
 	ADDQ $16, SI
 	JMP  {{.ShortName}}Loop4
 {{.ShortName}}Loop1:
 	// for i < len(src)
 	CMPQ R9, R11
-	JAE  {{.ShortName}}Cleanup
+	JAE  {{.ShortName}}End
 	// x = src[i] + offset
 	MOVL     (SI), X1
@ -167,7 +157,6 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
 	{{.ClampAndScale}}
 	// z = convertToInt32(y)
 	{{.ConvertToInt32}}
 	{{.Store1}}
@ -179,12 +168,9 @@ TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48
 	// dst = dst[1:]
 	// src = src[1:]
 	ADDQ $1, R9
-	ADDQ $1, DI
+	ADDQ ${{.DstElemSize1}}, DI
 	ADDQ $4, SI
 	JMP  {{.ShortName}}Loop1
 {{.ShortName}}Cleanup:
 	{{.Cleanup}}
 {{.ShortName}}End:
 	RET
--- a/vector/raster_floating.go
+++ b/vector/raster_floating.go
@ -190,6 +190,11 @@ func floatingAccumulateOpSrc(dst []uint8, src []float32) {
 }
 func floatingAccumulateMask(dst []uint32, src []float32) {
 	// Sanity check that len(dst) >= len(src).
 	if len(dst) < len(src) {
 		return
 	}
 	acc := float32(0)
 	for i, v := range src {
 		acc += v
--- a/vector/vector.go
+++ b/vector/vector.go
@ -310,11 +310,19 @@ func (z *Rasterizer) accumulateMask() {
 		} else {
 			z.bufU32 = z.bufU32[:n]
 		}
 		if haveFloatingAccumulateSIMD {
 			floatingAccumulateMaskSIMD(z.bufU32, z.bufF32)
 		} else {
 			floatingAccumulateMask(z.bufU32, z.bufF32)
 		}
 	} else {
 		if haveFixedAccumulateSIMD {
 			fixedAccumulateMaskSIMD(z.bufU32)
 		} else {
 			fixedAccumulateMask(z.bufU32)
 		}
 	}
 }
 func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.Rectangle) {
 	// TODO: non-zero vs even-odd winding?