vector: add SIMD versions of xxxAccumulateOpSrc.
name old time/op new time/op delta GlyphAlpha16Src-8 3.37µs ± 0% 3.07µs ± 1% -8.86% (p=0.000 n=9+9) GlyphAlpha32Src-8 6.01µs ± 1% 4.55µs ± 0% -24.28% (p=0.000 n=10+9) GlyphAlpha64Src-8 13.2µs ± 0% 8.1µs ± 0% -38.69% (p=0.000 n=10+9) GlyphAlpha128Src-8 32.9µs ± 0% 16.9µs ± 0% -48.85% (p=0.000 n=10+9) GlyphAlpha256Src-8 98.0µs ± 0% 43.6µs ± 1% -55.50% (p=0.000 n=10+10) A comparison of the non-SIMD and SIMD versions: name time/op FixedAccumulateOpSrc16-8 368ns ± 0% FixedAccumulateOpSrcSIMD16-8 86.8ns ± 1% FloatingAccumulateOpSrc16-8 434ns ± 0% FloatingAccumulateOpSrcSIMD16-8 119ns ± 0% FixedAccumulateOpSrc64-8 6.12µs ± 0% FixedAccumulateOpSrcSIMD64-8 1.17µs ± 0% FloatingAccumulateOpSrc64-8 7.15µs ± 0% FloatingAccumulateOpSrcSIMD64-8 1.68µs ± 1% Change-Id: I58e5c7a3ecd12e536aab8e765e94275453d0eac8 Reviewed-on: https://go-review.googlesource.com/30431 Reviewed-by: David Crawshaw <crawshaw@golang.org>
This commit is contained in:
parent
dc590effac
commit
746988e7a2
21
vector/acc_amd64.go
Normal file
21
vector/acc_amd64.go
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
// Copyright 2016 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// +build !appengine
|
||||||
|
// +build gc
|
||||||
|
// +build !noasm
|
||||||
|
|
||||||
|
package vector
|
||||||
|
|
||||||
|
func haveSSE4_1() bool
|
||||||
|
|
||||||
|
var haveFixedAccumulateSIMD = haveSSE4_1()
|
||||||
|
|
||||||
|
const haveFloatingAccumulateSIMD = true
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
|
321
vector/acc_amd64.s
Normal file
321
vector/acc_amd64.s
Normal file
|
@ -0,0 +1,321 @@
|
||||||
|
// Copyright 2016 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// +build !appengine
|
||||||
|
// +build gc
|
||||||
|
// +build !noasm
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
// fl is short for floating point math. fx is short for fixed point math.
|
||||||
|
|
||||||
|
DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff
|
||||||
|
DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff
|
||||||
|
DATA flOnes<>+0x00(SB)/8, $0x3f8000003f800000
|
||||||
|
DATA flOnes<>+0x08(SB)/8, $0x3f8000003f800000
|
||||||
|
DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff
|
||||||
|
DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff
|
||||||
|
DATA shuffleMask<>+0x00(SB)/8, $0x0c0804000c080400
|
||||||
|
DATA shuffleMask<>+0x08(SB)/8, $0x0c0804000c080400
|
||||||
|
DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff
|
||||||
|
DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff
|
||||||
|
|
||||||
|
GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16
|
||||||
|
GLOBL flOnes<>(SB), (NOPTR+RODATA), $16
|
||||||
|
GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16
|
||||||
|
GLOBL shuffleMask<>(SB), (NOPTR+RODATA), $16
|
||||||
|
GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16
|
||||||
|
|
||||||
|
// func haveSSE4_1() bool
|
||||||
|
TEXT ·haveSSE4_1(SB), NOSPLIT, $0
|
||||||
|
MOVQ $1, AX
|
||||||
|
CPUID
|
||||||
|
SHRQ $19, CX
|
||||||
|
ANDQ $1, CX
|
||||||
|
MOVB CX, ret+0(FP)
|
||||||
|
RET
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
|
||||||
|
//
|
||||||
|
// XMM registers. Variable names are per
|
||||||
|
// https://github.com/google/font-rs/blob/master/src/accumulate.c
|
||||||
|
//
|
||||||
|
// xmm0 scratch
|
||||||
|
// xmm1 x
|
||||||
|
// xmm2 y, z
|
||||||
|
// xmm3 -
|
||||||
|
// xmm4 -
|
||||||
|
// xmm5 fxAlmost256
|
||||||
|
// xmm6 shuffleMask
|
||||||
|
// xmm7 offset
|
||||||
|
TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
|
||||||
|
MOVQ dst_base+0(FP), DI
|
||||||
|
MOVQ dst_len+8(FP), BX
|
||||||
|
MOVQ src_base+24(FP), SI
|
||||||
|
MOVQ src_len+32(FP), CX
|
||||||
|
|
||||||
|
// Sanity check that len(dst) >= len(src).
|
||||||
|
CMPQ BX, CX
|
||||||
|
JLT fxAccOpSrcEnd
|
||||||
|
|
||||||
|
// CX = len(src) &^ 3
|
||||||
|
// DX = len(src)
|
||||||
|
MOVQ CX, DX
|
||||||
|
ANDQ $-4, CX
|
||||||
|
|
||||||
|
// fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
|
||||||
|
// shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask.
|
||||||
|
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
|
||||||
|
MOVOU fxAlmost256<>(SB), X5
|
||||||
|
MOVOU shuffleMask<>(SB), X6
|
||||||
|
XORPS X7, X7
|
||||||
|
|
||||||
|
// i := 0
|
||||||
|
MOVQ $0, AX
|
||||||
|
|
||||||
|
fxAccOpSrcLoop4:
|
||||||
|
// for i < (len(src) &^ 3)
|
||||||
|
CMPQ AX, CX
|
||||||
|
JAE fxAccOpSrcLoop1
|
||||||
|
|
||||||
|
// x = XMM(s0, s1, s2, s3)
|
||||||
|
//
|
||||||
|
// Where s0 is src[i+0], s1 is src[i+1], etc.
|
||||||
|
MOVOU (SI), X1
|
||||||
|
|
||||||
|
// scratch = XMM(0, s0, s1, s2)
|
||||||
|
// x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
|
||||||
|
MOVOU X1, X0
|
||||||
|
PSLLO $4, X0
|
||||||
|
PADDD X0, X1
|
||||||
|
|
||||||
|
// scratch = XMM(0, 0, 0, 0)
|
||||||
|
// scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
|
||||||
|
// x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
|
||||||
|
XORPS X0, X0
|
||||||
|
SHUFPS $0x40, X1, X0
|
||||||
|
PADDD X0, X1
|
||||||
|
|
||||||
|
// x += offset
|
||||||
|
PADDD X7, X1
|
||||||
|
|
||||||
|
// y = abs(x)
|
||||||
|
// y >>= 12 // Shift by 2*ϕ - 8.
|
||||||
|
// y = min(y, fxAlmost256)
|
||||||
|
//
|
||||||
|
// pabsd %xmm1,%xmm2
|
||||||
|
// psrld $0xc,%xmm2
|
||||||
|
// pminud %xmm5,%xmm2
|
||||||
|
//
|
||||||
|
// Hopefully we'll get these opcode mnemonics into the assembler for Go
|
||||||
|
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
|
||||||
|
// it's similar.
|
||||||
|
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
|
||||||
|
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c
|
||||||
|
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
|
||||||
|
|
||||||
|
// z = shuffleTheLowBytesOfEach4ByteElement(y)
|
||||||
|
// copy(dst[:4], low4BytesOf(z))
|
||||||
|
PSHUFB X6, X2
|
||||||
|
MOVL X2, (DI)
|
||||||
|
|
||||||
|
// offset = XMM(x@3, x@3, x@3, x@3)
|
||||||
|
MOVOU X1, X7
|
||||||
|
SHUFPS $0xff, X1, X7
|
||||||
|
|
||||||
|
// i += 4
|
||||||
|
// dst = dst[4:]
|
||||||
|
// src = src[4:]
|
||||||
|
ADDQ $4, AX
|
||||||
|
ADDQ $4, DI
|
||||||
|
ADDQ $16, SI
|
||||||
|
JMP fxAccOpSrcLoop4
|
||||||
|
|
||||||
|
fxAccOpSrcLoop1:
|
||||||
|
// for i < len(src)
|
||||||
|
CMPQ AX, DX
|
||||||
|
JAE fxAccOpSrcEnd
|
||||||
|
|
||||||
|
// x = src[i] + offset
|
||||||
|
MOVL (SI), X1
|
||||||
|
PADDD X7, X1
|
||||||
|
|
||||||
|
// y = abs(x)
|
||||||
|
// y >>= 12 // Shift by 2*ϕ - 8.
|
||||||
|
// y = min(y, fxAlmost256)
|
||||||
|
//
|
||||||
|
// pabsd %xmm1,%xmm2
|
||||||
|
// psrld $0xc,%xmm2
|
||||||
|
// pminud %xmm5,%xmm2
|
||||||
|
//
|
||||||
|
// Hopefully we'll get these opcode mnemonics into the assembler for Go
|
||||||
|
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
|
||||||
|
// it's similar.
|
||||||
|
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
|
||||||
|
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c
|
||||||
|
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
|
||||||
|
|
||||||
|
// dst[0] = uint8(y)
|
||||||
|
MOVL X2, BX
|
||||||
|
MOVB BX, (DI)
|
||||||
|
|
||||||
|
// offset = x
|
||||||
|
MOVOU X1, X7
|
||||||
|
|
||||||
|
// i += 1
|
||||||
|
// dst = dst[1:]
|
||||||
|
// src = src[1:]
|
||||||
|
ADDQ $1, AX
|
||||||
|
ADDQ $1, DI
|
||||||
|
ADDQ $4, SI
|
||||||
|
JMP fxAccOpSrcLoop1
|
||||||
|
|
||||||
|
fxAccOpSrcEnd:
|
||||||
|
RET
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
|
||||||
|
//
|
||||||
|
// XMM registers. Variable names are per
|
||||||
|
// https://github.com/google/font-rs/blob/master/src/accumulate.c
|
||||||
|
//
|
||||||
|
// xmm0 scratch
|
||||||
|
// xmm1 x
|
||||||
|
// xmm2 y, z
|
||||||
|
// xmm3 flAlmost256
|
||||||
|
// xmm4 flOnes
|
||||||
|
// xmm5 flSignMask
|
||||||
|
// xmm6 shuffleMask
|
||||||
|
// xmm7 offset
|
||||||
|
TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
|
||||||
|
MOVQ dst_base+0(FP), DI
|
||||||
|
MOVQ dst_len+8(FP), BX
|
||||||
|
MOVQ src_base+24(FP), SI
|
||||||
|
MOVQ src_len+32(FP), CX
|
||||||
|
|
||||||
|
// Sanity check that len(dst) >= len(src).
|
||||||
|
CMPQ BX, CX
|
||||||
|
JLT flAccOpSrcEnd
|
||||||
|
|
||||||
|
// CX = len(src) &^ 3
|
||||||
|
// DX = len(src)
|
||||||
|
MOVQ CX, DX
|
||||||
|
ANDQ $-4, CX
|
||||||
|
|
||||||
|
// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero".
|
||||||
|
STMXCSR mxcsrOrig-8(SP)
|
||||||
|
MOVL mxcsrOrig-8(SP), AX
|
||||||
|
ORL $0x6000, AX
|
||||||
|
MOVL AX, mxcsrNew-4(SP)
|
||||||
|
LDMXCSR mxcsrNew-4(SP)
|
||||||
|
|
||||||
|
// flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32.
|
||||||
|
// flOnes := XMM(0x3f800000 repeated four times) // 1 as a float32.
|
||||||
|
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
|
||||||
|
// shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask.
|
||||||
|
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
|
||||||
|
MOVOU flAlmost256<>(SB), X3
|
||||||
|
MOVOU flOnes<>(SB), X4
|
||||||
|
MOVOU flSignMask<>(SB), X5
|
||||||
|
MOVOU shuffleMask<>(SB), X6
|
||||||
|
XORPS X7, X7
|
||||||
|
|
||||||
|
// i := 0
|
||||||
|
MOVQ $0, AX
|
||||||
|
|
||||||
|
flAccOpSrcLoop4:
|
||||||
|
// for i < (len(src) &^ 3)
|
||||||
|
CMPQ AX, CX
|
||||||
|
JAE flAccOpSrcLoop1
|
||||||
|
|
||||||
|
// x = XMM(s0, s1, s2, s3)
|
||||||
|
//
|
||||||
|
// Where s0 is src[i+0], s1 is src[i+1], etc.
|
||||||
|
MOVOU (SI), X1
|
||||||
|
|
||||||
|
// scratch = XMM(0, s0, s1, s2)
|
||||||
|
// x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
|
||||||
|
MOVOU X1, X0
|
||||||
|
PSLLO $4, X0
|
||||||
|
ADDPS X0, X1
|
||||||
|
|
||||||
|
// scratch = XMM(0, 0, 0, 0)
|
||||||
|
// scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
|
||||||
|
// x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
|
||||||
|
XORPS X0, X0
|
||||||
|
SHUFPS $0x40, X1, X0
|
||||||
|
ADDPS X0, X1
|
||||||
|
|
||||||
|
// x += offset
|
||||||
|
ADDPS X7, X1
|
||||||
|
|
||||||
|
// y = x & flSignMask
|
||||||
|
// y = min(y, flOnes)
|
||||||
|
// y = mul(y, flAlmost256)
|
||||||
|
MOVOU X5, X2
|
||||||
|
ANDPS X1, X2
|
||||||
|
MINPS X4, X2
|
||||||
|
MULPS X3, X2
|
||||||
|
|
||||||
|
// z = float32ToInt32(y)
|
||||||
|
// z = shuffleTheLowBytesOfEach4ByteElement(z)
|
||||||
|
// copy(dst[:4], low4BytesOf(z))
|
||||||
|
CVTPS2PL X2, X2
|
||||||
|
PSHUFB X6, X2
|
||||||
|
MOVL X2, (DI)
|
||||||
|
|
||||||
|
// offset = XMM(x@3, x@3, x@3, x@3)
|
||||||
|
MOVOU X1, X7
|
||||||
|
SHUFPS $0xff, X1, X7
|
||||||
|
|
||||||
|
// i += 4
|
||||||
|
// dst = dst[4:]
|
||||||
|
// src = src[4:]
|
||||||
|
ADDQ $4, AX
|
||||||
|
ADDQ $4, DI
|
||||||
|
ADDQ $16, SI
|
||||||
|
JMP flAccOpSrcLoop4
|
||||||
|
|
||||||
|
flAccOpSrcLoop1:
|
||||||
|
// for i < len(src)
|
||||||
|
CMPQ AX, DX
|
||||||
|
JAE flAccOpSrcRestoreMXCSR
|
||||||
|
|
||||||
|
// x = src[i] + offset
|
||||||
|
MOVL (SI), X1
|
||||||
|
ADDPS X7, X1
|
||||||
|
|
||||||
|
// y = x & flSignMask
|
||||||
|
// y = min(y, flOnes)
|
||||||
|
// y = mul(y, flAlmost256)
|
||||||
|
MOVOU X5, X2
|
||||||
|
ANDPS X1, X2
|
||||||
|
MINPS X4, X2
|
||||||
|
MULPS X3, X2
|
||||||
|
|
||||||
|
// z = float32ToInt32(y)
|
||||||
|
// dst[0] = uint8(z)
|
||||||
|
CVTPS2PL X2, X2
|
||||||
|
MOVL X2, BX
|
||||||
|
MOVB BX, (DI)
|
||||||
|
|
||||||
|
// offset = x
|
||||||
|
MOVOU X1, X7
|
||||||
|
|
||||||
|
// i += 1
|
||||||
|
// dst = dst[1:]
|
||||||
|
// src = src[1:]
|
||||||
|
ADDQ $1, AX
|
||||||
|
ADDQ $1, DI
|
||||||
|
ADDQ $4, SI
|
||||||
|
JMP flAccOpSrcLoop1
|
||||||
|
|
||||||
|
flAccOpSrcRestoreMXCSR:
|
||||||
|
LDMXCSR mxcsrOrig-8(SP)
|
||||||
|
|
||||||
|
flAccOpSrcEnd:
|
||||||
|
RET
|
13
vector/acc_other.go
Normal file
13
vector/acc_other.go
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
// Copyright 2016 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// +build !amd64 appengine !gc noasm
|
||||||
|
|
||||||
|
package vector
|
||||||
|
|
||||||
|
const haveFixedAccumulateSIMD = false
|
||||||
|
const haveFloatingAccumulateSIMD = false
|
||||||
|
|
||||||
|
func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {}
|
||||||
|
func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {}
|
|
@ -10,6 +10,87 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// TestXxxSIMDUnaligned tests that unaligned SIMD loads/stores don't crash.
|
||||||
|
|
||||||
|
func TestFixedAccumulateSIMDUnaligned(t *testing.T) {
|
||||||
|
if !haveFixedAccumulateSIMD {
|
||||||
|
t.Skip("No SIMD implemention")
|
||||||
|
}
|
||||||
|
|
||||||
|
dst := make([]uint8, 64)
|
||||||
|
src := make([]uint32, 64)
|
||||||
|
for d := 0; d < 16; d++ {
|
||||||
|
for s := 0; s < 16; s++ {
|
||||||
|
fixedAccumulateOpSrcSIMD(dst[d:d+32], src[s:s+32])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFloatingAccumulateSIMDUnaligned(t *testing.T) {
|
||||||
|
if !haveFloatingAccumulateSIMD {
|
||||||
|
t.Skip("No SIMD implemention")
|
||||||
|
}
|
||||||
|
|
||||||
|
dst := make([]uint8, 64)
|
||||||
|
src := make([]float32, 64)
|
||||||
|
for d := 0; d < 16; d++ {
|
||||||
|
for s := 0; s < 16; s++ {
|
||||||
|
floatingAccumulateOpSrcSIMD(dst[d:d+32], src[s:s+32])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestXxxSIMDShortDst tests that the SIMD implementations don't write past the
|
||||||
|
// end of the dst buffer.
|
||||||
|
|
||||||
|
func TestFixedAccumulateSIMDShortDst(t *testing.T) {
|
||||||
|
if !haveFixedAccumulateSIMD {
|
||||||
|
t.Skip("No SIMD implemention")
|
||||||
|
}
|
||||||
|
|
||||||
|
const oneQuarter = uint32(int2ϕ(fxOne*fxOne)) / 4
|
||||||
|
src := []uint32{oneQuarter, oneQuarter, oneQuarter, oneQuarter}
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
dst := make([]uint8, 4)
|
||||||
|
fixedAccumulateOpSrcSIMD(dst[:i], src[:i])
|
||||||
|
for j := range dst {
|
||||||
|
if j < i {
|
||||||
|
if got := dst[j]; got == 0 {
|
||||||
|
t.Errorf("i=%d, j=%d: got %#02x, want non-zero", i, j, got)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if got := dst[j]; got != 0 {
|
||||||
|
t.Errorf("i=%d, j=%d: got %#02x, want zero", i, j, got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFloatingAccumulateSIMDShortDst(t *testing.T) {
|
||||||
|
if !haveFloatingAccumulateSIMD {
|
||||||
|
t.Skip("No SIMD implemention")
|
||||||
|
}
|
||||||
|
|
||||||
|
const oneQuarter = 0.25
|
||||||
|
src := []float32{oneQuarter, oneQuarter, oneQuarter, oneQuarter}
|
||||||
|
for i := 0; i < 4; i++ {
|
||||||
|
dst := make([]uint8, 4)
|
||||||
|
floatingAccumulateOpSrcSIMD(dst[:i], src[:i])
|
||||||
|
for j := range dst {
|
||||||
|
if j < i {
|
||||||
|
if got := dst[j]; got == 0 {
|
||||||
|
t.Errorf("i=%d, j=%d: got %#02x, want non-zero", i, j, got)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if got := dst[j]; got != 0 {
|
||||||
|
t.Errorf("i=%d, j=%d: got %#02x, want zero", i, j, got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestFixedAccumulateOpOverShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "over") }
|
func TestFixedAccumulateOpOverShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "over") }
|
||||||
func TestFixedAccumulateOpSrcShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "src") }
|
func TestFixedAccumulateOpSrcShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "src") }
|
||||||
func TestFixedAccumulateMaskShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "mask") }
|
func TestFixedAccumulateMaskShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "mask") }
|
||||||
|
@ -25,81 +106,97 @@ func TestFloatingAccumulateOpSrc16(t *testing.T) { testAcc(t, flIn16, flMask16,
|
||||||
func TestFloatingAccumulateMask16(t *testing.T) { testAcc(t, flIn16, flMask16, "mask") }
|
func TestFloatingAccumulateMask16(t *testing.T) { testAcc(t, flIn16, flMask16, "mask") }
|
||||||
|
|
||||||
func testAcc(t *testing.T, in interface{}, mask []uint32, op string) {
|
func testAcc(t *testing.T, in interface{}, mask []uint32, op string) {
|
||||||
maxN := 0
|
for _, simd := range []bool{false, true} {
|
||||||
switch in := in.(type) {
|
maxN := 0
|
||||||
case []uint32:
|
|
||||||
maxN = len(in)
|
|
||||||
case []float32:
|
|
||||||
maxN = len(in)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, n := range []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
|
|
||||||
33, 55, 79, 96, 120, 165, 256, maxN} {
|
|
||||||
|
|
||||||
if n > maxN {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
got8, want8 []uint8
|
|
||||||
got32, want32 []uint32
|
|
||||||
)
|
|
||||||
switch op {
|
|
||||||
case "over":
|
|
||||||
const background = 0x40
|
|
||||||
got8 = make([]uint8, n)
|
|
||||||
for i := range got8 {
|
|
||||||
got8[i] = background
|
|
||||||
}
|
|
||||||
want8 = make([]uint8, n)
|
|
||||||
for i := range want8 {
|
|
||||||
dstA := uint32(background * 0x101)
|
|
||||||
maskA := mask[i]
|
|
||||||
outA := dstA*(0xffff-maskA)/0xffff + maskA
|
|
||||||
want8[i] = uint8(outA >> 8)
|
|
||||||
}
|
|
||||||
|
|
||||||
case "src":
|
|
||||||
got8 = make([]uint8, n)
|
|
||||||
want8 = make([]uint8, n)
|
|
||||||
for i := range want8 {
|
|
||||||
want8[i] = uint8(mask[i] >> 8)
|
|
||||||
}
|
|
||||||
|
|
||||||
case "mask":
|
|
||||||
got32 = make([]uint32, n)
|
|
||||||
want32 = mask[:n]
|
|
||||||
}
|
|
||||||
|
|
||||||
switch in := in.(type) {
|
switch in := in.(type) {
|
||||||
case []uint32:
|
case []uint32:
|
||||||
switch op {
|
if simd && !haveFixedAccumulateSIMD {
|
||||||
case "over":
|
continue
|
||||||
fixedAccumulateOpOver(got8, in[:n])
|
|
||||||
case "src":
|
|
||||||
fixedAccumulateOpSrc(got8, in[:n])
|
|
||||||
case "mask":
|
|
||||||
copy(got32, in[:n])
|
|
||||||
fixedAccumulateMask(got32)
|
|
||||||
}
|
}
|
||||||
|
maxN = len(in)
|
||||||
case []float32:
|
case []float32:
|
||||||
switch op {
|
if simd && !haveFloatingAccumulateSIMD {
|
||||||
case "over":
|
continue
|
||||||
floatingAccumulateOpOver(got8, in[:n])
|
|
||||||
case "src":
|
|
||||||
floatingAccumulateOpSrc(got8, in[:n])
|
|
||||||
case "mask":
|
|
||||||
floatingAccumulateMask(got32, in[:n])
|
|
||||||
}
|
}
|
||||||
|
maxN = len(in)
|
||||||
}
|
}
|
||||||
|
|
||||||
if op != "mask" {
|
for _, n := range []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
|
||||||
if !bytes.Equal(got8, want8) {
|
33, 55, 79, 96, 120, 165, 256, maxN} {
|
||||||
t.Errorf("n=%d:\ngot: % x\nwant: % x", n, got8, want8)
|
|
||||||
|
if n > maxN {
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
if !uint32sEqual(got32, want32) {
|
var (
|
||||||
t.Errorf("n=%d:\ngot: % x\nwant: % x", n, got32, want32)
|
got8, want8 []uint8
|
||||||
|
got32, want32 []uint32
|
||||||
|
)
|
||||||
|
switch op {
|
||||||
|
case "over":
|
||||||
|
const background = 0x40
|
||||||
|
got8 = make([]uint8, n)
|
||||||
|
for i := range got8 {
|
||||||
|
got8[i] = background
|
||||||
|
}
|
||||||
|
want8 = make([]uint8, n)
|
||||||
|
for i := range want8 {
|
||||||
|
dstA := uint32(background * 0x101)
|
||||||
|
maskA := mask[i]
|
||||||
|
outA := dstA*(0xffff-maskA)/0xffff + maskA
|
||||||
|
want8[i] = uint8(outA >> 8)
|
||||||
|
}
|
||||||
|
|
||||||
|
case "src":
|
||||||
|
got8 = make([]uint8, n)
|
||||||
|
want8 = make([]uint8, n)
|
||||||
|
for i := range want8 {
|
||||||
|
want8[i] = uint8(mask[i] >> 8)
|
||||||
|
}
|
||||||
|
|
||||||
|
case "mask":
|
||||||
|
got32 = make([]uint32, n)
|
||||||
|
want32 = mask[:n]
|
||||||
|
}
|
||||||
|
|
||||||
|
switch in := in.(type) {
|
||||||
|
case []uint32:
|
||||||
|
switch op {
|
||||||
|
case "over":
|
||||||
|
fixedAccumulateOpOver(got8, in[:n])
|
||||||
|
case "src":
|
||||||
|
if simd {
|
||||||
|
fixedAccumulateOpSrcSIMD(got8, in[:n])
|
||||||
|
} else {
|
||||||
|
fixedAccumulateOpSrc(got8, in[:n])
|
||||||
|
}
|
||||||
|
case "mask":
|
||||||
|
copy(got32, in[:n])
|
||||||
|
fixedAccumulateMask(got32)
|
||||||
|
}
|
||||||
|
case []float32:
|
||||||
|
switch op {
|
||||||
|
case "over":
|
||||||
|
floatingAccumulateOpOver(got8, in[:n])
|
||||||
|
case "src":
|
||||||
|
if simd {
|
||||||
|
floatingAccumulateOpSrcSIMD(got8, in[:n])
|
||||||
|
} else {
|
||||||
|
floatingAccumulateOpSrc(got8, in[:n])
|
||||||
|
}
|
||||||
|
case "mask":
|
||||||
|
floatingAccumulateMask(got32, in[:n])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if op != "mask" {
|
||||||
|
if !bytes.Equal(got8, want8) {
|
||||||
|
t.Errorf("simd=%t, n=%d:\ngot: % x\nwant: % x", simd, n, got8, want8)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if !uint32sEqual(got32, want32) {
|
||||||
|
t.Errorf("simd=%t, n=%d:\ngot: % x\nwant: % x", simd, n, got32, want32)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -129,45 +226,66 @@ func float32sEqual(xs, ys []float32) bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkFixedAccumulateOpOver16(b *testing.B) { benchAcc(b, fxIn16, "over") }
|
func BenchmarkFixedAccumulateOpOver16(b *testing.B) { benchAcc(b, fxIn16, "over", false) }
|
||||||
func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src") }
|
func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src", false) }
|
||||||
func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask") }
|
func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, fxIn16, "src", true) }
|
||||||
func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over") }
|
func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask", false) }
|
||||||
func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src") }
|
func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over", false) }
|
||||||
func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask") }
|
func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src", false) }
|
||||||
|
func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) }
|
||||||
|
func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask", false) }
|
||||||
|
|
||||||
func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over") }
|
func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over", false) }
|
||||||
func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src") }
|
func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src", false) }
|
||||||
func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask") }
|
func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, fxIn64, "src", true) }
|
||||||
func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over") }
|
func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask", false) }
|
||||||
func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src") }
|
func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over", false) }
|
||||||
func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask") }
|
func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src", false) }
|
||||||
|
func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) }
|
||||||
|
func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask", false) }
|
||||||
|
|
||||||
func benchAcc(b *testing.B, in interface{}, op string) {
|
func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
|
||||||
var f func()
|
var f func()
|
||||||
|
|
||||||
switch in := in.(type) {
|
switch in := in.(type) {
|
||||||
case []uint32:
|
case []uint32:
|
||||||
|
if simd && !haveFixedAccumulateSIMD {
|
||||||
|
b.Skip("No SIMD implemention")
|
||||||
|
}
|
||||||
|
|
||||||
switch op {
|
switch op {
|
||||||
case "over":
|
case "over":
|
||||||
dst := make([]uint8, len(in))
|
dst := make([]uint8, len(in))
|
||||||
f = func() { fixedAccumulateOpOver(dst, in) }
|
f = func() { fixedAccumulateOpOver(dst, in) }
|
||||||
case "src":
|
case "src":
|
||||||
dst := make([]uint8, len(in))
|
dst := make([]uint8, len(in))
|
||||||
f = func() { fixedAccumulateOpSrc(dst, in) }
|
if simd {
|
||||||
|
f = func() { fixedAccumulateOpSrcSIMD(dst, in) }
|
||||||
|
} else {
|
||||||
|
f = func() { fixedAccumulateOpSrc(dst, in) }
|
||||||
|
}
|
||||||
case "mask":
|
case "mask":
|
||||||
buf := make([]uint32, len(in))
|
buf := make([]uint32, len(in))
|
||||||
copy(buf, in)
|
copy(buf, in)
|
||||||
f = func() { fixedAccumulateMask(buf) }
|
f = func() { fixedAccumulateMask(buf) }
|
||||||
}
|
}
|
||||||
|
|
||||||
case []float32:
|
case []float32:
|
||||||
|
if simd && !haveFloatingAccumulateSIMD {
|
||||||
|
b.Skip("No SIMD implemention")
|
||||||
|
}
|
||||||
|
|
||||||
switch op {
|
switch op {
|
||||||
case "over":
|
case "over":
|
||||||
dst := make([]uint8, len(in))
|
dst := make([]uint8, len(in))
|
||||||
f = func() { floatingAccumulateOpOver(dst, in) }
|
f = func() { floatingAccumulateOpOver(dst, in) }
|
||||||
case "src":
|
case "src":
|
||||||
dst := make([]uint8, len(in))
|
dst := make([]uint8, len(in))
|
||||||
f = func() { floatingAccumulateOpSrc(dst, in) }
|
if simd {
|
||||||
|
f = func() { floatingAccumulateOpSrcSIMD(dst, in) }
|
||||||
|
} else {
|
||||||
|
f = func() { floatingAccumulateOpSrc(dst, in) }
|
||||||
|
}
|
||||||
case "mask":
|
case "mask":
|
||||||
dst := make([]uint32, len(in))
|
dst := make([]uint32, len(in))
|
||||||
f = func() { floatingAccumulateMask(dst, in) }
|
f = func() { floatingAccumulateMask(dst, in) }
|
||||||
|
|
|
@ -21,9 +21,9 @@ const (
|
||||||
// in the .s files).
|
// in the .s files).
|
||||||
ϕ = 10
|
ϕ = 10
|
||||||
|
|
||||||
one int1ϕ = 1 << ϕ
|
fxOne int1ϕ = 1 << ϕ
|
||||||
oneAndAHalf int1ϕ = 1<<ϕ + 1<<(ϕ-1)
|
fxOneAndAHalf int1ϕ = 1<<ϕ + 1<<(ϕ-1)
|
||||||
oneMinusIota int1ϕ = 1<<ϕ - 1 // Used for rounding up.
|
fxOneMinusIota int1ϕ = 1<<ϕ - 1 // Used for rounding up.
|
||||||
)
|
)
|
||||||
|
|
||||||
// int1ϕ is a signed fixed-point number with 1*ϕ binary digits after the fixed
|
// int1ϕ is a signed fixed-point number with 1*ϕ binary digits after the fixed
|
||||||
|
@ -56,7 +56,7 @@ func fixedMin(x, y int1ϕ) int1ϕ {
|
||||||
}
|
}
|
||||||
|
|
||||||
func fixedFloor(x int1ϕ) int32 { return int32(x >> ϕ) }
|
func fixedFloor(x int1ϕ) int32 { return int32(x >> ϕ) }
|
||||||
func fixedCeil(x int1ϕ) int32 { return int32((x + oneMinusIota) >> ϕ) }
|
func fixedCeil(x int1ϕ) int32 { return int32((x + fxOneMinusIota) >> ϕ) }
|
||||||
|
|
||||||
func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
|
func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
|
||||||
a := z.pen
|
a := z.pen
|
||||||
|
@ -74,10 +74,10 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
|
||||||
}
|
}
|
||||||
dxdy := (b[0] - a[0]) / (b[1] - a[1])
|
dxdy := (b[0] - a[0]) / (b[1] - a[1])
|
||||||
|
|
||||||
ay := int1ϕ(a[1] * float32(one))
|
ay := int1ϕ(a[1] * float32(fxOne))
|
||||||
by := int1ϕ(b[1] * float32(one))
|
by := int1ϕ(b[1] * float32(fxOne))
|
||||||
|
|
||||||
x := int1ϕ(a[0] * float32(one))
|
x := int1ϕ(a[0] * float32(fxOne))
|
||||||
y := fixedFloor(ay)
|
y := fixedFloor(ay)
|
||||||
yMax := fixedCeil(by)
|
yMax := fixedCeil(by)
|
||||||
if yMax > int32(z.size.Y) {
|
if yMax > int32(z.size.Y) {
|
||||||
|
@ -106,7 +106,7 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
|
||||||
if x1i <= x0i+1 {
|
if x1i <= x0i+1 {
|
||||||
xmf := (x+xNext)>>1 - x0Floor
|
xmf := (x+xNext)>>1 - x0Floor
|
||||||
if i := clamp(x0i+0, width); i < uint(len(buf)) {
|
if i := clamp(x0i+0, width); i < uint(len(buf)) {
|
||||||
buf[i] += uint32(d * (one - xmf))
|
buf[i] += uint32(d * (fxOne - xmf))
|
||||||
}
|
}
|
||||||
if i := clamp(x0i+1, width); i < uint(len(buf)) {
|
if i := clamp(x0i+1, width); i < uint(len(buf)) {
|
||||||
buf[i] += uint32(d * xmf)
|
buf[i] += uint32(d * xmf)
|
||||||
|
@ -115,9 +115,9 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
|
||||||
oneOverS := x1 - x0
|
oneOverS := x1 - x0
|
||||||
twoOverS := 2 * oneOverS
|
twoOverS := 2 * oneOverS
|
||||||
x0f := x0 - x0Floor
|
x0f := x0 - x0Floor
|
||||||
oneMinusX0f := one - x0f
|
oneMinusX0f := fxOne - x0f
|
||||||
oneMinusX0fSquared := oneMinusX0f * oneMinusX0f
|
oneMinusX0fSquared := oneMinusX0f * oneMinusX0f
|
||||||
x1f := x1 - x1Ceil + one
|
x1f := x1 - x1Ceil + fxOne
|
||||||
x1fSquared := x1f * x1f
|
x1fSquared := x1f * x1f
|
||||||
|
|
||||||
// These next two variables are unused, as rounding errors are
|
// These next two variables are unused, as rounding errors are
|
||||||
|
@ -139,7 +139,7 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
|
||||||
|
|
||||||
if x1i == x0i+2 {
|
if x1i == x0i+2 {
|
||||||
if i := clamp(x0i+1, width); i < uint(len(buf)) {
|
if i := clamp(x0i+1, width); i < uint(len(buf)) {
|
||||||
// In ideal math: buf[i] += uint32(d * (one - a0 - am))
|
// In ideal math: buf[i] += uint32(d * (fxOne - a0 - am))
|
||||||
D := twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared
|
D := twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared
|
||||||
D *= d
|
D *= d
|
||||||
D /= twoOverS
|
D /= twoOverS
|
||||||
|
@ -148,14 +148,14 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
|
||||||
} else {
|
} else {
|
||||||
// This is commented out for the same reason as a0 and am.
|
// This is commented out for the same reason as a0 and am.
|
||||||
//
|
//
|
||||||
// a1 := ((oneAndAHalf - x0f) << ϕ) / oneOverS
|
// a1 := ((fxOneAndAHalf - x0f) << ϕ) / oneOverS
|
||||||
|
|
||||||
if i := clamp(x0i+1, width); i < uint(len(buf)) {
|
if i := clamp(x0i+1, width); i < uint(len(buf)) {
|
||||||
// In ideal math: buf[i] += uint32(d * (a1 - a0))
|
// In ideal math: buf[i] += uint32(d * (a1 - a0))
|
||||||
//
|
//
|
||||||
// Convert to int64 to avoid overflow. Without that,
|
// Convert to int64 to avoid overflow. Without that,
|
||||||
// TestRasterizePolygon fails.
|
// TestRasterizePolygon fails.
|
||||||
D := int64((oneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared)
|
D := int64((fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared)
|
||||||
D *= int64(d)
|
D *= int64(d)
|
||||||
D /= int64(twoOverS)
|
D /= int64(twoOverS)
|
||||||
buf[i] += uint32(D)
|
buf[i] += uint32(D)
|
||||||
|
@ -172,12 +172,12 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
|
||||||
// a2 := a1 + (int1ϕ(x1i-x0i-3)<<(2*ϕ))/oneOverS
|
// a2 := a1 + (int1ϕ(x1i-x0i-3)<<(2*ϕ))/oneOverS
|
||||||
|
|
||||||
if i := clamp(x1i-1, width); i < uint(len(buf)) {
|
if i := clamp(x1i-1, width); i < uint(len(buf)) {
|
||||||
// In ideal math: buf[i] += uint32(d * (one - a2 - am))
|
// In ideal math: buf[i] += uint32(d * (fxOne - a2 - am))
|
||||||
//
|
//
|
||||||
// Convert to int64 to avoid overflow. Without that,
|
// Convert to int64 to avoid overflow. Without that,
|
||||||
// TestRasterizePolygon fails.
|
// TestRasterizePolygon fails.
|
||||||
D := int64(twoOverS << ϕ)
|
D := int64(twoOverS << ϕ)
|
||||||
D -= int64((oneAndAHalf - x0f) << (ϕ + 1))
|
D -= int64((fxOneAndAHalf - x0f) << (ϕ + 1))
|
||||||
D -= int64((x1i - x0i - 3) << (2*ϕ + 1))
|
D -= int64((x1i - x0i - 3) << (2*ϕ + 1))
|
||||||
D -= int64(x1fSquared)
|
D -= int64(x1fSquared)
|
||||||
D *= int64(d)
|
D *= int64(d)
|
||||||
|
@ -220,6 +220,11 @@ func fixedAccumulateOpOver(dst []uint8, src []uint32) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func fixedAccumulateOpSrc(dst []uint8, src []uint32) {
|
func fixedAccumulateOpSrc(dst []uint8, src []uint32) {
|
||||||
|
// Sanity check that len(dst) >= len(src).
|
||||||
|
if len(dst) < len(src) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
acc := int2ϕ(0)
|
acc := int2ϕ(0)
|
||||||
for i, v := range src {
|
for i, v := range src {
|
||||||
acc += int2ϕ(v)
|
acc += int2ϕ(v)
|
||||||
|
|
|
@ -165,6 +165,11 @@ func floatingAccumulateOpOver(dst []uint8, src []float32) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func floatingAccumulateOpSrc(dst []uint8, src []float32) {
|
func floatingAccumulateOpSrc(dst []uint8, src []float32) {
|
||||||
|
// Sanity check that len(dst) >= len(src).
|
||||||
|
if len(dst) < len(src) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
acc := float32(0)
|
acc := float32(0)
|
||||||
for i, v := range src {
|
for i, v := range src {
|
||||||
acc += v
|
acc += v
|
||||||
|
|
|
@ -312,7 +312,6 @@ func (z *Rasterizer) accumulateMask() {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.Rectangle) {
|
func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.Rectangle) {
|
||||||
// TODO: add SIMD implementations.
|
|
||||||
// TODO: non-zero vs even-odd winding?
|
// TODO: non-zero vs even-odd winding?
|
||||||
if r == dst.Bounds() && r == z.Bounds() {
|
if r == dst.Bounds() && r == z.Bounds() {
|
||||||
// We bypass the z.accumulateMask step and convert straight from
|
// We bypass the z.accumulateMask step and convert straight from
|
||||||
|
@ -341,15 +340,22 @@ func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.
|
||||||
}
|
}
|
||||||
|
|
||||||
func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpSrc(dst *image.Alpha, r image.Rectangle) {
|
func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpSrc(dst *image.Alpha, r image.Rectangle) {
|
||||||
// TODO: add SIMD implementations.
|
|
||||||
// TODO: non-zero vs even-odd winding?
|
// TODO: non-zero vs even-odd winding?
|
||||||
if r == dst.Bounds() && r == z.Bounds() {
|
if r == dst.Bounds() && r == z.Bounds() {
|
||||||
// We bypass the z.accumulateMask step and convert straight from
|
// We bypass the z.accumulateMask step and convert straight from
|
||||||
// z.bufF32 or z.bufU32 to dst.Pix.
|
// z.bufF32 or z.bufU32 to dst.Pix.
|
||||||
if z.useFloatingPointMath {
|
if z.useFloatingPointMath {
|
||||||
floatingAccumulateOpSrc(dst.Pix, z.bufF32)
|
if haveFloatingAccumulateSIMD {
|
||||||
|
floatingAccumulateOpSrcSIMD(dst.Pix, z.bufF32)
|
||||||
|
} else {
|
||||||
|
floatingAccumulateOpSrc(dst.Pix, z.bufF32)
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
fixedAccumulateOpSrc(dst.Pix, z.bufU32)
|
if haveFixedAccumulateSIMD {
|
||||||
|
fixedAccumulateOpSrcSIMD(dst.Pix, z.bufU32)
|
||||||
|
} else {
|
||||||
|
fixedAccumulateOpSrc(dst.Pix, z.bufU32)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user