vector: change ϕ from 10 to 9.

This slight loss in quality allows us to use int32 math exclusively
throughout raster_fixed.go, instead of occasionally dropping into int64
math. The change in ϕ doesn't affect the benchmarks noticably, but
staying in int32 does. The net effect:

name                              old time/op  new time/op  delta
GlyphAlpha16Over-8                3.36µs ± 0%  2.99µs ± 0%  -10.89%         (p=0.000 n=10+9)
GlyphAlpha16Src-8                 3.26µs ± 0%  2.89µs ± 1%  -11.34%         (p=0.000 n=9+10)
GlyphAlpha32Over-8                5.20µs ± 0%  4.53µs ± 0%  -12.76%         (p=0.000 n=8+10)
GlyphAlpha32Src-8                 4.81µs ± 1%  4.14µs ± 0%  -13.91%          (p=0.000 n=9+9)
GlyphAlpha64Over-8                10.2µs ± 0%   9.0µs ± 1%  -11.99%         (p=0.000 n=9+10)
GlyphAlpha64Src-8                 8.62µs ± 0%  7.42µs ± 1%  -13.89%         (p=0.000 n=9+10)
GlyphAlpha128Over-8               24.1µs ± 0%  21.8µs ± 0%   -9.32%          (p=0.000 n=9+9)
GlyphAlpha128Src-8                17.9µs ± 0%  15.6µs ± 0%  -12.68%         (p=0.000 n=9+10)
GlyphAlpha256Over-8               70.1µs ± 0%  66.3µs ± 1%   -5.44%        (p=0.000 n=10+10)
GlyphAlpha256Src-8                45.2µs ± 1%  41.2µs ± 1%   -8.92%        (p=0.000 n=10+10)
GlyphRGBA16Over-8                 5.12µs ± 0%  4.75µs ± 0%   -7.15%         (p=0.000 n=10+9)
GlyphRGBA16Src-8                  4.57µs ± 1%  4.20µs ± 0%   -8.18%          (p=0.000 n=9+8)
GlyphRGBA32Over-8                 12.1µs ± 0%  11.4µs ± 0%   -5.50%         (p=0.000 n=10+9)
GlyphRGBA32Src-8                  10.0µs ± 0%   9.3µs ± 1%   -6.80%         (p=0.000 n=10+9)
GlyphRGBA64Over-8                 37.2µs ± 0%  36.0µs ± 0%   -3.17%          (p=0.000 n=9+8)
GlyphRGBA64Src-8                  29.0µs ± 1%  27.9µs ± 1%   -4.05%         (p=0.000 n=9+10)
GlyphRGBA128Over-8                 134µs ± 1%   131µs ± 0%   -1.85%          (p=0.000 n=9+9)
GlyphRGBA128Src-8                  100µs ± 1%    98µs ± 0%   -2.27%         (p=0.000 n=10+9)
GlyphRGBA256Over-8                 506µs ± 0%   503µs ± 0%   -0.56%         (p=0.000 n=10+8)
GlyphRGBA256Src-8                  373µs ± 0%   370µs ± 0%   -1.01%         (p=0.000 n=10+9)

Change-Id: Ie02afac6fd6fa95f090bf3fe0a5c744799ea9dc5
Reviewed-on: https://go-review.googlesource.com/31532
Reviewed-by: David Crawshaw <crawshaw@golang.org>
This commit is contained in:
Nigel Tao 2016-10-20 10:34:23 +11:00
parent fa54d6fa1c
commit 8874bef159
6 changed files with 95 additions and 108 deletions

View File

@ -143,18 +143,18 @@ fxAccOpOverLoop4:
PADDD X7, X1
// y = abs(x)
// y >>= 4 // Shift by 2*ϕ - 16.
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
//
// pabsd %xmm1,%xmm2
// psrld $0x4,%xmm2
// psrld $0x2,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
// z = convertToInt32(y)
@ -230,18 +230,18 @@ fxAccOpOverLoop1:
PADDD X7, X1
// y = abs(x)
// y >>= 4 // Shift by 2*ϕ - 16.
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
//
// pabsd %xmm1,%xmm2
// psrld $0x4,%xmm2
// psrld $0x2,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
// z = convertToInt32(y)
@ -353,18 +353,18 @@ fxAccOpSrcLoop4:
PADDD X7, X1
// y = abs(x)
// y >>= 12 // Shift by 2*ϕ - 8.
// y >>= 10 // Shift by 2*ϕ - 8.
// y = min(y, fxAlmost256)
//
// pabsd %xmm1,%xmm2
// psrld $0xc,%xmm2
// psrld $0xa,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
// z = convertToInt32(y)
@ -397,18 +397,18 @@ fxAccOpSrcLoop1:
PADDD X7, X1
// y = abs(x)
// y >>= 12 // Shift by 2*ϕ - 8.
// y >>= 10 // Shift by 2*ϕ - 8.
// y = min(y, fxAlmost256)
//
// pabsd %xmm1,%xmm2
// psrld $0xc,%xmm2
// psrld $0xa,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
// z = convertToInt32(y)
@ -498,18 +498,18 @@ fxAccMaskLoop4:
PADDD X7, X1
// y = abs(x)
// y >>= 4 // Shift by 2*ϕ - 16.
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
//
// pabsd %xmm1,%xmm2
// psrld $0x4,%xmm2
// psrld $0x2,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
// z = convertToInt32(y)
@ -540,18 +540,18 @@ fxAccMaskLoop1:
PADDD X7, X1
// y = abs(x)
// y >>= 4 // Shift by 2*ϕ - 16.
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
//
// pabsd %xmm1,%xmm2
// psrld $0x4,%xmm2
// psrld $0x2,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
// z = convertToInt32(y)

View File

@ -370,18 +370,18 @@ func itou(i int2ϕ) uint32 {
}
var fxInShort = []uint32{
itou(+0x020000), // +0.125, // Running sum: +0.125
itou(-0x080000), // -0.500, // Running sum: -0.375
itou(+0x040000), // +0.250, // Running sum: -0.125
itou(+0x060000), // +0.375, // Running sum: +0.250
itou(+0x020000), // +0.125, // Running sum: +0.375
itou(+0x000000), // +0.000, // Running sum: +0.375
itou(-0x100000), // -1.000, // Running sum: -0.625
itou(-0x080000), // -0.500, // Running sum: -1.125
itou(+0x040000), // +0.250, // Running sum: -0.875
itou(+0x0e0000), // +0.875, // Running sum: +0.000
itou(+0x040000), // +0.250, // Running sum: +0.250
itou(+0x0c0000), // +0.750, // Running sum: +1.000
itou(+0x08000), // +0.125, // Running sum: +0.125
itou(-0x20000), // -0.500, // Running sum: -0.375
itou(+0x10000), // +0.250, // Running sum: -0.125
itou(+0x18000), // +0.375, // Running sum: +0.250
itou(+0x08000), // +0.125, // Running sum: +0.375
itou(+0x00000), // +0.000, // Running sum: +0.375
itou(-0x40000), // -1.000, // Running sum: -0.625
itou(-0x20000), // -0.500, // Running sum: -1.125
itou(+0x10000), // +0.250, // Running sum: -0.875
itou(+0x38000), // +0.875, // Running sum: +0.000
itou(+0x10000), // +0.250, // Running sum: +0.250
itou(+0x30000), // +0.750, // Running sum: +1.000
}
var flInShort = []float32{
@ -518,32 +518,32 @@ var (
)
var hardCodedFxIn16 = []uint32{
0x00000000, 0x00000000, 0xffffa3ee, 0xfff9f0c9, 0xfffaaafc, 0xfffd38ec, 0xffff073f, 0x0001dddf,
0x0002589a, 0x0006a22c, 0x0004a6df, 0x000000a0, 0x00000000, 0x00000000, 0xfffdb883, 0xfff4c620,
0xfffd815f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00052ec6,
0x000ab1ba, 0x00001f7f, 0xffff29b7, 0xfff2ad44, 0xfffe2906, 0x00006c84, 0x0006ce82, 0x00050d7b,
0x00010db4, 0xfffd8c05, 0xfff85159, 0xfffccc6d, 0x00000000, 0x00088d28, 0x000772d8, 0xfff8a36a,
0xfff75c96, 0x00000000, 0x000a2b80, 0x0005d480, 0x00000000, 0x00000000, 0x00000000, 0xffff4bbf,
0xfff2b937, 0xfffdfb0b, 0x0001cc00, 0x000e3400, 0xfffa4980, 0xfffcb680, 0x000008e8, 0x0008966f,
0x000060a8, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfff72000, 0xfff8e000, 0x00000165,
0x000e9134, 0x00016d65, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0xfff8d3be, 0xfff72c42, 0x00000000, 0x000cec0f, 0x000313f1, 0x00000000,
0x00000000, 0xfffe84f1, 0xfffbbb8f, 0xfffe3008, 0xfffe311b, 0xffff1e60, 0x00000000, 0xfffd6f10,
0xfffcd0f0, 0x00000000, 0x000cec00, 0x00031400, 0xfffe6d8a, 0xfff7d307, 0xfffa38bf, 0xffff86b3,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000cec00,
0x00024dc4, 0xfff3cc79, 0xfffcf9c4, 0x00003ed0, 0x000467df, 0x0004c32f, 0x0001a038, 0x00012964,
0x00002883, 0xfffa7bf1, 0xfff9280f, 0x00000000, 0x000cec00, 0xfffa2901, 0xfff8eaff, 0x00004138,
0x000aebd5, 0x0004d2f2, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfff8dc00, 0xfff72400,
0x00000000, 0x000cec00, 0xfff64800, 0xfffccc00, 0x00039400, 0x000c6c00, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0xfff8dc00, 0xfff72400, 0x00000000, 0x000cec00, 0xfff3ea8a,
0xffff2976, 0x00047cad, 0x000b8353, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0xfff6cb2e, 0xfff934d2, 0x00000000, 0x000cec00, 0xfff68000, 0xfffc9400, 0x0000babf, 0x000cfbcc,
0x00024974, 0x00000000, 0x00000000, 0x00000000, 0xfffa12a1, 0xfff61e13, 0xffffcf4d, 0x00000000,
0x000c79a0, 0xfffcac8c, 0xfff6d9d4, 0x00000000, 0x00015024, 0x0006d297, 0x000288dc, 0xfffe8e52,
0xfffaba3a, 0xfffc0cbd, 0xffffff20, 0x00000000, 0x00000000, 0x000b5c00, 0x000496d7, 0xfff5a25f,
0xfffa6acc, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0002abf1, 0x0005195f,
0xfff83aae, 0x00000000, 0x00089fb6, 0x0007604a, 0xfffffe47, 0xfffb0173, 0xfff94d6b, 0xfffd7586,
0xffff5219, 0x000319cc, 0x0003eed3, 0x0007529f, 0xfffedc08, 0xfff647f6, 0x00000000, 0x000392ce,
0x00000000, 0x00000000, 0xffffe91d, 0xfffe7c4a, 0xfffeaa9f, 0xffff4e33, 0xffffc1c5, 0x00007782,
0x00009619, 0x0001a857, 0x000129e9, 0x00000028, 0x00000000, 0x00000000, 0xffff6e70, 0xfffd3199,
0xffff5ff8, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00014b29,
0x0002acf3, 0x000007e2, 0xffffca5a, 0xfffcab73, 0xffff8a34, 0x00001b55, 0x0001b334, 0x0001449e,
0x0000434d, 0xffff62ec, 0xfffe1443, 0xffff325d, 0x00000000, 0x0002234a, 0x0001dcb6, 0xfffe2948,
0xfffdd6b8, 0x00000000, 0x00028cc0, 0x00017340, 0x00000000, 0x00000000, 0x00000000, 0xffffd2d6,
0xfffcadd0, 0xffff7f5c, 0x00007400, 0x00038c00, 0xfffe9260, 0xffff2da0, 0x0000023a, 0x0002259b,
0x0000182a, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffdc600, 0xfffe3a00, 0x00000059,
0x0003a44d, 0x00005b59, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0xfffe33f3, 0xfffdcc0d, 0x00000000, 0x00033c02, 0x0000c3fe, 0x00000000,
0x00000000, 0xffffa13d, 0xfffeeec8, 0xffff8c02, 0xffff8c48, 0xffffc7b5, 0x00000000, 0xffff5b68,
0xffff3498, 0x00000000, 0x00033c00, 0x0000c400, 0xffff9bc4, 0xfffdf4a3, 0xfffe8df3, 0xffffe1a8,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00033c00,
0x000092c7, 0xfffcf373, 0xffff3dc7, 0x00000fcc, 0x00011ae7, 0x000130c3, 0x0000680d, 0x00004a59,
0x00000a20, 0xfffe9dc4, 0xfffe4a3c, 0x00000000, 0x00033c00, 0xfffe87ef, 0xfffe3c11, 0x0000105e,
0x0002b9c4, 0x000135dc, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffe3600, 0xfffdca00,
0x00000000, 0x00033c00, 0xfffd9000, 0xffff3400, 0x0000e400, 0x00031c00, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0xfffe3600, 0xfffdca00, 0x00000000, 0x00033c00, 0xfffcf9a5,
0xffffca5b, 0x000120e6, 0x0002df1a, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0xfffdb195, 0xfffe4e6b, 0x00000000, 0x00033c00, 0xfffd9e00, 0xffff2600, 0x00002f0e, 0x00033ea3,
0x0000924d, 0x00000000, 0x00000000, 0x00000000, 0xfffe83b3, 0xfffd881d, 0xfffff431, 0x00000000,
0x00031f60, 0xffff297a, 0xfffdb726, 0x00000000, 0x000053a7, 0x0001b506, 0x0000a24b, 0xffffa32d,
0xfffead9b, 0xffff0479, 0xffffffc9, 0x00000000, 0x00000000, 0x0002d800, 0x0001249d, 0xfffd67bb,
0xfffe9baa, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ac03, 0x0001448b,
0xfffe0f70, 0x00000000, 0x000229ea, 0x0001d616, 0xffffff8c, 0xfffebf76, 0xfffe54d9, 0xffff5d9e,
0xffffd3eb, 0x0000c65e, 0x0000fc15, 0x0001d491, 0xffffb566, 0xfffd9433, 0x00000000, 0x0000e4ec,
}
var hardCodedFlIn16 = []float32{
@ -576,22 +576,22 @@ var hardCodedFlIn16 = []float32{
}
var fxMask16 = []uint32{
0x0000, 0x0000, 0x05c1, 0x66b4, 0xbc04, 0xe876, 0xf802, 0xda24, 0xb49a, 0x4a77, 0x0009, 0x0000, 0x0000,
0x0000, 0x2477, 0xd815, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xad13, 0x01f7, 0x0000,
0x0d64, 0xe290, 0xffff, 0xf937, 0x8c4f, 0x3b77, 0x2a9c, 0x51dc, 0xccc6, 0xffff, 0xffff, 0x772d, 0x0000,
0x75c9, 0xffff, 0xffff, 0x5d47, 0x0000, 0x0000, 0x0000, 0x0000, 0x0b43, 0xdfb0, 0xffff, 0xe33f, 0x0000,
0x5b67, 0x8fff, 0x8f71, 0x060a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x8dff, 0xffff, 0xffe9, 0x16d6,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x72c4, 0xffff, 0xffff, 0x313e,
0x0000, 0x0000, 0x0000, 0x17b0, 0x5bf7, 0x78f7, 0x95e5, 0xa3ff, 0xa3ff, 0xcd0e, 0xffff, 0xffff, 0x313f,
0x0000, 0x1927, 0x9bf6, 0xf86a, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x313f,
0x0c63, 0xcf9b, 0xffff, 0xfc12, 0xb594, 0x6961, 0x4f5e, 0x3cc7, 0x3a3f, 0x9280, 0xffff, 0xffff, 0x313f,
0x8eaf, 0xffff, 0xfbec, 0x4d2e, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x723f, 0xffff, 0xffff, 0x313f,
0xccbf, 0xffff, 0xc6bf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x723f, 0xffff, 0xffff, 0x313f,
0xf297, 0xffff, 0xb834, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x934c, 0xffff, 0xffff, 0x313f,
0xc93f, 0xffff, 0xf453, 0x2497, 0x0000, 0x0000, 0x0000, 0x0000, 0x5ed5, 0xfcf4, 0xffff, 0xffff, 0x3865,
0x6d9c, 0xffff, 0xffff, 0xeafd, 0x7dd4, 0x5546, 0x6c61, 0xc0bd, 0xfff1, 0xffff, 0xffff, 0xffff, 0x4a3f,
0x00d2, 0xa6ac, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xd540, 0x83aa, 0xffff, 0xffff, 0x7604,
0x0000, 0x001b, 0x5004, 0xbb2d, 0xe3d5, 0xeeb3, 0xbd16, 0x7e29, 0x08ff, 0x1b3f, 0xb6bf, 0xb6bf, 0x7d92,
0x0000, 0x0000, 0x05b8, 0x66a6, 0xbbfe, 0xe871, 0xf800, 0xda20, 0xb499, 0x4a84, 0x0009, 0x0000, 0x0000,
0x0000, 0x2463, 0xd7fd, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xad35, 0x01f8, 0x0000,
0x0d69, 0xe28c, 0xffff, 0xf92a, 0x8c5d, 0x3b36, 0x2a62, 0x51a7, 0xcc97, 0xffff, 0xffff, 0x772d, 0x0000,
0x75ad, 0xffff, 0xffff, 0x5ccf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0b4a, 0xdfd6, 0xffff, 0xe2ff, 0x0000,
0x5b67, 0x8fff, 0x8f70, 0x060a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x8e7f, 0xffff, 0xffe9, 0x16d6,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x7303, 0xffff, 0xffff, 0x30ff,
0x0000, 0x0000, 0x0000, 0x17b0, 0x5bfe, 0x78fe, 0x95ec, 0xa3fe, 0xa3fe, 0xcd24, 0xfffe, 0xfffe, 0x30fe,
0x0001, 0x190d, 0x9be5, 0xf868, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0x30fe,
0x0c4c, 0xcf6f, 0xfffe, 0xfc0b, 0xb551, 0x6920, 0x4f1d, 0x3c87, 0x39ff, 0x928e, 0xffff, 0xffff, 0x30ff,
0x8f03, 0xffff, 0xfbe7, 0x4d76, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x727f, 0xffff, 0xffff, 0x30ff,
0xccff, 0xffff, 0xc6ff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x727f, 0xffff, 0xffff, 0x30ff,
0xf296, 0xffff, 0xb7c6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x939a, 0xffff, 0xffff, 0x30ff,
0xc97f, 0xffff, 0xf43c, 0x2493, 0x0000, 0x0000, 0x0000, 0x0000, 0x5f13, 0xfd0c, 0xffff, 0xffff, 0x3827,
0x6dc9, 0xffff, 0xffff, 0xeb16, 0x7dd4, 0x5541, 0x6c76, 0xc10f, 0xfff1, 0xffff, 0xffff, 0xffff, 0x49ff,
0x00d8, 0xa6e9, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xd4fe, 0x83db, 0xffff, 0xffff, 0x7584,
0x0000, 0x001c, 0x503e, 0xbb08, 0xe3a1, 0xeea6, 0xbd0e, 0x7e09, 0x08e5, 0x1b8b, 0xb67f, 0xb67f, 0x7d44,
}
var flMask16 = []uint32{

View File

@ -308,34 +308,34 @@ const (
fxClampAndScale256 = `
// y = abs(x)
// y >>= 12 // Shift by 2*ϕ - 8.
// y >>= 10 // Shift by 2*ϕ - 8.
// y = min(y, fxAlmost256)
//
// pabsd %xmm1,%xmm2
// psrld $0xc,%xmm2
// psrld $0xa,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
`
fxClampAndScale65536 = `
// y = abs(x)
// y >>= 4 // Shift by 2*ϕ - 16.
// y >>= 2 // Shift by 2*ϕ - 16.
// y = min(y, fxAlmost65536)
//
// pabsd %xmm1,%xmm2
// psrld $0x4,%xmm2
// psrld $0x2,%xmm2
// pminud %xmm5,%xmm2
//
// Hopefully we'll get these opcode mnemonics into the assembler for Go
// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
// it's similar.
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04
BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
`
flClampAndScale256 = `

View File

@ -19,11 +19,7 @@ const (
//
// When changing this number, also change the assembly code (search for ϕ
// in the .s files).
//
// TODO: drop ϕ from 10 to 9, so that ±1<<(3*ϕ+3) doesn't overflow an int32
// and we can therefore use int32 math instead of the slower int64 math in
// Rasterizer.fixedLineTo below.
ϕ = 10
ϕ = 9
fxOne int1ϕ = 1 << ϕ
fxOneAndAHalf int1ϕ = 1<<ϕ + 1<<(ϕ-1)
@ -146,13 +142,10 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
// In ideal math: buf[i] += uint32(d * (fxOne - a0 - am))
//
// (x1i == x0i+2) and (twoOverS == 2 * (x1 - x0)) implies
// that int64(twoOverS) ranges up to +1<<(1*ϕ+2).
//
// Convert to int64 to avoid overflow. Without that,
// TestRasterize30Degrees fails.
D := int64(twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared) // D ranges up to ±1<<(2*ϕ+2).
D *= int64(d) // D ranges up to ±1<<(3*ϕ+2).
D /= int64(twoOverS)
// that twoOverS ranges up to +1<<(1*ϕ+2).
D := twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared // D ranges up to ±1<<(2*ϕ+2).
D *= d // D ranges up to ±1<<(3*ϕ+2).
D /= twoOverS
buf[i] += uint32(D)
}
} else {
@ -194,12 +187,9 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
// Thus, A ranges up to ±1<<(2*ϕ+2). It is possible to
// derive a tighter bound, but this bound is sufficient to
// reason about overflow.
//
// Convert to int64 to avoid overflow. Without that,
// TestRasterizePolygon fails.
D := int64((fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared) // D ranges up to ±1<<(2*ϕ+2).
D *= int64(d) // D ranges up to ±1<<(3*ϕ+2).
D /= int64(twoOverS)
D := (fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared // D ranges up to ±1<<(2*ϕ+2).
D *= d // D ranges up to ±1<<(3*ϕ+2).
D /= twoOverS
buf[i] += uint32(D)
}
dTimesS := uint32((d << (2 * ϕ)) / oneOverS)
@ -255,15 +245,12 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
// greater than -fxOne<<2, or -1<<(ϕ+2). Thus, B ranges up
// to ±1<<(ϕ+2). One final simplification:
// B = x1f<<1 + (1<<(ϕ+2) - fxOneAndAHalf<<1)
//
// Convert to int64 to avoid overflow. Without that,
// TestRasterizePolygon fails.
const C = 1<<(ϕ+2) - fxOneAndAHalf<<1
D := int64(x1f<<1 + C) // D ranges up to ±1<<(1*ϕ+2).
D <<= ϕ // D ranges up to ±1<<(2*ϕ+2).
D -= int64(x1fSquared) // D ranges up to ±1<<(2*ϕ+3).
D *= int64(d) // D ranges up to ±1<<(3*ϕ+3).
D /= int64(twoOverS)
D := x1f<<1 + C // D ranges up to ±1<<(1*ϕ+2).
D <<= ϕ // D ranges up to ±1<<(2*ϕ+2).
D -= x1fSquared // D ranges up to ±1<<(2*ϕ+3).
D *= d // D ranges up to ±1<<(3*ϕ+3).
D /= twoOverS
buf[i] += uint32(D)
}
}

View File

@ -46,9 +46,9 @@ import (
//
// The rationale for this particular value is that TestRasterizePolygon in
// vector_test.go checks the rendering quality of polygon edges at various
// angles, inscribed in a circle of diameter 2048. It may be that a higher
// value would still produce acceptable quality, but 2048 seems to work.
const floatingPointMathThreshold = 2048
// angles, inscribed in a circle of diameter 512. It may be that a higher value
// would still produce acceptable quality, but 512 seems to work.
const floatingPointMathThreshold = 512
func midPoint(p, q f32.Vec2) f32.Vec2 {
return f32.Vec2{

View File

@ -88,7 +88,7 @@ func TestRasterizeOutOfBounds(t *testing.T) {
func TestRasterizePolygon(t *testing.T) {
var z Rasterizer
for radius := 4; radius <= 1024; radius *= 2 {
for radius := 4; radius <= 256; radius *= 2 {
for n := 3; n <= 19; n += 4 {
z.Reset(2*radius, 2*radius)
z.MoveTo(f32.Vec2{