From 8874bef159af77ab9e746fc3c80b2219b3652045 Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Thu, 20 Oct 2016 10:34:23 +1100 Subject: [PATCH] =?UTF-8?q?vector:=20change=20=CF=95=20from=2010=20to=209.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This slight loss in quality allows us to use int32 math exclusively throughout raster_fixed.go, instead of occasionally dropping into int64 math. The change in ϕ doesn't affect the benchmarks noticably, but staying in int32 does. The net effect: name old time/op new time/op delta GlyphAlpha16Over-8 3.36µs ± 0% 2.99µs ± 0% -10.89% (p=0.000 n=10+9) GlyphAlpha16Src-8 3.26µs ± 0% 2.89µs ± 1% -11.34% (p=0.000 n=9+10) GlyphAlpha32Over-8 5.20µs ± 0% 4.53µs ± 0% -12.76% (p=0.000 n=8+10) GlyphAlpha32Src-8 4.81µs ± 1% 4.14µs ± 0% -13.91% (p=0.000 n=9+9) GlyphAlpha64Over-8 10.2µs ± 0% 9.0µs ± 1% -11.99% (p=0.000 n=9+10) GlyphAlpha64Src-8 8.62µs ± 0% 7.42µs ± 1% -13.89% (p=0.000 n=9+10) GlyphAlpha128Over-8 24.1µs ± 0% 21.8µs ± 0% -9.32% (p=0.000 n=9+9) GlyphAlpha128Src-8 17.9µs ± 0% 15.6µs ± 0% -12.68% (p=0.000 n=9+10) GlyphAlpha256Over-8 70.1µs ± 0% 66.3µs ± 1% -5.44% (p=0.000 n=10+10) GlyphAlpha256Src-8 45.2µs ± 1% 41.2µs ± 1% -8.92% (p=0.000 n=10+10) GlyphRGBA16Over-8 5.12µs ± 0% 4.75µs ± 0% -7.15% (p=0.000 n=10+9) GlyphRGBA16Src-8 4.57µs ± 1% 4.20µs ± 0% -8.18% (p=0.000 n=9+8) GlyphRGBA32Over-8 12.1µs ± 0% 11.4µs ± 0% -5.50% (p=0.000 n=10+9) GlyphRGBA32Src-8 10.0µs ± 0% 9.3µs ± 1% -6.80% (p=0.000 n=10+9) GlyphRGBA64Over-8 37.2µs ± 0% 36.0µs ± 0% -3.17% (p=0.000 n=9+8) GlyphRGBA64Src-8 29.0µs ± 1% 27.9µs ± 1% -4.05% (p=0.000 n=9+10) GlyphRGBA128Over-8 134µs ± 1% 131µs ± 0% -1.85% (p=0.000 n=9+9) GlyphRGBA128Src-8 100µs ± 1% 98µs ± 0% -2.27% (p=0.000 n=10+9) GlyphRGBA256Over-8 506µs ± 0% 503µs ± 0% -0.56% (p=0.000 n=10+8) GlyphRGBA256Src-8 373µs ± 0% 370µs ± 0% -1.01% (p=0.000 n=10+9) Change-Id: Ie02afac6fd6fa95f090bf3fe0a5c744799ea9dc5 Reviewed-on: https://go-review.googlesource.com/31532 Reviewed-by: David Crawshaw --- vector/acc_amd64.s | 36 +++++++------- vector/acc_test.go | 108 ++++++++++++++++++++--------------------- vector/gen.go | 12 ++--- vector/raster_fixed.go | 39 +++++---------- vector/vector.go | 6 +-- vector/vector_test.go | 2 +- 6 files changed, 95 insertions(+), 108 deletions(-) diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s index 31b9c6e..8d9719c 100644 --- a/vector/acc_amd64.s +++ b/vector/acc_amd64.s @@ -143,18 +143,18 @@ fxAccOpOverLoop4: PADDD X7, X1 // y = abs(x) - // y >>= 4 // Shift by 2*ϕ - 16. + // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) // // pabsd %xmm1,%xmm2 - // psrld $0x4,%xmm2 + // psrld $0x2,%xmm2 // pminud %xmm5,%xmm2 // // Hopefully we'll get these opcode mnemonics into the assembler for Go // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but // it's similar. BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 // z = convertToInt32(y) @@ -230,18 +230,18 @@ fxAccOpOverLoop1: PADDD X7, X1 // y = abs(x) - // y >>= 4 // Shift by 2*ϕ - 16. + // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) // // pabsd %xmm1,%xmm2 - // psrld $0x4,%xmm2 + // psrld $0x2,%xmm2 // pminud %xmm5,%xmm2 // // Hopefully we'll get these opcode mnemonics into the assembler for Go // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but // it's similar. BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 // z = convertToInt32(y) @@ -353,18 +353,18 @@ fxAccOpSrcLoop4: PADDD X7, X1 // y = abs(x) - // y >>= 12 // Shift by 2*ϕ - 8. + // y >>= 10 // Shift by 2*ϕ - 8. // y = min(y, fxAlmost256) // // pabsd %xmm1,%xmm2 - // psrld $0xc,%xmm2 + // psrld $0xa,%xmm2 // pminud %xmm5,%xmm2 // // Hopefully we'll get these opcode mnemonics into the assembler for Go // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but // it's similar. BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 // z = convertToInt32(y) @@ -397,18 +397,18 @@ fxAccOpSrcLoop1: PADDD X7, X1 // y = abs(x) - // y >>= 12 // Shift by 2*ϕ - 8. + // y >>= 10 // Shift by 2*ϕ - 8. // y = min(y, fxAlmost256) // // pabsd %xmm1,%xmm2 - // psrld $0xc,%xmm2 + // psrld $0xa,%xmm2 // pminud %xmm5,%xmm2 // // Hopefully we'll get these opcode mnemonics into the assembler for Go // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but // it's similar. BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 // z = convertToInt32(y) @@ -498,18 +498,18 @@ fxAccMaskLoop4: PADDD X7, X1 // y = abs(x) - // y >>= 4 // Shift by 2*ϕ - 16. + // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) // // pabsd %xmm1,%xmm2 - // psrld $0x4,%xmm2 + // psrld $0x2,%xmm2 // pminud %xmm5,%xmm2 // // Hopefully we'll get these opcode mnemonics into the assembler for Go // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but // it's similar. BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 // z = convertToInt32(y) @@ -540,18 +540,18 @@ fxAccMaskLoop1: PADDD X7, X1 // y = abs(x) - // y >>= 4 // Shift by 2*ϕ - 16. + // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) // // pabsd %xmm1,%xmm2 - // psrld $0x4,%xmm2 + // psrld $0x2,%xmm2 // pminud %xmm5,%xmm2 // // Hopefully we'll get these opcode mnemonics into the assembler for Go // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but // it's similar. BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 // z = convertToInt32(y) diff --git a/vector/acc_test.go b/vector/acc_test.go index db238f7..1dce439 100644 --- a/vector/acc_test.go +++ b/vector/acc_test.go @@ -370,18 +370,18 @@ func itou(i int2ϕ) uint32 { } var fxInShort = []uint32{ - itou(+0x020000), // +0.125, // Running sum: +0.125 - itou(-0x080000), // -0.500, // Running sum: -0.375 - itou(+0x040000), // +0.250, // Running sum: -0.125 - itou(+0x060000), // +0.375, // Running sum: +0.250 - itou(+0x020000), // +0.125, // Running sum: +0.375 - itou(+0x000000), // +0.000, // Running sum: +0.375 - itou(-0x100000), // -1.000, // Running sum: -0.625 - itou(-0x080000), // -0.500, // Running sum: -1.125 - itou(+0x040000), // +0.250, // Running sum: -0.875 - itou(+0x0e0000), // +0.875, // Running sum: +0.000 - itou(+0x040000), // +0.250, // Running sum: +0.250 - itou(+0x0c0000), // +0.750, // Running sum: +1.000 + itou(+0x08000), // +0.125, // Running sum: +0.125 + itou(-0x20000), // -0.500, // Running sum: -0.375 + itou(+0x10000), // +0.250, // Running sum: -0.125 + itou(+0x18000), // +0.375, // Running sum: +0.250 + itou(+0x08000), // +0.125, // Running sum: +0.375 + itou(+0x00000), // +0.000, // Running sum: +0.375 + itou(-0x40000), // -1.000, // Running sum: -0.625 + itou(-0x20000), // -0.500, // Running sum: -1.125 + itou(+0x10000), // +0.250, // Running sum: -0.875 + itou(+0x38000), // +0.875, // Running sum: +0.000 + itou(+0x10000), // +0.250, // Running sum: +0.250 + itou(+0x30000), // +0.750, // Running sum: +1.000 } var flInShort = []float32{ @@ -518,32 +518,32 @@ var ( ) var hardCodedFxIn16 = []uint32{ - 0x00000000, 0x00000000, 0xffffa3ee, 0xfff9f0c9, 0xfffaaafc, 0xfffd38ec, 0xffff073f, 0x0001dddf, - 0x0002589a, 0x0006a22c, 0x0004a6df, 0x000000a0, 0x00000000, 0x00000000, 0xfffdb883, 0xfff4c620, - 0xfffd815f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00052ec6, - 0x000ab1ba, 0x00001f7f, 0xffff29b7, 0xfff2ad44, 0xfffe2906, 0x00006c84, 0x0006ce82, 0x00050d7b, - 0x00010db4, 0xfffd8c05, 0xfff85159, 0xfffccc6d, 0x00000000, 0x00088d28, 0x000772d8, 0xfff8a36a, - 0xfff75c96, 0x00000000, 0x000a2b80, 0x0005d480, 0x00000000, 0x00000000, 0x00000000, 0xffff4bbf, - 0xfff2b937, 0xfffdfb0b, 0x0001cc00, 0x000e3400, 0xfffa4980, 0xfffcb680, 0x000008e8, 0x0008966f, - 0x000060a8, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfff72000, 0xfff8e000, 0x00000165, - 0x000e9134, 0x00016d65, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0xfff8d3be, 0xfff72c42, 0x00000000, 0x000cec0f, 0x000313f1, 0x00000000, - 0x00000000, 0xfffe84f1, 0xfffbbb8f, 0xfffe3008, 0xfffe311b, 0xffff1e60, 0x00000000, 0xfffd6f10, - 0xfffcd0f0, 0x00000000, 0x000cec00, 0x00031400, 0xfffe6d8a, 0xfff7d307, 0xfffa38bf, 0xffff86b3, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000cec00, - 0x00024dc4, 0xfff3cc79, 0xfffcf9c4, 0x00003ed0, 0x000467df, 0x0004c32f, 0x0001a038, 0x00012964, - 0x00002883, 0xfffa7bf1, 0xfff9280f, 0x00000000, 0x000cec00, 0xfffa2901, 0xfff8eaff, 0x00004138, - 0x000aebd5, 0x0004d2f2, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfff8dc00, 0xfff72400, - 0x00000000, 0x000cec00, 0xfff64800, 0xfffccc00, 0x00039400, 0x000c6c00, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0xfff8dc00, 0xfff72400, 0x00000000, 0x000cec00, 0xfff3ea8a, - 0xffff2976, 0x00047cad, 0x000b8353, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0xfff6cb2e, 0xfff934d2, 0x00000000, 0x000cec00, 0xfff68000, 0xfffc9400, 0x0000babf, 0x000cfbcc, - 0x00024974, 0x00000000, 0x00000000, 0x00000000, 0xfffa12a1, 0xfff61e13, 0xffffcf4d, 0x00000000, - 0x000c79a0, 0xfffcac8c, 0xfff6d9d4, 0x00000000, 0x00015024, 0x0006d297, 0x000288dc, 0xfffe8e52, - 0xfffaba3a, 0xfffc0cbd, 0xffffff20, 0x00000000, 0x00000000, 0x000b5c00, 0x000496d7, 0xfff5a25f, - 0xfffa6acc, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0002abf1, 0x0005195f, - 0xfff83aae, 0x00000000, 0x00089fb6, 0x0007604a, 0xfffffe47, 0xfffb0173, 0xfff94d6b, 0xfffd7586, - 0xffff5219, 0x000319cc, 0x0003eed3, 0x0007529f, 0xfffedc08, 0xfff647f6, 0x00000000, 0x000392ce, + 0x00000000, 0x00000000, 0xffffe91d, 0xfffe7c4a, 0xfffeaa9f, 0xffff4e33, 0xffffc1c5, 0x00007782, + 0x00009619, 0x0001a857, 0x000129e9, 0x00000028, 0x00000000, 0x00000000, 0xffff6e70, 0xfffd3199, + 0xffff5ff8, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00014b29, + 0x0002acf3, 0x000007e2, 0xffffca5a, 0xfffcab73, 0xffff8a34, 0x00001b55, 0x0001b334, 0x0001449e, + 0x0000434d, 0xffff62ec, 0xfffe1443, 0xffff325d, 0x00000000, 0x0002234a, 0x0001dcb6, 0xfffe2948, + 0xfffdd6b8, 0x00000000, 0x00028cc0, 0x00017340, 0x00000000, 0x00000000, 0x00000000, 0xffffd2d6, + 0xfffcadd0, 0xffff7f5c, 0x00007400, 0x00038c00, 0xfffe9260, 0xffff2da0, 0x0000023a, 0x0002259b, + 0x0000182a, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffdc600, 0xfffe3a00, 0x00000059, + 0x0003a44d, 0x00005b59, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0xfffe33f3, 0xfffdcc0d, 0x00000000, 0x00033c02, 0x0000c3fe, 0x00000000, + 0x00000000, 0xffffa13d, 0xfffeeec8, 0xffff8c02, 0xffff8c48, 0xffffc7b5, 0x00000000, 0xffff5b68, + 0xffff3498, 0x00000000, 0x00033c00, 0x0000c400, 0xffff9bc4, 0xfffdf4a3, 0xfffe8df3, 0xffffe1a8, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00033c00, + 0x000092c7, 0xfffcf373, 0xffff3dc7, 0x00000fcc, 0x00011ae7, 0x000130c3, 0x0000680d, 0x00004a59, + 0x00000a20, 0xfffe9dc4, 0xfffe4a3c, 0x00000000, 0x00033c00, 0xfffe87ef, 0xfffe3c11, 0x0000105e, + 0x0002b9c4, 0x000135dc, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffe3600, 0xfffdca00, + 0x00000000, 0x00033c00, 0xfffd9000, 0xffff3400, 0x0000e400, 0x00031c00, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0xfffe3600, 0xfffdca00, 0x00000000, 0x00033c00, 0xfffcf9a5, + 0xffffca5b, 0x000120e6, 0x0002df1a, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xfffdb195, 0xfffe4e6b, 0x00000000, 0x00033c00, 0xfffd9e00, 0xffff2600, 0x00002f0e, 0x00033ea3, + 0x0000924d, 0x00000000, 0x00000000, 0x00000000, 0xfffe83b3, 0xfffd881d, 0xfffff431, 0x00000000, + 0x00031f60, 0xffff297a, 0xfffdb726, 0x00000000, 0x000053a7, 0x0001b506, 0x0000a24b, 0xffffa32d, + 0xfffead9b, 0xffff0479, 0xffffffc9, 0x00000000, 0x00000000, 0x0002d800, 0x0001249d, 0xfffd67bb, + 0xfffe9baa, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000ac03, 0x0001448b, + 0xfffe0f70, 0x00000000, 0x000229ea, 0x0001d616, 0xffffff8c, 0xfffebf76, 0xfffe54d9, 0xffff5d9e, + 0xffffd3eb, 0x0000c65e, 0x0000fc15, 0x0001d491, 0xffffb566, 0xfffd9433, 0x00000000, 0x0000e4ec, } var hardCodedFlIn16 = []float32{ @@ -576,22 +576,22 @@ var hardCodedFlIn16 = []float32{ } var fxMask16 = []uint32{ - 0x0000, 0x0000, 0x05c1, 0x66b4, 0xbc04, 0xe876, 0xf802, 0xda24, 0xb49a, 0x4a77, 0x0009, 0x0000, 0x0000, - 0x0000, 0x2477, 0xd815, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xad13, 0x01f7, 0x0000, - 0x0d64, 0xe290, 0xffff, 0xf937, 0x8c4f, 0x3b77, 0x2a9c, 0x51dc, 0xccc6, 0xffff, 0xffff, 0x772d, 0x0000, - 0x75c9, 0xffff, 0xffff, 0x5d47, 0x0000, 0x0000, 0x0000, 0x0000, 0x0b43, 0xdfb0, 0xffff, 0xe33f, 0x0000, - 0x5b67, 0x8fff, 0x8f71, 0x060a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x8dff, 0xffff, 0xffe9, 0x16d6, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x72c4, 0xffff, 0xffff, 0x313e, - 0x0000, 0x0000, 0x0000, 0x17b0, 0x5bf7, 0x78f7, 0x95e5, 0xa3ff, 0xa3ff, 0xcd0e, 0xffff, 0xffff, 0x313f, - 0x0000, 0x1927, 0x9bf6, 0xf86a, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x313f, - 0x0c63, 0xcf9b, 0xffff, 0xfc12, 0xb594, 0x6961, 0x4f5e, 0x3cc7, 0x3a3f, 0x9280, 0xffff, 0xffff, 0x313f, - 0x8eaf, 0xffff, 0xfbec, 0x4d2e, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x723f, 0xffff, 0xffff, 0x313f, - 0xccbf, 0xffff, 0xc6bf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x723f, 0xffff, 0xffff, 0x313f, - 0xf297, 0xffff, 0xb834, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x934c, 0xffff, 0xffff, 0x313f, - 0xc93f, 0xffff, 0xf453, 0x2497, 0x0000, 0x0000, 0x0000, 0x0000, 0x5ed5, 0xfcf4, 0xffff, 0xffff, 0x3865, - 0x6d9c, 0xffff, 0xffff, 0xeafd, 0x7dd4, 0x5546, 0x6c61, 0xc0bd, 0xfff1, 0xffff, 0xffff, 0xffff, 0x4a3f, - 0x00d2, 0xa6ac, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xd540, 0x83aa, 0xffff, 0xffff, 0x7604, - 0x0000, 0x001b, 0x5004, 0xbb2d, 0xe3d5, 0xeeb3, 0xbd16, 0x7e29, 0x08ff, 0x1b3f, 0xb6bf, 0xb6bf, 0x7d92, + 0x0000, 0x0000, 0x05b8, 0x66a6, 0xbbfe, 0xe871, 0xf800, 0xda20, 0xb499, 0x4a84, 0x0009, 0x0000, 0x0000, + 0x0000, 0x2463, 0xd7fd, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xad35, 0x01f8, 0x0000, + 0x0d69, 0xe28c, 0xffff, 0xf92a, 0x8c5d, 0x3b36, 0x2a62, 0x51a7, 0xcc97, 0xffff, 0xffff, 0x772d, 0x0000, + 0x75ad, 0xffff, 0xffff, 0x5ccf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0b4a, 0xdfd6, 0xffff, 0xe2ff, 0x0000, + 0x5b67, 0x8fff, 0x8f70, 0x060a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x8e7f, 0xffff, 0xffe9, 0x16d6, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x7303, 0xffff, 0xffff, 0x30ff, + 0x0000, 0x0000, 0x0000, 0x17b0, 0x5bfe, 0x78fe, 0x95ec, 0xa3fe, 0xa3fe, 0xcd24, 0xfffe, 0xfffe, 0x30fe, + 0x0001, 0x190d, 0x9be5, 0xf868, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0x30fe, + 0x0c4c, 0xcf6f, 0xfffe, 0xfc0b, 0xb551, 0x6920, 0x4f1d, 0x3c87, 0x39ff, 0x928e, 0xffff, 0xffff, 0x30ff, + 0x8f03, 0xffff, 0xfbe7, 0x4d76, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x727f, 0xffff, 0xffff, 0x30ff, + 0xccff, 0xffff, 0xc6ff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x727f, 0xffff, 0xffff, 0x30ff, + 0xf296, 0xffff, 0xb7c6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x939a, 0xffff, 0xffff, 0x30ff, + 0xc97f, 0xffff, 0xf43c, 0x2493, 0x0000, 0x0000, 0x0000, 0x0000, 0x5f13, 0xfd0c, 0xffff, 0xffff, 0x3827, + 0x6dc9, 0xffff, 0xffff, 0xeb16, 0x7dd4, 0x5541, 0x6c76, 0xc10f, 0xfff1, 0xffff, 0xffff, 0xffff, 0x49ff, + 0x00d8, 0xa6e9, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xd4fe, 0x83db, 0xffff, 0xffff, 0x7584, + 0x0000, 0x001c, 0x503e, 0xbb08, 0xe3a1, 0xeea6, 0xbd0e, 0x7e09, 0x08e5, 0x1b8b, 0xb67f, 0xb67f, 0x7d44, } var flMask16 = []uint32{ diff --git a/vector/gen.go b/vector/gen.go index 355226a..92be417 100644 --- a/vector/gen.go +++ b/vector/gen.go @@ -308,34 +308,34 @@ const ( fxClampAndScale256 = ` // y = abs(x) - // y >>= 12 // Shift by 2*ϕ - 8. + // y >>= 10 // Shift by 2*ϕ - 8. // y = min(y, fxAlmost256) // // pabsd %xmm1,%xmm2 - // psrld $0xc,%xmm2 + // psrld $0xa,%xmm2 // pminud %xmm5,%xmm2 // // Hopefully we'll get these opcode mnemonics into the assembler for Go // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but // it's similar. BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 ` fxClampAndScale65536 = ` // y = abs(x) - // y >>= 4 // Shift by 2*ϕ - 16. + // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) // // pabsd %xmm1,%xmm2 - // psrld $0x4,%xmm2 + // psrld $0x2,%xmm2 // pminud %xmm5,%xmm2 // // Hopefully we'll get these opcode mnemonics into the assembler for Go // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but // it's similar. BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x04 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 ` flClampAndScale256 = ` diff --git a/vector/raster_fixed.go b/vector/raster_fixed.go index fe5f75a..086b9fc 100644 --- a/vector/raster_fixed.go +++ b/vector/raster_fixed.go @@ -19,11 +19,7 @@ const ( // // When changing this number, also change the assembly code (search for ϕ // in the .s files). - // - // TODO: drop ϕ from 10 to 9, so that ±1<<(3*ϕ+3) doesn't overflow an int32 - // and we can therefore use int32 math instead of the slower int64 math in - // Rasterizer.fixedLineTo below. - ϕ = 10 + ϕ = 9 fxOne int1ϕ = 1 << ϕ fxOneAndAHalf int1ϕ = 1<<ϕ + 1<<(ϕ-1) @@ -146,13 +142,10 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) { // In ideal math: buf[i] += uint32(d * (fxOne - a0 - am)) // // (x1i == x0i+2) and (twoOverS == 2 * (x1 - x0)) implies - // that int64(twoOverS) ranges up to +1<<(1*ϕ+2). - // - // Convert to int64 to avoid overflow. Without that, - // TestRasterize30Degrees fails. - D := int64(twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared) // D ranges up to ±1<<(2*ϕ+2). - D *= int64(d) // D ranges up to ±1<<(3*ϕ+2). - D /= int64(twoOverS) + // that twoOverS ranges up to +1<<(1*ϕ+2). + D := twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared // D ranges up to ±1<<(2*ϕ+2). + D *= d // D ranges up to ±1<<(3*ϕ+2). + D /= twoOverS buf[i] += uint32(D) } } else { @@ -194,12 +187,9 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) { // Thus, A ranges up to ±1<<(2*ϕ+2). It is possible to // derive a tighter bound, but this bound is sufficient to // reason about overflow. - // - // Convert to int64 to avoid overflow. Without that, - // TestRasterizePolygon fails. - D := int64((fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared) // D ranges up to ±1<<(2*ϕ+2). - D *= int64(d) // D ranges up to ±1<<(3*ϕ+2). - D /= int64(twoOverS) + D := (fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared // D ranges up to ±1<<(2*ϕ+2). + D *= d // D ranges up to ±1<<(3*ϕ+2). + D /= twoOverS buf[i] += uint32(D) } dTimesS := uint32((d << (2 * ϕ)) / oneOverS) @@ -255,15 +245,12 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) { // greater than -fxOne<<2, or -1<<(ϕ+2). Thus, B ranges up // to ±1<<(ϕ+2). One final simplification: // B = x1f<<1 + (1<<(ϕ+2) - fxOneAndAHalf<<1) - // - // Convert to int64 to avoid overflow. Without that, - // TestRasterizePolygon fails. const C = 1<<(ϕ+2) - fxOneAndAHalf<<1 - D := int64(x1f<<1 + C) // D ranges up to ±1<<(1*ϕ+2). - D <<= ϕ // D ranges up to ±1<<(2*ϕ+2). - D -= int64(x1fSquared) // D ranges up to ±1<<(2*ϕ+3). - D *= int64(d) // D ranges up to ±1<<(3*ϕ+3). - D /= int64(twoOverS) + D := x1f<<1 + C // D ranges up to ±1<<(1*ϕ+2). + D <<= ϕ // D ranges up to ±1<<(2*ϕ+2). + D -= x1fSquared // D ranges up to ±1<<(2*ϕ+3). + D *= d // D ranges up to ±1<<(3*ϕ+3). + D /= twoOverS buf[i] += uint32(D) } } diff --git a/vector/vector.go b/vector/vector.go index 8538bb9..418a956 100644 --- a/vector/vector.go +++ b/vector/vector.go @@ -46,9 +46,9 @@ import ( // // The rationale for this particular value is that TestRasterizePolygon in // vector_test.go checks the rendering quality of polygon edges at various -// angles, inscribed in a circle of diameter 2048. It may be that a higher -// value would still produce acceptable quality, but 2048 seems to work. -const floatingPointMathThreshold = 2048 +// angles, inscribed in a circle of diameter 512. It may be that a higher value +// would still produce acceptable quality, but 512 seems to work. +const floatingPointMathThreshold = 512 func midPoint(p, q f32.Vec2) f32.Vec2 { return f32.Vec2{ diff --git a/vector/vector_test.go b/vector/vector_test.go index e2cbbdb..f84d040 100644 --- a/vector/vector_test.go +++ b/vector/vector_test.go @@ -88,7 +88,7 @@ func TestRasterizeOutOfBounds(t *testing.T) { func TestRasterizePolygon(t *testing.T) { var z Rasterizer - for radius := 4; radius <= 1024; radius *= 2 { + for radius := 4; radius <= 256; radius *= 2 { for n := 3; n <= 19; n += 4 { z.Reset(2*radius, 2*radius) z.MoveTo(f32.Vec2{