vector: simplify fixedLineTo computation.

name                              old time/op  new time/op   delta
GlyphAlpha16Over-8                3.38µs ± 0%   3.36µs ± 0%   -0.54%        (p=0.000 n=10+10)
GlyphAlpha16Src-8                 3.28µs ± 0%   3.26µs ± 0%   -0.69%         (p=0.000 n=10+9)
GlyphAlpha32Over-8                5.23µs ± 1%   5.20µs ± 0%   -0.58%         (p=0.000 n=10+8)
GlyphAlpha32Src-8                 4.83µs ± 0%   4.81µs ± 1%   -0.46%         (p=0.001 n=10+9)
GlyphAlpha64Over-8                10.2µs ± 0%   10.2µs ± 0%   -0.21%         (p=0.003 n=10+9)
GlyphAlpha64Src-8                 8.68µs ± 1%   8.62µs ± 0%   -0.70%         (p=0.000 n=10+9)
GlyphAlpha128Over-8               24.2µs ± 1%   24.1µs ± 0%   -0.58%         (p=0.001 n=10+9)
GlyphAlpha128Src-8                18.0µs ± 1%   17.9µs ± 0%   -0.61%         (p=0.001 n=10+9)
GlyphAlpha256Over-8               70.3µs ± 0%   70.1µs ± 0%   -0.37%        (p=0.019 n=10+10)
GlyphAlpha256Src-8                45.4µs ± 0%   45.2µs ± 1%   -0.38%         (p=0.041 n=8+10)
GlyphRGBA16Over-8                 5.14µs ± 0%   5.12µs ± 0%   -0.43%         (p=0.000 n=9+10)
GlyphRGBA16Src-8                  4.59µs ± 0%   4.57µs ± 1%   -0.43%          (p=0.005 n=9+9)
GlyphRGBA32Over-8                 12.2µs ± 1%   12.1µs ± 0%   -0.70%        (p=0.000 n=10+10)
GlyphRGBA32Src-8                  10.0µs ± 0%   10.0µs ± 0%     ~            (p=0.092 n=7+10)
GlyphRGBA64Over-8                 37.5µs ± 1%   37.2µs ± 0%   -0.75%         (p=0.000 n=10+9)
GlyphRGBA64Src-8                  29.1µs ± 0%   29.0µs ± 1%     ~            (p=0.243 n=10+9)
GlyphRGBA128Over-8                 135µs ± 0%    134µs ± 1%   -0.72%          (p=0.000 n=9+9)
GlyphRGBA128Src-8                  101µs ± 1%    100µs ± 1%     ~           (p=0.197 n=10+10)
GlyphRGBA256Over-8                 511µs ± 0%    506µs ± 0%   -0.97%        (p=0.000 n=10+10)
GlyphRGBA256Src-8                  374µs ± 0%    373µs ± 0%   -0.29%        (p=0.002 n=10+10)

Change-Id: Ic05a900935cb59e55711374db1e62b055d75c8e3
Reviewed-on: https://go-review.googlesource.com/31116
Reviewed-by: David Crawshaw <crawshaw@golang.org>
This commit is contained in:
Nigel Tao 2016-10-15 13:23:35 +11:00
parent f72412cfe3
commit fa54d6fa1c

View File

@ -20,7 +20,7 @@ const (
// When changing this number, also change the assembly code (search for ϕ
// in the .s files).
//
// TODO: drop ϕ from 10 to 9, so that ±1<<(3*ϕ+2) doesn't overflow an int32
// TODO: drop ϕ from 10 to 9, so that ±1<<(3*ϕ+3) doesn't overflow an int32
// and we can therefore use int32 math instead of the slower int64 math in
// Rasterizer.fixedLineTo below.
ϕ = 10
@ -228,26 +228,41 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
// = twoOverS<<ϕ - a1*twoOverS - (int1ϕ(x1i-x0i-3)<<(2*ϕ))*2 - x1f*x1f
// = twoOverS<<ϕ - a1*twoOverS - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f
// Substituting for a1, given above, yields:
// A = twoOverS<<ϕ - ((fxOneAndAHalf - x0f)<<ϕ)*2 - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f
// = twoOverS<<ϕ - (fxOneAndAHalf - x0f)<<(ϕ+1) - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f
//
// TODO: re-factor that equation some more: twoOverS equals
// 2*(x1-x0), so a substantial part of twoOverS<<ϕ and
// int1ϕ(x1i-x0i-3)<<(2*ϕ+1) should cancel each other out.
// Doing subtract-then-shift instead of shift-then-subtract
// could mean that we can use the faster int32 math,
// instead of int64, but still avoid overflow:
// A = B<<ϕ - x1f*x1f
// A = twoOverS<<ϕ - ((fxOneAndAHalf-x0f)<<ϕ)*2 - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f
// = twoOverS<<ϕ - (fxOneAndAHalf-x0f)<<(ϕ+1) - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f
// = B<<ϕ - x1f*x1f
// where
// B = twoOverS - (fxOneAndAHalf - x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1)
// B = twoOverS - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1)
// = (x1-x0)<<1 - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1)
//
// Re-arranging the defintions given above:
// x0Floor := int1ϕ(x0i) << ϕ
// x0f := x0 - x0Floor
// x1Ceil := int1ϕ(x1i) << ϕ
// x1f := x1 - x1Ceil + fxOne
// combined with fxOne = 1<<ϕ yields:
// x0 = x0f + int1ϕ(x0i)<<ϕ
// x1 = x1f + int1ϕ(x1i-1)<<ϕ
// so that expanding (x1-x0) yields:
// B = (x1f-x0f + int1ϕ(x1i-x0i-1)<<ϕ)<<1 - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1)
// = (x1f-x0f)<<1 + int1ϕ(x1i-x0i-1)<<(ϕ+1) - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1)
// A large part of the second and fourth terms cancel:
// B = (x1f-x0f)<<1 - (fxOneAndAHalf-x0f)<<1 - int1ϕ(-2)<<(ϕ+1)
// = (x1f-x0f)<<1 - (fxOneAndAHalf-x0f)<<1 + 1<<(ϕ+2)
// = (x1f - fxOneAndAHalf)<<1 + 1<<(ϕ+2)
// The first term, (x1f - fxOneAndAHalf)<<1, is a negative
// number, bounded below by -fxOneAndAHalf<<1, which is
// greater than -fxOne<<2, or -1<<(ϕ+2). Thus, B ranges up
// to ±1<<(ϕ+2). One final simplification:
// B = x1f<<1 + (1<<(ϕ+2) - fxOneAndAHalf<<1)
//
// Convert to int64 to avoid overflow. Without that,
// TestRasterizePolygon fails.
D := int64(twoOverS) << ϕ
D -= int64((fxOneAndAHalf - x0f)) << (ϕ + 1)
D -= int64((x1i - x0i - 3)) << (2*ϕ + 1)
D -= int64(x1fSquared)
D *= int64(d)
const C = 1<<(ϕ+2) - fxOneAndAHalf<<1
D := int64(x1f<<1 + C) // D ranges up to ±1<<(1*ϕ+2).
D <<= ϕ // D ranges up to ±1<<(2*ϕ+2).
D -= int64(x1fSquared) // D ranges up to ±1<<(2*ϕ+3).
D *= int64(d) // D ranges up to ±1<<(3*ϕ+3).
D /= int64(twoOverS)
buf[i] += uint32(D)
}