draw: use a sync.Pool for kernel scaling's temporary buffers.

benchmark                      old ns/op     new ns/op     delta
BenchmarkScaleBLLargeDown      257715146     260286012     +1.00%
BenchmarkScaleCRLargeDown      426797448     430078734     +0.77%
BenchmarkScaleBLDown           4449939       4222542       -5.11%
BenchmarkScaleCRDown           8160446       8010056       -1.84%
BenchmarkScaleBLUp             22290312      21044122      -5.59%
BenchmarkScaleCRUp             33010722      32021468      -3.00%
BenchmarkScaleCRSrcGray        13307961      13020192      -2.16%
BenchmarkScaleCRSrcNRGBA       40567431      40801939      +0.58%
BenchmarkScaleCRSrcRGBA        39892971      40240558      +0.87%
BenchmarkScaleCRSrcYCbCr       59020222      59686699      +1.13%

benchmark                      old allocs     new allocs     delta
BenchmarkScaleBLLargeDown      1              1              +0.00%
BenchmarkScaleCRLargeDown      1              2              +100.00%
BenchmarkScaleBLDown           1              0              -100.00%
BenchmarkScaleCRDown           1              0              -100.00%
BenchmarkScaleBLUp             1              0              -100.00%
BenchmarkScaleCRUp             1              0              -100.00%
BenchmarkScaleCRSrcGray        1              0              -100.00%
BenchmarkScaleCRSrcNRGBA       1              0              -100.00%
BenchmarkScaleCRSrcRGBA        1              0              -100.00%
BenchmarkScaleCRSrcYCbCr       1              0              -100.00%

benchmark                      old bytes     new bytes     delta
BenchmarkScaleBLLargeDown      14745600      2949200       -80.00%
BenchmarkScaleCRLargeDown      14745600      4915333       -66.67%
BenchmarkScaleBLDown           1523712       5079          -99.67%
BenchmarkScaleCRDown           1523712       7619          -99.50%
BenchmarkScaleBLUp             10117120      101175        -99.00%
BenchmarkScaleCRUp             10117120      202350        -98.00%
BenchmarkScaleCRSrcGray        4915200       49156         -99.00%
BenchmarkScaleCRSrcNRGBA       4915200       163853        -96.67%
BenchmarkScaleCRSrcRGBA        4915200       163853        -96.67%
BenchmarkScaleCRSrcYCbCr       4915200       245780        -95.00%

The increase in BenchmarkScale??LargeDown number of allocs I think is an
accounting error due to the low number of iterations: a low denominator.
I suspect that there are one or two extra allocs up front for using the
sync.Pool, but one fewer alloc per iteration. The number of iterations
is only 5 for BL and 3 for CR, for the default timeout. If I increase
the -test.benchtime value to 5s, then the reported average (allocs/op)
drop from 2 to 0, so the delta should actually be -100% instead of +0 or
+100%.

Change-Id: I21d9bb0086bdb25517b6a430e8a21bdf3db026f6
Reviewed-on: https://go-review.googlesource.com/8150
Reviewed-by: Rob Pike <r@golang.org>
This commit is contained in:
Nigel Tao 2015-03-27 11:21:16 +11:00
parent d5c5125b57
commit c53fa16781
4 changed files with 37 additions and 6 deletions

View File

@ -936,8 +936,14 @@ const (
// Create a temporary buffer: // Create a temporary buffer:
// scaleX distributes the source image's columns over the temporary image. // scaleX distributes the source image's columns over the temporary image.
// scaleY distributes the temporary image's rows over the destination image. // scaleY distributes the temporary image's rows over the destination image.
// TODO: is it worth having a sync.Pool for this temporary buffer? var tmp [][4]float64
tmp := make([][4]float64, z.dw*z.sh) if z.pool.New != nil {
tmpp := z.pool.Get().(*[][4]float64)
defer z.pool.Put(tmpp)
tmp = *tmpp
} else {
tmp = z.makeTmpBuf()
}
// sr is the source pixels. If it extends beyond the src bounds, // sr is the source pixels. If it extends beyond the src bounds,
// we cannot use the type-specific fast paths, as they access // we cannot use the type-specific fast paths, as they access

View File

@ -3036,8 +3036,14 @@ func (z *kernelScaler) Scale(dst Image, dr image.Rectangle, src image.Image, sr
// Create a temporary buffer: // Create a temporary buffer:
// scaleX distributes the source image's columns over the temporary image. // scaleX distributes the source image's columns over the temporary image.
// scaleY distributes the temporary image's rows over the destination image. // scaleY distributes the temporary image's rows over the destination image.
// TODO: is it worth having a sync.Pool for this temporary buffer? var tmp [][4]float64
tmp := make([][4]float64, z.dw*z.sh) if z.pool.New != nil {
tmpp := z.pool.Get().(*[][4]float64)
defer z.pool.Put(tmpp)
tmp = *tmpp
} else {
tmp = z.makeTmpBuf()
}
// sr is the source pixels. If it extends beyond the src bounds, // sr is the source pixels. If it extends beyond the src bounds,
// we cannot use the type-specific fast paths, as they access // we cannot use the type-specific fast paths, as they access

View File

@ -10,6 +10,7 @@ import (
"image" "image"
"image/color" "image/color"
"math" "math"
"sync"
"golang.org/x/image/math/f64" "golang.org/x/image/math/f64"
) )
@ -88,13 +89,17 @@ type Kernel struct {
// Scale implements the Scaler interface. // Scale implements the Scaler interface.
func (q *Kernel) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) { func (q *Kernel) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
q.NewScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy()).Scale(dst, dr, src, sr, opts) q.newScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy(), false).Scale(dst, dr, src, sr, opts)
} }
// NewScaler returns a Scaler that is optimized for scaling multiple times with // NewScaler returns a Scaler that is optimized for scaling multiple times with
// the same fixed destination and source width and height. // the same fixed destination and source width and height.
func (q *Kernel) NewScaler(dw, dh, sw, sh int) Scaler { func (q *Kernel) NewScaler(dw, dh, sw, sh int) Scaler {
return &kernelScaler{ return q.newScaler(dw, dh, sw, sh, true)
}
func (q *Kernel) newScaler(dw, dh, sw, sh int, usePool bool) Scaler {
z := &kernelScaler{
kernel: q, kernel: q,
dw: int32(dw), dw: int32(dw),
dh: int32(dh), dh: int32(dh),
@ -103,6 +108,13 @@ func (q *Kernel) NewScaler(dw, dh, sw, sh int) Scaler {
horizontal: newDistrib(q, int32(dw), int32(sw)), horizontal: newDistrib(q, int32(dw), int32(sw)),
vertical: newDistrib(q, int32(dh), int32(sh)), vertical: newDistrib(q, int32(dh), int32(sh)),
} }
if usePool {
z.pool.New = func() interface{} {
tmp := z.makeTmpBuf()
return &tmp
}
}
return z
} }
var ( var (
@ -152,6 +164,11 @@ type kernelScaler struct {
kernel *Kernel kernel *Kernel
dw, dh, sw, sh int32 dw, dh, sw, sh int32
horizontal, vertical distrib horizontal, vertical distrib
pool sync.Pool
}
func (z *kernelScaler) makeTmpBuf() [][4]float64 {
return make([][4]float64, z.dw*z.sh)
} }
// source is a range of contribs, their inverse total weight, and that ITW // source is a range of contribs, their inverse total weight, and that ITW

View File

@ -393,6 +393,7 @@ func benchScale(b *testing.B, srcf func(image.Rectangle) (image.Image, error), w
scaler = n.NewScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy()) scaler = n.NewScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy())
} }
b.ReportAllocs()
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
scaler.Scale(dst, dr, src, sr, nil) scaler.Scale(dst, dr, src, sr, nil)
@ -408,6 +409,7 @@ func benchTform(b *testing.B, srcf func(image.Rectangle) (image.Image, error), w
sr := src.Bounds() sr := src.Bounds()
m := transformMatrix(40, 10) m := transformMatrix(40, 10)
b.ReportAllocs()
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
q.Transform(dst, m, src, sr, nil) q.Transform(dst, m, src, sr, nil)