From 427b8d133e710b7781794fbf781ade2c1a71dd2e Mon Sep 17 00:00:00 2001
From: Charlie Vieth <charlie.vieth@gmail.com>
Date: Tue, 29 Jul 2014 18:32:58 -0400
Subject: [PATCH] Optimized Nearest-Neighbor function - 2x faster

---
 filters.go      |  29 +++++-
 nearest.go      | 228 +++++++++++++++++++++++++++++++++++++++++++
 nearest_test.go |  41 ++++++++
 resize.go       | 253 ++++++++++++++++++++++++++++++++++++++++++------
 4 files changed, 520 insertions(+), 31 deletions(-)
 create mode 100644 nearest.go
 create mode 100644 nearest_test.go

diff --git a/filters.go b/filters.go
index a5f6e79..0cc738c 100644
--- a/filters.go
+++ b/filters.go
@@ -89,8 +89,9 @@ func createWeights8(dy, minx, filterLength int, blur, scale float64, kernel func
 	for y := 0; y < dy; y++ {
 		interpX := scale*(float64(y)+0.5) + float64(minx)
 		start[y] = int(interpX) - filterLength/2 + 1
+		interpX -= float64(start[y])
 		for i := 0; i < filterLength; i++ {
-			in := (interpX - float64(start[y]) - float64(i)) * filterFactor
+			in := (interpX - float64(i)) * filterFactor
 			coeffs[y*filterLength+i] = int16(kernel(in) * 256)
 		}
 	}
@@ -108,11 +109,35 @@ func createWeights16(dy, minx, filterLength int, blur, scale float64, kernel fun
 	for y := 0; y < dy; y++ {
 		interpX := scale*(float64(y)+0.5) + float64(minx)
 		start[y] = int(interpX) - filterLength/2 + 1
+		interpX -= float64(start[y])
 		for i := 0; i < filterLength; i++ {
-			in := (interpX - float64(start[y]) - float64(i)) * filterFactor
+			in := (interpX - float64(i)) * filterFactor
 			coeffs[y*filterLength+i] = int32(kernel(in) * 65536)
 		}
 	}
 
 	return coeffs, start, filterLength
 }
+
+func createWeightsNearest(dy, minx, filterLength int, blur, scale float64) ([]bool, []int, int) {
+	filterLength = filterLength * int(math.Max(math.Ceil(blur*scale), 1))
+	filterFactor := math.Min(1./(blur*scale), 1)
+
+	coeffs := make([]bool, dy*filterLength)
+	start := make([]int, dy)
+	for y := 0; y < dy; y++ {
+		interpX := scale*(float64(y)+0.5) + float64(minx)
+		start[y] = int(interpX) - filterLength/2 + 1
+		interpX -= float64(start[y])
+		for i := 0; i < filterLength; i++ {
+			in := (interpX - float64(i)) * filterFactor
+			if in >= -0.5 && in < 0.5 {
+				coeffs[y*filterLength+i] = true
+			} else {
+				coeffs[y*filterLength+i] = false
+			}
+		}
+	}
+
+	return coeffs, start, filterLength
+}
diff --git a/nearest.go b/nearest.go
new file mode 100644
index 0000000..5708fd0
--- /dev/null
+++ b/nearest.go
@@ -0,0 +1,228 @@
+package resize
+
+import "image"
+
+func floatToUint8(x float32) uint8 {
+	// Nearest-neighbor values are always
+	// positive no need to check lower-bound.
+	if x > 0xfe {
+		return 0xff
+	}
+	return uint8(x)
+}
+
+func floatToUint16(x float32) uint16 {
+	if x > 0xfffe {
+		return 0xffff
+	}
+	return uint16(x)
+}
+
+func nearestGeneric(in image.Image, out *image.RGBA64, scale float64, coeffs []bool, offset []int, filterLength int) {
+	oldBounds := in.Bounds()
+	newBounds := out.Bounds()
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			var rgba [4]float32
+			var sum float32
+			start := offset[y]
+			ci := (y - newBounds.Min.Y) * filterLength
+			for i := 0; i < filterLength; i++ {
+				if coeffs[ci+i] {
+					xi := start + i
+					switch {
+					case uint(xi) < uint(oldBounds.Max.X):
+						break
+					case xi >= oldBounds.Max.X:
+						xi = oldBounds.Min.X
+					default:
+						xi = oldBounds.Max.X - 1
+					}
+					r, g, b, a := in.At(xi, x).RGBA()
+					rgba[0] += float32(r)
+					rgba[1] += float32(g)
+					rgba[2] += float32(b)
+					rgba[3] += float32(a)
+					sum++
+				}
+			}
+
+			offset := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
+			value := floatToUint16(rgba[0] / sum)
+			out.Pix[offset+0] = uint8(value >> 8)
+			out.Pix[offset+1] = uint8(value)
+			value = floatToUint16(rgba[1] / sum)
+			out.Pix[offset+2] = uint8(value >> 8)
+			out.Pix[offset+3] = uint8(value)
+			value = floatToUint16(rgba[2] / sum)
+			out.Pix[offset+4] = uint8(value >> 8)
+			out.Pix[offset+5] = uint8(value)
+			value = floatToUint16(rgba[3] / sum)
+			out.Pix[offset+6] = uint8(value >> 8)
+			out.Pix[offset+7] = uint8(value)
+		}
+	}
+}
+
+func nearestRGBA(in *image.RGBA, out *image.RGBA, scale float64, coeffs []bool, offset []int, filterLength int) {
+	oldBounds := in.Bounds()
+	newBounds := out.Bounds()
+	minX := oldBounds.Min.X * 4
+	maxX := (oldBounds.Max.X - oldBounds.Min.X - 1) * 4
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[(x-oldBounds.Min.Y)*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			var rgba [4]float32
+			var sum float32
+			start := offset[y]
+			ci := (y - newBounds.Min.Y) * filterLength
+			for i := 0; i < filterLength; i++ {
+				if coeffs[ci+i] {
+					xi := start + i
+					switch {
+					case uint(xi) < uint(oldBounds.Max.X):
+						xi *= 4
+					case xi >= oldBounds.Max.X:
+						xi = maxX
+					default:
+						xi = minX
+					}
+					rgba[0] += float32(row[xi+0])
+					rgba[1] += float32(row[xi+1])
+					rgba[2] += float32(row[xi+2])
+					rgba[3] += float32(row[xi+3])
+					sum++
+				}
+			}
+
+			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*4
+			out.Pix[xo+0] = floatToUint8(rgba[0] / sum)
+			out.Pix[xo+1] = floatToUint8(rgba[1] / sum)
+			out.Pix[xo+2] = floatToUint8(rgba[2] / sum)
+			out.Pix[xo+3] = floatToUint8(rgba[3] / sum)
+		}
+	}
+}
+
+func nearestRGBA64(in *image.RGBA64, out *image.RGBA64, scale float64, coeffs []bool, offset []int, filterLength int) {
+	oldBounds := in.Bounds()
+	newBounds := out.Bounds()
+	minX := oldBounds.Min.X * 8
+	maxX := (oldBounds.Max.X - oldBounds.Min.X - 1) * 8
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[(x-oldBounds.Min.Y)*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			var rgba [4]float32
+			var sum float32
+			start := offset[y]
+			ci := (y - newBounds.Min.Y) * filterLength
+			for i := 0; i < filterLength; i++ {
+				if coeffs[ci+i] {
+					xi := start + i
+					switch {
+					case uint(xi) < uint(oldBounds.Max.X):
+						xi *= 8
+					case xi >= oldBounds.Max.X:
+						xi = maxX
+					default:
+						xi = minX
+					}
+					rgba[0] += float32(uint16(row[xi+0])<<8 | uint16(row[xi+1]))
+					rgba[1] += float32(uint16(row[xi+2])<<8 | uint16(row[xi+3]))
+					rgba[2] += float32(uint16(row[xi+4])<<8 | uint16(row[xi+5]))
+					rgba[3] += float32(uint16(row[xi+6])<<8 | uint16(row[xi+7]))
+					sum++
+				}
+			}
+
+			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
+			value := floatToUint16(rgba[0] / sum)
+			out.Pix[xo+0] = uint8(value >> 8)
+			out.Pix[xo+1] = uint8(value)
+			value = floatToUint16(rgba[1] / sum)
+			out.Pix[xo+2] = uint8(value >> 8)
+			out.Pix[xo+3] = uint8(value)
+			value = floatToUint16(rgba[2] / sum)
+			out.Pix[xo+4] = uint8(value >> 8)
+			out.Pix[xo+5] = uint8(value)
+			value = floatToUint16(rgba[3] / sum)
+			out.Pix[xo+6] = uint8(value >> 8)
+			out.Pix[xo+7] = uint8(value)
+		}
+	}
+}
+
+func nearestGray(in *image.Gray, out *image.Gray, scale float64, coeffs []bool, offset []int, filterLength int) {
+	oldBounds := in.Bounds()
+	newBounds := out.Bounds()
+	minX := oldBounds.Min.X
+	maxX := (oldBounds.Max.X - oldBounds.Min.X - 1)
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[(x-oldBounds.Min.Y)*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			var gray float32
+			var sum float32
+			start := offset[y]
+			ci := (y - newBounds.Min.Y) * filterLength
+			for i := 0; i < filterLength; i++ {
+				if coeffs[ci+i] {
+					xi := start + i
+					switch {
+					case uint(xi) < uint(oldBounds.Max.X):
+						break
+					case xi >= oldBounds.Max.X:
+						xi = maxX
+					default:
+						xi = minX
+					}
+					gray += float32(row[xi])
+					sum++
+				}
+			}
+
+			offset := (y-newBounds.Min.Y)*out.Stride + (x - newBounds.Min.X)
+			out.Pix[offset] = floatToUint8(gray / sum)
+		}
+	}
+}
+
+func nearestGray16(in *image.Gray16, out *image.Gray16, scale float64, coeffs []bool, offset []int, filterLength int) {
+	oldBounds := in.Bounds()
+	newBounds := out.Bounds()
+	minX := oldBounds.Min.X * 2
+	maxX := (oldBounds.Max.X - oldBounds.Min.X - 1) * 2
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[(x-oldBounds.Min.Y)*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			var gray float32
+			var sum float32
+			start := offset[y]
+			ci := (y - newBounds.Min.Y) * filterLength
+			for i := 0; i < filterLength; i++ {
+				if coeffs[ci+i] {
+					xi := start + i
+					switch {
+					case uint(xi) < uint(oldBounds.Max.X):
+						xi *= 2
+					case xi >= oldBounds.Max.X:
+						xi = maxX
+					default:
+						xi = minX
+					}
+					gray += float32(uint16(row[xi+0])<<8 | uint16(row[xi+1]))
+					sum++
+				}
+			}
+
+			offset := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*2
+			value := floatToUint16(gray / sum)
+			out.Pix[offset+0] = uint8(value >> 8)
+			out.Pix[offset+1] = uint8(value)
+		}
+	}
+}
diff --git a/nearest_test.go b/nearest_test.go
new file mode 100644
index 0000000..08adbda
--- /dev/null
+++ b/nearest_test.go
@@ -0,0 +1,41 @@
+package resize
+
+import "testing"
+
+func Test_FloatToUint8(t *testing.T) {
+	var testData = []struct {
+		in       float32
+		expected uint8
+	}{
+		{0, 0},
+		{255, 255},
+		{128, 128},
+		{1, 1},
+		{256, 255},
+	}
+	for _, test := range testData {
+		actual := floatToUint8(test.in)
+		if actual != test.expected {
+			t.Fail()
+		}
+	}
+}
+
+func Test_FloatToUint16(t *testing.T) {
+	var testData = []struct {
+		in       float32
+		expected uint16
+	}{
+		{0, 0},
+		{65535, 65535},
+		{128, 128},
+		{1, 1},
+		{65536, 65535},
+	}
+	for _, test := range testData {
+		actual := floatToUint16(test.in)
+		if actual != test.expected {
+			t.Fail()
+		}
+	}
+}
diff --git a/resize.go b/resize.go
index b9eb775..fad39b4 100644
--- a/resize.go
+++ b/resize.go
@@ -33,36 +33,41 @@ import (
 // An InterpolationFunction provides the parameters that describe an
 // interpolation kernel. It returns the number of samples to take
 // and the kernel function to use for sampling.
-type InterpolationFunction func() (int, func(float64) float64)
+type InterpolationFunction int
 
-// Nearest-neighbor interpolation
-func NearestNeighbor() (int, func(float64) float64) {
-	return 2, nearest
-}
+// InterpolationFunction constants
+const (
+	// Nearest-neighbor interpolation
+	NearestNeighbor InterpolationFunction = iota
+	// Bilinear interpolation
+	Bilinear
+	// Bicubic interpolation (with cubic hermite spline)
+	Bicubic
+	// Mitchell-Netravali interpolation
+	MitchellNetravali
+	// Lanczos interpolation (a=2)
+	Lanczos2
+	// Lanczos interpolation (a=3)
+	Lanczos3
+)
 
-// Bilinear interpolation
-func Bilinear() (int, func(float64) float64) {
-	return 2, linear
-}
-
-// Bicubic interpolation (with cubic hermite spline)
-func Bicubic() (int, func(float64) float64) {
-	return 4, cubic
-}
-
-// Mitchell-Netravali interpolation
-func MitchellNetravali() (int, func(float64) float64) {
-	return 4, mitchellnetravali
-}
-
-// Lanczos interpolation (a=2)
-func Lanczos2() (int, func(float64) float64) {
-	return 4, lanczos2
-}
-
-// Lanczos interpolation (a=3)
-func Lanczos3() (int, func(float64) float64) {
-	return 6, lanczos3
+// kernal, returns an InterpolationFunctions taps and kernel.
+func (i InterpolationFunction) kernel() (int, func(float64) float64) {
+	switch i {
+	case Bilinear:
+		return 2, linear
+	case Bicubic:
+		return 4, cubic
+	case MitchellNetravali:
+		return 4, mitchellnetravali
+	case Lanczos2:
+		return 4, lanczos2
+	case Lanczos3:
+		return 6, lanczos3
+	default:
+		// Default to NearestNeighbor.
+		return 2, nearest
+	}
 }
 
 // values <1 will sharpen the image
@@ -81,8 +86,11 @@ func Resize(width, height uint, img image.Image, interp InterpolationFunction) i
 	if height == 0 {
 		height = uint(0.7 + float64(img.Bounds().Dy())/scaleY)
 	}
+	if interp == NearestNeighbor {
+		return resizeNearest(width, height, scaleX, scaleY, img, interp)
+	}
 
-	taps, kernel := interp()
+	taps, kernel := interp.kernel()
 	cpus := runtime.NumCPU()
 	wg := sync.WaitGroup{}
 
@@ -269,6 +277,193 @@ func Resize(width, height uint, img image.Image, interp InterpolationFunction) i
 	}
 }
 
+func resizeNearest(width, height uint, scaleX, scaleY float64, img image.Image, interp InterpolationFunction) image.Image {
+	taps, _ := interp.kernel()
+	cpus := runtime.NumCPU()
+	wg := sync.WaitGroup{}
+
+	switch input := img.(type) {
+	case *image.RGBA:
+		// 8-bit precision
+		temp := image.NewRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewRGBA(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), input.Bounds().Min.X, taps, blur, scaleX)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.RGBA)
+			go func() {
+				defer wg.Done()
+				nearestRGBA(input, slice, scaleX, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.RGBA)
+			go func() {
+				defer wg.Done()
+				nearestRGBA(temp, slice, scaleY, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.YCbCr:
+		// 8-bit precision
+		// accessing the YCbCr arrays in a tight loop is slow.
+		// converting the image before filtering will improve performance.
+		inputAsRGBA := convertYCbCrToRGBA(input)
+		temp := image.NewRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewRGBA(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), input.Bounds().Min.X, taps, blur, scaleX)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.RGBA)
+			go func() {
+				defer wg.Done()
+				nearestRGBA(inputAsRGBA, slice, scaleX, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.RGBA)
+			go func() {
+				defer wg.Done()
+				nearestRGBA(temp, slice, scaleY, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.RGBA64:
+		// 16-bit precision
+		temp := image.NewRGBA64(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), input.Bounds().Min.X, taps, blur, scaleX)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
+			go func() {
+				defer wg.Done()
+				nearestRGBA64(input, slice, scaleX, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.RGBA64)
+			go func() {
+				defer wg.Done()
+				nearestGeneric(temp, slice, scaleY, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.Gray:
+		// 8-bit precision
+		temp := image.NewGray(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewGray(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), input.Bounds().Min.X, taps, blur, scaleX)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.Gray)
+			go func() {
+				defer wg.Done()
+				nearestGray(input, slice, scaleX, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.Gray)
+			go func() {
+				defer wg.Done()
+				nearestGray(temp, slice, scaleY, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.Gray16:
+		// 16-bit precision
+		temp := image.NewGray16(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewGray16(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), input.Bounds().Min.X, taps, blur, scaleX)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.Gray16)
+			go func() {
+				defer wg.Done()
+				nearestGray16(input, slice, scaleX, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.Gray16)
+			go func() {
+				defer wg.Done()
+				nearestGray16(temp, slice, scaleY, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	default:
+		// 16-bit precision
+		temp := image.NewRGBA64(image.Rect(0, 0, img.Bounds().Dy(), int(width)))
+		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), img.Bounds().Min.X, taps, blur, scaleX)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
+			go func() {
+				defer wg.Done()
+				nearestGeneric(img, slice, scaleX, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.RGBA64)
+			go func() {
+				defer wg.Done()
+				nearestRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	}
+
+}
+
 // Calculates scaling factors using old and new image dimensions.
 func calcFactors(width, height uint, oldWidth, oldHeight float64) (scaleX, scaleY float64) {
 	if width == 0 {