diff --git a/converter.go b/converter.go
index c4a1bdc..8c6ac84 100644
--- a/converter.go
+++ b/converter.go
@@ -21,110 +21,236 @@ import (
 	"image/color"
 )
 
-type colorArray [4]float32
-
-func replicateBorder1d(x, min, max int) int {
-	if x < min {
-		x = min
-	} else if x >= max {
-		x = max - 1
+// Keep value in [0,255] range.
+func clampUint8(in int32) uint8 {
+	if in < 0 {
+		return 0
 	}
-
-	return x
+	if in > 255 {
+		return 255
+	}
+	return uint8(in)
 }
 
-func replicateBorder(x, y int, rect image.Rectangle) (xx, yy int) {
-	xx = replicateBorder1d(x, rect.Min.X, rect.Max.X)
-	yy = replicateBorder1d(y, rect.Min.Y, rect.Max.Y)
-	return
+// Keep value in [0,65535] range.
+func clampUint16(in int64) uint16 {
+	if in < 0 {
+		return 0
+	}
+	if in > 65535 {
+		return 65535
+	}
+	return uint16(in)
 }
 
-// converter allows to retrieve a colorArray for points of an image.
-// the idea is to speed up computation by providing optimized implementations
-// for different image types instead of relying on image.Image.At().
-type converter interface {
-	at(x, y int, color *colorArray)
+func resizeGeneric(in image.Image, out *image.RGBA64, scale float64, coeffs []int32, filterLength int) {
+	oldBounds := in.Bounds()
+	newBounds := out.Bounds()
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			interpX := scale*(float64(y)+0.5) + float64(oldBounds.Min.X)
+			start := int(interpX) - filterLength/2 + 1
+
+			var rgba [4]int64
+			var sum int64
+			for i := 0; i < filterLength; i++ {
+				xx := start + i
+				if xx < oldBounds.Min.X {
+					xx = oldBounds.Min.X
+				} else if xx >= oldBounds.Max.X {
+					xx = oldBounds.Max.X - 1
+				}
+
+				coeff := coeffs[(y-newBounds.Min.Y)*filterLength+i]
+				r, g, b, a := in.At(xx, x).RGBA()
+				rgba[0] += int64(coeff) * int64(r)
+				rgba[1] += int64(coeff) * int64(g)
+				rgba[2] += int64(coeff) * int64(b)
+				rgba[3] += int64(coeff) * int64(a)
+				sum += int64(coeff)
+			}
+
+			offset := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
+			value := clampUint16(rgba[0] / sum)
+			out.Pix[offset+0] = uint8(value >> 8)
+			out.Pix[offset+1] = uint8(value)
+			value = clampUint16(rgba[1] / sum)
+			out.Pix[offset+2] = uint8(value >> 8)
+			out.Pix[offset+3] = uint8(value)
+			value = clampUint16(rgba[2] / sum)
+			out.Pix[offset+4] = uint8(value >> 8)
+			out.Pix[offset+5] = uint8(value)
+			value = clampUint16(rgba[3] / sum)
+			out.Pix[offset+6] = uint8(value >> 8)
+			out.Pix[offset+7] = uint8(value)
+		}
+	}
 }
 
-type genericConverter struct {
-	src image.Image
+func resizeRGBA(in *image.RGBA, out *image.RGBA, scale float64, coeffs []int16, filterLength int) {
+	oldBounds := in.Bounds()
+	newBounds := out.Bounds()
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[(x-oldBounds.Min.Y)*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			interpX := scale*(float64(y)+0.5) + float64(oldBounds.Min.X)
+			start := int(interpX) - filterLength/2 + 1
+
+			var rgba [4]int32
+			var sum int32
+			for i := 0; i < filterLength; i++ {
+				xx := start + i
+				if xx < oldBounds.Min.X {
+					xx = oldBounds.Min.X
+				} else if xx >= oldBounds.Max.X {
+					xx = oldBounds.Max.X - 1
+				}
+
+				coeff := coeffs[(y-newBounds.Min.Y)*filterLength+i]
+				offset := (xx - oldBounds.Min.X) * 4
+				rgba[0] += int32(coeff) * int32(row[offset+0])
+				rgba[1] += int32(coeff) * int32(row[offset+1])
+				rgba[2] += int32(coeff) * int32(row[offset+2])
+				rgba[3] += int32(coeff) * int32(row[offset+3])
+				sum += int32(coeff)
+			}
+
+			offset := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*4
+			out.Pix[offset+0] = clampUint8(rgba[0] / sum)
+			out.Pix[offset+1] = clampUint8(rgba[1] / sum)
+			out.Pix[offset+2] = clampUint8(rgba[2] / sum)
+			out.Pix[offset+3] = clampUint8(rgba[3] / sum)
+		}
+	}
 }
 
-func (c *genericConverter) at(x, y int, result *colorArray) {
-	r, g, b, a := c.src.At(replicateBorder(x, y, c.src.Bounds())).RGBA()
-	result[0] = float32(r)
-	result[1] = float32(g)
-	result[2] = float32(b)
-	result[3] = float32(a)
-	return
+func resizeRGBA64(in *image.RGBA64, out *image.RGBA64, scale float64, coeffs []int32, filterLength int) {
+	oldBounds := in.Bounds()
+	newBounds := out.Bounds()
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[(x-oldBounds.Min.Y)*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			interpX := scale*(float64(y)+0.5) + float64(oldBounds.Min.X)
+			start := int(interpX) - filterLength/2 + 1
+
+			var rgba [4]int64
+			var sum int64
+			for i := 0; i < filterLength; i++ {
+				xx := start + i
+				if xx < oldBounds.Min.X {
+					xx = oldBounds.Min.X
+				} else if xx >= oldBounds.Max.X {
+					xx = oldBounds.Max.X - 1
+				}
+
+				coeff := coeffs[(y-newBounds.Min.Y)*filterLength+i]
+				offset := (xx - oldBounds.Min.X) * 8
+				rgba[0] += int64(coeff) * int64(uint16(row[offset+0])<<8|uint16(row[offset+1]))
+				rgba[1] += int64(coeff) * int64(uint16(row[offset+2])<<8|uint16(row[offset+3]))
+				rgba[2] += int64(coeff) * int64(uint16(row[offset+4])<<8|uint16(row[offset+5]))
+				rgba[3] += int64(coeff) * int64(uint16(row[offset+6])<<8|uint16(row[offset+7]))
+				sum += int64(coeff)
+			}
+
+			offset := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
+			value := clampUint16(rgba[0] / sum)
+			out.Pix[offset+0] = uint8(value >> 8)
+			out.Pix[offset+1] = uint8(value)
+			value = clampUint16(rgba[1] / sum)
+			out.Pix[offset+2] = uint8(value >> 8)
+			out.Pix[offset+3] = uint8(value)
+			value = clampUint16(rgba[2] / sum)
+			out.Pix[offset+4] = uint8(value >> 8)
+			out.Pix[offset+5] = uint8(value)
+			value = clampUint16(rgba[3] / sum)
+			out.Pix[offset+6] = uint8(value >> 8)
+			out.Pix[offset+7] = uint8(value)
+		}
+	}
 }
 
-type rgbaConverter struct {
-	src *image.RGBA
+func resizeGray(in *image.Gray, out *image.Gray, scale float64, coeffs []int16, filterLength int) {
+	oldBounds := in.Bounds()
+	newBounds := out.Bounds()
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[(x-oldBounds.Min.Y)*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			interpX := scale*(float64(y)+0.5) + float64(oldBounds.Min.X)
+			start := int(interpX) - filterLength/2 + 1
+
+			var gray int32
+			var sum int32
+			for i := 0; i < filterLength; i++ {
+				xx := start + i
+				if xx < oldBounds.Min.X {
+					xx = oldBounds.Min.X
+				} else if xx >= oldBounds.Max.X {
+					xx = oldBounds.Max.X - 1
+				}
+
+				coeff := coeffs[(y-newBounds.Min.Y)*filterLength+i]
+				offset := (xx - oldBounds.Min.X)
+				gray += int32(coeff) * int32(row[offset])
+				sum += int32(coeff)
+			}
+
+			offset := (y-newBounds.Min.Y)*out.Stride + (x - newBounds.Min.X)
+			out.Pix[offset] = clampUint8(gray / sum)
+		}
+	}
 }
 
-func (c *rgbaConverter) at(x, y int, result *colorArray) {
-	i := c.src.PixOffset(replicateBorder(x, y, c.src.Rect))
-	result[0] = float32(uint16(c.src.Pix[i+0])<<8 | uint16(c.src.Pix[i+0]))
-	result[1] = float32(uint16(c.src.Pix[i+1])<<8 | uint16(c.src.Pix[i+1]))
-	result[2] = float32(uint16(c.src.Pix[i+2])<<8 | uint16(c.src.Pix[i+2]))
-	result[3] = float32(uint16(c.src.Pix[i+3])<<8 | uint16(c.src.Pix[i+3]))
-	return
+func resizeGray16(in *image.Gray16, out *image.Gray16, scale float64, coeffs []int32, filterLength int) {
+	oldBounds := in.Bounds()
+	newBounds := out.Bounds()
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[(x-oldBounds.Min.Y)*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			interpX := scale*(float64(y)+0.5) + float64(oldBounds.Min.X)
+			start := int(interpX) - filterLength/2 + 1
+
+			var gray int64
+			var sum int64
+			for i := 0; i < filterLength; i++ {
+				xx := start + i
+				if xx < oldBounds.Min.X {
+					xx = oldBounds.Min.X
+				} else if xx >= oldBounds.Max.X {
+					xx = oldBounds.Max.X - 1
+				}
+
+				coeff := coeffs[(y-newBounds.Min.Y)*filterLength+i]
+				offset := (xx - oldBounds.Min.X) * 2
+				gray += int64(coeff) * int64(uint16(row[offset+0])<<8|uint16(row[offset+1]))
+				sum += int64(coeff)
+			}
+
+			offset := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*2
+			value := clampUint16(gray / sum)
+			out.Pix[offset+0] = uint8(value >> 8)
+			out.Pix[offset+1] = uint8(value)
+		}
+	}
 }
 
-type rgba64Converter struct {
-	src *image.RGBA64
-}
-
-func (c *rgba64Converter) at(x, y int, result *colorArray) {
-	i := c.src.PixOffset(replicateBorder(x, y, c.src.Rect))
-	result[0] = float32(uint16(c.src.Pix[i+0])<<8 | uint16(c.src.Pix[i+1]))
-	result[1] = float32(uint16(c.src.Pix[i+2])<<8 | uint16(c.src.Pix[i+3]))
-	result[2] = float32(uint16(c.src.Pix[i+4])<<8 | uint16(c.src.Pix[i+5]))
-	result[3] = float32(uint16(c.src.Pix[i+6])<<8 | uint16(c.src.Pix[i+7]))
-	return
-}
-
-type grayConverter struct {
-	src *image.Gray
-}
-
-func (c *grayConverter) at(x, y int, result *colorArray) {
-	i := c.src.PixOffset(replicateBorder(x, y, c.src.Rect))
-	g := float32(uint16(c.src.Pix[i])<<8 | uint16(c.src.Pix[i]))
-	result[0] = g
-	result[1] = g
-	result[2] = g
-	result[3] = float32(0xffff)
-	return
-}
-
-type gray16Converter struct {
-	src *image.Gray16
-}
-
-func (c *gray16Converter) at(x, y int, result *colorArray) {
-	i := c.src.PixOffset(replicateBorder(x, y, c.src.Rect))
-	g := float32(uint16(c.src.Pix[i+0])<<8 | uint16(c.src.Pix[i+1]))
-	result[0] = g
-	result[1] = g
-	result[2] = g
-	result[3] = float32(0xffff)
-	return
-}
-
-type ycbcrConverter struct {
-	src *image.YCbCr
-}
-
-func (c *ycbcrConverter) at(x, y int, result *colorArray) {
-	xx, yy := replicateBorder(x, y, c.src.Rect)
-	yi := c.src.YOffset(xx, yy)
-	ci := c.src.COffset(xx, yy)
-	r, g, b := color.YCbCrToRGB(c.src.Y[yi], c.src.Cb[ci], c.src.Cr[ci])
-	result[0] = float32(uint16(r) * 0x101)
-	result[1] = float32(uint16(g) * 0x101)
-	result[2] = float32(uint16(b) * 0x101)
-	result[3] = float32(0xffff)
-	return
+func convertYCbCrToRGBA(in *image.YCbCr) *image.RGBA {
+	out := image.NewRGBA(in.Bounds())
+	for y := 0; y < out.Bounds().Dy(); y++ {
+		for x := 0; x < out.Bounds().Dx(); x++ {
+			p := out.Pix[y*out.Stride+4*x:]
+			yi := in.YOffset(x, y)
+			ci := in.COffset(x, y)
+			r, g, b := color.YCbCrToRGB(in.Y[yi], in.Cb[ci], in.Cr[ci])
+			p[0] = r
+			p[1] = g
+			p[2] = b
+			p[3] = 0xff
+		}
+	}
+	return out
 }
diff --git a/filters.go b/filters.go
index deb66a7..a6bd4e8 100644
--- a/filters.go
+++ b/filters.go
@@ -17,222 +17,100 @@ THIS SOFTWARE.
 package resize
 
 import (
-	"image"
-	"image/color"
 	"math"
 )
 
-// restrict an input float32 to the range of uint16 values
-func clampToUint16(x float32) (y uint16) {
-	y = uint16(x)
-	if x < 0 {
-		y = 0
-	} else if x > float32(0xfffe) {
-		// "else if x > float32(0xffff)" will cause overflows!
-		y = 0xffff
+func nearest(in float64) float64 {
+	if in >= -0.5 && in < 0.5 {
+		return 1
 	}
-
-	return
+	return 0
 }
 
-// describe a resampling filter
-type filterModel struct {
-	// resampling is done by convolution with a (scaled) kernel
-	kernel func(float32) float32
-
-	// instead of blurring an image before downscaling to avoid aliasing,
-	// the filter is scaled by a factor which leads to a similar effect
-	factorInv float32
-
-	// for optimized access to image points
-	converter
-
-	// temporary used by Interpolate
-	tempRow []colorArray
-
-	kernelWeight []float32
-	weightSum    float32
-}
-
-func (f *filterModel) SetKernelWeights(u float32) {
-	uf := int(u) - len(f.tempRow)/2 + 1
-	u -= float32(uf)
-	f.weightSum = 0
-
-	for j := range f.tempRow {
-		f.kernelWeight[j] = f.kernel((u - float32(j)) * f.factorInv)
-		f.weightSum += f.kernelWeight[j]
+func linear(in float64) float64 {
+	in = math.Abs(in)
+	if in <= 1 {
+		return 1 - in
 	}
+	return 0
 }
 
-func (f *filterModel) convolution1d() (c colorArray) {
-	for j := range f.tempRow {
-		for i := range c {
-			c[i] += f.tempRow[j][i] * f.kernelWeight[j]
+func cubic(in float64) float64 {
+	in = math.Abs(in)
+	if in <= 1 {
+		return in*in*(1.5*in-2.5) + 1.0
+	}
+	if in <= 2 {
+		return in*(in*(2.5-0.5*in)-4.0) + 2.0
+	}
+	return 0
+}
+
+func mitchellnetravali(in float64) float64 {
+	in = math.Abs(in)
+	if in <= 1 {
+		return (7.0*in*in*in - 12.0*in*in + 5.33333333333) * 0.16666666666
+	}
+	if in <= 2 {
+		return (-2.33333333333*in*in*in + 12.0*in*in - 20.0*in + 10.6666666667) * 0.16666666666
+	}
+	return 0
+}
+
+func sinc(x float64) float64 {
+	x = math.Abs(x) * math.Pi
+	if x >= 1.220703e-4 {
+		return math.Sin(x) / x
+	}
+	return 1
+}
+
+func lanczos2(in float64) float64 {
+	if in > -2 && in < 2 {
+		return sinc(in) * sinc(in*0.5)
+	}
+	return 0
+}
+
+func lanczos3(in float64) float64 {
+	if in > -3 && in < 3 {
+		return sinc(in) * sinc(in*0.3333333333333333)
+	}
+	return 0
+}
+
+// range [-256,256]
+func createWeights8(dy, minx, filterLength int, blur, scale float64, kernel func(float64) float64) ([]int16, int) {
+	filterLength = filterLength * int(math.Max(math.Ceil(blur*scale), 1))
+	filterFactor := math.Min(1./(blur*scale), 1)
+
+	coeffs := make([]int16, dy*filterLength)
+	for y := 0; y < dy; y++ {
+		interpX := scale*(float64(y)+0.5) + float64(minx)
+		start := int(interpX) - filterLength/2 + 1
+		for i := 0; i < filterLength; i++ {
+			in := (interpX - float64(start) - float64(i)) * filterFactor
+			coeffs[y*filterLength+i] = int16(kernel(in) * 256)
 		}
 	}
 
-	// normalize values
-	for i := range c {
-		c[i] = c[i] / f.weightSum
-	}
-
-	return
+	return coeffs, filterLength
 }
 
-func (f *filterModel) Interpolate(u float32, y int) color.RGBA64 {
-	uf := int(u) - len(f.tempRow)/2 + 1
-	u -= float32(uf)
+// range [-65536,65536]
+func createWeights16(dy, minx, filterLength int, blur, scale float64, kernel func(float64) float64) ([]int32, int) {
+	filterLength = filterLength * int(math.Max(math.Ceil(blur*scale), 1))
+	filterFactor := math.Min(1./(blur*scale), 1)
 
-	for i := range f.tempRow {
-		f.at(uf+i, y, &f.tempRow[i])
-	}
-
-	c := f.convolution1d()
-	return color.RGBA64{
-		clampToUint16(c[0]),
-		clampToUint16(c[1]),
-		clampToUint16(c[2]),
-		clampToUint16(c[3]),
-	}
-}
-
-// createFilter tries to find an optimized converter for the given input image
-// and initializes all filterModel members to their defaults
-func createFilter(img image.Image, factor float32, size int, kernel func(float32) float32) (f Filter) {
-	sizeX := size * (int(math.Ceil(float64(factor))))
-
-	switch img.(type) {
-	default:
-		f = &filterModel{
-			kernel, 1. / factor,
-			&genericConverter{img},
-			make([]colorArray, sizeX),
-			make([]float32, sizeX),
-			0,
-		}
-	case *image.RGBA:
-		f = &filterModel{
-			kernel, 1. / factor,
-			&rgbaConverter{img.(*image.RGBA)},
-			make([]colorArray, sizeX),
-			make([]float32, sizeX),
-			0,
-		}
-	case *image.RGBA64:
-		f = &filterModel{
-			kernel, 1. / factor,
-			&rgba64Converter{img.(*image.RGBA64)},
-			make([]colorArray, sizeX),
-			make([]float32, sizeX),
-			0,
-		}
-	case *image.Gray:
-		f = &filterModel{
-			kernel, 1. / factor,
-			&grayConverter{img.(*image.Gray)},
-			make([]colorArray, sizeX),
-			make([]float32, sizeX),
-			0,
-		}
-	case *image.Gray16:
-		f = &filterModel{
-			kernel, 1. / factor,
-			&gray16Converter{img.(*image.Gray16)},
-			make([]colorArray, sizeX),
-			make([]float32, sizeX),
-			0,
-		}
-	case *image.YCbCr:
-		f = &filterModel{
-			kernel, 1. / factor,
-			&ycbcrConverter{img.(*image.YCbCr)},
-			make([]colorArray, sizeX),
-			make([]float32, sizeX),
-			0,
+	coeffs := make([]int32, dy*filterLength)
+	for y := 0; y < dy; y++ {
+		interpX := scale*(float64(y)+0.5) + float64(minx)
+		start := int(interpX) - filterLength/2 + 1
+		for i := 0; i < filterLength; i++ {
+			in := (interpX - float64(start) - float64(i)) * filterFactor
+			coeffs[y*filterLength+i] = int32(kernel(in) * 65536)
 		}
 	}
 
-	return
-}
-
-// Nearest-neighbor interpolation
-func NearestNeighbor(img image.Image, factor float32) Filter {
-	return createFilter(img, factor, 2, func(x float32) (y float32) {
-		if x >= -0.5 && x < 0.5 {
-			y = 1
-		} else {
-			y = 0
-		}
-
-		return
-	})
-}
-
-// Bilinear interpolation
-func Bilinear(img image.Image, factor float32) Filter {
-	return createFilter(img, factor, 2, func(x float32) (y float32) {
-		absX := float32(math.Abs(float64(x)))
-		if absX <= 1 {
-			y = 1 - absX
-		} else {
-			y = 0
-		}
-
-		return
-	})
-}
-
-// Bicubic interpolation (with cubic hermite spline)
-func Bicubic(img image.Image, factor float32) Filter {
-	return createFilter(img, factor, 4, splineKernel(0, 0.5))
-}
-
-// Mitchell-Netravali interpolation
-func MitchellNetravali(img image.Image, factor float32) Filter {
-	return createFilter(img, factor, 4, splineKernel(1.0/3.0, 1.0/3.0))
-}
-
-func splineKernel(B, C float32) func(float32) float32 {
-	factorA := 2.0 - 1.5*B - C
-	factorB := -3.0 + 2.0*B + C
-	factorC := 1.0 - 1.0/3.0*B
-	factorD := -B/6.0 - C
-	factorE := B + 5.0*C
-	factorF := -2.0*B - 8.0*C
-	factorG := 4.0/3.0*B + 4.0*C
-	return func(x float32) (y float32) {
-		absX := float32(math.Abs(float64(x)))
-		if absX <= 1 {
-			y = absX*absX*(factorA*absX+factorB) + factorC
-		} else if absX <= 2 {
-			y = absX*(absX*(absX*factorD+factorE)+factorF) + factorG
-		} else {
-			y = 0
-		}
-
-		return
-	}
-}
-
-func lanczosKernel(a uint) func(float32) float32 {
-	return func(x float32) (y float32) {
-		if x > -float32(a) && x < float32(a) {
-			y = float32(Sinc(float64(x))) * float32(Sinc(float64(x/float32(a))))
-		} else {
-			y = 0
-		}
-
-		return
-	}
-}
-
-// Lanczos interpolation (a=2)
-func Lanczos2(img image.Image, factor float32) Filter {
-	return createFilter(img, factor, 4, lanczosKernel(2))
-}
-
-// Lanczos interpolation (a=3)
-func Lanczos3(img image.Image, factor float32) Filter {
-	return createFilter(img, factor, 6, lanczosKernel(3))
+	return coeffs, filterLength
 }
diff --git a/resize.go b/resize.go
index 89ae6ba..0c3ec9a 100644
--- a/resize.go
+++ b/resize.go
@@ -26,124 +26,275 @@ package resize
 
 import (
 	"image"
-	"image/color"
 	"runtime"
+	"sync"
 )
 
-// Filter can interpolate at points (x,y)
-type Filter interface {
-	SetKernelWeights(u float32)
-	Interpolate(u float32, y int) color.RGBA64
+// An InterpolationFunction provides the parameters that describe an
+// interpolation kernel. It returns the number of samples to take
+// and the kernel function to use for sampling.
+type InterpolationFunction func() (int, func(float64) float64)
+
+// Nearest-neighbor interpolation
+func NearestNeighbor() (int, func(float64) float64) {
+	return 2, nearest
 }
 
-// InterpolationFunction return a Filter implementation
-// that operates on an image. Two factors
-// allow to scale the filter kernels in x- and y-direction
-// to prevent moire patterns.
-type InterpolationFunction func(image.Image, float32) Filter
+// Bilinear interpolation
+func Bilinear() (int, func(float64) float64) {
+	return 2, linear
+}
 
-// Resize an image to new width and height using the interpolation function interp.
+// Bicubic interpolation (with cubic hermite spline)
+func Bicubic() (int, func(float64) float64) {
+	return 4, cubic
+}
+
+// Mitchell-Netravali interpolation
+func MitchellNetravali() (int, func(float64) float64) {
+	return 4, mitchellnetravali
+}
+
+// Lanczos interpolation (a=2)
+func Lanczos2() (int, func(float64) float64) {
+	return 4, lanczos2
+}
+
+// Lanczos interpolation (a=3)
+func Lanczos3() (int, func(float64) float64) {
+	return 6, lanczos3
+}
+
+// values <1 will sharpen the image
+var blur = 1.0
+
+// Resize scales an image to new width and height using the interpolation function interp.
 // A new image with the given dimensions will be returned.
 // If one of the parameters width or height is set to 0, its size will be calculated so that
 // the aspect ratio is that of the originating image.
 // The resizing algorithm uses channels for parallel computation.
 func Resize(width, height uint, img image.Image, interp InterpolationFunction) image.Image {
-	oldBounds := img.Bounds()
-	oldWidth := float32(oldBounds.Dx())
-	oldHeight := float32(oldBounds.Dy())
-	scaleX, scaleY := calcFactors(width, height, oldWidth, oldHeight)
-
-	tempImg := image.NewRGBA64(image.Rect(0, 0, oldBounds.Dy(), int(0.7+oldWidth/scaleX)))
-	b := tempImg.Bounds()
-	adjust := 0.5 * ((oldWidth-1.0)/scaleX - float32(b.Dy()-1))
-
-	n := numJobs(b.Dy())
-	c := make(chan int, n)
-	for i := 0; i < n; i++ {
-		slice := image.Rect(b.Min.X, b.Min.Y+i*(b.Dy())/n, b.Max.X, b.Min.Y+(i+1)*(b.Dy())/n)
-		go resizeSlice(img, tempImg, interp, scaleX, adjust, float32(oldBounds.Min.X), oldBounds.Min.Y, slice, c)
+	scaleX, scaleY := calcFactors(width, height, float64(img.Bounds().Dx()), float64(img.Bounds().Dy()))
+	if width == 0 {
+		width = uint(0.7 + float64(img.Bounds().Dx())/scaleX)
 	}
-	for i := 0; i < n; i++ {
-		<-c
+	if height == 0 {
+		height = uint(0.7 + float64(img.Bounds().Dy())/scaleY)
 	}
 
-	resultImg := image.NewRGBA64(image.Rect(0, 0, int(0.7+oldWidth/scaleX), int(0.7+oldHeight/scaleY)))
-	b = resultImg.Bounds()
-	adjust = 0.5 * ((oldHeight-1.0)/scaleY - float32(b.Dy()-1))
+	taps, kernel := interp()
+	cpus := runtime.NumCPU()
+	wg := sync.WaitGroup{}
 
-	for i := 0; i < n; i++ {
-		slice := image.Rect(b.Min.X, b.Min.Y+i*(b.Dy())/n, b.Max.X, b.Min.Y+(i+1)*(b.Dy())/n)
-		go resizeSlice(tempImg, resultImg, interp, scaleY, adjust, 0, 0, slice, c)
-	}
-	for i := 0; i < n; i++ {
-		<-c
-	}
+	// Generic access to image.Image is slow in tight loops.
+	// The optimal access has to be determined from the concrete image type.
+	switch input := img.(type) {
+	case *image.RGBA:
+		// 8-bit precision
+		temp := image.NewRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewRGBA(image.Rect(0, 0, int(width), int(height)))
 
-	return resultImg
-}
-
-// Resize a rectangle image slice
-func resizeSlice(input image.Image, output *image.RGBA64, interp InterpolationFunction, scale, adjust, xoffset float32, yoffset int, slice image.Rectangle, c chan int) {
-	filter := interp(input, float32(clampFactor(scale)))
-	var u float32
-	var color color.RGBA64
-	for y := slice.Min.Y; y < slice.Max.Y; y++ {
-		u = scale*(float32(y)+adjust) + xoffset
-		filter.SetKernelWeights(u)
-		for x := slice.Min.X; x < slice.Max.X; x++ {
-			color = filter.Interpolate(u, x+yoffset)
-			i := output.PixOffset(x, y)
-			output.Pix[i+0] = uint8(color.R >> 8)
-			output.Pix[i+1] = uint8(color.R)
-			output.Pix[i+2] = uint8(color.G >> 8)
-			output.Pix[i+3] = uint8(color.G)
-			output.Pix[i+4] = uint8(color.B >> 8)
-			output.Pix[i+5] = uint8(color.B)
-			output.Pix[i+6] = uint8(color.A >> 8)
-			output.Pix[i+7] = uint8(color.A)
+		// horizontal filter, results in transposed temporary image
+		coeffs, filterLength := createWeights8(temp.Bounds().Dy(), input.Bounds().Min.X, taps, blur, scaleX, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.RGBA)
+			go func() {
+				defer wg.Done()
+				resizeRGBA(input, slice, scaleX, coeffs, filterLength)
+			}()
 		}
-	}
+		wg.Wait()
 
-	c <- 1
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, filterLength = createWeights8(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.RGBA)
+			go func() {
+				defer wg.Done()
+				resizeRGBA(temp, slice, scaleY, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.YCbCr:
+		// 8-bit precision
+		// accessing the YCbCr arrays in a tight loop is slow.
+		// converting the image before filtering will improve performance.
+		inputAsRGBA := convertYCbCrToRGBA(input)
+		temp := image.NewRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewRGBA(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, filterLength := createWeights8(temp.Bounds().Dy(), input.Bounds().Min.X, taps, blur, scaleX, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.RGBA)
+			go func() {
+				defer wg.Done()
+				resizeRGBA(inputAsRGBA, slice, scaleX, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, filterLength = createWeights8(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.RGBA)
+			go func() {
+				defer wg.Done()
+				resizeRGBA(temp, slice, scaleY, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.RGBA64:
+		// 16-bit precision
+		temp := image.NewRGBA64(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, filterLength := createWeights16(temp.Bounds().Dy(), input.Bounds().Min.X, taps, blur, scaleX, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
+			go func() {
+				defer wg.Done()
+				resizeRGBA64(input, slice, scaleX, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, filterLength = createWeights16(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.RGBA64)
+			go func() {
+				defer wg.Done()
+				resizeGeneric(temp, slice, scaleY, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.Gray:
+		// 8-bit precision
+		temp := image.NewGray(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewGray(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, filterLength := createWeights8(temp.Bounds().Dy(), input.Bounds().Min.X, taps, blur, scaleX, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.Gray)
+			go func() {
+				defer wg.Done()
+				resizeGray(input, slice, scaleX, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, filterLength = createWeights8(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.Gray)
+			go func() {
+				defer wg.Done()
+				resizeGray(temp, slice, scaleY, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.Gray16:
+		// 16-bit precision
+		temp := image.NewGray16(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewGray16(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, filterLength := createWeights16(temp.Bounds().Dy(), input.Bounds().Min.X, taps, blur, scaleX, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.Gray16)
+			go func() {
+				defer wg.Done()
+				resizeGray16(input, slice, scaleX, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, filterLength = createWeights16(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.Gray16)
+			go func() {
+				defer wg.Done()
+				resizeGray16(temp, slice, scaleY, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	default:
+		// 16-bit precision
+		temp := image.NewRGBA64(image.Rect(0, 0, img.Bounds().Dy(), int(width)))
+		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, filterLength := createWeights16(temp.Bounds().Dy(), img.Bounds().Min.X, taps, blur, scaleX, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
+			go func() {
+				defer wg.Done()
+				resizeGeneric(img, slice, scaleX, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, filterLength = createWeights16(result.Bounds().Dy(), temp.Bounds().Min.X, taps, blur, scaleY, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.RGBA64)
+			go func() {
+				defer wg.Done()
+				resizeRGBA64(temp, slice, scaleY, coeffs, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	}
 }
 
-// Calculate scaling factors using old and new image dimensions.
-func calcFactors(width, height uint, oldWidth, oldHeight float32) (scaleX, scaleY float32) {
+// Calculates scaling factors using old and new image dimensions.
+func calcFactors(width, height uint, oldWidth, oldHeight float64) (scaleX, scaleY float64) {
 	if width == 0 {
 		if height == 0 {
 			scaleX = 1.0
 			scaleY = 1.0
 		} else {
-			scaleY = oldHeight / float32(height)
+			scaleY = oldHeight / float64(height)
 			scaleX = scaleY
 		}
 	} else {
-		scaleX = oldWidth / float32(width)
+		scaleX = oldWidth / float64(width)
 		if height == 0 {
 			scaleY = scaleX
 		} else {
-			scaleY = oldHeight / float32(height)
+			scaleY = oldHeight / float64(height)
 		}
 	}
 	return
 }
 
-// Set filter scaling factor to avoid moire patterns.
-// This is only useful in case of downscaling (factor>1).
-func clampFactor(factor float32) float32 {
-	if factor < 1 {
-		factor = 1
-	}
-	return factor
+type imageWithSubImage interface {
+	image.Image
+	SubImage(image.Rectangle) image.Image
 }
 
-// Set number of parallel jobs
-// but prevent resize from doing too much work
-// if #CPUs > width
-func numJobs(d int) (n int) {
-	n = runtime.NumCPU()
-	if n > d {
-		n = d
-	}
-	return
+func makeSlice(img imageWithSubImage, i, n int) image.Image {
+	return img.SubImage(image.Rect(img.Bounds().Min.X, img.Bounds().Min.Y+i*img.Bounds().Dy()/n, img.Bounds().Max.X, img.Bounds().Min.Y+(i+1)*img.Bounds().Dy()/n))
 }
diff --git a/resize_test.go b/resize_test.go
index 17cd51c..283b89c 100644
--- a/resize_test.go
+++ b/resize_test.go
@@ -14,13 +14,6 @@ func init() {
 	img.Set(1, 1, color.White)
 }
 
-func Test_Nearest(t *testing.T) {
-	m := Resize(6, 0, img, NearestNeighbor)
-	if m.At(1, 1) == m.At(2, 2) {
-		t.Fail()
-	}
-}
-
 func Test_Param1(t *testing.T) {
 	m := Resize(0, 0, img, NearestNeighbor)
 	if m.Bounds() != img.Bounds() {
@@ -53,6 +46,24 @@ func Test_CorrectResize(t *testing.T) {
 	}
 }
 
+func Test_SameColor(t *testing.T) {
+	img := image.NewRGBA(image.Rect(0, 0, 20, 20))
+	for y := img.Bounds().Min.Y; y < img.Bounds().Max.Y; y++ {
+		for x := img.Bounds().Min.X; x < img.Bounds().Max.X; x++ {
+			img.SetRGBA(x, y, color.RGBA{0x80, 0x80, 0x80, 0xFF})
+		}
+	}
+	out := Resize(10, 10, img, Lanczos3)
+	for y := out.Bounds().Min.Y; y < out.Bounds().Max.Y; y++ {
+		for x := out.Bounds().Min.X; x < out.Bounds().Max.X; x++ {
+			color := img.At(x, y).(color.RGBA)
+			if color.R != 0x80 || color.G != 0x80 || color.B != 0x80 || color.A != 0xFF {
+				t.Fail()
+			}
+		}
+	}
+}
+
 func Benchmark_BigResizeLanczos3(b *testing.B) {
 	var m image.Image
 	for i := 0; i < b.N; i++ {
diff --git a/sinc.go b/sinc.go
deleted file mode 100644
index 723e6af..0000000
--- a/sinc.go
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
-Copyright (c) 2012, Jan Schlicht <jan.schlicht@gmail.com>
-
-Permission to use, copy, modify, and/or distribute this software for any purpose
-with or without fee is hereby granted, provided that the above copyright notice
-and this permission notice appear in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
-REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
-INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
-THIS SOFTWARE.
-*/
-
-package resize
-
-import (
-	"math"
-)
-
-var (
-	epsilon      = math.Nextafter(1.0, 2.0) - 1.0 // machine epsilon
-	taylor2bound = math.Sqrt(epsilon)
-	taylorNbound = math.Sqrt(taylor2bound)
-)
-
-// unnormalized sinc function
-func Sinc1(x float64) (y float64) {
-	if math.Abs(x) >= taylorNbound {
-		y = math.Sin(x) / x
-	} else {
-		y = 1.0
-		if math.Abs(x) >= epsilon {
-			x2 := x * x
-			y -= x2 / 6.0
-			if math.Abs(x) >= taylor2bound {
-				y += (x2 * x2) / 120.0
-			}
-		}
-	}
-	return
-}
-
-// normalized sinc function
-func Sinc(x float64) float64 {
-	return Sinc1(x * math.Pi)
-}
diff --git a/sinc_test.go b/sinc_test.go
deleted file mode 100644
index 9372853..0000000
--- a/sinc_test.go
+++ /dev/null
@@ -1,38 +0,0 @@
-package resize
-
-import (
-	"fmt"
-	"math"
-	"testing"
-)
-
-const limit = 1e-12
-
-func Test_SincOne(t *testing.T) {
-	zero := Sinc(1)
-	if zero >= limit {
-		t.Error("Sinc(1) != 0")
-	}
-}
-
-func Test_SincZero(t *testing.T) {
-	one := Sinc(0)
-	if math.Abs(one-1) >= limit {
-		t.Error("Sinc(0) != 1")
-	}
-}
-
-func Test_SincDotOne(t *testing.T) {
-	res := Sinc(0.1)
-	if math.Abs(res-0.983631643083466) >= limit {
-		t.Error("Sinc(0.1) wrong")
-	}
-}
-
-func Test_SincNearZero(t *testing.T) {
-	res := Sinc(0.000001)
-	if math.Abs(res-0.9999999999983551) >= limit {
-		fmt.Println(res)
-		t.Error("Sinc near zero not stable")
-	}
-}