From 69afd001f792d732a78bd7225793315a8deb09ea Mon Sep 17 00:00:00 2001
From: Nigel Tao <nigeltao@golang.org>
Date: Sat, 21 Jan 2017 16:32:42 +1100
Subject: [PATCH] font/sfnt: parse the cmap table.

Change-Id: I757d42c9caf419f549696543f0f156cfe3dbfe1a
Reviewed-on: https://go-review.googlesource.com/35512
Reviewed-by: David Crawshaw <crawshaw@golang.org>
---
 font/sfnt/cmap.go      | 198 +++++++++++++++++++++++++++++++++++++++++
 font/sfnt/sfnt.go      | 196 ++++++++++++++++++++++++++++++----------
 font/sfnt/sfnt_test.go |  70 ++++++++++++++-
 3 files changed, 414 insertions(+), 50 deletions(-)
 create mode 100644 font/sfnt/cmap.go

diff --git a/font/sfnt/cmap.go b/font/sfnt/cmap.go
new file mode 100644
index 0000000..19b0a9d
--- /dev/null
+++ b/font/sfnt/cmap.go
@@ -0,0 +1,198 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sfnt
+
+import (
+	"unicode/utf8"
+
+	"golang.org/x/text/encoding/charmap"
+)
+
+// Platform IDs and Platform Specific IDs as per
+// https://www.microsoft.com/typography/otspec/name.htm
+const (
+	pidUnicode   = 0
+	pidMacintosh = 1
+	pidWindows   = 3
+
+	psidUnicode2BMPOnly        = 3
+	psidUnicode2FullRepertoire = 4
+
+	psidMacintoshRoman = 0
+
+	psidWindowsUCS2 = 1
+	psidWindowsUCS4 = 10
+)
+
+// platformEncodingWidth returns the number of bytes per character assumed by
+// the given Platform ID and Platform Specific ID.
+//
+// Very old fonts, from before Unicode was widely adopted, assume only 1 byte
+// per character: a character map.
+//
+// Old fonts, from when Unicode meant the Basic Multilingual Plane (BMP),
+// assume that 2 bytes per character is sufficient.
+//
+// Recent fonts naturally support the full range of Unicode code points, which
+// can take up to 4 bytes per character. Such fonts might still choose one of
+// the legacy encodings if e.g. their repertoire is limited to the BMP, for
+// greater compatibility with older software, or because the resultant file
+// size can be smaller.
+func platformEncodingWidth(pid, psid uint16) int {
+	switch pid {
+	case pidUnicode:
+		switch psid {
+		case psidUnicode2BMPOnly:
+			return 2
+		case psidUnicode2FullRepertoire:
+			return 4
+		}
+
+	case pidMacintosh:
+		switch psid {
+		case psidMacintoshRoman:
+			return 1
+		}
+
+	case pidWindows:
+		switch psid {
+		case psidWindowsUCS2:
+			return 2
+		case psidWindowsUCS4:
+			return 4
+		}
+	}
+	return 0
+}
+
+// The various cmap formats are described at
+// https://www.microsoft.com/typography/otspec/cmap.htm
+
+var supportedCmapFormat = func(format, pid, psid uint16) bool {
+	switch format {
+	case 0:
+		return pid == pidMacintosh && psid == psidMacintoshRoman
+	case 4:
+		return true
+	case 12:
+		// TODO: implement.
+	}
+	return false
+}
+
+func (f *Font) makeCachedGlyphIndex(buf []byte, offset, length uint32, format uint16) ([]byte, error) {
+	switch format {
+	case 0:
+		return f.makeCachedGlyphIndexFormat0(buf, offset, length)
+	case 4:
+		return f.makeCachedGlyphIndexFormat4(buf, offset, length)
+	case 12:
+		// TODO: implement, including a cmapEntry32 type (32, not 16).
+	}
+	panic("unreachable")
+}
+
+func (f *Font) makeCachedGlyphIndexFormat0(buf []byte, offset, length uint32) ([]byte, error) {
+	if length != 6+256 || offset+length > f.cmap.length {
+		return nil, errInvalidCmapTable
+	}
+	var err error
+	buf, err = f.src.view(buf, int(f.cmap.offset+offset), int(length))
+	if err != nil {
+		return nil, err
+	}
+	var table [256]byte
+	copy(table[:], buf[6:])
+	f.cached.glyphIndex = func(f *Font, b *Buffer, r rune) (GlyphIndex, error) {
+		// TODO: for this closure to be goroutine-safe, the
+		// golang.org/x/text/encoding/charmap API needs to allocate a new
+		// Encoder and new []byte buffers, for every call to this closure, even
+		// though all we want to do is to encode one rune as one byte. We could
+		// possibly add some fields in the Buffer struct to re-use these
+		// allocations, but a better solution is to improve the charmap API.
+		var dst, src [utf8.UTFMax]byte
+		n := utf8.EncodeRune(src[:], r)
+		_, _, err = charmap.Macintosh.NewEncoder().Transform(dst[:], src[:n], true)
+		if err != nil {
+			// The source rune r is not representable in the Macintosh-Roman encoding.
+			return 0, nil
+		}
+		return GlyphIndex(table[dst[0]]), nil
+	}
+	return buf, nil
+}
+
+func (f *Font) makeCachedGlyphIndexFormat4(buf []byte, offset, length uint32) ([]byte, error) {
+	const headerSize = 14
+	if offset+headerSize > f.cmap.length {
+		return nil, errInvalidCmapTable
+	}
+	var err error
+	buf, err = f.src.view(buf, int(f.cmap.offset+offset), headerSize)
+	if err != nil {
+		return nil, err
+	}
+	offset += headerSize
+
+	segCount := u16(buf[6:])
+	if segCount&1 != 0 {
+		return nil, errInvalidCmapTable
+	}
+	segCount /= 2
+	if segCount > maxCmapSegments {
+		return nil, errUnsupportedNumberOfCmapSegments
+	}
+
+	eLength := 8*uint32(segCount) + 2
+	if offset+eLength > f.cmap.length {
+		return nil, errInvalidCmapTable
+	}
+	buf, err = f.src.view(buf, int(f.cmap.offset+offset), int(eLength))
+	if err != nil {
+		return nil, err
+	}
+	offset += eLength
+
+	entries := make([]cmapEntry16, segCount)
+	for i := range entries {
+		entries[i] = cmapEntry16{
+			end:    u16(buf[0*len(entries)+0+2*i:]),
+			start:  u16(buf[2*len(entries)+2+2*i:]),
+			delta:  u16(buf[4*len(entries)+2+2*i:]),
+			offset: u16(buf[6*len(entries)+2+2*i:]),
+		}
+	}
+
+	f.cached.glyphIndex = func(f *Font, b *Buffer, r rune) (GlyphIndex, error) {
+		if uint32(r) > 0xffff {
+			return 0, nil
+		}
+
+		c := uint16(r)
+		for i, j := 0, len(entries); i < j; {
+			h := i + (j-i)/2
+			entry := &entries[h]
+			if c < entry.start {
+				j = h
+			} else if entry.end < c {
+				i = h + 1
+			} else if entry.offset == 0 {
+				return GlyphIndex(c + entry.delta), nil
+			} else {
+				// TODO: support the glyphIdArray as per
+				// https://www.microsoft.com/typography/OTSPEC/cmap.htm
+				//
+				// This will probably use the *Font and *Buffer arguments.
+				return 0, errUnsupportedCmapFormat
+			}
+		}
+		return 0, nil
+	}
+	return buf, nil
+}
+
+type cmapEntry16 struct {
+	end, start, delta, offset uint16
+}
diff --git a/font/sfnt/sfnt.go b/font/sfnt/sfnt.go
index d4929a8..83065b1 100644
--- a/font/sfnt/sfnt.go
+++ b/font/sfnt/sfnt.go
@@ -13,6 +13,9 @@ package sfnt // import "golang.org/x/image/font/sfnt"
 //
 // The pyftinspect tool from https://github.com/fonttools/fonttools is useful
 // for inspecting SFNT fonts.
+//
+// The ttfdump tool is also useful. For example:
+//	ttfdump -t cmap ../testdata/CFFTest.otf dump.txt
 
 import (
 	"errors"
@@ -25,6 +28,7 @@ import (
 // These constants are not part of the specifications, but are limitations used
 // by this implementation.
 const (
+	maxCmapSegments     = 1024
 	maxGlyphDataLength  = 64 * 1024
 	maxHintBits         = 256
 	maxNumTables        = 256
@@ -41,6 +45,7 @@ var (
 
 	errInvalidBounds        = errors.New("sfnt: invalid bounds")
 	errInvalidCFFTable      = errors.New("sfnt: invalid CFF table")
+	errInvalidCmapTable     = errors.New("sfnt: invalid cmap table")
 	errInvalidGlyphData     = errors.New("sfnt: invalid glyph data")
 	errInvalidHeadTable     = errors.New("sfnt: invalid head table")
 	errInvalidLocaTable     = errors.New("sfnt: invalid loca table")
@@ -53,15 +58,18 @@ var (
 	errInvalidUCS2String    = errors.New("sfnt: invalid UCS-2 string")
 	errInvalidVersion       = errors.New("sfnt: invalid version")
 
-	errUnsupportedCFFVersion         = errors.New("sfnt: unsupported CFF version")
-	errUnsupportedCompoundGlyph      = errors.New("sfnt: unsupported compound glyph")
-	errUnsupportedGlyphDataLength    = errors.New("sfnt: unsupported glyph data length")
-	errUnsupportedRealNumberEncoding = errors.New("sfnt: unsupported real number encoding")
-	errUnsupportedNumberOfHints      = errors.New("sfnt: unsupported number of hints")
-	errUnsupportedNumberOfTables     = errors.New("sfnt: unsupported number of tables")
-	errUnsupportedPlatformEncoding   = errors.New("sfnt: unsupported platform encoding")
-	errUnsupportedTableOffsetLength  = errors.New("sfnt: unsupported table offset or length")
-	errUnsupportedType2Charstring    = errors.New("sfnt: unsupported Type 2 Charstring")
+	errUnsupportedCFFVersion           = errors.New("sfnt: unsupported CFF version")
+	errUnsupportedCmapEncodings        = errors.New("sfnt: unsupported cmap encodings")
+	errUnsupportedCmapFormat           = errors.New("sfnt: unsupported cmap format")
+	errUnsupportedCompoundGlyph        = errors.New("sfnt: unsupported compound glyph")
+	errUnsupportedGlyphDataLength      = errors.New("sfnt: unsupported glyph data length")
+	errUnsupportedRealNumberEncoding   = errors.New("sfnt: unsupported real number encoding")
+	errUnsupportedNumberOfCmapSegments = errors.New("sfnt: unsupported number of cmap segments")
+	errUnsupportedNumberOfHints        = errors.New("sfnt: unsupported number of hints")
+	errUnsupportedNumberOfTables       = errors.New("sfnt: unsupported number of tables")
+	errUnsupportedPlatformEncoding     = errors.New("sfnt: unsupported platform encoding")
+	errUnsupportedTableOffsetLength    = errors.New("sfnt: unsupported table offset or length")
+	errUnsupportedType2Charstring      = errors.New("sfnt: unsupported Type 2 Charstring")
 )
 
 // GlyphIndex is a glyph index in a Font.
@@ -107,17 +115,6 @@ const (
 // display resolution (DPI) and font size (e.g. a 12 point font).
 type Units int32
 
-// Platform IDs and Platform Specific IDs as per
-// https://www.microsoft.com/typography/otspec/name.htm
-const (
-	pidMacintosh = 1
-	pidWindows   = 3
-
-	psidMacintoshRoman = 0
-
-	psidWindowsUCS2 = 1
-)
-
 func u16(b []byte) uint16 {
 	_ = b[1] // Bounds check hint to compiler.
 	return uint16(b[0])<<8 | uint16(b[1])<<0
@@ -286,6 +283,7 @@ type Font struct {
 	// TODO: hdmx, kern, vmtx? Others?
 
 	cached struct {
+		glyphIndex       func(f *Font, b *Buffer, r rune) (GlyphIndex, error)
 		indexToLocFormat bool // false means short, true means long.
 		isPostScript     bool
 		unitsPerEm       Units
@@ -306,18 +304,36 @@ func (f *Font) initialize() error {
 	if !f.src.valid() {
 		return errInvalidSourceData
 	}
-	var buf []byte
+	buf, err := f.initializeTables(nil)
+	if err != nil {
+		return err
+	}
+	buf, err = f.parseHead(buf)
+	if err != nil {
+		return err
+	}
+	buf, err = f.parseMaxp(buf)
+	if err != nil {
+		return err
+	}
+	buf, err = f.parseCmap(buf)
+	if err != nil {
+		return err
+	}
+	return nil
+}
 
+func (f *Font) initializeTables(buf []byte) ([]byte, error) {
 	// https://www.microsoft.com/typography/otspec/otff.htm "Organization of an
 	// OpenType Font" says that "The OpenType font starts with the Offset
 	// Table", which is 12 bytes.
 	buf, err := f.src.view(buf, 0, 12)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	switch u32(buf) {
 	default:
-		return errInvalidVersion
+		return nil, errInvalidVersion
 	case 0x00010000:
 		// No-op.
 	case 0x4f54544f: // "OTTO".
@@ -325,32 +341,32 @@ func (f *Font) initialize() error {
 	}
 	numTables := int(u16(buf[4:]))
 	if numTables > maxNumTables {
-		return errUnsupportedNumberOfTables
+		return nil, errUnsupportedNumberOfTables
 	}
 
 	// "The Offset Table is followed immediately by the Table Record entries...
 	// sorted in ascending order by tag", 16 bytes each.
 	buf, err = f.src.view(buf, 12, 16*numTables)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	for b, first, prevTag := buf, true, uint32(0); len(b) > 0; b = b[16:] {
 		tag := u32(b)
 		if first {
 			first = false
 		} else if tag <= prevTag {
-			return errInvalidTableTagOrder
+			return nil, errInvalidTableTagOrder
 		}
 		prevTag = tag
 
 		o, n := u32(b[8:12]), u32(b[12:16])
 		if o > maxTableOffset || n > maxTableLength {
-			return errUnsupportedTableOffsetLength
+			return nil, errUnsupportedTableOffsetLength
 		}
 		// We ignore the checksums, but "all tables must begin on four byte
 		// boundries [sic]".
 		if o&3 != 0 {
-			return errInvalidTableOffset
+			return nil, errInvalidTableOffset
 		}
 
 		// Match the 4-byte tag as a uint32. For example, "OS/2" is 0x4f532f32.
@@ -379,40 +395,109 @@ func (f *Font) initialize() error {
 			f.post = table{o, n}
 		}
 	}
+	return buf, nil
+}
 
-	var u uint16
+func (f *Font) parseCmap(buf []byte) ([]byte, error) {
+	// https://www.microsoft.com/typography/OTSPEC/cmap.htm
 
-	// https://www.microsoft.com/typography/otspec/head.htm
-	if f.head.length != 54 {
-		return errInvalidHeadTable
+	const headerSize, entrySize = 4, 8
+	if f.cmap.length < headerSize {
+		return nil, errInvalidCmapTable
 	}
-	u, err = f.src.u16(buf, f.head, 18)
+	u, err := f.src.u16(buf, f.cmap, 2)
 	if err != nil {
-		return err
+		return nil, err
+	}
+	numSubtables := int(u)
+	if f.cmap.length < headerSize+entrySize*uint32(numSubtables) {
+		return nil, errInvalidCmapTable
+	}
+
+	var (
+		bestWidth  int
+		bestOffset uint32
+		bestLength uint32
+		bestFormat uint16
+	)
+
+	// Scan all of the subtables, picking the widest supported one. See the
+	// platformEncodingWidth comment for more discussion of width.
+	for i := 0; i < numSubtables; i++ {
+		buf, err = f.src.view(buf, int(f.cmap.offset)+headerSize+entrySize*i, entrySize)
+		if err != nil {
+			return nil, err
+		}
+		pid := u16(buf)
+		psid := u16(buf[2:])
+		width := platformEncodingWidth(pid, psid)
+		if width <= bestWidth {
+			continue
+		}
+		offset := u32(buf[4:])
+
+		if offset > f.cmap.length-4 {
+			return nil, errInvalidCmapTable
+		}
+		buf, err = f.src.view(buf, int(f.cmap.offset+offset), 4)
+		if err != nil {
+			return nil, err
+		}
+		format := u16(buf)
+		if !supportedCmapFormat(format, pid, psid) {
+			continue
+		}
+		length := uint32(u16(buf[2:]))
+
+		bestWidth = width
+		bestOffset = offset
+		bestLength = length
+		bestFormat = format
+	}
+
+	if bestWidth == 0 {
+		return nil, errUnsupportedCmapEncodings
+	}
+	return f.makeCachedGlyphIndex(buf, bestOffset, bestLength, bestFormat)
+}
+
+func (f *Font) parseHead(buf []byte) ([]byte, error) {
+	// https://www.microsoft.com/typography/otspec/head.htm
+
+	if f.head.length != 54 {
+		return nil, errInvalidHeadTable
+	}
+	u, err := f.src.u16(buf, f.head, 18)
+	if err != nil {
+		return nil, err
 	}
 	if u == 0 {
-		return errInvalidHeadTable
+		return nil, errInvalidHeadTable
 	}
 	f.cached.unitsPerEm = Units(u)
 	u, err = f.src.u16(buf, f.head, 50)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	f.cached.indexToLocFormat = u != 0
+	return buf, nil
+}
 
+func (f *Font) parseMaxp(buf []byte) ([]byte, error) {
 	// https://www.microsoft.com/typography/otspec/maxp.htm
+
 	if f.cached.isPostScript {
 		if f.maxp.length != 6 {
-			return errInvalidMaxpTable
+			return nil, errInvalidMaxpTable
 		}
 	} else {
 		if f.maxp.length != 32 {
-			return errInvalidMaxpTable
+			return nil, errInvalidMaxpTable
 		}
 	}
-	u, err = f.src.u16(buf, f.maxp, 4)
+	u, err := f.src.u16(buf, f.maxp, 4)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	numGlyphs := int(u)
 
@@ -425,23 +510,36 @@ func (f *Font) initialize() error {
 		}
 		f.cached.locations, err = p.parse()
 		if err != nil {
-			return err
+			return nil, err
 		}
 	} else {
 		f.cached.locations, err = parseLoca(
 			&f.src, f.loca, f.glyf.offset, f.cached.indexToLocFormat, numGlyphs)
 		if err != nil {
-			return err
+			return nil, err
 		}
 	}
 	if len(f.cached.locations) != numGlyphs+1 {
-		return errInvalidLocationData
+		return nil, errInvalidLocationData
 	}
-	return nil
+
+	return buf, nil
 }
 
-// TODO: func (f *Font) GlyphIndex(r rune) (x GlyphIndex, ok bool)
-// This will require parsing the cmap table.
+// TODO: API for looking up glyph variants?? For example, some fonts may
+// provide both slashed and dotted zero glyphs ('0'), or regular and 'old
+// style' numerals, and users can direct software to choose a variant.
+
+// GlyphIndex returns the glyph index for the given rune.
+//
+// It returns (0, nil) if there is no glyph for r.
+// https://www.microsoft.com/typography/OTSPEC/cmap.htm says that "Character
+// codes that do not correspond to any glyph in the font should be mapped to
+// glyph index 0. The glyph at this location must be a special glyph
+// representing a missing character, commonly known as .notdef."
+func (f *Font) GlyphIndex(b *Buffer, r rune) (GlyphIndex, error) {
+	return f.cached.glyphIndex(f, b, r)
+}
 
 func (f *Font) viewGlyphData(b *Buffer, x GlyphIndex) ([]byte, error) {
 	xx := int(x)
@@ -512,14 +610,14 @@ func (f *Font) Name(b *Buffer, id NameID) (string, error) {
 	if err != nil {
 		return "", err
 	}
-	nSubtables := u16(buf[2:])
-	if f.name.length < headerSize+entrySize*uint32(nSubtables) {
+	numSubtables := u16(buf[2:])
+	if f.name.length < headerSize+entrySize*uint32(numSubtables) {
 		return "", errInvalidNameTable
 	}
 	stringOffset := u16(buf[4:])
 
 	seen := false
-	for i, n := 0, int(nSubtables); i < n; i++ {
+	for i, n := 0, int(numSubtables); i < n; i++ {
 		buf, err := b.view(&f.src, int(f.name.offset)+headerSize+entrySize*i, entrySize)
 		if err != nil {
 			return "", err
diff --git a/font/sfnt/sfnt_test.go b/font/sfnt/sfnt_test.go
index 4ffd72f..4b72221 100644
--- a/font/sfnt/sfnt_test.go
+++ b/font/sfnt/sfnt_test.go
@@ -88,6 +88,74 @@ func testTrueType(t *testing.T, f *Font) {
 	}
 }
 
+func TestGlyphIndex(t *testing.T) {
+	data, err := ioutil.ReadFile(filepath.FromSlash("../testdata/CFFTest.otf"))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for _, format := range []int{-1, 0, 4} {
+		testGlyphIndex(t, data, format)
+	}
+}
+
+func testGlyphIndex(t *testing.T, data []byte, cmapFormat int) {
+	if cmapFormat >= 0 {
+		originalSupportedCmapFormat := supportedCmapFormat
+		defer func() {
+			supportedCmapFormat = originalSupportedCmapFormat
+		}()
+		supportedCmapFormat = func(format, pid, psid uint16) bool {
+			return int(format) == cmapFormat && originalSupportedCmapFormat(format, pid, psid)
+		}
+	}
+
+	f, err := Parse(data)
+	if err != nil {
+		t.Errorf("cmapFormat=%d: %v", cmapFormat, err)
+		return
+	}
+
+	testCases := []struct {
+		r    rune
+		want GlyphIndex
+	}{
+		{'0', 1},
+		{'1', 2},
+		{'Q', 3},
+		// TODO: add the U+00E0 non-ASCII Latin-1 Supplement rune to
+		// CFFTest.otf and change 0 to something non-zero.
+		{'\u00e0', 0},
+		{'\u4e2d', 4},
+		// TODO: add a rune >= U+00010000 to CFFTest.otf?
+
+		// Glyphs that aren't present in CFFTest.otf.
+		{'?', 0},
+		{'\ufffd', 0},
+		{'\U0001f4a9', 0},
+	}
+
+	var b Buffer
+	for _, tc := range testCases {
+		want := tc.want
+		// cmap format 0, with the Macintosh Roman encoding, can't represent
+		// U+4E2D.
+		if cmapFormat == 0 && tc.r == '\u4e2d' {
+			want = 0
+		}
+
+		got, err := f.GlyphIndex(&b, tc.r)
+		if err != nil {
+			t.Errorf("cmapFormat=%d, r=%q: %v", cmapFormat, tc.r, err)
+			continue
+		}
+		if got != want {
+			t.Errorf("cmapFormat=%d, r=%q: got %d, want %d", cmapFormat, tc.r, got, want)
+			continue
+		}
+	}
+}
+
 func TestPostScriptSegments(t *testing.T) {
 	// wants' vectors correspond 1-to-1 to what's in the CFFTest.sfd file,
 	// although OpenType/CFF and FontForge's SFD have reversed orders.
@@ -226,7 +294,7 @@ func TestTrueTypeSegments(t *testing.T) {
 }
 
 func testSegments(t *testing.T, filename string, wants [][]Segment) {
-	data, err := ioutil.ReadFile(filepath.Join("..", "testdata", filename))
+	data, err := ioutil.ReadFile(filepath.FromSlash("../testdata/" + filename))
 	if err != nil {
 		t.Fatal(err)
 	}