From 69afd001f792d732a78bd7225793315a8deb09ea Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Sat, 21 Jan 2017 16:32:42 +1100 Subject: [PATCH] font/sfnt: parse the cmap table. Change-Id: I757d42c9caf419f549696543f0f156cfe3dbfe1a Reviewed-on: https://go-review.googlesource.com/35512 Reviewed-by: David Crawshaw --- font/sfnt/cmap.go | 198 +++++++++++++++++++++++++++++++++++++++++ font/sfnt/sfnt.go | 196 ++++++++++++++++++++++++++++++---------- font/sfnt/sfnt_test.go | 70 ++++++++++++++- 3 files changed, 414 insertions(+), 50 deletions(-) create mode 100644 font/sfnt/cmap.go diff --git a/font/sfnt/cmap.go b/font/sfnt/cmap.go new file mode 100644 index 0000000..19b0a9d --- /dev/null +++ b/font/sfnt/cmap.go @@ -0,0 +1,198 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sfnt + +import ( + "unicode/utf8" + + "golang.org/x/text/encoding/charmap" +) + +// Platform IDs and Platform Specific IDs as per +// https://www.microsoft.com/typography/otspec/name.htm +const ( + pidUnicode = 0 + pidMacintosh = 1 + pidWindows = 3 + + psidUnicode2BMPOnly = 3 + psidUnicode2FullRepertoire = 4 + + psidMacintoshRoman = 0 + + psidWindowsUCS2 = 1 + psidWindowsUCS4 = 10 +) + +// platformEncodingWidth returns the number of bytes per character assumed by +// the given Platform ID and Platform Specific ID. +// +// Very old fonts, from before Unicode was widely adopted, assume only 1 byte +// per character: a character map. +// +// Old fonts, from when Unicode meant the Basic Multilingual Plane (BMP), +// assume that 2 bytes per character is sufficient. +// +// Recent fonts naturally support the full range of Unicode code points, which +// can take up to 4 bytes per character. Such fonts might still choose one of +// the legacy encodings if e.g. their repertoire is limited to the BMP, for +// greater compatibility with older software, or because the resultant file +// size can be smaller. +func platformEncodingWidth(pid, psid uint16) int { + switch pid { + case pidUnicode: + switch psid { + case psidUnicode2BMPOnly: + return 2 + case psidUnicode2FullRepertoire: + return 4 + } + + case pidMacintosh: + switch psid { + case psidMacintoshRoman: + return 1 + } + + case pidWindows: + switch psid { + case psidWindowsUCS2: + return 2 + case psidWindowsUCS4: + return 4 + } + } + return 0 +} + +// The various cmap formats are described at +// https://www.microsoft.com/typography/otspec/cmap.htm + +var supportedCmapFormat = func(format, pid, psid uint16) bool { + switch format { + case 0: + return pid == pidMacintosh && psid == psidMacintoshRoman + case 4: + return true + case 12: + // TODO: implement. + } + return false +} + +func (f *Font) makeCachedGlyphIndex(buf []byte, offset, length uint32, format uint16) ([]byte, error) { + switch format { + case 0: + return f.makeCachedGlyphIndexFormat0(buf, offset, length) + case 4: + return f.makeCachedGlyphIndexFormat4(buf, offset, length) + case 12: + // TODO: implement, including a cmapEntry32 type (32, not 16). + } + panic("unreachable") +} + +func (f *Font) makeCachedGlyphIndexFormat0(buf []byte, offset, length uint32) ([]byte, error) { + if length != 6+256 || offset+length > f.cmap.length { + return nil, errInvalidCmapTable + } + var err error + buf, err = f.src.view(buf, int(f.cmap.offset+offset), int(length)) + if err != nil { + return nil, err + } + var table [256]byte + copy(table[:], buf[6:]) + f.cached.glyphIndex = func(f *Font, b *Buffer, r rune) (GlyphIndex, error) { + // TODO: for this closure to be goroutine-safe, the + // golang.org/x/text/encoding/charmap API needs to allocate a new + // Encoder and new []byte buffers, for every call to this closure, even + // though all we want to do is to encode one rune as one byte. We could + // possibly add some fields in the Buffer struct to re-use these + // allocations, but a better solution is to improve the charmap API. + var dst, src [utf8.UTFMax]byte + n := utf8.EncodeRune(src[:], r) + _, _, err = charmap.Macintosh.NewEncoder().Transform(dst[:], src[:n], true) + if err != nil { + // The source rune r is not representable in the Macintosh-Roman encoding. + return 0, nil + } + return GlyphIndex(table[dst[0]]), nil + } + return buf, nil +} + +func (f *Font) makeCachedGlyphIndexFormat4(buf []byte, offset, length uint32) ([]byte, error) { + const headerSize = 14 + if offset+headerSize > f.cmap.length { + return nil, errInvalidCmapTable + } + var err error + buf, err = f.src.view(buf, int(f.cmap.offset+offset), headerSize) + if err != nil { + return nil, err + } + offset += headerSize + + segCount := u16(buf[6:]) + if segCount&1 != 0 { + return nil, errInvalidCmapTable + } + segCount /= 2 + if segCount > maxCmapSegments { + return nil, errUnsupportedNumberOfCmapSegments + } + + eLength := 8*uint32(segCount) + 2 + if offset+eLength > f.cmap.length { + return nil, errInvalidCmapTable + } + buf, err = f.src.view(buf, int(f.cmap.offset+offset), int(eLength)) + if err != nil { + return nil, err + } + offset += eLength + + entries := make([]cmapEntry16, segCount) + for i := range entries { + entries[i] = cmapEntry16{ + end: u16(buf[0*len(entries)+0+2*i:]), + start: u16(buf[2*len(entries)+2+2*i:]), + delta: u16(buf[4*len(entries)+2+2*i:]), + offset: u16(buf[6*len(entries)+2+2*i:]), + } + } + + f.cached.glyphIndex = func(f *Font, b *Buffer, r rune) (GlyphIndex, error) { + if uint32(r) > 0xffff { + return 0, nil + } + + c := uint16(r) + for i, j := 0, len(entries); i < j; { + h := i + (j-i)/2 + entry := &entries[h] + if c < entry.start { + j = h + } else if entry.end < c { + i = h + 1 + } else if entry.offset == 0 { + return GlyphIndex(c + entry.delta), nil + } else { + // TODO: support the glyphIdArray as per + // https://www.microsoft.com/typography/OTSPEC/cmap.htm + // + // This will probably use the *Font and *Buffer arguments. + return 0, errUnsupportedCmapFormat + } + } + return 0, nil + } + return buf, nil +} + +type cmapEntry16 struct { + end, start, delta, offset uint16 +} diff --git a/font/sfnt/sfnt.go b/font/sfnt/sfnt.go index d4929a8..83065b1 100644 --- a/font/sfnt/sfnt.go +++ b/font/sfnt/sfnt.go @@ -13,6 +13,9 @@ package sfnt // import "golang.org/x/image/font/sfnt" // // The pyftinspect tool from https://github.com/fonttools/fonttools is useful // for inspecting SFNT fonts. +// +// The ttfdump tool is also useful. For example: +// ttfdump -t cmap ../testdata/CFFTest.otf dump.txt import ( "errors" @@ -25,6 +28,7 @@ import ( // These constants are not part of the specifications, but are limitations used // by this implementation. const ( + maxCmapSegments = 1024 maxGlyphDataLength = 64 * 1024 maxHintBits = 256 maxNumTables = 256 @@ -41,6 +45,7 @@ var ( errInvalidBounds = errors.New("sfnt: invalid bounds") errInvalidCFFTable = errors.New("sfnt: invalid CFF table") + errInvalidCmapTable = errors.New("sfnt: invalid cmap table") errInvalidGlyphData = errors.New("sfnt: invalid glyph data") errInvalidHeadTable = errors.New("sfnt: invalid head table") errInvalidLocaTable = errors.New("sfnt: invalid loca table") @@ -53,15 +58,18 @@ var ( errInvalidUCS2String = errors.New("sfnt: invalid UCS-2 string") errInvalidVersion = errors.New("sfnt: invalid version") - errUnsupportedCFFVersion = errors.New("sfnt: unsupported CFF version") - errUnsupportedCompoundGlyph = errors.New("sfnt: unsupported compound glyph") - errUnsupportedGlyphDataLength = errors.New("sfnt: unsupported glyph data length") - errUnsupportedRealNumberEncoding = errors.New("sfnt: unsupported real number encoding") - errUnsupportedNumberOfHints = errors.New("sfnt: unsupported number of hints") - errUnsupportedNumberOfTables = errors.New("sfnt: unsupported number of tables") - errUnsupportedPlatformEncoding = errors.New("sfnt: unsupported platform encoding") - errUnsupportedTableOffsetLength = errors.New("sfnt: unsupported table offset or length") - errUnsupportedType2Charstring = errors.New("sfnt: unsupported Type 2 Charstring") + errUnsupportedCFFVersion = errors.New("sfnt: unsupported CFF version") + errUnsupportedCmapEncodings = errors.New("sfnt: unsupported cmap encodings") + errUnsupportedCmapFormat = errors.New("sfnt: unsupported cmap format") + errUnsupportedCompoundGlyph = errors.New("sfnt: unsupported compound glyph") + errUnsupportedGlyphDataLength = errors.New("sfnt: unsupported glyph data length") + errUnsupportedRealNumberEncoding = errors.New("sfnt: unsupported real number encoding") + errUnsupportedNumberOfCmapSegments = errors.New("sfnt: unsupported number of cmap segments") + errUnsupportedNumberOfHints = errors.New("sfnt: unsupported number of hints") + errUnsupportedNumberOfTables = errors.New("sfnt: unsupported number of tables") + errUnsupportedPlatformEncoding = errors.New("sfnt: unsupported platform encoding") + errUnsupportedTableOffsetLength = errors.New("sfnt: unsupported table offset or length") + errUnsupportedType2Charstring = errors.New("sfnt: unsupported Type 2 Charstring") ) // GlyphIndex is a glyph index in a Font. @@ -107,17 +115,6 @@ const ( // display resolution (DPI) and font size (e.g. a 12 point font). type Units int32 -// Platform IDs and Platform Specific IDs as per -// https://www.microsoft.com/typography/otspec/name.htm -const ( - pidMacintosh = 1 - pidWindows = 3 - - psidMacintoshRoman = 0 - - psidWindowsUCS2 = 1 -) - func u16(b []byte) uint16 { _ = b[1] // Bounds check hint to compiler. return uint16(b[0])<<8 | uint16(b[1])<<0 @@ -286,6 +283,7 @@ type Font struct { // TODO: hdmx, kern, vmtx? Others? cached struct { + glyphIndex func(f *Font, b *Buffer, r rune) (GlyphIndex, error) indexToLocFormat bool // false means short, true means long. isPostScript bool unitsPerEm Units @@ -306,18 +304,36 @@ func (f *Font) initialize() error { if !f.src.valid() { return errInvalidSourceData } - var buf []byte + buf, err := f.initializeTables(nil) + if err != nil { + return err + } + buf, err = f.parseHead(buf) + if err != nil { + return err + } + buf, err = f.parseMaxp(buf) + if err != nil { + return err + } + buf, err = f.parseCmap(buf) + if err != nil { + return err + } + return nil +} +func (f *Font) initializeTables(buf []byte) ([]byte, error) { // https://www.microsoft.com/typography/otspec/otff.htm "Organization of an // OpenType Font" says that "The OpenType font starts with the Offset // Table", which is 12 bytes. buf, err := f.src.view(buf, 0, 12) if err != nil { - return err + return nil, err } switch u32(buf) { default: - return errInvalidVersion + return nil, errInvalidVersion case 0x00010000: // No-op. case 0x4f54544f: // "OTTO". @@ -325,32 +341,32 @@ func (f *Font) initialize() error { } numTables := int(u16(buf[4:])) if numTables > maxNumTables { - return errUnsupportedNumberOfTables + return nil, errUnsupportedNumberOfTables } // "The Offset Table is followed immediately by the Table Record entries... // sorted in ascending order by tag", 16 bytes each. buf, err = f.src.view(buf, 12, 16*numTables) if err != nil { - return err + return nil, err } for b, first, prevTag := buf, true, uint32(0); len(b) > 0; b = b[16:] { tag := u32(b) if first { first = false } else if tag <= prevTag { - return errInvalidTableTagOrder + return nil, errInvalidTableTagOrder } prevTag = tag o, n := u32(b[8:12]), u32(b[12:16]) if o > maxTableOffset || n > maxTableLength { - return errUnsupportedTableOffsetLength + return nil, errUnsupportedTableOffsetLength } // We ignore the checksums, but "all tables must begin on four byte // boundries [sic]". if o&3 != 0 { - return errInvalidTableOffset + return nil, errInvalidTableOffset } // Match the 4-byte tag as a uint32. For example, "OS/2" is 0x4f532f32. @@ -379,40 +395,109 @@ func (f *Font) initialize() error { f.post = table{o, n} } } + return buf, nil +} - var u uint16 +func (f *Font) parseCmap(buf []byte) ([]byte, error) { + // https://www.microsoft.com/typography/OTSPEC/cmap.htm - // https://www.microsoft.com/typography/otspec/head.htm - if f.head.length != 54 { - return errInvalidHeadTable + const headerSize, entrySize = 4, 8 + if f.cmap.length < headerSize { + return nil, errInvalidCmapTable } - u, err = f.src.u16(buf, f.head, 18) + u, err := f.src.u16(buf, f.cmap, 2) if err != nil { - return err + return nil, err + } + numSubtables := int(u) + if f.cmap.length < headerSize+entrySize*uint32(numSubtables) { + return nil, errInvalidCmapTable + } + + var ( + bestWidth int + bestOffset uint32 + bestLength uint32 + bestFormat uint16 + ) + + // Scan all of the subtables, picking the widest supported one. See the + // platformEncodingWidth comment for more discussion of width. + for i := 0; i < numSubtables; i++ { + buf, err = f.src.view(buf, int(f.cmap.offset)+headerSize+entrySize*i, entrySize) + if err != nil { + return nil, err + } + pid := u16(buf) + psid := u16(buf[2:]) + width := platformEncodingWidth(pid, psid) + if width <= bestWidth { + continue + } + offset := u32(buf[4:]) + + if offset > f.cmap.length-4 { + return nil, errInvalidCmapTable + } + buf, err = f.src.view(buf, int(f.cmap.offset+offset), 4) + if err != nil { + return nil, err + } + format := u16(buf) + if !supportedCmapFormat(format, pid, psid) { + continue + } + length := uint32(u16(buf[2:])) + + bestWidth = width + bestOffset = offset + bestLength = length + bestFormat = format + } + + if bestWidth == 0 { + return nil, errUnsupportedCmapEncodings + } + return f.makeCachedGlyphIndex(buf, bestOffset, bestLength, bestFormat) +} + +func (f *Font) parseHead(buf []byte) ([]byte, error) { + // https://www.microsoft.com/typography/otspec/head.htm + + if f.head.length != 54 { + return nil, errInvalidHeadTable + } + u, err := f.src.u16(buf, f.head, 18) + if err != nil { + return nil, err } if u == 0 { - return errInvalidHeadTable + return nil, errInvalidHeadTable } f.cached.unitsPerEm = Units(u) u, err = f.src.u16(buf, f.head, 50) if err != nil { - return err + return nil, err } f.cached.indexToLocFormat = u != 0 + return buf, nil +} +func (f *Font) parseMaxp(buf []byte) ([]byte, error) { // https://www.microsoft.com/typography/otspec/maxp.htm + if f.cached.isPostScript { if f.maxp.length != 6 { - return errInvalidMaxpTable + return nil, errInvalidMaxpTable } } else { if f.maxp.length != 32 { - return errInvalidMaxpTable + return nil, errInvalidMaxpTable } } - u, err = f.src.u16(buf, f.maxp, 4) + u, err := f.src.u16(buf, f.maxp, 4) if err != nil { - return err + return nil, err } numGlyphs := int(u) @@ -425,23 +510,36 @@ func (f *Font) initialize() error { } f.cached.locations, err = p.parse() if err != nil { - return err + return nil, err } } else { f.cached.locations, err = parseLoca( &f.src, f.loca, f.glyf.offset, f.cached.indexToLocFormat, numGlyphs) if err != nil { - return err + return nil, err } } if len(f.cached.locations) != numGlyphs+1 { - return errInvalidLocationData + return nil, errInvalidLocationData } - return nil + + return buf, nil } -// TODO: func (f *Font) GlyphIndex(r rune) (x GlyphIndex, ok bool) -// This will require parsing the cmap table. +// TODO: API for looking up glyph variants?? For example, some fonts may +// provide both slashed and dotted zero glyphs ('0'), or regular and 'old +// style' numerals, and users can direct software to choose a variant. + +// GlyphIndex returns the glyph index for the given rune. +// +// It returns (0, nil) if there is no glyph for r. +// https://www.microsoft.com/typography/OTSPEC/cmap.htm says that "Character +// codes that do not correspond to any glyph in the font should be mapped to +// glyph index 0. The glyph at this location must be a special glyph +// representing a missing character, commonly known as .notdef." +func (f *Font) GlyphIndex(b *Buffer, r rune) (GlyphIndex, error) { + return f.cached.glyphIndex(f, b, r) +} func (f *Font) viewGlyphData(b *Buffer, x GlyphIndex) ([]byte, error) { xx := int(x) @@ -512,14 +610,14 @@ func (f *Font) Name(b *Buffer, id NameID) (string, error) { if err != nil { return "", err } - nSubtables := u16(buf[2:]) - if f.name.length < headerSize+entrySize*uint32(nSubtables) { + numSubtables := u16(buf[2:]) + if f.name.length < headerSize+entrySize*uint32(numSubtables) { return "", errInvalidNameTable } stringOffset := u16(buf[4:]) seen := false - for i, n := 0, int(nSubtables); i < n; i++ { + for i, n := 0, int(numSubtables); i < n; i++ { buf, err := b.view(&f.src, int(f.name.offset)+headerSize+entrySize*i, entrySize) if err != nil { return "", err diff --git a/font/sfnt/sfnt_test.go b/font/sfnt/sfnt_test.go index 4ffd72f..4b72221 100644 --- a/font/sfnt/sfnt_test.go +++ b/font/sfnt/sfnt_test.go @@ -88,6 +88,74 @@ func testTrueType(t *testing.T, f *Font) { } } +func TestGlyphIndex(t *testing.T) { + data, err := ioutil.ReadFile(filepath.FromSlash("../testdata/CFFTest.otf")) + if err != nil { + t.Fatal(err) + } + + for _, format := range []int{-1, 0, 4} { + testGlyphIndex(t, data, format) + } +} + +func testGlyphIndex(t *testing.T, data []byte, cmapFormat int) { + if cmapFormat >= 0 { + originalSupportedCmapFormat := supportedCmapFormat + defer func() { + supportedCmapFormat = originalSupportedCmapFormat + }() + supportedCmapFormat = func(format, pid, psid uint16) bool { + return int(format) == cmapFormat && originalSupportedCmapFormat(format, pid, psid) + } + } + + f, err := Parse(data) + if err != nil { + t.Errorf("cmapFormat=%d: %v", cmapFormat, err) + return + } + + testCases := []struct { + r rune + want GlyphIndex + }{ + {'0', 1}, + {'1', 2}, + {'Q', 3}, + // TODO: add the U+00E0 non-ASCII Latin-1 Supplement rune to + // CFFTest.otf and change 0 to something non-zero. + {'\u00e0', 0}, + {'\u4e2d', 4}, + // TODO: add a rune >= U+00010000 to CFFTest.otf? + + // Glyphs that aren't present in CFFTest.otf. + {'?', 0}, + {'\ufffd', 0}, + {'\U0001f4a9', 0}, + } + + var b Buffer + for _, tc := range testCases { + want := tc.want + // cmap format 0, with the Macintosh Roman encoding, can't represent + // U+4E2D. + if cmapFormat == 0 && tc.r == '\u4e2d' { + want = 0 + } + + got, err := f.GlyphIndex(&b, tc.r) + if err != nil { + t.Errorf("cmapFormat=%d, r=%q: %v", cmapFormat, tc.r, err) + continue + } + if got != want { + t.Errorf("cmapFormat=%d, r=%q: got %d, want %d", cmapFormat, tc.r, got, want) + continue + } + } +} + func TestPostScriptSegments(t *testing.T) { // wants' vectors correspond 1-to-1 to what's in the CFFTest.sfd file, // although OpenType/CFF and FontForge's SFD have reversed orders. @@ -226,7 +294,7 @@ func TestTrueTypeSegments(t *testing.T) { } func testSegments(t *testing.T, filename string, wants [][]Segment) { - data, err := ioutil.ReadFile(filepath.Join("..", "testdata", filename)) + data, err := ioutil.ReadFile(filepath.FromSlash("../testdata/" + filename)) if err != nil { t.Fatal(err) }