font/sfnt: parse the cmap table.

Change-Id: I757d42c9caf419f549696543f0f156cfe3dbfe1a
Reviewed-on: https://go-review.googlesource.com/35512
Reviewed-by: David Crawshaw <crawshaw@golang.org>
This commit is contained in:
Nigel Tao 2017-01-21 16:32:42 +11:00
parent 83686c5479
commit 69afd001f7
3 changed files with 414 additions and 50 deletions

198
font/sfnt/cmap.go Normal file
View File

@ -0,0 +1,198 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package sfnt
import (
"unicode/utf8"
"golang.org/x/text/encoding/charmap"
)
// Platform IDs and Platform Specific IDs as per
// https://www.microsoft.com/typography/otspec/name.htm
const (
pidUnicode = 0
pidMacintosh = 1
pidWindows = 3
psidUnicode2BMPOnly = 3
psidUnicode2FullRepertoire = 4
psidMacintoshRoman = 0
psidWindowsUCS2 = 1
psidWindowsUCS4 = 10
)
// platformEncodingWidth returns the number of bytes per character assumed by
// the given Platform ID and Platform Specific ID.
//
// Very old fonts, from before Unicode was widely adopted, assume only 1 byte
// per character: a character map.
//
// Old fonts, from when Unicode meant the Basic Multilingual Plane (BMP),
// assume that 2 bytes per character is sufficient.
//
// Recent fonts naturally support the full range of Unicode code points, which
// can take up to 4 bytes per character. Such fonts might still choose one of
// the legacy encodings if e.g. their repertoire is limited to the BMP, for
// greater compatibility with older software, or because the resultant file
// size can be smaller.
func platformEncodingWidth(pid, psid uint16) int {
switch pid {
case pidUnicode:
switch psid {
case psidUnicode2BMPOnly:
return 2
case psidUnicode2FullRepertoire:
return 4
}
case pidMacintosh:
switch psid {
case psidMacintoshRoman:
return 1
}
case pidWindows:
switch psid {
case psidWindowsUCS2:
return 2
case psidWindowsUCS4:
return 4
}
}
return 0
}
// The various cmap formats are described at
// https://www.microsoft.com/typography/otspec/cmap.htm
var supportedCmapFormat = func(format, pid, psid uint16) bool {
switch format {
case 0:
return pid == pidMacintosh && psid == psidMacintoshRoman
case 4:
return true
case 12:
// TODO: implement.
}
return false
}
func (f *Font) makeCachedGlyphIndex(buf []byte, offset, length uint32, format uint16) ([]byte, error) {
switch format {
case 0:
return f.makeCachedGlyphIndexFormat0(buf, offset, length)
case 4:
return f.makeCachedGlyphIndexFormat4(buf, offset, length)
case 12:
// TODO: implement, including a cmapEntry32 type (32, not 16).
}
panic("unreachable")
}
func (f *Font) makeCachedGlyphIndexFormat0(buf []byte, offset, length uint32) ([]byte, error) {
if length != 6+256 || offset+length > f.cmap.length {
return nil, errInvalidCmapTable
}
var err error
buf, err = f.src.view(buf, int(f.cmap.offset+offset), int(length))
if err != nil {
return nil, err
}
var table [256]byte
copy(table[:], buf[6:])
f.cached.glyphIndex = func(f *Font, b *Buffer, r rune) (GlyphIndex, error) {
// TODO: for this closure to be goroutine-safe, the
// golang.org/x/text/encoding/charmap API needs to allocate a new
// Encoder and new []byte buffers, for every call to this closure, even
// though all we want to do is to encode one rune as one byte. We could
// possibly add some fields in the Buffer struct to re-use these
// allocations, but a better solution is to improve the charmap API.
var dst, src [utf8.UTFMax]byte
n := utf8.EncodeRune(src[:], r)
_, _, err = charmap.Macintosh.NewEncoder().Transform(dst[:], src[:n], true)
if err != nil {
// The source rune r is not representable in the Macintosh-Roman encoding.
return 0, nil
}
return GlyphIndex(table[dst[0]]), nil
}
return buf, nil
}
func (f *Font) makeCachedGlyphIndexFormat4(buf []byte, offset, length uint32) ([]byte, error) {
const headerSize = 14
if offset+headerSize > f.cmap.length {
return nil, errInvalidCmapTable
}
var err error
buf, err = f.src.view(buf, int(f.cmap.offset+offset), headerSize)
if err != nil {
return nil, err
}
offset += headerSize
segCount := u16(buf[6:])
if segCount&1 != 0 {
return nil, errInvalidCmapTable
}
segCount /= 2
if segCount > maxCmapSegments {
return nil, errUnsupportedNumberOfCmapSegments
}
eLength := 8*uint32(segCount) + 2
if offset+eLength > f.cmap.length {
return nil, errInvalidCmapTable
}
buf, err = f.src.view(buf, int(f.cmap.offset+offset), int(eLength))
if err != nil {
return nil, err
}
offset += eLength
entries := make([]cmapEntry16, segCount)
for i := range entries {
entries[i] = cmapEntry16{
end: u16(buf[0*len(entries)+0+2*i:]),
start: u16(buf[2*len(entries)+2+2*i:]),
delta: u16(buf[4*len(entries)+2+2*i:]),
offset: u16(buf[6*len(entries)+2+2*i:]),
}
}
f.cached.glyphIndex = func(f *Font, b *Buffer, r rune) (GlyphIndex, error) {
if uint32(r) > 0xffff {
return 0, nil
}
c := uint16(r)
for i, j := 0, len(entries); i < j; {
h := i + (j-i)/2
entry := &entries[h]
if c < entry.start {
j = h
} else if entry.end < c {
i = h + 1
} else if entry.offset == 0 {
return GlyphIndex(c + entry.delta), nil
} else {
// TODO: support the glyphIdArray as per
// https://www.microsoft.com/typography/OTSPEC/cmap.htm
//
// This will probably use the *Font and *Buffer arguments.
return 0, errUnsupportedCmapFormat
}
}
return 0, nil
}
return buf, nil
}
type cmapEntry16 struct {
end, start, delta, offset uint16
}

View File

@ -13,6 +13,9 @@ package sfnt // import "golang.org/x/image/font/sfnt"
//
// The pyftinspect tool from https://github.com/fonttools/fonttools is useful
// for inspecting SFNT fonts.
//
// The ttfdump tool is also useful. For example:
// ttfdump -t cmap ../testdata/CFFTest.otf dump.txt
import (
"errors"
@ -25,6 +28,7 @@ import (
// These constants are not part of the specifications, but are limitations used
// by this implementation.
const (
maxCmapSegments = 1024
maxGlyphDataLength = 64 * 1024
maxHintBits = 256
maxNumTables = 256
@ -41,6 +45,7 @@ var (
errInvalidBounds = errors.New("sfnt: invalid bounds")
errInvalidCFFTable = errors.New("sfnt: invalid CFF table")
errInvalidCmapTable = errors.New("sfnt: invalid cmap table")
errInvalidGlyphData = errors.New("sfnt: invalid glyph data")
errInvalidHeadTable = errors.New("sfnt: invalid head table")
errInvalidLocaTable = errors.New("sfnt: invalid loca table")
@ -54,9 +59,12 @@ var (
errInvalidVersion = errors.New("sfnt: invalid version")
errUnsupportedCFFVersion = errors.New("sfnt: unsupported CFF version")
errUnsupportedCmapEncodings = errors.New("sfnt: unsupported cmap encodings")
errUnsupportedCmapFormat = errors.New("sfnt: unsupported cmap format")
errUnsupportedCompoundGlyph = errors.New("sfnt: unsupported compound glyph")
errUnsupportedGlyphDataLength = errors.New("sfnt: unsupported glyph data length")
errUnsupportedRealNumberEncoding = errors.New("sfnt: unsupported real number encoding")
errUnsupportedNumberOfCmapSegments = errors.New("sfnt: unsupported number of cmap segments")
errUnsupportedNumberOfHints = errors.New("sfnt: unsupported number of hints")
errUnsupportedNumberOfTables = errors.New("sfnt: unsupported number of tables")
errUnsupportedPlatformEncoding = errors.New("sfnt: unsupported platform encoding")
@ -107,17 +115,6 @@ const (
// display resolution (DPI) and font size (e.g. a 12 point font).
type Units int32
// Platform IDs and Platform Specific IDs as per
// https://www.microsoft.com/typography/otspec/name.htm
const (
pidMacintosh = 1
pidWindows = 3
psidMacintoshRoman = 0
psidWindowsUCS2 = 1
)
func u16(b []byte) uint16 {
_ = b[1] // Bounds check hint to compiler.
return uint16(b[0])<<8 | uint16(b[1])<<0
@ -286,6 +283,7 @@ type Font struct {
// TODO: hdmx, kern, vmtx? Others?
cached struct {
glyphIndex func(f *Font, b *Buffer, r rune) (GlyphIndex, error)
indexToLocFormat bool // false means short, true means long.
isPostScript bool
unitsPerEm Units
@ -306,18 +304,36 @@ func (f *Font) initialize() error {
if !f.src.valid() {
return errInvalidSourceData
}
var buf []byte
buf, err := f.initializeTables(nil)
if err != nil {
return err
}
buf, err = f.parseHead(buf)
if err != nil {
return err
}
buf, err = f.parseMaxp(buf)
if err != nil {
return err
}
buf, err = f.parseCmap(buf)
if err != nil {
return err
}
return nil
}
func (f *Font) initializeTables(buf []byte) ([]byte, error) {
// https://www.microsoft.com/typography/otspec/otff.htm "Organization of an
// OpenType Font" says that "The OpenType font starts with the Offset
// Table", which is 12 bytes.
buf, err := f.src.view(buf, 0, 12)
if err != nil {
return err
return nil, err
}
switch u32(buf) {
default:
return errInvalidVersion
return nil, errInvalidVersion
case 0x00010000:
// No-op.
case 0x4f54544f: // "OTTO".
@ -325,32 +341,32 @@ func (f *Font) initialize() error {
}
numTables := int(u16(buf[4:]))
if numTables > maxNumTables {
return errUnsupportedNumberOfTables
return nil, errUnsupportedNumberOfTables
}
// "The Offset Table is followed immediately by the Table Record entries...
// sorted in ascending order by tag", 16 bytes each.
buf, err = f.src.view(buf, 12, 16*numTables)
if err != nil {
return err
return nil, err
}
for b, first, prevTag := buf, true, uint32(0); len(b) > 0; b = b[16:] {
tag := u32(b)
if first {
first = false
} else if tag <= prevTag {
return errInvalidTableTagOrder
return nil, errInvalidTableTagOrder
}
prevTag = tag
o, n := u32(b[8:12]), u32(b[12:16])
if o > maxTableOffset || n > maxTableLength {
return errUnsupportedTableOffsetLength
return nil, errUnsupportedTableOffsetLength
}
// We ignore the checksums, but "all tables must begin on four byte
// boundries [sic]".
if o&3 != 0 {
return errInvalidTableOffset
return nil, errInvalidTableOffset
}
// Match the 4-byte tag as a uint32. For example, "OS/2" is 0x4f532f32.
@ -379,40 +395,109 @@ func (f *Font) initialize() error {
f.post = table{o, n}
}
}
var u uint16
// https://www.microsoft.com/typography/otspec/head.htm
if f.head.length != 54 {
return errInvalidHeadTable
return buf, nil
}
u, err = f.src.u16(buf, f.head, 18)
func (f *Font) parseCmap(buf []byte) ([]byte, error) {
// https://www.microsoft.com/typography/OTSPEC/cmap.htm
const headerSize, entrySize = 4, 8
if f.cmap.length < headerSize {
return nil, errInvalidCmapTable
}
u, err := f.src.u16(buf, f.cmap, 2)
if err != nil {
return err
return nil, err
}
numSubtables := int(u)
if f.cmap.length < headerSize+entrySize*uint32(numSubtables) {
return nil, errInvalidCmapTable
}
var (
bestWidth int
bestOffset uint32
bestLength uint32
bestFormat uint16
)
// Scan all of the subtables, picking the widest supported one. See the
// platformEncodingWidth comment for more discussion of width.
for i := 0; i < numSubtables; i++ {
buf, err = f.src.view(buf, int(f.cmap.offset)+headerSize+entrySize*i, entrySize)
if err != nil {
return nil, err
}
pid := u16(buf)
psid := u16(buf[2:])
width := platformEncodingWidth(pid, psid)
if width <= bestWidth {
continue
}
offset := u32(buf[4:])
if offset > f.cmap.length-4 {
return nil, errInvalidCmapTable
}
buf, err = f.src.view(buf, int(f.cmap.offset+offset), 4)
if err != nil {
return nil, err
}
format := u16(buf)
if !supportedCmapFormat(format, pid, psid) {
continue
}
length := uint32(u16(buf[2:]))
bestWidth = width
bestOffset = offset
bestLength = length
bestFormat = format
}
if bestWidth == 0 {
return nil, errUnsupportedCmapEncodings
}
return f.makeCachedGlyphIndex(buf, bestOffset, bestLength, bestFormat)
}
func (f *Font) parseHead(buf []byte) ([]byte, error) {
// https://www.microsoft.com/typography/otspec/head.htm
if f.head.length != 54 {
return nil, errInvalidHeadTable
}
u, err := f.src.u16(buf, f.head, 18)
if err != nil {
return nil, err
}
if u == 0 {
return errInvalidHeadTable
return nil, errInvalidHeadTable
}
f.cached.unitsPerEm = Units(u)
u, err = f.src.u16(buf, f.head, 50)
if err != nil {
return err
return nil, err
}
f.cached.indexToLocFormat = u != 0
return buf, nil
}
func (f *Font) parseMaxp(buf []byte) ([]byte, error) {
// https://www.microsoft.com/typography/otspec/maxp.htm
if f.cached.isPostScript {
if f.maxp.length != 6 {
return errInvalidMaxpTable
return nil, errInvalidMaxpTable
}
} else {
if f.maxp.length != 32 {
return errInvalidMaxpTable
return nil, errInvalidMaxpTable
}
}
u, err = f.src.u16(buf, f.maxp, 4)
u, err := f.src.u16(buf, f.maxp, 4)
if err != nil {
return err
return nil, err
}
numGlyphs := int(u)
@ -425,23 +510,36 @@ func (f *Font) initialize() error {
}
f.cached.locations, err = p.parse()
if err != nil {
return err
return nil, err
}
} else {
f.cached.locations, err = parseLoca(
&f.src, f.loca, f.glyf.offset, f.cached.indexToLocFormat, numGlyphs)
if err != nil {
return err
return nil, err
}
}
if len(f.cached.locations) != numGlyphs+1 {
return errInvalidLocationData
}
return nil
return nil, errInvalidLocationData
}
// TODO: func (f *Font) GlyphIndex(r rune) (x GlyphIndex, ok bool)
// This will require parsing the cmap table.
return buf, nil
}
// TODO: API for looking up glyph variants?? For example, some fonts may
// provide both slashed and dotted zero glyphs ('0'), or regular and 'old
// style' numerals, and users can direct software to choose a variant.
// GlyphIndex returns the glyph index for the given rune.
//
// It returns (0, nil) if there is no glyph for r.
// https://www.microsoft.com/typography/OTSPEC/cmap.htm says that "Character
// codes that do not correspond to any glyph in the font should be mapped to
// glyph index 0. The glyph at this location must be a special glyph
// representing a missing character, commonly known as .notdef."
func (f *Font) GlyphIndex(b *Buffer, r rune) (GlyphIndex, error) {
return f.cached.glyphIndex(f, b, r)
}
func (f *Font) viewGlyphData(b *Buffer, x GlyphIndex) ([]byte, error) {
xx := int(x)
@ -512,14 +610,14 @@ func (f *Font) Name(b *Buffer, id NameID) (string, error) {
if err != nil {
return "", err
}
nSubtables := u16(buf[2:])
if f.name.length < headerSize+entrySize*uint32(nSubtables) {
numSubtables := u16(buf[2:])
if f.name.length < headerSize+entrySize*uint32(numSubtables) {
return "", errInvalidNameTable
}
stringOffset := u16(buf[4:])
seen := false
for i, n := 0, int(nSubtables); i < n; i++ {
for i, n := 0, int(numSubtables); i < n; i++ {
buf, err := b.view(&f.src, int(f.name.offset)+headerSize+entrySize*i, entrySize)
if err != nil {
return "", err

View File

@ -88,6 +88,74 @@ func testTrueType(t *testing.T, f *Font) {
}
}
func TestGlyphIndex(t *testing.T) {
data, err := ioutil.ReadFile(filepath.FromSlash("../testdata/CFFTest.otf"))
if err != nil {
t.Fatal(err)
}
for _, format := range []int{-1, 0, 4} {
testGlyphIndex(t, data, format)
}
}
func testGlyphIndex(t *testing.T, data []byte, cmapFormat int) {
if cmapFormat >= 0 {
originalSupportedCmapFormat := supportedCmapFormat
defer func() {
supportedCmapFormat = originalSupportedCmapFormat
}()
supportedCmapFormat = func(format, pid, psid uint16) bool {
return int(format) == cmapFormat && originalSupportedCmapFormat(format, pid, psid)
}
}
f, err := Parse(data)
if err != nil {
t.Errorf("cmapFormat=%d: %v", cmapFormat, err)
return
}
testCases := []struct {
r rune
want GlyphIndex
}{
{'0', 1},
{'1', 2},
{'Q', 3},
// TODO: add the U+00E0 non-ASCII Latin-1 Supplement rune to
// CFFTest.otf and change 0 to something non-zero.
{'\u00e0', 0},
{'\u4e2d', 4},
// TODO: add a rune >= U+00010000 to CFFTest.otf?
// Glyphs that aren't present in CFFTest.otf.
{'?', 0},
{'\ufffd', 0},
{'\U0001f4a9', 0},
}
var b Buffer
for _, tc := range testCases {
want := tc.want
// cmap format 0, with the Macintosh Roman encoding, can't represent
// U+4E2D.
if cmapFormat == 0 && tc.r == '\u4e2d' {
want = 0
}
got, err := f.GlyphIndex(&b, tc.r)
if err != nil {
t.Errorf("cmapFormat=%d, r=%q: %v", cmapFormat, tc.r, err)
continue
}
if got != want {
t.Errorf("cmapFormat=%d, r=%q: got %d, want %d", cmapFormat, tc.r, got, want)
continue
}
}
}
func TestPostScriptSegments(t *testing.T) {
// wants' vectors correspond 1-to-1 to what's in the CFFTest.sfd file,
// although OpenType/CFF and FontForge's SFD have reversed orders.
@ -226,7 +294,7 @@ func TestTrueTypeSegments(t *testing.T) {
}
func testSegments(t *testing.T, filename string, wants [][]Segment) {
data, err := ioutil.ReadFile(filepath.Join("..", "testdata", filename))
data, err := ioutil.ReadFile(filepath.FromSlash("../testdata/" + filename))
if err != nil {
t.Fatal(err)
}