aboutsummaryrefslogtreecommitdiff
path: root/src/pkg/exp/norm/forminfo.go
blob: ee3edb8ea7d78d1763217f7b71f38a342a04dc0f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package norm

// This file contains Form-specific logic and wrappers for data in tables.go.

type runeInfo struct {
	pos   uint8  // start position in reorderBuffer; used in composition.go
	size  uint8  // length of UTF-8 encoding of this rune
	ccc   uint8  // canonical combining class
	flags qcInfo // quick check flags
}

// functions dispatchable per form
type boundaryFunc func(f *formInfo, info runeInfo) bool
type lookupFunc func(b []byte) runeInfo
type lookupFuncString func(s string) runeInfo
type decompFunc func(b []byte) []byte
type decompFuncString func(s string) []byte

// formInfo holds Form-specific functions and tables.
type formInfo struct {
	form Form

	composing, compatibility bool // form type

	decompose       decompFunc
	decomposeString decompFuncString
	info            lookupFunc
	infoString      lookupFuncString
	boundaryBefore  boundaryFunc
	boundaryAfter   boundaryFunc
}

var formTable []*formInfo

func init() {
	formTable = make([]*formInfo, 4)

	for i := range formTable {
		f := &formInfo{}
		formTable[i] = f
		f.form = Form(i)
		if Form(i) == NFKD || Form(i) == NFKC {
			f.compatibility = true
			f.decompose = decomposeNFKC
			f.decomposeString = decomposeStringNFKC
			f.info = lookupInfoNFKC
			f.infoString = lookupInfoStringNFKC
		} else {
			f.decompose = decomposeNFC
			f.decomposeString = decomposeStringNFC
			f.info = lookupInfoNFC
			f.infoString = lookupInfoStringNFC
		}
		if Form(i) == NFC || Form(i) == NFKC {
			f.composing = true
			f.boundaryBefore = compBoundaryBefore
			f.boundaryAfter = compBoundaryAfter
		} else {
			f.boundaryBefore = decompBoundary
			f.boundaryAfter = decompBoundary
		}
	}
}

func decompBoundary(f *formInfo, info runeInfo) bool {
	if info.ccc == 0 && info.flags.isYesD() { // Implies isHangul(b) == true
		return true
	}
	// We assume that the CCC of the first character in a decomposition
	// is always non-zero if different from info.ccc and that we can return
	// false at this point. This is verified by maketables.
	return false
}

func compBoundaryBefore(f *formInfo, info runeInfo) bool {
	if info.ccc == 0 && info.flags.isYesC() {
		return true
	}
	// We assume that the CCC of the first character in a decomposition
	// is always non-zero if different from info.ccc and that we can return
	// false at this point. This is verified by maketables.
	return false
}

func compBoundaryAfter(f *formInfo, info runeInfo) bool {
	// This misses values where the last char in a decomposition is a
	// boundary such as Hangul with JamoT.
	// TODO(mpvl): verify this does not lead to segments that do
	// not fit in the reorderBuffer.
	return info.flags.isInert()
}

// We pack quick check data in 4 bits:
//   0:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
//   1..2: NFC_QC Yes(00), No (01), or Maybe (11)
//   3:    Combines forward  (0 == false, 1 == true)
// 
// When all 4 bits are zero, the character is inert, meaning it is never
// influenced by normalization.
//
// We pack the bits for both NFC/D and NFKC/D in one byte.
type qcInfo uint8

func (i qcInfo) isYesC() bool  { return i&0x2 == 0 }
func (i qcInfo) isNoC() bool   { return i&0x6 == 0x2 }
func (i qcInfo) isMaybe() bool { return i&0x4 != 0 }
func (i qcInfo) isYesD() bool  { return i&0x1 == 0 }
func (i qcInfo) isNoD() bool   { return i&0x1 != 0 }
func (i qcInfo) isInert() bool { return i&0xf == 0 }

func (i qcInfo) combinesForward() bool  { return i&0x8 != 0 }
func (i qcInfo) combinesBackward() bool { return i&0x4 != 0 } // == isMaybe
func (i qcInfo) hasDecomposition() bool { return i&0x1 != 0 } // == isNoD

// Wrappers for tables.go

// The 16-bit value of the decompostion tries is an index into a byte
// array of UTF-8 decomposition sequences. The first byte is the number
// of bytes in the decomposition (excluding this length byte). The actual
// sequence starts at the offset+1.
func decomposeNFC(b []byte) []byte {
	p := nfcDecompTrie.lookupUnsafe(b)
	n := decomps[p]
	p++
	return decomps[p : p+uint16(n)]
}

func decomposeNFKC(b []byte) []byte {
	p := nfkcDecompTrie.lookupUnsafe(b)
	n := decomps[p]
	p++
	return decomps[p : p+uint16(n)]
}

func decomposeStringNFC(s string) []byte {
	p := nfcDecompTrie.lookupStringUnsafe(s)
	n := decomps[p]
	p++
	return decomps[p : p+uint16(n)]
}

func decomposeStringNFKC(s string) []byte {
	p := nfkcDecompTrie.lookupStringUnsafe(s)
	n := decomps[p]
	p++
	return decomps[p : p+uint16(n)]
}

// Recomposition
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
// This clips off the bits of three entries, but we know this will not
// result in a collision. In the unlikely event that changes to
// UnicodeData.txt introduce collisions, the compiler will catch it.
// Note that the recomposition map for NFC and NFKC are identical.

// combine returns the combined rune or 0 if it doesn't exist.
func combine(a, b uint32) uint32 {
	key := uint32(uint16(a))<<16 + uint32(uint16(b))
	return recompMap[key]
}

// The 16-bit character info has the following bit layout:
//    0..7   CCC value.
//    8..11  qcInfo for NFC/NFD
//   12..15  qcInfo for NFKC/NFKD
func lookupInfoNFC(b []byte) runeInfo {
	v, sz := charInfoTrie.lookup(b)
	return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)}
}

func lookupInfoStringNFC(s string) runeInfo {
	v, sz := charInfoTrie.lookupString(s)
	return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)}
}

func lookupInfoNFKC(b []byte) runeInfo {
	v, sz := charInfoTrie.lookup(b)
	return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)}
}

func lookupInfoStringNFKC(s string) runeInfo {
	v, sz := charInfoTrie.lookupString(s)
	return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)}
}