-
Notifications
You must be signed in to change notification settings - Fork 17
/
ocr.go
164 lines (139 loc) · 4.23 KB
/
ocr.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
package lookup
import (
"image"
"os"
"path/filepath"
"sort"
"strings"
)
// OCR implements a simple OCR based on the Lookup functions. It allows multiple fontsets,
// just call LoadFont for each fontset.
//
// If you need to encode special symbols use UNICODE in the file name. For example if you
// need to have '\' character (which is prohibited in the path and file name) specify
// %2F.png as a image symbol name.
//
// Sometimes you need to specify two different image for one symbol (if image / font symbol vary
// too much). To do so add unicode ZERO WIDTH SPACE symbol (%E2%80%8B) to the filename.
// Ex: %2F%E2%80%8B.png will produce '/' symbol as well.
type OCR struct {
fontFamilies map[string][]*fontSymbol
threshold float64
allSymbols []*fontSymbol
numThreads int
}
// NewOCR creates a new OCR instance, that will use the given threshold. You can optionally
// parallelize the processing by specifying the number of threads to use. The optimal number
// varies and depends on your use case (size of fontset x size of image). Default is use
// only one thread
func NewOCR(threshold float64, numThreads ...int) *OCR {
ocr := &OCR{
fontFamilies: make(map[string][]*fontSymbol),
threshold: threshold,
numThreads: 1,
}
if len(numThreads) > 0 {
ocr.numThreads = numThreads[0]
}
return ocr
}
// LoadFont loads a specific fontset from the given folder. Fonts are simple image files
// containing a PNG/JPEG of the font, and named after the "letter" represented by the image.
//
// This can be called multiple times, with different folders, to load different fontsets.
func (o *OCR) LoadFont(fontPath string) error {
if _, err := os.Stat(fontPath); os.IsNotExist(err) {
return err
}
fontFamily, err := loadFont(fontPath)
if err != nil {
return err
}
familyName := filepath.Base(fontPath)
family, ok := o.fontFamilies[familyName]
if !ok {
family = make([]*fontSymbol, 0, len(fontFamily))
}
family = append(family, fontFamily...)
o.fontFamilies[familyName] = family
o.updateAllSymbols()
return nil
}
func (o *OCR) updateAllSymbols() {
total := 0
o.allSymbols = nil
for _, family := range o.fontFamilies {
total += len(family)
o.allSymbols = append(o.allSymbols, family...)
}
}
// Recognize the text in the image using the fontsets previously loaded. If a SubImage
// is received, the search will be limited by the boundaries of the SubImage
func (o *OCR) Recognize(img image.Image) (string, error) {
bi := newImageBinary(ensureGrayScale(img))
return o.recognize(bi, image.Rect(0, 0, bi.width-1, bi.height-1))
}
func (o *OCR) recognize(bi *imageBinary, rect image.Rectangle) (string, error) {
found, err := findAllInParallel(o.numThreads, o.allSymbols, bi, o.threshold, rect)
if err != nil {
return "", err
}
if len(found) == 0 {
return "", nil
}
text := o.filterAndArrange(found)
return text, nil
}
func biggerFirst(list []*fontSymbolLookup) func(i, j int) bool {
maxSize := 0
for _, i := range list {
maxSize = max(maxSize, i.fs.image.size)
}
maxSize2 := maxSize / 2
return func(i, j int) bool {
return list[i].biggerThan(list[j], maxSize2)
}
}
func (o *OCR) filterAndArrange(all []*fontSymbolLookup) string {
// big images eat small ones
sort.Slice(all, biggerFirst(all))
for k, kk := range all {
for j := k + 1; j < len(all); j++ {
jj := all[j]
if kk.cross(jj) {
all = deleteSymbol(all, j)
j--
}
}
}
// sort top/bottom/left/right
sort.Slice(all, func(i, j int) bool {
return all[i].comesAfter(all[j])
})
var str strings.Builder
x := all[0].x
cx := 0
for i, s := range all {
maxCX := max(cx, s.fs.width)
// if distance between end of previous symbol and beginning of the
// current is larger then a char size, then it is a space
// This should not be applied in the beginning (i == 0) as it would put a white space for
// any s.x > maxCX will have a (useless) whitespace in front
if s.x-x >= maxCX && i != 0 {
str.WriteString(" ")
}
// if we drop back, then we have an end of line
if s.x < x {
str.WriteString("\n")
}
x = s.x + s.fs.width
cx = s.fs.width
str.WriteString(s.fs.symbol)
}
return str.String()
}
func deleteSymbol(all []*fontSymbolLookup, i int) []*fontSymbolLookup {
copy(all[i:], all[i+1:])
all[len(all)-1] = nil
return all[:len(all)-1]
}