-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpersiangenderdetection.go
101 lines (86 loc) · 2.15 KB
/
persiangenderdetection.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
package persiangenderdetection
import (
"bufio"
_ "embed"
"strings"
"sync"
"unicode"
)
//go:embed data/names.csv
var namesCSV string
var (
dataset = make(map[string]string)
datasetOnce sync.Once
)
// readCSV reads the embedded CSV data and populates the dataset map
func readCSV() {
scanner := bufio.NewScanner(strings.NewReader(namesCSV))
for scanner.Scan() {
line := scanner.Text()
record := strings.Split(line, ",")
if len(record) == 2 {
dataset[record[0]] = record[1]
}
}
if err := scanner.Err(); err != nil {
panic(err)
}
}
// isUnwantedRune checks if a rune is a punctuation, symbol, nonspacing mark, or digit
func isUnwantedRune(r rune) bool {
return unicode.IsPunct(r) || unicode.IsSymbol(r) || unicode.Is(unicode.Mn, r) || unicode.IsDigit(r)
}
// clearName cleans the input name by removing unwanted runes and replacing specific characters
func clearName(name string) string {
var builder strings.Builder
builder.Grow(len(name))
for _, r := range name {
if !isUnwantedRune(r) {
builder.WriteRune(r)
}
}
replacer := strings.NewReplacer(
"\u200c", " ",
"آ", "ا",
"ي", "ی",
"ك", "ک",
"ـ", "",
)
return strings.TrimSpace(replacer.Replace(builder.String()))
}
// GetGender determines the gender of the given name by consulting the dataset
// It first clears the name of unwanted characters, then checks for prefixes, and finally looks up the gender
func GetGender(name string) string {
datasetOnce.Do(readCSV)
fullName := strings.Fields(clearName(name))
prefixes := map[string]struct{}{
"سید": {},
"سیده": {},
"استاد": {},
"دکتر": {},
"مهندس": {},
"سرکار": {},
}
// Remove prefixes from the name
for len(fullName) > 0 {
if _, exists := prefixes[fullName[0]]; exists {
fullName = fullName[1:]
} else {
break
}
}
// Check for gender by progressively reducing the full name
for len(fullName) > 0 {
firstName := strings.Join(fullName, " ")
if gender, found := dataset[firstName]; found {
switch gender {
case "M":
return "MALE"
case "F":
return "FEMALE"
}
}
fullName = fullName[:len(fullName)-1]
}
return "UNKNOWN" // Unknown gender
}