-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
206 lines (161 loc) · 7.94 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""
For text sentiment classification, each sentence of the complete
transcript is treated as a separate observation to predict the relevant sentiment. The division in sentences is done to keep the context of the individual words for better understanding and prediction.
Hence, for real-time analysis, the model waits for the completion of a sentence to give the final prediction for the same. Each of the sentences is processed further in the following order:
1. removal of punctuation,
2. lowering of capital letters,
3. tokenization,
4. removal of stop words, and
5. lemmatization.
The resultant string with only keywords is ready for input in the text emotion recognition model."""
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")
table = str.maketrans("", "", string.punctuation)
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def preprocess_text(transcript: str):
sentences = sent_tokenize(transcript)
processed_sentences = []
for s in sentences:
s = s.translate(table).lower()
tokens = word_tokenize(s)
tokens = [i for i in tokens if i not in stop_words]
tokens = [lemmatizer.lemmatize(i) for i in tokens]
ps = " ".join(tokens)
processed_sentences.append(ps)
return " ".join(processed_sentences)
"""The dataset for audio emotion recognition consists of many details about each of the audio files, which contribute to the input for the emotion recognition model. Out of this information, only the gender of the actor is used. Since gender is already provided, other extracted features from this dataset are used to create a Random Forest Classifier model that can effectively predict the gender of the speakers where it is not present.
Multiple studies have shown a positive impact of gender knowledge on audio emotion recognition (Thakare, Chaurasia, Rathod, Joshi, & Gudadhe, 2021); hence, a specific model for gender prediction has been created to improve prediction accuracy for unknown videos. Since the audio files in the training subset are 3 seconds long, the audio from the complete video in testing is also processed in fragments of 3 seconds.
From each of the fragments, multiple features are extracted using the Librosa library in terms of arrays and numerical values and stored in multi-dimensional arrays. The features include:
1. Absolute Short-Time Fourier Transform (STFT)
2. An average of 40 Mel-Frequency Cepstral Coefficients (MFCCs)
3. Chromogram
4. Mel-Scaled Spectrogram
5. Average Root Mean Square (RMS) value
6. Average energy contrast between the highest and lowest energy band
7. Average Tonal Centroid features
These features result in an array of 195 values, which are used for gender prediction. The predicted gender is then added as a feature for sentiment prediction."""
import librosa
import numpy as np
def extract_audio_features(audio_file, sampling_rate=22050, duration=3):
y, _ = librosa.load(audio_file, mono=True)
samples = sampling_rate * duration
num_fragments = len(y) // samples
features = []
for i in range(num_fragments):
start = i * samples
end = start + samples
fragment = y[start:end]
if len(fragment) < samples:
continue
# Absolute Short-time fourier transofrm - 29
stft = np.abs(librosa.stft(fragment)).mean(axis=1)
# average mel-frquenct cepstral coefficients - 40
mfcc = librosa.feature.mfcc(y=fragment, sr=sampling_rate, n_mfcc=40).mean(axis=1)
# chrmogram - 12
chroma = librosa.feature.chroma_stft(y=fragment, sr=sampling_rate).mean(axis=1)
# melspectrogram - 100
mel_spectrogram = librosa.feature.melspectrogram(y=fragment, sr=sampling_rate).mean(axis=1)
# average root mean sqare - 1
rms = librosa.feature.rms(y=fragment).mean()
# average energy contrast between bands - 7
contrast = librosa.feature.spectral_contrast(y=fragment, sr=sampling_rate).mean(axis=1)
# average tonal centroid features - 6
tonal_centroid = librosa.feature.tonnetz(y=fragment, sr=sampling_rate).mean(axis=1)
# 195 values array
fragment_features = np.hstack([stft[:29], mfcc, chroma, mel_spectrogram[:100], [rms], contrast, tonal_centroid])
features.append(fragment_features)
return np.array(features)
"""For image emotion recognition, each of the images in the dataset is
pre-processed. The ImageDataGenerator is used to create more image
data with variations in the angle of the image, stretching, zooming,
etc. keeping the target size as an image of 48*48 size. The data is
ready for input in the 2D CNN model for sentiment prediction. For the
multimodal dataset, the frame is extracted at intervals of one second
and run through the pre-trained Deepface module for Python with
'opencv' as the backend for face detection. If the face is detected, the
frame is converted to a Grayscale and cropped to have only the face
in the frame. The image is sharpened by performing convolution of the
image with a 3 x 3 sharpening filter, =
[ 0 -1 0
-1 5 -1
0 -1 0].
The image is then resized to 48*48 size.
This resized image is then fed into the model for prediction.
The convolution operation - [𝐼 * 𝐾](𝑥, 𝑦) = Σ𝑚 Σ𝑛 𝐼(𝑥 - 𝑚, 𝑦 - 𝑛) ⋅ 𝐾(𝑚, 𝑛)
where the output is the result of convolution,
𝐼 represents image matrix,
𝐾 represents the sharpening kernel defined above,
(𝑥, 𝑦) represents the location in the output image,
and 𝑚 and 𝑛 are the coordinates within the kernel.
It involves sliding a convolution kernel over the image,
computing the element-wise product at each position, and summing
the results to produce the convolved output, which emphasizes image
features and patterns."""
import cv2
import numpy as np
from deepface import DeepFace
from keras._tf_keras.keras.preprocessing.image import ImageDataGenerator
datagen = ImageDataGenerator(
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
rescale=1.0 / 255,
)
def extract_frames(video_path, interval=1):
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
frames = []
success, frame = cap.read()
count = 0
while success:
if count % (fps * interval) == 0:
frames.append(frame)
success, frame = cap.read()
count += 1
cap.release()
return frames
def detect_and_crop_face(frame):
detected_face = DeepFace.extract_faces(frame, detector_backend="opencv", enforce_detection=False)
largest_face = None
for face in detected_face:
if largest_face is None:
largest_face = face
elif face["confidence"] > largest_face["confidence"]:
largest_face = face
face_image = largest_face["face"]
if face_image.dtype != np.uint8:
face_image = (face_image * 255).astype(np.uint8)
gray_face = cv2.cvtColor(face_image, cv2.COLOR_BGR2GRAY)
return gray_face
def sharpen_and_resize_image(image, size=(48, 48)):
kernel = np.array(
[
[0, -1, 0],
[-1, 5, -1],
[0, -1, 0],
]
)
sharpened_image = cv2.filter2D(image, -1, kernel)
resized_image = cv2.resize(sharpened_image, size, interpolation=cv2.INTER_AREA)
return resized_image
def preprocess_video(filepath):
frames = extract_frames(filepath)
preprocessed_images = []
for frame in frames:
gray_face = detect_and_crop_face(frame)
if gray_face is not None:
final_image = sharpen_and_resize_image(gray_face)
if final_image is not None:
final_image = final_image / 255.0
final_image = final_image.reshape(48, 48, 1)
preprocessed_images.append(final_image)
return np.array(preprocessed_images)