-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpre_process_text.py
189 lines (146 loc) · 6.68 KB
/
pre_process_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from utils import read_json, load_secrets, BOS, SEP, EOS, IMG_TOKEN, GIF_TOKEN, LINK_TOKEN
from datetime import timedelta
from dateutil import parser
import re
import json
FOLDER = "messages"
emote_re = re.compile("(?=:).+(?=:.*\>)")
def break_into_conversations(full_json : dict, time_delta_threshold:int=10) -> list:
"""
Breaks messages into conversations.
Takes all messages received and chunks them into conversations.
Input:
- full_json: The full json of all scraped messages. Output of downloader.py
- time_delta_threshold: A threshold for determining how much time can occur between messages
in the same conversation (in minutes).
Output:
- convos: A list of convos (each a lists of strings), representing messages
in the conversation (ordered by time - index 0 is first message)
"""
convos = []
current_convo = None
prev_message_time = None
# Full JSON is sorted in reverse order
for mssg_idx in range(len(full_json) - 1, -1, -1):
message = full_json[mssg_idx]
# https://www.geeksforgeeks.org/create-python-datetime-from-string/
curr_msg_timestamp = parser.parse(message["timestamp"])
# https://discord.com/developers/docs/resources/channel#message-object-message-types
is_a_response = message["type"] in [19]
# First Message
if (current_convo is None ):
current_convo = [message]
else:
is_awhile_after_last = (curr_msg_timestamp - prev_message_time) > timedelta(minutes=time_delta_threshold)
# New convo conditions: not a reply and at least specified num mins after last message
if (not is_a_response and is_awhile_after_last):
convos.append(current_convo)
current_convo = [message]
else:
current_convo.append(message)
prev_message_time = curr_msg_timestamp
return convos
def process_msg_text(message:dict) -> str:
"""
Handles all pre-processing of message text.
Input:
- message: A single message object.
Output:
- processed_msg: The string of processed message string.
"""
content = message["content"]
# Empty string implies an image was sent. Consequence of downloader.py's parse_message()
if (content == ""):
content = IMG_TOKEN
return content
elif (content.startswith("https://tenor.com/")):
content = GIF_TOKEN
return content
for mention in message["mentions"]:
content = content.replace(f"<@{mention['id']}>", "@" + mention['username'])
# Process each word in the message.
# Replaces emotes with format expected, lower cases text, replaces links with link token.
content_tokens = content.split()
for i in range(len(content_tokens)):
token = content_tokens[i]
if (token.startswith("https:")):
content_tokens[i] = LINK_TOKEN
else:
emote_val = re.search(emote_re, token)
if (emote_val is not None):
# regexp for this is annoying. Need to add back in the closing : (e.g., :KEKW:)
content_tokens[i] = emote_val.group(0) + ":"
elif(content_tokens[i] not in [GIF_TOKEN, IMG_TOKEN, LINK_TOKEN]):
content_tokens[i] = content_tokens[i].lower()
processed_msg = " ".join(content_tokens)
return processed_msg
def split_into_user_context(convo:list) -> dict:
"""
Splits a conversation into context + response pairs for model training.
Input:
- convo: A list of strings representing a conversation. First message is assumed to be index 0.
Output:
- author_data : Dictionary mapping author (str) to list of context response pairs, where the corresponding
author is the one who responded.
"""
# This combines all contiguous user messages into one string.
# Often times in discord, users will split their response into several individual message. This corrects that.
# Separate messages denoted by periods.
combined_messages = []
prev_author = None
curr_message = ""
for message in convo:
author = message["author"]["username"]
message_content = process_msg_text(message)
if (author == prev_author or prev_author is None ):
curr_message += message_content + ". "
else:
combined_messages.append({
"author" : prev_author,
"message" : curr_message
})
curr_message = message_content + ". "
prev_author = author
# Can't forget the final message
combined_messages.append({
"author" : prev_author,
"message" : curr_message
})
# Split these messages into context + response pairs.
author_data = {}
prev_context = " "
all_prev_context = ""
for message in combined_messages:
author = message["author"]
message_content = message["message"]
if (author not in author_data):
author_data[author] = []
# Format for model training: "[BOS] {prompt} [SEP] {response} [EOS]"
train_string = f"{BOS} " + prev_context + f" {SEP} " + message_content + f" {EOS}"
# Format as jsonl
author_data[author].append( json.dumps({"context" : prev_context, "response" : message_content, "train" : train_string}) + "\n" )
prev_context = message_content
all_prev_context += message_content
return author_data
def main():
secrets = load_secrets()
user_to_id = secrets["users"]
# File with all messages scraped using downloader.py
base_message = FOLDER + "/all_messages.json"
all_msgs_json = read_json(base_message)
convos = break_into_conversations(all_msgs_json, user_to_id)
all_author_data = {}
for convo in convos:
author_data = split_into_user_context(convo)
for author, data in author_data.items():
if (author not in all_author_data):
all_author_data[author] = data
else:
all_author_data[author] += data
# Write each author's data to their own .jsonl file.
# NOTE: discord usernames can have periods in them. Deal with that as you need to.
for author, all_data in all_author_data.items():
with open(FOLDER + f"/user_messages/{author}.jsonl", 'w') as fp:
fp.writelines(all_data)
if (__name__ == "__main__"):
main()