-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_dictionary.py
53 lines (46 loc) · 1.69 KB
/
create_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from nltk import WordNetLemmatizer
from pymongo import MongoClient
from nltk.corpus import stopwords
from gensim import corpora
import nltk
import time
import Config
db = MongoClient(Config.MONGO_CONNECTION_URL)[Config.ACADEMIC_DATASET_DB]
cleaned_reviews = db[Config.CLEANED_REVIEWS]
# maps words to ids, NOT # of appearances
def createDictionary():
t0 = time.time()
print "Creating Dictionary"
dictionary = corpora.Dictionary(review['cleaned_text'] for review in cleaned_reviews.find())
dictionary.filter_extremes(keep_n=10000)
dictionary.compactify()
corpora.Dictionary.save(dictionary, Config.DICTIONARY_LOCAL)
print "Done."
print time.time() - t0, "seconds"
return dictionary
# creates bag of word model
def createCorpus(dictionary):
t0 = time.time()
print "converting Documents to Bag of Words vectors..."
corpus = [dictionary.doc2bow(review['cleaned_text']) for review in cleaned_reviews.find()]
corpora.BleiCorpus.serialize(Config.CORPUS_LOCAL, corpus)
#corpus =[]
#for review in cleaned_reviews.find():
#corpus.append(dictionary.doc2bow(review["cleaned_text"]))
#corpora.BleiCorpus.serialize(Config.CORPUS_LOCAL, corpus)
print "Done."
print time.time() - t0, "seconds"
return corpus
if __name__ == "__main__":
dictionary = createDictionary()
corpus = createCorpus(dictionary)
t0 = time.time()
print "Loading Dictionary"
local_dictonary = corpora.Dictionary.load(Config.DICTIONARY_LOCAL)
print "Done."
print time.time() - t0, "seconds"
t0 = time.time()
print "Loading Corpus"
local_corpus = corpora.BleiCorpus(Config.CORPUS_LOCAL)
print "Done."
print time.time() - t0, "seconds"