-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3B_TM2topicmodelling.py
168 lines (119 loc) · 7.37 KB
/
3B_TM2topicmodelling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
## Oskar, EPS
### Script 3B: Topic modelling: clustering, hyperparameters and vizualisation ############################
#####################################
import pandas as pd
from bertopic import BERTopic # overview of BErtopic: https://www.pinecone.io/learn/bertopic/
# if trouble, then install hdbscan using pipwin, and after pip install --upgrade --force-reinstall joblib==1.1.0
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
#from sentence_transformers import SentenceTransformer
# from transformers import AutoTokenizer, AutoModel # The Hugging face way
import pickle
############# DISCUSSION #########
# is k-means better than hdbscan? Swapping out HDBSCAN for k-Means will reduce in a significantly less accurate model. There are quite a few benefits to HDBSCAN over k-Means including outlier detection, hierarchical nature, density efficiency, etc.
# reduce topic size:
#################
# 'all-MiniLM-L12-v2' / 'all-mpnet-base-v2' / 'allenai-specter' / 'paraphrase-MiniLM-L12-v2'
embedmodel = 'all-MiniLM-L12-v2'
#Load sentences & embeddings from disc
with open(f'data/bertopic/embeds/{embedmodel}_cleaned2023.pkl', "rb") as fIn:
stored_data = pickle.load(fIn)
stored_sentences = stored_data['sentences']
stored_embeddings = stored_data['embeddings']
###### HYPERPARAMETERS
# Pre-compute embeddings: Nice, to iterate fast over different versions of the model, if trying to optimize it to a specific use case
### Non-edited TAK is better than edited. (still cleaned for ampersands and linebreaks)
### Leaf vs EOM [Go with LEAF]
# By readthedocs: If you are more interested in having small homogeneous clusters then you may find Excess of Mass has a tendency to pick one or two large clusters and then a number of small extra clusters. In this situation you may be tempted to recluster just the data in the single large cluster. Instead, a better option is to select 'leaf' as a cluster selection method. This will select leaf nodes from the tree, producing many small homogeneous clusters.
# comparison 10,10, div=0.2/0.1. EOM gives fewer topics, smaller noise, larger clusters
### Diversity [Go with 0.2]
# Same noise. 0.3 and 0.0 makes clusters top-heavy, 0.0 makes cat 0 largerst, b
### Top n words [Go with 10]: Less clusters, but more coherent. # default: 10. "Use 10-20",
### Min_cluster_size [Go with 10]: Gives a few more, but clearer clusters. #BERTopic min_topic_size == HDBSCAN min_cluster_size
### min_samples [Go with 10]
### N neighbours [Go with 10]: 15 is def
##################################
vector_model = CountVectorizer(ngram_range=(1, 2), stop_words= "english")
umap_model = UMAP(n_neighbors=10, min_dist=0.00, random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=10, cluster_selection_method='leaf', gen_min_span_tree=True, prediction_data = False)
topic_model = BERTopic(vectorizer_model=vector_model,
#embedding_model=sentence_model,
hdbscan_model= hdbscan_model,
umap_model= umap_model,
diversity = 0.2,
top_n_words=10,
language="english", nr_topics="auto", verbose=True, calculate_probabilities=False
)
topics, probs = topic_model.fit_transform(stored_sentences, stored_embeddings)
len(topic_model.get_topic_info())
topic_model.get_topic_freq()
# Save
topic_model.save(f"data/bertopic/topicmodels/{embedmodel}_cleaned2023")
########################### load saved model
topic_model = BERTopic.load(f"data/bertopic/topicmodels/q4_takunedit_{embedmodel}")
### Merge or reduce no of topics
topic_model.reduce_topics(stored_sentences, nr_topics=25)
# topic_model.merge_topics(docs, [10,12])
### ngrams to 3
# topic_model.update_topics(docs, vectorizer_model=CountVectorizer(ngram_range=(1, 3), stop_words="english"))
###########################
topicswordprobs = pd.DataFrame(topic_model.get_topics())
topicswordprobs.columns = range(topicswordprobs.shape[1])
topicswords = topicswordprobs.applymap(lambda x: x[0])
topicsno = pd.DataFrame(topic_model.get_topic_info()['Topic'])
topicswords = pd.concat([topicsno.T,topicswords], axis=0).reset_index(drop=True)
topic_model.get_topic_info()
topicfreq = pd.DataFrame(topic_model.get_topic_freq().T)
topicdata = topicfreq.append(topicswords)
topicdata.T.to_clipboard()
## save topic model data
topicdata.T.to_csv("data/3topicdata.csv", encoding='utf-8-sig')
#############################################
# visualise barchart + intertopic distance
figbar = topic_model.visualize_barchart(top_n_topics=60, n_words=10, height=300, width=300)
figdistance = topic_model.visualize_topics(top_n_topics=60, height=1000, width=1000)
figbar.show()
figdistance.show()
# hierarchy
hierarchical_topics = topic_model.hierarchical_topics(docs)
fighiera = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, width=600)
fighiera.show()
tree = topic_model.get_topic_tree(hierarchical_topics)
topic_model.visualize_term_rank().show()
topic_model.visualize_term_rank(log_scale=True).show()
# visualise map of articles (takes a while)
figdocmap = topic_model.visualize_documents(stored_sentences, embeddings=stored_embeddings, hide_document_hover=True, height=210*5, width=297*5)
figdocmap.update_layout(uniformtext_minsize=12) # font_size=25)
# update_annotation(font=dict(color="white", size=10))
figdocmap.show()
# modifying visuals: https://stackoverflow.com/questions/70952260/how-to-change-the-function-parameters-visualize-topics-over-time-in-bertopic
# https://github.com/MaartenGr/BERTopic/issues/126
# get source code: https://github.com/MaartenGr/BERTopic/tree/master/bertopic/plotting
# Save html figures:pathtoviz = "data/bertopic/viz/"
pathtoviz = "data/bertopic/viz/"
figbar.write_html(pathtoviz + "fig_barchart.html")
figdistance.write_html(pathtoviz + "fig_distance.html")
fighiera.write_html(pathtoviz + "fig_hierachy.html")
figdocmap.write_html(pathtoviz + "fig_docmap.html")
# save each article and their topic:
scopus = pd.read_csv("data/1cleaned.csv").loc[:,'eid':]
################## add topic numbers and topic_words to docs
# Concatenate topic keywords
topicwordscombined = topicswords.T.iloc[:,1:].apply(lambda row: ','.join(row.values.astype(str)), axis=1)
topicwordlists = pd.concat([topicswords.T.iloc[:,0],topicwordscombined],axis=1)
topicwordlists.columns = ['topicnr', 'topicwords']
# Get topic number for each article
topics = pd.DataFrame(topic_model._map_predictions(topic_model.hdbscan_model.labels_), columns=['topicnr'])
topics = pd.merge(topics, topicwordlists, how="left", on='topicnr')
topics
# Finally merge topicdata onto articles
# scopus = scopus.drop(['topicnr'], axis=1)
article_topics = pd.concat([scopus, topics], axis=1)
###### Import EIDs, which were previously screened manually
article_topics = article_topics[['eid', 'doi', 'subtype', 'affilname', 'author_count', 'citedby_count', 'pubyear', 'journal_count', 'author_names',
'subjabbr', 'subjarea', 'pubname' ,'title', 'authkeywords', 'description', 'topicnr', 'topicwords']]
mastereidscanned = pd.read_csv("data/backup_scanned_ids/4eids_manuallyscanned.csv", encoding='utf-8-sig').loc[:,'eid':]
transfdscans = pd.merge(article_topics, mastereidscanned, how='left', on='eid')
len(article_topics), len(transfdscans), len(mastereidscanned)
transfdscans.to_csv("data/3cleaned_wtopics.csv", encoding='utf-8-sig')