-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlaunch_sdec.py
95 lines (58 loc) · 2.59 KB
/
launch_sdec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
from keras_sdec import DeepEmbeddingClustering
import yaml
import numpy as np
import pandas as pd
from processing_utils import transformDataInCategory, divideDataset
# Open the configuration file and load the different arguments
with open('config.yaml') as f:
config = yaml.safe_load(f)
# Load the DataFrame from a Parquet file
df = pd.read_parquet('tce.parquet')
directory = config['output_embeddings']
files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.npy')]
all_embeddings = []
for file_path in files: # for each batch saved ..
# Load the stored tuples of (embedding, contract_id)
embeddings = np.load(file_path)
all_embeddings.extend(embeddings)
# Convert the list of embeddings into a large NumPy array
X = np.vstack(all_embeddings)
"""
Adding unidades and elemdespesatce
"""
unidades = df['Unidade']
elemdespesatce = df['ElemDespesaTCE']
# unidades_one_hot is going to return a table with 1484918 rows and 771 columns
# there are 771 unidades, so for each row, it's going to return true for its corresponding unidade
# and false for all the others
unidades_one_hot = transformDataInCategory(unidades)
# same thing goes for the elemdespesatce column
elemdespesatce_one_hot = transformDataInCategory(elemdespesatce)
# Next steps: concatenate unidades_one_hot and elemdespesa_one_hot to 'all_embeddings'
# Labels dataset
Y = np.array(df['IdContrato'].astype(str).tolist()) # # empty IdContrato values are '0'
# note that if X.shape and Y.shape are not the same, it will error.
# make sure to have all the embeddings
X_non_zero, X_zeros, Y_non_zero, Y_zeros = divideDataset(X, Y)
n_clusters = 20 # Assuming 10 clusters
input_dim = 384 # Your embedding dimension
print(X_non_zero.shape)
print(Y_non_zero.shape)
# Set the seed for reproducibility
np.random.seed(42)
# Determine how many samples to select (half of the dataset)
sample_size = X_non_zero.shape[0] // 10
# Randomly sample indices
indices = np.random.choice(X_non_zero.shape[0], sample_size, replace=False)
# Use the sampled indices to create a new subset of the data
X_non_zero_reduced = X_non_zero[indices]
Y_non_zero_reduced = Y_non_zero[indices]
print(X_non_zero_reduced.shape)
print(Y_non_zero_reduced.shape)
# Initialize the DeepEmbeddingClustering model
c = DeepEmbeddingClustering(n_clusters=n_clusters, input_dim=input_dim)
# Initialize the model with the embeddings
c.initialize(X_non_zero_reduced, finetune_iters=100, layerwise_pretrain_iters=50) # Adjust iterations as necessary
# Perform clustering
c.cluster(X_non_zero_reduced, y=Y_non_zero_reduced, iter_max=10)