-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample.py
executable file
·97 lines (75 loc) · 2.98 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
##
## Authors: Adriano Marques
## Nathan Martins
## Thales Ribeiro
##
## Copyright (C) 2019 Exponential Ventures LLC
##
## This library is free software; you can redistribute it and/or
## modify it under the terms of the GNU Library General Public
## License as published by the Free Software Foundation; either
## version 2 of the License, or (at your option) any later version.
##
## This library is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## Library General Public License for more details.
##
## You should have received a copy of the GNU Library General Public
## License along with this library; if not, write to the Free Software
## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
##
from stripping.benchmark import Benchmark
benchmark = Benchmark('Black Friday with stripping')
benchmark.start()
import logging
from os.path import split, join
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from stripping import setup_stripping
st, c = setup_stripping(join(split(__file__)[0], '.stripping'))
logging.basicConfig(level=logging.DEBUG)
@st.step()
def load_dataset():
c.bf_file = join(split(__file__)[0], "datasets", "black_friday.csv")
logging.info(f"Processing file '{c.bf_file}' without using the Catalysis acceleration framework.")
c.bf = pd.read_csv(c.bf_file)
@st.step()
def split_dataset():
c.X = c.bf.iloc[:, 0:6].values
c.y = c.bf.iloc[:, 9].values
c.X_train, c.X_test, c.y_train, c.y_test = train_test_split(
c.X, c.y, test_size=0.15, random_state=0)
@st.step()
def encode_labels():
#################################
# Encoding non-numerical columns
c.x_train_encoder = LabelEncoder()
c.X_train[:, 0] = c.x_train_encoder.fit_transform(c.X_train[:, 0])
c.X_train[:, 1] = c.x_train_encoder.fit_transform(c.X_train[:, 1])
c.X_train[:, 3] = c.x_train_encoder.fit_transform(c.X_train[:, 3])
c.X_train[:, 4] = c.x_train_encoder.fit_transform(c.X_train[:, 4])
#######........
@st.step()
def scale_values():
######################
# Scaling all columns
c.X_train_scaler = StandardScaler()
c.X_test_scaler = StandardScaler()
c.X_train = c.X_train_scaler.fit_transform(c.X_train)
c.X_test = c.X_test_scaler.fit_transform(c.X_test)
@st.step()
def train_model():
#################################
# Training and error measurement
c.regressor = RandomForestRegressor(n_estimators=1000, random_state=0)
c.regressor.fit(c.X_train, c.y_train)
@st.step()
def measure_error():
c.y_pred = c.regressor.predict(c.X_test)
c.error = mean_absolute_error(c.y_test, c.y_pred)
st.execute()