Mercurial > repos > siwaa > wine_quality_train_eval
comparison toto_wine_quality_train_eval.py @ 9:31d737992c63 draft
planemo upload for repository https://forgemia.inra.fr/nathalie.rousse/use/-/tree/dnn/DNN/galaxy-tools/wine_quality_train_eval commit e7fd13c34ec074a7ebc246301b5a80069dcbcc3a-dirty
author | siwaa |
---|---|
date | Thu, 05 Dec 2024 15:40:01 +0000 |
parents | |
children | b432386c0f1c |
comparison
equal
deleted
inserted
replaced
8:7ee93e4cf2da | 9:31d737992c63 |
---|---|
1 #!/usr/bin/env python | |
2 # coding: utf-8 | |
3 | |
4 ############################################################################### | |
5 # Module : model_wine_lightning | |
6 # | |
7 # This code has been extracted from 01-DNN-Wine-Regression-lightning.ipynb | |
8 # (fidle-tp/fidle-master-3.0.11/Wine.Lightning) then modified. | |
9 # Only first part kept : | |
10 # - (Retrieve data) | |
11 # - (Preparing the data) | |
12 # - (Build a model) | |
13 # - Train and save the model | |
14 # | |
15 # Inputs : | |
16 # | |
17 # -dataset_filepath : dataset file path (.csv). | |
18 # - File containing data used to train and test the model. | |
19 # - The dataset will be splitted in 2 parts : | |
20 # one for training and one for validation. | |
21 # - if unavailable (not given, not found...) : default dataset_filepath used | |
22 # | |
23 # Outputs : | |
24 # | |
25 # Output files under "OUTPUTS" folder (must exist !!!) | |
26 # | |
27 # - Model file (model_ckpt_filepath) (.ckpt) | |
28 # | |
29 # - Normalization configuration file (norm_config_json_filepath) (.json) | |
30 # | |
31 # - Report file (report_json_filepath) (.json) containing: | |
32 # - Normalization configuration information | |
33 # - Evaluation score information | |
34 # example {'val_loss': 0.48292940855026245, 'val_mae': 0.524127721786499, | |
35 # 'val_mse': 0.48292940855026245} | |
36 # | |
37 # - Log files into Wine.Lightning/run/LWINE1/logs/reg_logs | |
38 # | |
39 # - Screen display containing running information : | |
40 # - training lines : | |
41 # Epoch 99: 100%|█| 64/64 [00:01<00:00, 44.89it/s, v_num=0, val_loss=0.483, val_mae=0.524, val_mse=0.483, t | |
42 # - Eval score : x_test / loss : 0.4829 | |
43 # x_test / mae : 0.5241 | |
44 # x_test / mse : 0.4829 | |
45 # - ... | |
46 # | |
47 ############################################################################### | |
48 | |
49 # <img width="800px" src="../fidle/img/header.svg"></img> | |
50 # | |
51 # # <!-- TITLE --> [LWINE1] - Wine quality prediction with a Dense Network (DNN) | |
52 # <!-- DESC --> Another example of regression, with a wine quality prediction, using PyTorch Lightning | |
53 # <!-- AUTHOR : Achille Mbogol Touye (EFFILIA-MIAI/SIMaP) --> | |
54 # | |
55 # ## Objectives : | |
56 # - Predict the **quality of wines**, based on their analysis | |
57 # - Understanding the principle and the architecture of a regression with a dense neural network with backup and restore of the trained model. | |
58 # | |
59 # The **[Wine Quality datasets](https://archive.ics.uci.edu/ml/datasets/wine+Quality)** are made up of analyses of a large number of wines, with an associated quality (between 0 and 10) | |
60 # This dataset is provide by : | |
61 # Paulo Cortez, University of Minho, Guimarães, Portugal, http://www3.dsi.uminho.pt/pcortez | |
62 # A. Cerdeira, F. Almeida, T. Matos and J. Reis, Viticulture Commission of the Vinho Verde Region(CVRVV), Porto, Portugal, @2009 | |
63 # This dataset can be retreive at [University of California Irvine (UCI)](https://archive-beta.ics.uci.edu/ml/datasets/wine+quality) | |
64 # | |
65 # | |
66 # Due to privacy and logistic issues, only physicochemical and sensory variables are available | |
67 # There is no data about grape types, wine brand, wine selling price, etc. | |
68 # | |
69 # - fixed acidity | |
70 # - volatile acidity | |
71 # - citric acid | |
72 # - residual sugar | |
73 # - chlorides | |
74 # - free sulfur dioxide | |
75 # - total sulfur dioxide | |
76 # - density | |
77 # - pH | |
78 # - sulphates | |
79 # - alcohol | |
80 # - quality (score between 0 and 10) | |
81 # | |
82 # ## What we're going to do : | |
83 # | |
84 # - (Retrieve data) | |
85 # - (Preparing the data) | |
86 # - (Build a model) | |
87 # - Train and save the model | |
88 # - Restore saved model | |
89 # - Evaluate the model | |
90 # - Make some predictions | |
91 # | |
92 HEAD = "[model_wine_lightning.wine_quality_train_eval]" | |
93 | |
94 # ## Step 1 - Import and init | |
95 print("\n"+HEAD,"# ## Step 1 - Import and init\n") | |
96 | |
97 # Import some packages | |
98 import os | |
99 import lightning.pytorch as pl | |
100 import torchvision.transforms as T | |
101 from IPython.display import display, HTML | |
102 from torch.utils.data import DataLoader, random_split | |
103 from model_wine_lightning.modules.progressbar import CustomTrainProgressBar | |
104 from model_wine_lightning.modules.data_load import WineQualityDataset | |
105 from model_wine_lightning.modules.data_load import Normalize, ToTensor | |
106 from model_wine_lightning.modules.model import LitRegression | |
107 from lightning.pytorch.loggers.tensorboard import TensorBoardLogger | |
108 import fidle | |
109 import json | |
110 import shutil | |
111 import argparse | |
112 from pprint import pprint | |
113 | |
114 OUTPUTS_PATH = "OUTPUTS" # must exit !!! | |
115 | |
116 error_msg, warn_msg, more_msg = "", "", "" # default | |
117 | |
118 model_ckpt_filepath = os.path.join(OUTPUTS_PATH, "model.ckpt") | |
119 norm_config_json_filepath = os.path.join(OUTPUTS_PATH, "norm_config.json") | |
120 report_json_filepath = os.path.join(OUTPUTS_PATH, "report.json") | |
121 report = dict() # init | |
122 | |
123 try: | |
124 if not os.path.exists(OUTPUTS_PATH): # cas isfile non traite | |
125 os.mkdir(OUTPUTS_PATH) | |
126 message = "Outputs folder '"+OUTPUTS_PATH+" does not exist => created." | |
127 warn_msg += message + " " | |
128 print(HEAD, "Warning :", message) | |
129 | |
130 message = "Outputs folder '" + OUTPUTS_PATH + "' must exist." | |
131 if not os.path.isdir(OUTPUTS_PATH): | |
132 error_msg += message + " " | |
133 raise Exception(message) | |
134 | |
135 # ## INPUTS | |
136 print("\n"+HEAD, "# ## INPUTS\n") | |
137 | |
138 parser = argparse.ArgumentParser() | |
139 | |
140 help_text = "dataset file path (.csv)" | |
141 parser.add_argument("-dataset_filepath", type=str, help=help_text) | |
142 | |
143 args = parser.parse_args() | |
144 | |
145 dataset_filepath = None # default (case default data file) | |
146 path = args.dataset_filepath | |
147 if (path is not None) and (path != 'None') : | |
148 if os.path.isfile(path) : | |
149 dataset_filepath = path | |
150 print(HEAD, "dataset file used :", path) | |
151 else : | |
152 message = path+ "dataset file not found => default data file used." | |
153 warn_msg += message + "" | |
154 print(HEAD, "Warning :", message) | |
155 else: | |
156 message = "no dataset_filepath given => default data file used." | |
157 warn_msg += message + "" | |
158 print(HEAD, "Warning :", message) | |
159 | |
160 # Init Fidle environment | |
161 print("\n"+HEAD, "# Init Fidle environment\n") | |
162 run_id, run_dir, datasets_dir = fidle.init('LWINE1_train_eval') | |
163 | |
164 # Verbosity during training : | |
165 # - 0 = silent | |
166 # - 1 = progress bar | |
167 # - 2 = one line per epoch | |
168 fit_verbosity = 1 | |
169 dataset_name = 'winequality-red.csv' # default data file | |
170 | |
171 # Override parameters (batch mode) - Just forget this cell | |
172 fidle.override('fit_verbosity', 'dataset_name') | |
173 | |
174 # ## Step 2 - Retrieve data | |
175 print("\n"+HEAD,"# ## Step 2 - Retrieve data\n") | |
176 | |
177 if dataset_filepath is None: # default data file | |
178 dataset_filepath = f'{datasets_dir}/WineQuality/origine/{dataset_name}' | |
179 print(HEAD, "Dataset file used :", dataset_filepath) | |
180 | |
181 # Verify | |
182 message = "Dataset file '" + dataset_filepath + "' not found." | |
183 if not os.path.isfile(dataset_filepath): | |
184 error_msg += message + " " | |
185 | |
186 datasets = WineQualityDataset(dataset_filepath) | |
187 print("datasets:") | |
188 #display(datasets.data.head(5).style.format("{0:.2f}")) | |
189 display(datasets.data.head(5)) | |
190 print('Missing Data : ',datasets.data.isna().sum().sum(), | |
191 ' Shape is : ', datasets.data.shape) | |
192 | |
193 # ## Step 3 - Preparing the data | |
194 print("\n"+HEAD,"# ## Step 3 - Preparing the data\n") | |
195 | |
196 # ### 3.1 - Data normalization | |
197 print("\n"+HEAD,"# ### 3.1 - Data normalization\n") | |
198 # **Note :** | |
199 # - All input features must be normalized. | |
200 # - To do this we will subtract the mean and divide by the standard | |
201 # deviation for each input features. | |
202 # - Then we convert numpy array features and target **(quality)** to | |
203 # torch tensor | |
204 | |
205 N = Normalize(dataset_filepath) | |
206 norm_config = {"mean_json":N.mean_json, "std_json":N.std_json, | |
207 "min_json":N.min_json, "max_json":N.max_json} | |
208 transforms = T.Compose([N, ToTensor()]) | |
209 dataset = WineQualityDataset(dataset_filepath, transform=transforms) | |
210 | |
211 print("Before normalization :") | |
212 display(datasets[:]["features"]) | |
213 print("After normalization :") | |
214 display(dataset[:]["features"]) | |
215 | |
216 # ### 3.2 - Split data | |
217 print("\n"+HEAD,"# ### 3.2 - Split data\n") | |
218 # We will use 80% of the data for training and 20% for validation. | |
219 # x will be the features data of the analysis and y the target (quality) | |
220 | |
221 # ---- Split => train, test | |
222 data_train_len = int(len(dataset)*0.8) # get 80 % | |
223 data_test_len = len(dataset) -data_train_len # test = all - train | |
224 | |
225 # ---- Split => x,y with random_split | |
226 data_train_subset, data_test_subset = random_split(dataset, | |
227 [data_train_len, data_test_len]) | |
228 | |
229 x_train = data_train_subset[:]["features"] | |
230 y_train = data_train_subset[:]["quality" ] | |
231 | |
232 x_test = data_test_subset[:]["features"] | |
233 y_test = data_test_subset[:]["quality" ] | |
234 | |
235 print('Original data shape was : ',dataset.data.shape) | |
236 print('x_train : ',x_train.shape, 'y_train : ',y_train.shape) | |
237 print('x_test : ',x_test.shape, 'y_test : ',y_test.shape) | |
238 | |
239 # ### 3.3 - For Training model use Dataloader | |
240 print("\n"+HEAD,"# ### 3.3 - For Training model use Dataloader\n") | |
241 # The Dataset retrieves our dataset’s features and labels one sample at a time. While training a model, we typically want to pass samples in minibatches, reshuffle the data at every epoch to reduce model overfitting. DataLoader is an iterable that abstracts this complexity for us in an easy API. | |
242 | |
243 # train batch data | |
244 train_loader= DataLoader(dataset=data_train_subset, | |
245 shuffle=True, batch_size=20, num_workers=2) | |
246 # test batch data | |
247 test_loader= DataLoader(dataset=data_test_subset, | |
248 shuffle=False, batch_size=20, num_workers=2) | |
249 | |
250 # ## Step 4 - Build a model | |
251 print("\n"+HEAD,"# ## Step 4 - Build a model\n") | |
252 | |
253 # ## 5 - Train the model | |
254 print("\n"+HEAD,"# ## 5 - Train the model\n") | |
255 | |
256 # ### 5.1 - Get it | |
257 print("\n"+HEAD,"# ### 5.1 - Get it\n") | |
258 print(HEAD, "Model creation") | |
259 reg=LitRegression(in_features=11) | |
260 print(reg) | |
261 | |
262 # ### 5.2 - Add callback | |
263 print("\n"+HEAD,"# ### 5.2 - Add callback\n") | |
264 os.makedirs('./run/models', exist_ok=True) | |
265 save_dir = "./run/models/" | |
266 filename ='best-model-{epoch}-{val_loss:.2f}' | |
267 savemodel_callback = pl.callbacks.ModelCheckpoint(dirpath=save_dir, | |
268 filename=filename, save_top_k=1, | |
269 verbose=False, monitor="val_loss") | |
270 # ### 5.3 - Train it | |
271 print("\n"+HEAD,"# ### 5.3 - Train it\n") | |
272 | |
273 # loggers data | |
274 os.makedirs(f'{run_dir}/logs', mode=0o750, exist_ok=True) | |
275 logger= TensorBoardLogger(save_dir=f'{run_dir}/logs', name="reg_logs") | |
276 | |
277 # train model | |
278 trainer = pl.Trainer(accelerator='auto', max_epochs=100, | |
279 logger=logger, num_sanity_val_steps=0, | |
280 callbacks=[savemodel_callback,CustomTrainProgressBar()]) | |
281 trainer.fit(model=reg, train_dataloaders=train_loader, | |
282 val_dataloaders=test_loader) | |
283 | |
284 # ## Step 6 - Evaluate it | |
285 print("\n"+HEAD,"# ## Step 6 - Evaluate it\n") | |
286 | |
287 # ### 6.1 - Model evaluation | |
288 print("\n"+HEAD,"# ### 6.1 - Model evaluation\n") | |
289 # MAE = Mean Absolute Error (between the labels and predictions) | |
290 # A mae equal to 3 represents an average error in prediction of $3k. | |
291 score = trainer.validate(model=reg, dataloaders=test_loader, verbose=False) | |
292 | |
293 print('x_test / loss : {:5.4f}'.format(score[0]['val_loss'])) | |
294 print('x_test / mae : {:5.4f}'.format(score[0]['val_mae'])) | |
295 print('x_test / mse : {:5.4f}'.format(score[0]['val_mse'])) | |
296 | |
297 # ### 6.2 - Training history | |
298 print("\n"+HEAD,"# ### 6.2 - Training history\n") | |
299 # | |
300 # To access logs with tensorboad : | |
301 # - Under **Docker**, from a terminal launched via the jupyterlab | |
302 # launcher, use the following command:<br> | |
303 # ```tensorboard --logdir <path-to-logs> --host 0.0.0.0``` | |
304 # - If you're **not using Docker**, from a terminal :<br> | |
305 # ```tensorboard --logdir <path-to-logs>``` | |
306 # | |
307 # **Note:** One tensorboard instance can be used simultaneously. | |
308 | |
309 # ## OUTPUTS | |
310 print("\n"+HEAD,"# ## OUTPUTS\n") | |
311 | |
312 # Model (.ckpt) copy of savemodel_callback.best_model_path (under save_dir) | |
313 savemodel_path = savemodel_callback.best_model_path | |
314 shutil.copyfile(src=savemodel_path, dst=model_ckpt_filepath) | |
315 print("OUTPUT:", "Model :", model_ckpt_filepath) | |
316 print(" (is a copy of: Best model file ", savemodel_path, ")") | |
317 | |
318 # Save norm_config as .json file | |
319 with open(norm_config_json_filepath, "w") as outfile: | |
320 json.dump(norm_config, outfile) | |
321 print("OUTPUT:", | |
322 "Normalization configuration file (containing norm_config) :", | |
323 norm_config_json_filepath) | |
324 | |
325 # Report (json) : | |
326 # - normalization configuration information | |
327 # - evaluation score information | |
328 # example {'val_loss': 0.48292940855026245, 'val_mae': 0.524127721786499, | |
329 # 'val_mse': 0.48292940855026245} | |
330 report['eval_score'] = score[0] | |
331 report['norm_config'] = norm_config | |
332 report['best_model_file'] = savemodel_path | |
333 | |
334 fidle.end() | |
335 | |
336 except Exception as e : | |
337 error_msg += type(e).__name__ + str(e.args) + ". " | |
338 | |
339 if error_msg != "": report["error"] = error_msg | |
340 if more_msg != "": report["more"] = more_msg | |
341 if warn_msg != "": report["warning"] = warn_msg | |
342 | |
343 print("OUTPUT:", "Report: ") | |
344 pprint(report) | |
345 | |
346 # Save Report as .json file | |
347 try: | |
348 with open(report_json_filepath, "w") as outfile: | |
349 json.dump(report, outfile) | |
350 print("OUTPUT:", "Report file (containing report) :", report_json_filepath) | |
351 except : | |
352 pass | |
353 | |
354 # --- | |
355 # <img width="80px" src="../fidle/img/logo-paysage.svg"></img> | |
356 |