comparison toto_wine_quality_train_eval.py @ 9:31d737992c63 draft

planemo upload for repository https://forgemia.inra.fr/nathalie.rousse/use/-/tree/dnn/DNN/galaxy-tools/wine_quality_train_eval commit e7fd13c34ec074a7ebc246301b5a80069dcbcc3a-dirty
author siwaa
date Thu, 05 Dec 2024 15:40:01 +0000
parents
children b432386c0f1c
comparison
equal deleted inserted replaced
8:7ee93e4cf2da 9:31d737992c63
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 ###############################################################################
5 # Module : model_wine_lightning
6 #
7 # This code has been extracted from 01-DNN-Wine-Regression-lightning.ipynb
8 # (fidle-tp/fidle-master-3.0.11/Wine.Lightning) then modified.
9 # Only first part kept :
10 # - (Retrieve data)
11 # - (Preparing the data)
12 # - (Build a model)
13 # - Train and save the model
14 #
15 # Inputs :
16 #
17 # -dataset_filepath : dataset file path (.csv).
18 # - File containing data used to train and test the model.
19 # - The dataset will be splitted in 2 parts :
20 # one for training and one for validation.
21 # - if unavailable (not given, not found...) : default dataset_filepath used
22 #
23 # Outputs :
24 #
25 # Output files under "OUTPUTS" folder (must exist !!!)
26 #
27 # - Model file (model_ckpt_filepath) (.ckpt)
28 #
29 # - Normalization configuration file (norm_config_json_filepath) (.json)
30 #
31 # - Report file (report_json_filepath) (.json) containing:
32 # - Normalization configuration information
33 # - Evaluation score information
34 # example {'val_loss': 0.48292940855026245, 'val_mae': 0.524127721786499,
35 # 'val_mse': 0.48292940855026245}
36 #
37 # - Log files into Wine.Lightning/run/LWINE1/logs/reg_logs
38 #
39 # - Screen display containing running information :
40 # - training lines :
41 # Epoch 99: 100%|█| 64/64 [00:01<00:00, 44.89it/s, v_num=0, val_loss=0.483, val_mae=0.524, val_mse=0.483, t
42 # - Eval score : x_test / loss : 0.4829
43 # x_test / mae : 0.5241
44 # x_test / mse : 0.4829
45 # - ...
46 #
47 ###############################################################################
48
49 # <img width="800px" src="../fidle/img/header.svg"></img>
50 #
51 # # <!-- TITLE --> [LWINE1] - Wine quality prediction with a Dense Network (DNN)
52 # <!-- DESC --> Another example of regression, with a wine quality prediction, using PyTorch Lightning
53 # <!-- AUTHOR : Achille Mbogol Touye (EFFILIA-MIAI/SIMaP) -->
54 #
55 # ## Objectives :
56 # - Predict the **quality of wines**, based on their analysis
57 # - Understanding the principle and the architecture of a regression with a dense neural network with backup and restore of the trained model.
58 #
59 # The **[Wine Quality datasets](https://archive.ics.uci.edu/ml/datasets/wine+Quality)** are made up of analyses of a large number of wines, with an associated quality (between 0 and 10)
60 # This dataset is provide by :
61 # Paulo Cortez, University of Minho, Guimarães, Portugal, http://www3.dsi.uminho.pt/pcortez
62 # A. Cerdeira, F. Almeida, T. Matos and J. Reis, Viticulture Commission of the Vinho Verde Region(CVRVV), Porto, Portugal, @2009
63 # This dataset can be retreive at [University of California Irvine (UCI)](https://archive-beta.ics.uci.edu/ml/datasets/wine+quality)
64 #
65 #
66 # Due to privacy and logistic issues, only physicochemical and sensory variables are available
67 # There is no data about grape types, wine brand, wine selling price, etc.
68 #
69 # - fixed acidity
70 # - volatile acidity
71 # - citric acid
72 # - residual sugar
73 # - chlorides
74 # - free sulfur dioxide
75 # - total sulfur dioxide
76 # - density
77 # - pH
78 # - sulphates
79 # - alcohol
80 # - quality (score between 0 and 10)
81 #
82 # ## What we're going to do :
83 #
84 # - (Retrieve data)
85 # - (Preparing the data)
86 # - (Build a model)
87 # - Train and save the model
88 # - Restore saved model
89 # - Evaluate the model
90 # - Make some predictions
91 #
92 HEAD = "[model_wine_lightning.wine_quality_train_eval]"
93
94 # ## Step 1 - Import and init
95 print("\n"+HEAD,"# ## Step 1 - Import and init\n")
96
97 # Import some packages
98 import os
99 import lightning.pytorch as pl
100 import torchvision.transforms as T
101 from IPython.display import display, HTML
102 from torch.utils.data import DataLoader, random_split
103 from model_wine_lightning.modules.progressbar import CustomTrainProgressBar
104 from model_wine_lightning.modules.data_load import WineQualityDataset
105 from model_wine_lightning.modules.data_load import Normalize, ToTensor
106 from model_wine_lightning.modules.model import LitRegression
107 from lightning.pytorch.loggers.tensorboard import TensorBoardLogger
108 import fidle
109 import json
110 import shutil
111 import argparse
112 from pprint import pprint
113
114 OUTPUTS_PATH = "OUTPUTS" # must exit !!!
115
116 error_msg, warn_msg, more_msg = "", "", "" # default
117
118 model_ckpt_filepath = os.path.join(OUTPUTS_PATH, "model.ckpt")
119 norm_config_json_filepath = os.path.join(OUTPUTS_PATH, "norm_config.json")
120 report_json_filepath = os.path.join(OUTPUTS_PATH, "report.json")
121 report = dict() # init
122
123 try:
124 if not os.path.exists(OUTPUTS_PATH): # cas isfile non traite
125 os.mkdir(OUTPUTS_PATH)
126 message = "Outputs folder '"+OUTPUTS_PATH+" does not exist => created."
127 warn_msg += message + " "
128 print(HEAD, "Warning :", message)
129
130 message = "Outputs folder '" + OUTPUTS_PATH + "' must exist."
131 if not os.path.isdir(OUTPUTS_PATH):
132 error_msg += message + " "
133 raise Exception(message)
134
135 # ## INPUTS
136 print("\n"+HEAD, "# ## INPUTS\n")
137
138 parser = argparse.ArgumentParser()
139
140 help_text = "dataset file path (.csv)"
141 parser.add_argument("-dataset_filepath", type=str, help=help_text)
142
143 args = parser.parse_args()
144
145 dataset_filepath = None # default (case default data file)
146 path = args.dataset_filepath
147 if (path is not None) and (path != 'None') :
148 if os.path.isfile(path) :
149 dataset_filepath = path
150 print(HEAD, "dataset file used :", path)
151 else :
152 message = path+ "dataset file not found => default data file used."
153 warn_msg += message + ""
154 print(HEAD, "Warning :", message)
155 else:
156 message = "no dataset_filepath given => default data file used."
157 warn_msg += message + ""
158 print(HEAD, "Warning :", message)
159
160 # Init Fidle environment
161 print("\n"+HEAD, "# Init Fidle environment\n")
162 run_id, run_dir, datasets_dir = fidle.init('LWINE1_train_eval')
163
164 # Verbosity during training :
165 # - 0 = silent
166 # - 1 = progress bar
167 # - 2 = one line per epoch
168 fit_verbosity = 1
169 dataset_name = 'winequality-red.csv' # default data file
170
171 # Override parameters (batch mode) - Just forget this cell
172 fidle.override('fit_verbosity', 'dataset_name')
173
174 # ## Step 2 - Retrieve data
175 print("\n"+HEAD,"# ## Step 2 - Retrieve data\n")
176
177 if dataset_filepath is None: # default data file
178 dataset_filepath = f'{datasets_dir}/WineQuality/origine/{dataset_name}'
179 print(HEAD, "Dataset file used :", dataset_filepath)
180
181 # Verify
182 message = "Dataset file '" + dataset_filepath + "' not found."
183 if not os.path.isfile(dataset_filepath):
184 error_msg += message + " "
185
186 datasets = WineQualityDataset(dataset_filepath)
187 print("datasets:")
188 #display(datasets.data.head(5).style.format("{0:.2f}"))
189 display(datasets.data.head(5))
190 print('Missing Data : ',datasets.data.isna().sum().sum(),
191 ' Shape is : ', datasets.data.shape)
192
193 # ## Step 3 - Preparing the data
194 print("\n"+HEAD,"# ## Step 3 - Preparing the data\n")
195
196 # ### 3.1 - Data normalization
197 print("\n"+HEAD,"# ### 3.1 - Data normalization\n")
198 # **Note :**
199 # - All input features must be normalized.
200 # - To do this we will subtract the mean and divide by the standard
201 # deviation for each input features.
202 # - Then we convert numpy array features and target **(quality)** to
203 # torch tensor
204
205 N = Normalize(dataset_filepath)
206 norm_config = {"mean_json":N.mean_json, "std_json":N.std_json,
207 "min_json":N.min_json, "max_json":N.max_json}
208 transforms = T.Compose([N, ToTensor()])
209 dataset = WineQualityDataset(dataset_filepath, transform=transforms)
210
211 print("Before normalization :")
212 display(datasets[:]["features"])
213 print("After normalization :")
214 display(dataset[:]["features"])
215
216 # ### 3.2 - Split data
217 print("\n"+HEAD,"# ### 3.2 - Split data\n")
218 # We will use 80% of the data for training and 20% for validation.
219 # x will be the features data of the analysis and y the target (quality)
220
221 # ---- Split => train, test
222 data_train_len = int(len(dataset)*0.8) # get 80 %
223 data_test_len = len(dataset) -data_train_len # test = all - train
224
225 # ---- Split => x,y with random_split
226 data_train_subset, data_test_subset = random_split(dataset,
227 [data_train_len, data_test_len])
228
229 x_train = data_train_subset[:]["features"]
230 y_train = data_train_subset[:]["quality" ]
231
232 x_test = data_test_subset[:]["features"]
233 y_test = data_test_subset[:]["quality" ]
234
235 print('Original data shape was : ',dataset.data.shape)
236 print('x_train : ',x_train.shape, 'y_train : ',y_train.shape)
237 print('x_test : ',x_test.shape, 'y_test : ',y_test.shape)
238
239 # ### 3.3 - For Training model use Dataloader
240 print("\n"+HEAD,"# ### 3.3 - For Training model use Dataloader\n")
241 # The Dataset retrieves our dataset’s features and labels one sample at a time. While training a model, we typically want to pass samples in minibatches, reshuffle the data at every epoch to reduce model overfitting. DataLoader is an iterable that abstracts this complexity for us in an easy API.
242
243 # train batch data
244 train_loader= DataLoader(dataset=data_train_subset,
245 shuffle=True, batch_size=20, num_workers=2)
246 # test batch data
247 test_loader= DataLoader(dataset=data_test_subset,
248 shuffle=False, batch_size=20, num_workers=2)
249
250 # ## Step 4 - Build a model
251 print("\n"+HEAD,"# ## Step 4 - Build a model\n")
252
253 # ## 5 - Train the model
254 print("\n"+HEAD,"# ## 5 - Train the model\n")
255
256 # ### 5.1 - Get it
257 print("\n"+HEAD,"# ### 5.1 - Get it\n")
258 print(HEAD, "Model creation")
259 reg=LitRegression(in_features=11)
260 print(reg)
261
262 # ### 5.2 - Add callback
263 print("\n"+HEAD,"# ### 5.2 - Add callback\n")
264 os.makedirs('./run/models', exist_ok=True)
265 save_dir = "./run/models/"
266 filename ='best-model-{epoch}-{val_loss:.2f}'
267 savemodel_callback = pl.callbacks.ModelCheckpoint(dirpath=save_dir,
268 filename=filename, save_top_k=1,
269 verbose=False, monitor="val_loss")
270 # ### 5.3 - Train it
271 print("\n"+HEAD,"# ### 5.3 - Train it\n")
272
273 # loggers data
274 os.makedirs(f'{run_dir}/logs', mode=0o750, exist_ok=True)
275 logger= TensorBoardLogger(save_dir=f'{run_dir}/logs', name="reg_logs")
276
277 # train model
278 trainer = pl.Trainer(accelerator='auto', max_epochs=100,
279 logger=logger, num_sanity_val_steps=0,
280 callbacks=[savemodel_callback,CustomTrainProgressBar()])
281 trainer.fit(model=reg, train_dataloaders=train_loader,
282 val_dataloaders=test_loader)
283
284 # ## Step 6 - Evaluate it
285 print("\n"+HEAD,"# ## Step 6 - Evaluate it\n")
286
287 # ### 6.1 - Model evaluation
288 print("\n"+HEAD,"# ### 6.1 - Model evaluation\n")
289 # MAE = Mean Absolute Error (between the labels and predictions)
290 # A mae equal to 3 represents an average error in prediction of $3k.
291 score = trainer.validate(model=reg, dataloaders=test_loader, verbose=False)
292
293 print('x_test / loss : {:5.4f}'.format(score[0]['val_loss']))
294 print('x_test / mae : {:5.4f}'.format(score[0]['val_mae']))
295 print('x_test / mse : {:5.4f}'.format(score[0]['val_mse']))
296
297 # ### 6.2 - Training history
298 print("\n"+HEAD,"# ### 6.2 - Training history\n")
299 #
300 # To access logs with tensorboad :
301 # - Under **Docker**, from a terminal launched via the jupyterlab
302 # launcher, use the following command:<br>
303 # ```tensorboard --logdir <path-to-logs> --host 0.0.0.0```
304 # - If you're **not using Docker**, from a terminal :<br>
305 # ```tensorboard --logdir <path-to-logs>```
306 #
307 # **Note:** One tensorboard instance can be used simultaneously.
308
309 # ## OUTPUTS
310 print("\n"+HEAD,"# ## OUTPUTS\n")
311
312 # Model (.ckpt) copy of savemodel_callback.best_model_path (under save_dir)
313 savemodel_path = savemodel_callback.best_model_path
314 shutil.copyfile(src=savemodel_path, dst=model_ckpt_filepath)
315 print("OUTPUT:", "Model :", model_ckpt_filepath)
316 print(" (is a copy of: Best model file ", savemodel_path, ")")
317
318 # Save norm_config as .json file
319 with open(norm_config_json_filepath, "w") as outfile:
320 json.dump(norm_config, outfile)
321 print("OUTPUT:",
322 "Normalization configuration file (containing norm_config) :",
323 norm_config_json_filepath)
324
325 # Report (json) :
326 # - normalization configuration information
327 # - evaluation score information
328 # example {'val_loss': 0.48292940855026245, 'val_mae': 0.524127721786499,
329 # 'val_mse': 0.48292940855026245}
330 report['eval_score'] = score[0]
331 report['norm_config'] = norm_config
332 report['best_model_file'] = savemodel_path
333
334 fidle.end()
335
336 except Exception as e :
337 error_msg += type(e).__name__ + str(e.args) + ". "
338
339 if error_msg != "": report["error"] = error_msg
340 if more_msg != "": report["more"] = more_msg
341 if warn_msg != "": report["warning"] = warn_msg
342
343 print("OUTPUT:", "Report: ")
344 pprint(report)
345
346 # Save Report as .json file
347 try:
348 with open(report_json_filepath, "w") as outfile:
349 json.dump(report, outfile)
350 print("OUTPUT:", "Report file (containing report) :", report_json_filepath)
351 except :
352 pass
353
354 # ---
355 # <img width="80px" src="../fidle/img/logo-paysage.svg"></img>
356