Mercurial > repos > siwaa > wine_quality_train_eval
view toto_wine_quality_train_eval.py @ 10:b432386c0f1c draft
planemo upload for repository https://forgemia.inra.fr/nathalie.rousse/use/-/tree/dnn/DNN/galaxy-tools/wine_quality_train_eval commit e7fd13c34ec074a7ebc246301b5a80069dcbcc3a-dirty
author | siwaa |
---|---|
date | Thu, 05 Dec 2024 15:47:56 +0000 |
parents | 31d737992c63 |
children | b5f69f836e03 |
line wrap: on
line source
#!/usr/bin/env python # coding: utf-8 ############################################################################### # Module : model_wine_lightning # # This code has been extracted from 01-DNN-Wine-Regression-lightning.ipynb # (fidle-tp/fidle-master-3.0.11/Wine.Lightning) then modified. # Only first part kept : # - (Retrieve data) # - (Preparing the data) # - (Build a model) # - Train and save the model # # Inputs : # # -dataset_filepath : dataset file path (.csv). # - File containing data used to train and test the model. # - The dataset will be splitted in 2 parts : # one for training and one for validation. # - if unavailable (not given, not found...) : default dataset_filepath used # # Outputs : # # Output files under "OUTPUTS" folder (must exist !!!) # # - Model file (model_ckpt_filepath) (.ckpt) # # - Normalization configuration file (norm_config_json_filepath) (.json) # # - Report file (report_json_filepath) (.json) containing: # - Normalization configuration information # - Evaluation score information # example {'val_loss': 0.48292940855026245, 'val_mae': 0.524127721786499, # 'val_mse': 0.48292940855026245} # # - Log files into Wine.Lightning/run/LWINE1/logs/reg_logs # # - Screen display containing running information : # - training lines : # Epoch 99: 100%|█| 64/64 [00:01<00:00, 44.89it/s, v_num=0, val_loss=0.483, val_mae=0.524, val_mse=0.483, t # - Eval score : x_test / loss : 0.4829 # x_test / mae : 0.5241 # x_test / mse : 0.4829 # - ... # ############################################################################### # <img width="800px" src="../fidle/img/header.svg"></img> # # # <!-- TITLE --> [LWINE1] - Wine quality prediction with a Dense Network (DNN) # <!-- DESC --> Another example of regression, with a wine quality prediction, using PyTorch Lightning # <!-- AUTHOR : Achille Mbogol Touye (EFFILIA-MIAI/SIMaP) --> # # ## Objectives : # - Predict the **quality of wines**, based on their analysis # - Understanding the principle and the architecture of a regression with a dense neural network with backup and restore of the trained model. # # The **[Wine Quality datasets](https://archive.ics.uci.edu/ml/datasets/wine+Quality)** are made up of analyses of a large number of wines, with an associated quality (between 0 and 10) # This dataset is provide by : # Paulo Cortez, University of Minho, Guimarães, Portugal, http://www3.dsi.uminho.pt/pcortez # A. Cerdeira, F. Almeida, T. Matos and J. Reis, Viticulture Commission of the Vinho Verde Region(CVRVV), Porto, Portugal, @2009 # This dataset can be retreive at [University of California Irvine (UCI)](https://archive-beta.ics.uci.edu/ml/datasets/wine+quality) # # # Due to privacy and logistic issues, only physicochemical and sensory variables are available # There is no data about grape types, wine brand, wine selling price, etc. # # - fixed acidity # - volatile acidity # - citric acid # - residual sugar # - chlorides # - free sulfur dioxide # - total sulfur dioxide # - density # - pH # - sulphates # - alcohol # - quality (score between 0 and 10) # # ## What we're going to do : # # - (Retrieve data) # - (Preparing the data) # - (Build a model) # - Train and save the model # - Restore saved model # - Evaluate the model # - Make some predictions # HEAD = "[model_wine_lightning.wine_quality_train_eval]" # ## Step 1 - Import and init print("\n"+HEAD,"# ## Step 1 - Import and init\n") # Import some packages import os import lightning.pytorch as pl import torchvision.transforms as T from IPython.display import display, HTML ##toto## HTML from torch.utils.data import DataLoader, random_split from model_wine_lightning.modules.progressbar import CustomTrainProgressBar from model_wine_lightning.modules.data_load import WineQualityDataset from model_wine_lightning.modules.data_load import Normalize, ToTensor from model_wine_lightning.modules.model import LitRegression from lightning.pytorch.loggers.tensorboard import TensorBoardLogger import fidle import json import shutil import argparse from pprint import pprint OUTPUTS_PATH = "OUTPUTS" # must exit !!! error_msg, warn_msg, more_msg = "", "", "" # default model_ckpt_filepath = os.path.join(OUTPUTS_PATH, "model.ckpt") norm_config_json_filepath = os.path.join(OUTPUTS_PATH, "norm_config.json") report_json_filepath = os.path.join(OUTPUTS_PATH, "report.json") report = dict() # init try: if not os.path.exists(OUTPUTS_PATH): # cas isfile non traite os.mkdir(OUTPUTS_PATH) message = "Outputs folder '"+OUTPUTS_PATH+" does not exist => created." warn_msg += message + " " print(HEAD, "Warning :", message) message = "Outputs folder '" + OUTPUTS_PATH + "' must exist." if not os.path.isdir(OUTPUTS_PATH): error_msg += message + " " raise Exception(message) # ## INPUTS print("\n"+HEAD, "# ## INPUTS\n") parser = argparse.ArgumentParser() help_text = "dataset file path (.csv)" parser.add_argument("-dataset_filepath", type=str, help=help_text) args = parser.parse_args() dataset_filepath = None # default (case default data file) path = args.dataset_filepath if (path is not None) and (path != 'None') : if os.path.isfile(path) : dataset_filepath = path print(HEAD, "dataset file used :", path) else : message = path+ "dataset file not found => default data file used." warn_msg += message + "" print(HEAD, "Warning :", message) else: message = "no dataset_filepath given => default data file used." warn_msg += message + "" print(HEAD, "Warning :", message) # Init Fidle environment print("\n"+HEAD, "# Init Fidle environment\n") run_id, run_dir, datasets_dir = fidle.init('LWINE1_train_eval') # Verbosity during training : # - 0 = silent # - 1 = progress bar # - 2 = one line per epoch fit_verbosity = 1 dataset_name = 'winequality-red.csv' # default data file # Override parameters (batch mode) - Just forget this cell fidle.override('fit_verbosity', 'dataset_name') # ## Step 2 - Retrieve data print("\n"+HEAD,"# ## Step 2 - Retrieve data\n") if dataset_filepath is None: # default data file dataset_filepath = f'{datasets_dir}/WineQuality/origine/{dataset_name}' print(HEAD, "Dataset file used :", dataset_filepath) # Verify message = "Dataset file '" + dataset_filepath + "' not found." if not os.path.isfile(dataset_filepath): error_msg += message + " " datasets = WineQualityDataset(dataset_filepath) print("datasets:") #display(datasets.data.head(5).style.format("{0:.2f}")) ##toto##display(datasets.data.head(5)) print('Missing Data : ',datasets.data.isna().sum().sum(), ' Shape is : ', datasets.data.shape) # ## Step 3 - Preparing the data print("\n"+HEAD,"# ## Step 3 - Preparing the data\n") # ### 3.1 - Data normalization print("\n"+HEAD,"# ### 3.1 - Data normalization\n") # **Note :** # - All input features must be normalized. # - To do this we will subtract the mean and divide by the standard # deviation for each input features. # - Then we convert numpy array features and target **(quality)** to # torch tensor N = Normalize(dataset_filepath) norm_config = {"mean_json":N.mean_json, "std_json":N.std_json, "min_json":N.min_json, "max_json":N.max_json} transforms = T.Compose([N, ToTensor()]) dataset = WineQualityDataset(dataset_filepath, transform=transforms) print("Before normalization :") ##toto##display(datasets[:]["features"]) print("After normalization :") ##toto##display(dataset[:]["features"]) # ### 3.2 - Split data print("\n"+HEAD,"# ### 3.2 - Split data\n") # We will use 80% of the data for training and 20% for validation. # x will be the features data of the analysis and y the target (quality) # ---- Split => train, test data_train_len = int(len(dataset)*0.8) # get 80 % data_test_len = len(dataset) -data_train_len # test = all - train # ---- Split => x,y with random_split data_train_subset, data_test_subset = random_split(dataset, [data_train_len, data_test_len]) x_train = data_train_subset[:]["features"] y_train = data_train_subset[:]["quality" ] x_test = data_test_subset[:]["features"] y_test = data_test_subset[:]["quality" ] print('Original data shape was : ',dataset.data.shape) print('x_train : ',x_train.shape, 'y_train : ',y_train.shape) print('x_test : ',x_test.shape, 'y_test : ',y_test.shape) # ### 3.3 - For Training model use Dataloader print("\n"+HEAD,"# ### 3.3 - For Training model use Dataloader\n") # The Dataset retrieves our dataset’s features and labels one sample at a time. While training a model, we typically want to pass samples in minibatches, reshuffle the data at every epoch to reduce model overfitting. DataLoader is an iterable that abstracts this complexity for us in an easy API. # train batch data train_loader= DataLoader(dataset=data_train_subset, shuffle=True, batch_size=20, num_workers=2) # test batch data test_loader= DataLoader(dataset=data_test_subset, shuffle=False, batch_size=20, num_workers=2) # ## Step 4 - Build a model print("\n"+HEAD,"# ## Step 4 - Build a model\n") # ## 5 - Train the model print("\n"+HEAD,"# ## 5 - Train the model\n") # ### 5.1 - Get it print("\n"+HEAD,"# ### 5.1 - Get it\n") print(HEAD, "Model creation") reg=LitRegression(in_features=11) print(reg) # ### 5.2 - Add callback print("\n"+HEAD,"# ### 5.2 - Add callback\n") os.makedirs('./run/models', exist_ok=True) save_dir = "./run/models/" filename ='best-model-{epoch}-{val_loss:.2f}' savemodel_callback = pl.callbacks.ModelCheckpoint(dirpath=save_dir, filename=filename, save_top_k=1, verbose=False, monitor="val_loss") # ### 5.3 - Train it print("\n"+HEAD,"# ### 5.3 - Train it\n") # loggers data os.makedirs(f'{run_dir}/logs', mode=0o750, exist_ok=True) logger= TensorBoardLogger(save_dir=f'{run_dir}/logs', name="reg_logs") # train model trainer = pl.Trainer(accelerator='auto', max_epochs=100, logger=logger, num_sanity_val_steps=0, callbacks=[savemodel_callback,CustomTrainProgressBar()]) trainer.fit(model=reg, train_dataloaders=train_loader, val_dataloaders=test_loader) # ## Step 6 - Evaluate it print("\n"+HEAD,"# ## Step 6 - Evaluate it\n") # ### 6.1 - Model evaluation print("\n"+HEAD,"# ### 6.1 - Model evaluation\n") # MAE = Mean Absolute Error (between the labels and predictions) # A mae equal to 3 represents an average error in prediction of $3k. score = trainer.validate(model=reg, dataloaders=test_loader, verbose=False) print('x_test / loss : {:5.4f}'.format(score[0]['val_loss'])) print('x_test / mae : {:5.4f}'.format(score[0]['val_mae'])) print('x_test / mse : {:5.4f}'.format(score[0]['val_mse'])) # ### 6.2 - Training history print("\n"+HEAD,"# ### 6.2 - Training history\n") # # To access logs with tensorboad : # - Under **Docker**, from a terminal launched via the jupyterlab # launcher, use the following command:<br> # ```tensorboard --logdir <path-to-logs> --host 0.0.0.0``` # - If you're **not using Docker**, from a terminal :<br> # ```tensorboard --logdir <path-to-logs>``` # # **Note:** One tensorboard instance can be used simultaneously. # ## OUTPUTS print("\n"+HEAD,"# ## OUTPUTS\n") # Model (.ckpt) copy of savemodel_callback.best_model_path (under save_dir) savemodel_path = savemodel_callback.best_model_path shutil.copyfile(src=savemodel_path, dst=model_ckpt_filepath) print("OUTPUT:", "Model :", model_ckpt_filepath) print(" (is a copy of: Best model file ", savemodel_path, ")") # Save norm_config as .json file with open(norm_config_json_filepath, "w") as outfile: json.dump(norm_config, outfile) print("OUTPUT:", "Normalization configuration file (containing norm_config) :", norm_config_json_filepath) # Report (json) : # - normalization configuration information # - evaluation score information # example {'val_loss': 0.48292940855026245, 'val_mae': 0.524127721786499, # 'val_mse': 0.48292940855026245} report['eval_score'] = score[0] report['norm_config'] = norm_config report['best_model_file'] = savemodel_path fidle.end() except Exception as e : error_msg += type(e).__name__ + str(e.args) + ". " if error_msg != "": report["error"] = error_msg if more_msg != "": report["more"] = more_msg if warn_msg != "": report["warning"] = warn_msg print("OUTPUT:", "Report: ") pprint(report) # Save Report as .json file try: with open(report_json_filepath, "w") as outfile: json.dump(report, outfile) print("OUTPUT:", "Report file (containing report) :", report_json_filepath) except : pass # --- # <img width="80px" src="../fidle/img/logo-paysage.svg"></img>