view toto_wine_quality_train_eval.py @ 12:60778af2dd78 draft

planemo upload for repository https://forgemia.inra.fr/nathalie.rousse/use/-/tree/dnn/DNN/galaxy-tools/wine_quality_train_eval commit e7fd13c34ec074a7ebc246301b5a80069dcbcc3a-dirty
author siwaa
date Thu, 05 Dec 2024 16:03:49 +0000
parents b5f69f836e03
children dd7d99707a65
line wrap: on
line source

#!/usr/bin/env python
# coding: utf-8

###############################################################################
# Module : model_wine_lightning
#
# This code has been extracted from 01-DNN-Wine-Regression-lightning.ipynb
# (fidle-tp/fidle-master-3.0.11/Wine.Lightning) then modified.
# Only first part kept :
#  - (Retrieve data)
#  - (Preparing the data)
#  - (Build a model)
#  - Train and save the model
#
# Inputs :
#
# -dataset_filepath : dataset file path (.csv).
#  - File containing data used to train and test the model.
#  - The dataset will be splitted in 2 parts :
#    one for training and one for validation.
#  - if unavailable (not given, not found...) : default dataset_filepath used
#
# Outputs :
#
# Output files under "OUTPUTS" folder (must exist !!!)
#
# - Model file (model_ckpt_filepath) (.ckpt)
#
# - Normalization configuration file (norm_config_json_filepath) (.json)
#
# - Report file (report_json_filepath) (.json) containing:
#   - Normalization configuration information
#   - Evaluation score information
#     example {'val_loss': 0.48292940855026245, 'val_mae': 0.524127721786499,
#              'val_mse': 0.48292940855026245}
#
# - Log files into Wine.Lightning/run/LWINE1/logs/reg_logs
#
# - Screen display containing running information : 
#   - training lines : 
#     Epoch 99: 100%|█| 64/64 [00:01<00:00, 44.89it/s, v_num=0, val_loss=0.483, val_mae=0.524, val_mse=0.483, t
#   - Eval score : x_test / loss      : 0.4829
#                  x_test / mae       : 0.5241
#                  x_test / mse       : 0.4829
#   - ...
#
###############################################################################

# <img width="800px" src="../fidle/img/header.svg"></img>
# 
# # <!-- TITLE --> [LWINE1] - Wine quality prediction with a Dense Network (DNN)
#   <!-- DESC -->  Another example of regression, with a wine quality prediction, using PyTorch Lightning
#   <!-- AUTHOR : Achille Mbogol Touye (EFFILIA-MIAI/SIMaP) -->
# 
# ## Objectives :
#  - Predict the **quality of wines**, based on their analysis
#  - Understanding the principle and the architecture of a regression with a dense neural network with backup and restore of the trained model. 
# 
# The **[Wine Quality datasets](https://archive.ics.uci.edu/ml/datasets/wine+Quality)** are made up of analyses of a large number of wines, with an associated quality (between 0 and 10)  
# This dataset is provide by :  
# Paulo Cortez, University of Minho, Guimarães, Portugal, http://www3.dsi.uminho.pt/pcortez  
# A. Cerdeira, F. Almeida, T. Matos and J. Reis, Viticulture Commission of the Vinho Verde Region(CVRVV), Porto, Portugal, @2009  
# This dataset can be retreive at [University of California Irvine (UCI)](https://archive-beta.ics.uci.edu/ml/datasets/wine+quality)
# 
# 
# Due to privacy and logistic issues, only physicochemical and sensory variables are available  
# There is no data about grape types, wine brand, wine selling price, etc.
# 
# - fixed acidity
# - volatile acidity
# - citric acid
# - residual sugar
# - chlorides
# - free sulfur dioxide
# - total sulfur dioxide
# - density
# - pH
# - sulphates
# - alcohol
# - quality (score between 0 and 10)
# 
# ## What we're going to do :
# 
#  - (Retrieve data)
#  - (Preparing the data)
#  - (Build a model)
#  - Train and save the model
#  - Restore saved model
#  - Evaluate the model
#  - Make some predictions
# 
HEAD = "[model_wine_lightning.wine_quality_train_eval]"

# ## Step 1 - Import and init
print("\n"+HEAD,"# ## Step 1 - Import and init\n")

# Import some packages
import os
import lightning.pytorch as pl
import torchvision.transforms as T
##toto## from IPython.display import display, HTML ##toto## HTML
from torch.utils.data import DataLoader, random_split
##toto## from model_wine_lightning.modules.progressbar import CustomTrainProgressBar
from model_wine_lightning.modules.data_load import WineQualityDataset
from model_wine_lightning.modules.data_load import Normalize, ToTensor
from model_wine_lightning.modules.model import LitRegression
from lightning.pytorch.loggers.tensorboard import TensorBoardLogger
import fidle
import json
import shutil
import argparse
from pprint import pprint

OUTPUTS_PATH = "OUTPUTS" # must exit !!!

error_msg, warn_msg, more_msg = "", "", "" # default

model_ckpt_filepath =       os.path.join(OUTPUTS_PATH, "model.ckpt")
norm_config_json_filepath = os.path.join(OUTPUTS_PATH, "norm_config.json")
report_json_filepath =      os.path.join(OUTPUTS_PATH, "report.json")
report = dict() # init

try:
    if not os.path.exists(OUTPUTS_PATH): # cas isfile non traite
        os.mkdir(OUTPUTS_PATH)
        message = "Outputs folder '"+OUTPUTS_PATH+" does not exist => created."
        warn_msg += message + " "
        print(HEAD, "Warning :", message)

    message = "Outputs folder '" + OUTPUTS_PATH + "' must exist."
    if not os.path.isdir(OUTPUTS_PATH):
        error_msg += message + " "
        raise Exception(message)

    # ## INPUTS
    print("\n"+HEAD, "# ## INPUTS\n")

    parser = argparse.ArgumentParser()

    help_text = "dataset file path (.csv)"
    parser.add_argument("-dataset_filepath", type=str, help=help_text)

    args = parser.parse_args()

    dataset_filepath = None # default (case default data file)
    path = args.dataset_filepath
    if (path is not None) and (path != 'None') :
        if os.path.isfile(path) :
            dataset_filepath = path
            print(HEAD, "dataset file used :", path)
        else :
            message = path+ "dataset file not found => default data file used."
            warn_msg += message + ""
            print(HEAD, "Warning :", message)
    else:
        message = "no dataset_filepath given => default data file used."
        warn_msg += message + ""
        print(HEAD, "Warning :", message)

    # Init Fidle environment
    print("\n"+HEAD, "# Init Fidle environment\n")
    run_id, run_dir, datasets_dir = fidle.init('LWINE1_train_eval')

    # Verbosity during training : 
    # - 0 = silent
    # - 1 = progress bar
    # - 2 = one line per epoch
    fit_verbosity = 1
    dataset_name  = 'winequality-red.csv' # default data file 

    # Override parameters (batch mode) - Just forget this cell
    fidle.override('fit_verbosity', 'dataset_name') 

    # ## Step 2 - Retrieve data
    print("\n"+HEAD,"# ## Step 2 - Retrieve data\n")

    if dataset_filepath is None: # default data file
        dataset_filepath = f'{datasets_dir}/WineQuality/origine/{dataset_name}'
    print(HEAD, "Dataset file used :", dataset_filepath)

    # Verify
    message = "Dataset file '" + dataset_filepath + "' not found."
    if not os.path.isfile(dataset_filepath):
        error_msg += message + " "

    datasets = WineQualityDataset(dataset_filepath)
    print("datasets:")
    #display(datasets.data.head(5).style.format("{0:.2f}"))
    ##toto##display(datasets.data.head(5))
    print('Missing Data : ',datasets.data.isna().sum().sum(),
          '  Shape is : ', datasets.data.shape)

    # ## Step 3 - Preparing the data
    print("\n"+HEAD,"# ## Step 3 - Preparing the data\n")
    
    # ### 3.1 - Data normalization
    print("\n"+HEAD,"# ### 3.1 - Data normalization\n")
    # **Note :** 
    #  - All input features must be normalized.  
    #  - To do this we will subtract the mean and divide by the standard
    #    deviation for each input features. 
    #  - Then we convert numpy array features and target **(quality)** to
    #    torch tensor   
    
    N = Normalize(dataset_filepath)
    norm_config = {"mean_json":N.mean_json, "std_json":N.std_json,
                   "min_json":N.min_json, "max_json":N.max_json}
    transforms = T.Compose([N, ToTensor()])
    dataset = WineQualityDataset(dataset_filepath, transform=transforms)

    print("Before normalization :")
    ##toto##display(datasets[:]["features"])
    print("After normalization :")
    ##toto##display(dataset[:]["features"])

    # ### 3.2 - Split data
    print("\n"+HEAD,"# ### 3.2 - Split data\n")
    # We will use 80% of the data for training and 20% for validation.  
    # x will be the features data of the analysis and y the target (quality)

    # ---- Split => train, test
    data_train_len = int(len(dataset)*0.8)        # get 80 %
    data_test_len  = len(dataset) -data_train_len # test = all - train

    # ---- Split => x,y with random_split
    data_train_subset, data_test_subset = random_split(dataset,
                                              [data_train_len, data_test_len])

    x_train = data_train_subset[:]["features"]
    y_train = data_train_subset[:]["quality" ]

    x_test  = data_test_subset[:]["features"]
    y_test  = data_test_subset[:]["quality" ]

    print('Original data shape was : ',dataset.data.shape)
    print('x_train : ',x_train.shape, 'y_train : ',y_train.shape)
    print('x_test  : ',x_test.shape,  'y_test  : ',y_test.shape)

    # ### 3.3 -  For Training model use Dataloader
    print("\n"+HEAD,"# ### 3.3 -  For Training model use Dataloader\n")
    # The Dataset retrieves our dataset’s features and labels one sample at a time. While training a model, we typically want to pass samples in minibatches, reshuffle the data at every epoch to reduce model overfitting. DataLoader is an iterable that abstracts this complexity for us in an easy API.

    # train batch data
    train_loader= DataLoader(dataset=data_train_subset, 
                             shuffle=True, batch_size=20, num_workers=2)
    # test batch data
    test_loader= DataLoader(dataset=data_test_subset, 
                            shuffle=False, batch_size=20, num_workers=2)

    # ## Step 4 - Build a model
    print("\n"+HEAD,"# ## Step 4 - Build a model\n")

    # ## 5 - Train the model
    print("\n"+HEAD,"# ## 5 - Train the model\n")

    # ### 5.1 - Get it
    print("\n"+HEAD,"# ### 5.1 - Get it\n")
    print(HEAD, "Model creation")
    reg=LitRegression(in_features=11)
    print(reg) 

    # ### 5.2 - Add callback
    print("\n"+HEAD,"# ### 5.2 - Add callback\n")
    os.makedirs('./run/models', exist_ok=True)
    save_dir = "./run/models/"
    filename ='best-model-{epoch}-{val_loss:.2f}'
    savemodel_callback = pl.callbacks.ModelCheckpoint(dirpath=save_dir,
                                            filename=filename, save_top_k=1,
                                            verbose=False, monitor="val_loss")
    # ### 5.3 - Train it
    print("\n"+HEAD,"# ### 5.3 - Train it\n")

    # loggers data
    os.makedirs(f'{run_dir}/logs', mode=0o750, exist_ok=True)
    logger= TensorBoardLogger(save_dir=f'{run_dir}/logs', name="reg_logs")

    # train model
    trainer = pl.Trainer(accelerator='auto', max_epochs=100,
                       logger=logger, num_sanity_val_steps=0,
                       callbacks=[savemodel_callback])
                       ##toto##callbacks=[savemodel_callback,CustomTrainProgressBar()])
    trainer.fit(model=reg, train_dataloaders=train_loader,
                val_dataloaders=test_loader)

    # ## Step 6 - Evaluate it
    print("\n"+HEAD,"# ## Step 6 - Evaluate it\n")

    # ### 6.1 - Model evaluation
    print("\n"+HEAD,"# ### 6.1 - Model evaluation\n")
    # MAE =  Mean Absolute Error (between the labels and predictions)  
    # A mae equal to 3 represents an average error in prediction of $3k.
    score = trainer.validate(model=reg, dataloaders=test_loader, verbose=False)

    print('x_test / loss : {:5.4f}'.format(score[0]['val_loss']))
    print('x_test / mae  : {:5.4f}'.format(score[0]['val_mae']))
    print('x_test / mse  : {:5.4f}'.format(score[0]['val_mse']))

    # ### 6.2 - Training history
    print("\n"+HEAD,"# ### 6.2 - Training history\n")
    # 
    # To access logs with tensorboad :
    # - Under **Docker**, from a terminal launched via the jupyterlab
    #   launcher, use the following command:<br>
    # ```tensorboard --logdir <path-to-logs> --host 0.0.0.0```
    # - If you're **not using Docker**, from a terminal :<br>
    # ```tensorboard --logdir <path-to-logs>```  
    # 
    # **Note:** One tensorboard instance can be used simultaneously.

    # ## OUTPUTS
    print("\n"+HEAD,"# ## OUTPUTS\n")

    # Model (.ckpt) copy of savemodel_callback.best_model_path (under save_dir)
    savemodel_path = savemodel_callback.best_model_path
    shutil.copyfile(src=savemodel_path, dst=model_ckpt_filepath)
    print("OUTPUT:", "Model :", model_ckpt_filepath)
    print("          (is a copy of: Best model file ", savemodel_path, ")")

    # Save norm_config as .json file
    with open(norm_config_json_filepath, "w") as outfile:
        json.dump(norm_config, outfile)
    print("OUTPUT:",
          "Normalization configuration file (containing norm_config) :",
          norm_config_json_filepath)

    # Report (json) :
    # - normalization configuration information
    # - evaluation score information
    #   example {'val_loss': 0.48292940855026245, 'val_mae': 0.524127721786499,
    #            'val_mse': 0.48292940855026245}
    report['eval_score'] = score[0]
    report['norm_config'] = norm_config
    report['best_model_file'] = savemodel_path

    fidle.end()

except Exception as e :
    error_msg += type(e).__name__ + str(e.args) + ". "

if error_msg != "": report["error"] = error_msg
if more_msg != "":  report["more"] = more_msg
if warn_msg != "":  report["warning"] = warn_msg

print("OUTPUT:", "Report: ")
pprint(report)

# Save Report as .json file
try:
    with open(report_json_filepath, "w") as outfile:
        json.dump(report, outfile)
    print("OUTPUT:", "Report file (containing report) :", report_json_filepath)
except :
    pass

# ---
# <img width="80px" src="../fidle/img/logo-paysage.svg"></img>