changeset 9:31d737992c63 draft

planemo upload for repository https://forgemia.inra.fr/nathalie.rousse/use/-/tree/dnn/DNN/galaxy-tools/wine_quality_train_eval commit e7fd13c34ec074a7ebc246301b5a80069dcbcc3a-dirty
author siwaa
date Thu, 05 Dec 2024 15:40:01 +0000
parents 7ee93e4cf2da
children b432386c0f1c
files toto_wine_quality_train_eval.py wine_quality_train_eval.xml
diffstat 2 files changed, 357 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/toto_wine_quality_train_eval.py	Thu Dec 05 15:40:01 2024 +0000
@@ -0,0 +1,356 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+###############################################################################
+# Module : model_wine_lightning
+#
+# This code has been extracted from 01-DNN-Wine-Regression-lightning.ipynb
+# (fidle-tp/fidle-master-3.0.11/Wine.Lightning) then modified.
+# Only first part kept :
+#  - (Retrieve data)
+#  - (Preparing the data)
+#  - (Build a model)
+#  - Train and save the model
+#
+# Inputs :
+#
+# -dataset_filepath : dataset file path (.csv).
+#  - File containing data used to train and test the model.
+#  - The dataset will be splitted in 2 parts :
+#    one for training and one for validation.
+#  - if unavailable (not given, not found...) : default dataset_filepath used
+#
+# Outputs :
+#
+# Output files under "OUTPUTS" folder (must exist !!!)
+#
+# - Model file (model_ckpt_filepath) (.ckpt)
+#
+# - Normalization configuration file (norm_config_json_filepath) (.json)
+#
+# - Report file (report_json_filepath) (.json) containing:
+#   - Normalization configuration information
+#   - Evaluation score information
+#     example {'val_loss': 0.48292940855026245, 'val_mae': 0.524127721786499,
+#              'val_mse': 0.48292940855026245}
+#
+# - Log files into Wine.Lightning/run/LWINE1/logs/reg_logs
+#
+# - Screen display containing running information : 
+#   - training lines : 
+#     Epoch 99: 100%|█| 64/64 [00:01<00:00, 44.89it/s, v_num=0, val_loss=0.483, val_mae=0.524, val_mse=0.483, t
+#   - Eval score : x_test / loss      : 0.4829
+#                  x_test / mae       : 0.5241
+#                  x_test / mse       : 0.4829
+#   - ...
+#
+###############################################################################
+
+# <img width="800px" src="../fidle/img/header.svg"></img>
+# 
+# # <!-- TITLE --> [LWINE1] - Wine quality prediction with a Dense Network (DNN)
+#   <!-- DESC -->  Another example of regression, with a wine quality prediction, using PyTorch Lightning
+#   <!-- AUTHOR : Achille Mbogol Touye (EFFILIA-MIAI/SIMaP) -->
+# 
+# ## Objectives :
+#  - Predict the **quality of wines**, based on their analysis
+#  - Understanding the principle and the architecture of a regression with a dense neural network with backup and restore of the trained model. 
+# 
+# The **[Wine Quality datasets](https://archive.ics.uci.edu/ml/datasets/wine+Quality)** are made up of analyses of a large number of wines, with an associated quality (between 0 and 10)  
+# This dataset is provide by :  
+# Paulo Cortez, University of Minho, Guimarães, Portugal, http://www3.dsi.uminho.pt/pcortez  
+# A. Cerdeira, F. Almeida, T. Matos and J. Reis, Viticulture Commission of the Vinho Verde Region(CVRVV), Porto, Portugal, @2009  
+# This dataset can be retreive at [University of California Irvine (UCI)](https://archive-beta.ics.uci.edu/ml/datasets/wine+quality)
+# 
+# 
+# Due to privacy and logistic issues, only physicochemical and sensory variables are available  
+# There is no data about grape types, wine brand, wine selling price, etc.
+# 
+# - fixed acidity
+# - volatile acidity
+# - citric acid
+# - residual sugar
+# - chlorides
+# - free sulfur dioxide
+# - total sulfur dioxide
+# - density
+# - pH
+# - sulphates
+# - alcohol
+# - quality (score between 0 and 10)
+# 
+# ## What we're going to do :
+# 
+#  - (Retrieve data)
+#  - (Preparing the data)
+#  - (Build a model)
+#  - Train and save the model
+#  - Restore saved model
+#  - Evaluate the model
+#  - Make some predictions
+# 
+HEAD = "[model_wine_lightning.wine_quality_train_eval]"
+
+# ## Step 1 - Import and init
+print("\n"+HEAD,"# ## Step 1 - Import and init\n")
+
+# Import some packages
+import os
+import lightning.pytorch as pl
+import torchvision.transforms as T
+from IPython.display import display, HTML
+from torch.utils.data import DataLoader, random_split
+from model_wine_lightning.modules.progressbar import CustomTrainProgressBar
+from model_wine_lightning.modules.data_load import WineQualityDataset
+from model_wine_lightning.modules.data_load import Normalize, ToTensor
+from model_wine_lightning.modules.model import LitRegression
+from lightning.pytorch.loggers.tensorboard import TensorBoardLogger
+import fidle
+import json
+import shutil
+import argparse
+from pprint import pprint
+
+OUTPUTS_PATH = "OUTPUTS" # must exit !!!
+
+error_msg, warn_msg, more_msg = "", "", "" # default
+
+model_ckpt_filepath =       os.path.join(OUTPUTS_PATH, "model.ckpt")
+norm_config_json_filepath = os.path.join(OUTPUTS_PATH, "norm_config.json")
+report_json_filepath =      os.path.join(OUTPUTS_PATH, "report.json")
+report = dict() # init
+
+try:
+    if not os.path.exists(OUTPUTS_PATH): # cas isfile non traite
+        os.mkdir(OUTPUTS_PATH)
+        message = "Outputs folder '"+OUTPUTS_PATH+" does not exist => created."
+        warn_msg += message + " "
+        print(HEAD, "Warning :", message)
+
+    message = "Outputs folder '" + OUTPUTS_PATH + "' must exist."
+    if not os.path.isdir(OUTPUTS_PATH):
+        error_msg += message + " "
+        raise Exception(message)
+
+    # ## INPUTS
+    print("\n"+HEAD, "# ## INPUTS\n")
+
+    parser = argparse.ArgumentParser()
+
+    help_text = "dataset file path (.csv)"
+    parser.add_argument("-dataset_filepath", type=str, help=help_text)
+
+    args = parser.parse_args()
+
+    dataset_filepath = None # default (case default data file)
+    path = args.dataset_filepath
+    if (path is not None) and (path != 'None') :
+        if os.path.isfile(path) :
+            dataset_filepath = path
+            print(HEAD, "dataset file used :", path)
+        else :
+            message = path+ "dataset file not found => default data file used."
+            warn_msg += message + ""
+            print(HEAD, "Warning :", message)
+    else:
+        message = "no dataset_filepath given => default data file used."
+        warn_msg += message + ""
+        print(HEAD, "Warning :", message)
+
+    # Init Fidle environment
+    print("\n"+HEAD, "# Init Fidle environment\n")
+    run_id, run_dir, datasets_dir = fidle.init('LWINE1_train_eval')
+
+    # Verbosity during training : 
+    # - 0 = silent
+    # - 1 = progress bar
+    # - 2 = one line per epoch
+    fit_verbosity = 1
+    dataset_name  = 'winequality-red.csv' # default data file 
+
+    # Override parameters (batch mode) - Just forget this cell
+    fidle.override('fit_verbosity', 'dataset_name') 
+
+    # ## Step 2 - Retrieve data
+    print("\n"+HEAD,"# ## Step 2 - Retrieve data\n")
+
+    if dataset_filepath is None: # default data file
+        dataset_filepath = f'{datasets_dir}/WineQuality/origine/{dataset_name}'
+    print(HEAD, "Dataset file used :", dataset_filepath)
+
+    # Verify
+    message = "Dataset file '" + dataset_filepath + "' not found."
+    if not os.path.isfile(dataset_filepath):
+        error_msg += message + " "
+
+    datasets = WineQualityDataset(dataset_filepath)
+    print("datasets:")
+    #display(datasets.data.head(5).style.format("{0:.2f}"))
+    display(datasets.data.head(5))
+    print('Missing Data : ',datasets.data.isna().sum().sum(),
+          '  Shape is : ', datasets.data.shape)
+
+    # ## Step 3 - Preparing the data
+    print("\n"+HEAD,"# ## Step 3 - Preparing the data\n")
+    
+    # ### 3.1 - Data normalization
+    print("\n"+HEAD,"# ### 3.1 - Data normalization\n")
+    # **Note :** 
+    #  - All input features must be normalized.  
+    #  - To do this we will subtract the mean and divide by the standard
+    #    deviation for each input features. 
+    #  - Then we convert numpy array features and target **(quality)** to
+    #    torch tensor   
+    
+    N = Normalize(dataset_filepath)
+    norm_config = {"mean_json":N.mean_json, "std_json":N.std_json,
+                   "min_json":N.min_json, "max_json":N.max_json}
+    transforms = T.Compose([N, ToTensor()])
+    dataset = WineQualityDataset(dataset_filepath, transform=transforms)
+
+    print("Before normalization :")
+    display(datasets[:]["features"])
+    print("After normalization :")
+    display(dataset[:]["features"])
+
+    # ### 3.2 - Split data
+    print("\n"+HEAD,"# ### 3.2 - Split data\n")
+    # We will use 80% of the data for training and 20% for validation.  
+    # x will be the features data of the analysis and y the target (quality)
+
+    # ---- Split => train, test
+    data_train_len = int(len(dataset)*0.8)        # get 80 %
+    data_test_len  = len(dataset) -data_train_len # test = all - train
+
+    # ---- Split => x,y with random_split
+    data_train_subset, data_test_subset = random_split(dataset,
+                                              [data_train_len, data_test_len])
+
+    x_train = data_train_subset[:]["features"]
+    y_train = data_train_subset[:]["quality" ]
+
+    x_test  = data_test_subset[:]["features"]
+    y_test  = data_test_subset[:]["quality" ]
+
+    print('Original data shape was : ',dataset.data.shape)
+    print('x_train : ',x_train.shape, 'y_train : ',y_train.shape)
+    print('x_test  : ',x_test.shape,  'y_test  : ',y_test.shape)
+
+    # ### 3.3 -  For Training model use Dataloader
+    print("\n"+HEAD,"# ### 3.3 -  For Training model use Dataloader\n")
+    # The Dataset retrieves our dataset’s features and labels one sample at a time. While training a model, we typically want to pass samples in minibatches, reshuffle the data at every epoch to reduce model overfitting. DataLoader is an iterable that abstracts this complexity for us in an easy API.
+
+    # train batch data
+    train_loader= DataLoader(dataset=data_train_subset, 
+                             shuffle=True, batch_size=20, num_workers=2)
+    # test batch data
+    test_loader= DataLoader(dataset=data_test_subset, 
+                            shuffle=False, batch_size=20, num_workers=2)
+
+    # ## Step 4 - Build a model
+    print("\n"+HEAD,"# ## Step 4 - Build a model\n")
+
+    # ## 5 - Train the model
+    print("\n"+HEAD,"# ## 5 - Train the model\n")
+
+    # ### 5.1 - Get it
+    print("\n"+HEAD,"# ### 5.1 - Get it\n")
+    print(HEAD, "Model creation")
+    reg=LitRegression(in_features=11)
+    print(reg) 
+
+    # ### 5.2 - Add callback
+    print("\n"+HEAD,"# ### 5.2 - Add callback\n")
+    os.makedirs('./run/models', exist_ok=True)
+    save_dir = "./run/models/"
+    filename ='best-model-{epoch}-{val_loss:.2f}'
+    savemodel_callback = pl.callbacks.ModelCheckpoint(dirpath=save_dir,
+                                            filename=filename, save_top_k=1,
+                                            verbose=False, monitor="val_loss")
+    # ### 5.3 - Train it
+    print("\n"+HEAD,"# ### 5.3 - Train it\n")
+
+    # loggers data
+    os.makedirs(f'{run_dir}/logs', mode=0o750, exist_ok=True)
+    logger= TensorBoardLogger(save_dir=f'{run_dir}/logs', name="reg_logs")
+
+    # train model
+    trainer = pl.Trainer(accelerator='auto', max_epochs=100,
+                       logger=logger, num_sanity_val_steps=0,
+                       callbacks=[savemodel_callback,CustomTrainProgressBar()])
+    trainer.fit(model=reg, train_dataloaders=train_loader,
+                val_dataloaders=test_loader)
+
+    # ## Step 6 - Evaluate it
+    print("\n"+HEAD,"# ## Step 6 - Evaluate it\n")
+
+    # ### 6.1 - Model evaluation
+    print("\n"+HEAD,"# ### 6.1 - Model evaluation\n")
+    # MAE =  Mean Absolute Error (between the labels and predictions)  
+    # A mae equal to 3 represents an average error in prediction of $3k.
+    score = trainer.validate(model=reg, dataloaders=test_loader, verbose=False)
+
+    print('x_test / loss : {:5.4f}'.format(score[0]['val_loss']))
+    print('x_test / mae  : {:5.4f}'.format(score[0]['val_mae']))
+    print('x_test / mse  : {:5.4f}'.format(score[0]['val_mse']))
+
+    # ### 6.2 - Training history
+    print("\n"+HEAD,"# ### 6.2 - Training history\n")
+    # 
+    # To access logs with tensorboad :
+    # - Under **Docker**, from a terminal launched via the jupyterlab
+    #   launcher, use the following command:<br>
+    # ```tensorboard --logdir <path-to-logs> --host 0.0.0.0```
+    # - If you're **not using Docker**, from a terminal :<br>
+    # ```tensorboard --logdir <path-to-logs>```  
+    # 
+    # **Note:** One tensorboard instance can be used simultaneously.
+
+    # ## OUTPUTS
+    print("\n"+HEAD,"# ## OUTPUTS\n")
+
+    # Model (.ckpt) copy of savemodel_callback.best_model_path (under save_dir)
+    savemodel_path = savemodel_callback.best_model_path
+    shutil.copyfile(src=savemodel_path, dst=model_ckpt_filepath)
+    print("OUTPUT:", "Model :", model_ckpt_filepath)
+    print("          (is a copy of: Best model file ", savemodel_path, ")")
+
+    # Save norm_config as .json file
+    with open(norm_config_json_filepath, "w") as outfile:
+        json.dump(norm_config, outfile)
+    print("OUTPUT:",
+          "Normalization configuration file (containing norm_config) :",
+          norm_config_json_filepath)
+
+    # Report (json) :
+    # - normalization configuration information
+    # - evaluation score information
+    #   example {'val_loss': 0.48292940855026245, 'val_mae': 0.524127721786499,
+    #            'val_mse': 0.48292940855026245}
+    report['eval_score'] = score[0]
+    report['norm_config'] = norm_config
+    report['best_model_file'] = savemodel_path
+
+    fidle.end()
+
+except Exception as e :
+    error_msg += type(e).__name__ + str(e.args) + ". "
+
+if error_msg != "": report["error"] = error_msg
+if more_msg != "":  report["more"] = more_msg
+if warn_msg != "":  report["warning"] = warn_msg
+
+print("OUTPUT:", "Report: ")
+pprint(report)
+
+# Save Report as .json file
+try:
+    with open(report_json_filepath, "w") as outfile:
+        json.dump(report, outfile)
+    print("OUTPUT:", "Report file (containing report) :", report_json_filepath)
+except :
+    pass
+
+# ---
+# <img width="80px" src="../fidle/img/logo-paysage.svg"></img>
+
--- a/wine_quality_train_eval.xml	Thu Dec 05 15:13:00 2024 +0000
+++ b/wine_quality_train_eval.xml	Thu Dec 05 15:40:01 2024 +0000
@@ -24,7 +24,7 @@
     mkdir mpl_dir &&
     export TRANSFORMERS_CACHE=\$(realpath -s cache_dir) &&
     export MPLCONFIGDIR=\$(realpath -s mpl_dir) &&
-    python3 /fidlemore/model_wine_lightning/wine_quality_train_eval.py -dataset_filepath ${dataset_csv} &&
+    python3 $__tool_directory__/toto_wine_quality_train_eval.py -dataset_filepath ${dataset_csv} &&
     cp OUTPUTS/model.ckpt ${model_ckpt} &&
     cp OUTPUTS/norm_config.json ${norm_config_json} &&
     cp OUTPUTS/report.json ${report_json}"