view csopraSplitInput.py @ 19:813ea6294046 draft default tip

"planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/7dbe12d7dae354eb7c2fa4cb1a1bcd6ec2afebf2/toos/CarboSeqSimulator commit 7dbe12d7dae354eb7c2fa4cb1a1bcd6ec2afebf2-dirty"
author siwaa
date Mon, 27 Jan 2025 14:08:33 +0000
parents e455667fe22f
children
line wrap: on
line source

import pandas as pd
import os
import shutil
import math
import sys

def split_csv(input_folder, output_folder, max_chunck_size):
    unit_path = input_folder + "/units.csv"
    soil_path = input_folder + "/soil.csv"
    crop_path = input_folder + "/crop.csv"

    os.mkdir(output_folder)
    
    dfs = pd.read_csv(soil_path)
    dfc = pd.read_csv(crop_path)

    fileS = open(soil_path, 'r')
    fileC = open(crop_path, 'r')
    
    LineS = fileS.readlines()
    LineC = fileC.readlines()

    headerS = LineS.pop(0)
    headerC = LineC.pop(0)
    
    dfs['ID'] = dfs['ID'].astype('int')
    dfc['ID'] = dfc['ID'].astype('int')

    IDs = dfs['ID'].unique()
    
    max_ID = len(IDs)

    step = max_chunck_size
    
    nb_split = math.ceil(max_ID / max_chunck_size)   

    for i in range(1, nb_split + 1):
        
        firstID = IDs[(i-1) * step]
        lastID = IDs[min((i * step) - 1 , max_ID - 1)]

        splitIDs = IDs[(i-1) * step : min((i * step) , max_ID)]
        
        whereS = dfs['ID'].isin(splitIDs)
        whereSid = whereS[whereS].index.values
        firstSid = min(whereSid)
        lastSid = max(whereSid)
        
        lineSsplit = LineS[firstSid:lastSid+1]

        whereC = dfc['ID'].isin(splitIDs)
        whereCid = whereC[whereC].index.values
        firstCid = min(whereCid)
        lastCid = max(whereCid)
        
        lineCsplit = LineC[firstCid:lastCid+1]
        
        os.mkdir(os.path.join(output_folder, f"{input_folder}_{i}"))
        shutil.copy(unit_path, os.path.join(output_folder, f"{input_folder}_{i}"))
        
        splitFileS = open(os.path.join(output_folder, f"{input_folder}_{i}", "soil.csv"), 'w')
        splitFileS.writelines(headerS)
        splitFileS.writelines(lineSsplit)
        splitFileS.close()

        splitFileC = open(os.path.join(output_folder, f"{input_folder}_{i}", "crop.csv"), 'w')
        splitFileC.writelines(headerC)
        splitFileC.writelines(lineCsplit)
        splitFileC.close()
        
        shutil.make_archive( os.path.join(output_folder,f"{input_folder}_{i}"), 'zip',  os.path.join(output_folder, f"{input_folder}_{i}"))



# Example usage
input_folder = "input"
output_folder = "outputs"

input_file = sys.argv[1]
max_chunck_size = int(sys.argv[2])

shutil.unpack_archive(input_file, input_folder, 'zip')

split_csv(input_folder, output_folder, max_chunck_size)