Mercurial > repos > siwaa > carboseq_s
view csopraSplitInput.py @ 19:813ea6294046 draft default tip
"planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/7dbe12d7dae354eb7c2fa4cb1a1bcd6ec2afebf2/toos/CarboSeqSimulator commit 7dbe12d7dae354eb7c2fa4cb1a1bcd6ec2afebf2-dirty"
author | siwaa |
---|---|
date | Mon, 27 Jan 2025 14:08:33 +0000 |
parents | e455667fe22f |
children |
line wrap: on
line source
import pandas as pd import os import shutil import math import sys def split_csv(input_folder, output_folder, max_chunck_size): unit_path = input_folder + "/units.csv" soil_path = input_folder + "/soil.csv" crop_path = input_folder + "/crop.csv" os.mkdir(output_folder) dfs = pd.read_csv(soil_path) dfc = pd.read_csv(crop_path) fileS = open(soil_path, 'r') fileC = open(crop_path, 'r') LineS = fileS.readlines() LineC = fileC.readlines() headerS = LineS.pop(0) headerC = LineC.pop(0) dfs['ID'] = dfs['ID'].astype('int') dfc['ID'] = dfc['ID'].astype('int') IDs = dfs['ID'].unique() max_ID = len(IDs) step = max_chunck_size nb_split = math.ceil(max_ID / max_chunck_size) for i in range(1, nb_split + 1): firstID = IDs[(i-1) * step] lastID = IDs[min((i * step) - 1 , max_ID - 1)] splitIDs = IDs[(i-1) * step : min((i * step) , max_ID)] whereS = dfs['ID'].isin(splitIDs) whereSid = whereS[whereS].index.values firstSid = min(whereSid) lastSid = max(whereSid) lineSsplit = LineS[firstSid:lastSid+1] whereC = dfc['ID'].isin(splitIDs) whereCid = whereC[whereC].index.values firstCid = min(whereCid) lastCid = max(whereCid) lineCsplit = LineC[firstCid:lastCid+1] os.mkdir(os.path.join(output_folder, f"{input_folder}_{i}")) shutil.copy(unit_path, os.path.join(output_folder, f"{input_folder}_{i}")) splitFileS = open(os.path.join(output_folder, f"{input_folder}_{i}", "soil.csv"), 'w') splitFileS.writelines(headerS) splitFileS.writelines(lineSsplit) splitFileS.close() splitFileC = open(os.path.join(output_folder, f"{input_folder}_{i}", "crop.csv"), 'w') splitFileC.writelines(headerC) splitFileC.writelines(lineCsplit) splitFileC.close() shutil.make_archive( os.path.join(output_folder,f"{input_folder}_{i}"), 'zip', os.path.join(output_folder, f"{input_folder}_{i}")) # Example usage input_folder = "input" output_folder = "outputs" input_file = sys.argv[1] max_chunck_size = int(sys.argv[2]) shutil.unpack_archive(input_file, input_folder, 'zip') split_csv(input_folder, output_folder, max_chunck_size)