# HG changeset patch # User siwaa # Date 1726588878 0 # Node ID 20886bc40659dbbc0b237a7034adddc16c41a2ea # Parent 2af4b1e46d9016da01f87d69c85745ffe4bc8b38 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/b1cafd3eb61b631ee0b1f8f6c5ef2f9a0e2b4ccf/toos/CarboSeqSimulator commit b1cafd3eb61b631ee0b1f8f6c5ef2f9a0e2b4ccf-dirty" diff -r 2af4b1e46d90 -r 20886bc40659 carboseqSplit.xml --- a/carboseqSplit.xml Tue Sep 17 10:41:39 2024 +0000 +++ b/carboseqSplit.xml Tue Sep 17 16:01:18 2024 +0000 @@ -1,4 +1,4 @@ - + To split input data of CarboSeq docker://registry.forgemia.inra.fr/carboseq/record-projet-carboseq:latest diff -r 2af4b1e46d90 -r 20886bc40659 csopraSplitInput.py --- a/csopraSplitInput.py Tue Sep 17 10:41:39 2024 +0000 +++ b/csopraSplitInput.py Tue Sep 17 16:01:18 2024 +0000 @@ -14,23 +14,30 @@ dfs = pd.read_csv(soil_path) dfc = pd.read_csv(crop_path) - max_ID = dfs.loc[dfs['ID'].idxmax()]['ID'] + dfs['ID'] = dfs['ID'].astype('int') + dfc['ID'] = dfc['ID'].astype('int') + + IDs = dfs['ID'].unique() + + max_ID = len(IDs) step = max_chunck_size - nb_split = math.ceil(max_ID / max_chunck_size) - - print(max_ID) + nb_split = math.ceil(max_ID / max_chunck_size) - print(nb_split) + for i in range(1, nb_split + 1): + + firstID = IDs[(i-1) * step] + lastID = IDs[min((i * step) - 1 , max_ID - 1)] - for i in range(1,nb_split + 1): + splitIDs = IDs[(i-1) * step : min((i * step) , max_ID)] + dfSplit = pd.DataFrame() - dfSplit = dfs.loc[((dfs['ID'] >= (((i-1) * step)) + 1) & (dfs['ID'] <= ((((i-1) * step)) + step )))] + dfSplit = dfs[dfs['ID'].isin(splitIDs)] dfcSplit = pd.DataFrame() - dfcSplit = dfc.loc[((dfc['ID'] >= (((i-1) * step)) + 1) & (dfc['ID'] <= ((((i-1) * step)) + step )))] + dfcSplit = dfc[dfc['ID'].isin(splitIDs)] os.mkdir(os.path.join(output_folder, f"{input_folder}_{i}")) shutil.copy(unit_path, os.path.join(output_folder, f"{input_folder}_{i}"))