diff csopraSplitInput.py @ 10:20886bc40659 draft

"planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/b1cafd3eb61b631ee0b1f8f6c5ef2f9a0e2b4ccf/toos/CarboSeqSimulator commit b1cafd3eb61b631ee0b1f8f6c5ef2f9a0e2b4ccf-dirty"
author siwaa
date Tue, 17 Sep 2024 16:01:18 +0000
parents 3e7978af2ba2
children e455667fe22f
line wrap: on
line diff
--- a/csopraSplitInput.py	Tue Sep 17 10:41:39 2024 +0000
+++ b/csopraSplitInput.py	Tue Sep 17 16:01:18 2024 +0000
@@ -14,23 +14,30 @@
     dfs = pd.read_csv(soil_path)
     dfc = pd.read_csv(crop_path)
 
-    max_ID = dfs.loc[dfs['ID'].idxmax()]['ID']
+    dfs['ID'] = dfs['ID'].astype('int')
+    dfc['ID'] = dfc['ID'].astype('int')
+
+    IDs = dfs['ID'].unique()
+    
+    max_ID = len(IDs)
 
     step = max_chunck_size
     
-    nb_split = math.ceil(max_ID / max_chunck_size)
-
-    print(max_ID)
+    nb_split = math.ceil(max_ID / max_chunck_size)   
 
-    print(nb_split)
+    for i in range(1, nb_split + 1):
+        
+        firstID = IDs[(i-1) * step]
+        lastID = IDs[min((i * step) - 1 , max_ID - 1)]
 
-    for i in range(1,nb_split + 1):
+        splitIDs = IDs[(i-1) * step : min((i * step) , max_ID)]
+        
 
         dfSplit = pd.DataFrame()
-        dfSplit = dfs.loc[((dfs['ID'] >= (((i-1) * step)) + 1) & (dfs['ID'] <= ((((i-1) * step)) + step )))]
+        dfSplit = dfs[dfs['ID'].isin(splitIDs)]
 
         dfcSplit = pd.DataFrame()
-        dfcSplit = dfc.loc[((dfc['ID'] >= (((i-1) * step)) + 1) & (dfc['ID'] <= ((((i-1) * step)) + step )))]
+        dfcSplit = dfc[dfc['ID'].isin(splitIDs)]
         
         os.mkdir(os.path.join(output_folder, f"{input_folder}_{i}"))
         shutil.copy(unit_path, os.path.join(output_folder, f"{input_folder}_{i}"))