annotate csopraSplitInput.py @ 5:3e7978af2ba2 draft

"planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
author siwaa
date Wed, 28 Aug 2024 14:53:30 +0000
parents
children 20886bc40659
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
1 import pandas as pd
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
2 import os
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
3 import shutil
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
4 import math
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
5 import sys
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
6
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
7 def split_csv(input_folder, output_folder, max_chunck_size):
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
8 unit_path = input_folder + "/units.csv"
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
9 soil_path = input_folder + "/soil.csv"
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
10 crop_path = input_folder + "/crop.csv"
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
11
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
12 os.mkdir(output_folder)
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
13
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
14 dfs = pd.read_csv(soil_path)
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
15 dfc = pd.read_csv(crop_path)
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
16
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
17 max_ID = dfs.loc[dfs['ID'].idxmax()]['ID']
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
18
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
19 step = max_chunck_size
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
20
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
21 nb_split = math.ceil(max_ID / max_chunck_size)
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
22
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
23 print(max_ID)
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
24
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
25 print(nb_split)
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
26
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
27 for i in range(1,nb_split + 1):
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
28
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
29 dfSplit = pd.DataFrame()
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
30 dfSplit = dfs.loc[((dfs['ID'] >= (((i-1) * step)) + 1) & (dfs['ID'] <= ((((i-1) * step)) + step )))]
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
31
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
32 dfcSplit = pd.DataFrame()
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
33 dfcSplit = dfc.loc[((dfc['ID'] >= (((i-1) * step)) + 1) & (dfc['ID'] <= ((((i-1) * step)) + step )))]
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
34
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
35 os.mkdir(os.path.join(output_folder, f"{input_folder}_{i}"))
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
36 shutil.copy(unit_path, os.path.join(output_folder, f"{input_folder}_{i}"))
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
37 dfSplit.to_csv(os.path.join(output_folder, f"{input_folder}_{i}", "soil.csv"), index=False)
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
38 dfcSplit.to_csv(os.path.join(output_folder, f"{input_folder}_{i}", "crop.csv"), index=False)
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
39 shutil.make_archive( os.path.join(output_folder,f"{input_folder}_{i}"), 'zip', os.path.join(output_folder, f"{input_folder}_{i}"))
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
40
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
41 # Example usage
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
42 input_folder = "input"
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
43 output_folder = "outputs"
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
44
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
45 input_file = sys.argv[1]
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
46 max_chunck_size = int(sys.argv[2])
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
47
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
48 shutil.unpack_archive(input_file, input_folder, 'zip')
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
49
3e7978af2ba2 "planemo upload for repository https://forgemia.inra.fr/carboseq/record-projet-carboseq/-/tree/5f761ea7dab0a6f8c978cbbe4946e16edc6d032e/toos/CarboSeqSimulator commit 5f761ea7dab0a6f8c978cbbe4946e16edc6d032e-dirty"
siwaa
parents:
diff changeset
50 split_csv(input_folder, output_folder, max_chunck_size)