clean_table.py 1.78 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
import sys
import pandas as pd


target_folder = os.getcwd()


if __name__ == '__main__':
    print(f"Cleaning table in folder: {target_folder}")
    error_folder = os.path.join(target_folder, "error_result")
    if not os.path.exists(error_folder):
        os.makedirs(error_folder)
    cif_folder = os.path.join(target_folder, "cif_result_final")
    csv_file = os.path.join(target_folder, "results_scheduler.csv")

    if not os.path.exists(csv_file):
        print(f"CSV file not found in {target_folder}, skipping...")
        sys.exit(0)
    if not os.path.exists(cif_folder):
        print(f"CIF folder not found in {target_folder}, skipping...")
        sys.exit(0)

    all_crystals = pd.read_csv(csv_file)
    cifs = os.listdir(cif_folder)
    invalid_list = []
    for i, item in all_crystals.iterrows():
        if int(item['stage2_steps']) > 2990 or (item['stage1_steps'] >= 2999 and item["stage2_steps"] == 0) or abs(float(item['stage2_energy'])) > 1e15 or item['file']+".cif" not in cifs:
            invalid_list.append(i)

    if invalid_list:
        invalid_df = all_crystals.iloc[invalid_list]
        all_crystals.drop(index=invalid_list, inplace=True)

    all_crystals.sort_values(by=["stage2_energy"], ascending=True, inplace=True)
    all_crystals.reset_index(drop=True, inplace=True)

    min_energy = all_crystals['stage2_energy'][0]
    all_crystals['relative_energy'] = all_crystals['stage2_energy'] - min_energy

    for cif in cifs:
        if cif[:-4] not in all_crystals['file'].values:
            # move cif to error folder
            src_path = os.path.join(cif_folder, cif)
            dest_path = os.path.join(error_folder, cif)
            os.rename(src_path, dest_path)
            
    # Save the cleaned DataFrame back to CSV
zcxzcx1's avatar
zcxzcx1 committed
49
    all_crystals.to_csv(csv_file, index=False)