duplicate_remove.py 10.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import sys
import os
import glob
import pandas as pd
import warnings
from pathlib import Path
from tqdm import tqdm
import multiprocessing as mp
from ccdc.crystal import PackingSimilarity
from ccdc.io import CrystalReader
import shutil
import argparse

warnings.filterwarnings("ignore", category=DeprecationWarning)

#########################
# Global Settings
#########################

# The number of matching molecules required in a packing shell to consider two crystals similar.
molecule_shell_size = 15

# Initialize the packing similarity engine from the CCDC API.
similarity_engine = PackingSimilarity()

# Configure the similarity engine settings.
similarity_engine.settings.allow_molecular_differences = False
similarity_engine.settings.distance_tolerance = 0.2
similarity_engine.settings.angle_tolerance = 20
similarity_engine.settings.packing_shell_size = molecule_shell_size

# Settings to make the comparison less strict regarding hydrogen atoms and bond counts.
similarity_engine.settings.ignore_hydrogen_positions = True
similarity_engine.settings.ignore_bond_counts = True
similarity_engine.settings.ignore_hydrogen_counts = True

# Pre-filtering thresholds to avoid expensive comparisons for vastly different structures.
ENERGY_DIFF = 5.0
DENSITY_DIFF = 0.2


def compare_sim_pack_pair(args):
    """
    Compares two crystal structures to determine if they are identical based on packing similarity.
    Accepts a tuple (candidate_fp, ufp) containing file paths to the two structures.
    Returns True if the structures are considered the same, False otherwise.
    """
    cand_fp, ufp = args
    try:
        c1 = CrystalReader(cand_fp)[0]
        c2 = CrystalReader(ufp)[0]
        h = similarity_engine.compare(c1, c2)
        # Structures are deemed identical if the number of matched molecules meets the global shell size.
        return h.nmatched_molecules >= molecule_shell_size
    except Exception:
        # Return False if any error occurs during file reading or comparison.
        return False

def find_top_unique(struct_list, target_count, n_workers):
    """
    Identifies a target number of unique structures from a list, sorted by energy.
    It iterates through structures from lowest to highest energy and uses parallel processing
    to compare each candidate against the list of already confirmed unique structures.

    Args:
        struct_list (list): A list of tuples, where each tuple is (density, energy, filepath).
        target_count (int): The desired number of unique structures to find.
        n_workers (int): The number of worker processes for parallel comparison.

    Returns:
        tuple: A tuple containing:
            - A list of file paths for the unique structures found.
            - A dictionary mapping each unique structure's path to a list of its duplicate paths.
    """
    unique_list = []
    
    # A dictionary to store each unique structure and its corresponding duplicates.
    # Format: {unique_fp: [dup_fp1, dup_fp2, ...]}
    duplicate_map = {}
    
    # Initialize the multiprocessing pool.
    pool = mp.Pool(processes=n_workers)
    try:
        # Iterate through the main structure list, which is pre-sorted by energy.
        for dens, ene, cand_fp in struct_list:
            # Stop if the target count of unique structures has been reached.
            if len(unique_list) >= target_count:
                break

            # Create comparison tasks only for unique structures within the density and energy thresholds.
            tasks = [(cand_fp, ufp) for dens2, ene2, ufp in unique_list if abs(dens2 - dens) < DENSITY_DIFF and abs(ene2 - ene) < ENERGY_DIFF]
            is_dup = False
            
            if tasks:
                # Use tqdm for a progress bar during parallel comparison.
                results_iterator = tqdm(pool.imap(compare_sim_pack_pair, tasks),
                                        total=len(tasks),
                                        desc=f"Comparing {os.path.basename(cand_fp)}",
                                        leave=False)
                # Convert iterator to a list to process results.
                results_list = list(results_iterator)
                
                # Pair the boolean results with the original tasks to identify which comparison was successful.
                for result, task in zip(results_list, tasks):
                    if result:
                        is_dup = True
                        # The second element of the task tuple is the path of the matched unique structure.
                        matched_unique_fp = task[1]
                        
                        # Record the current candidate as a duplicate of the matched unique structure.
                        duplicate_map[matched_unique_fp].append(cand_fp)
                        
                        # Once a duplicate is found, no more comparisons are needed for this candidate.
                        break

            if not is_dup:
                # If the candidate is not a duplicate of any existing unique structure, add it to the list.
                unique_list.append((dens, ene, cand_fp))
                
                # Create a new entry in the duplicate map for this new unique structure.
                duplicate_map[cand_fp] = []
                
                # Print real-time progress.
                print(f">>> Unique count: {len(unique_list)}  (added {os.path.basename(cand_fp)})")

    finally:
        # Ensure the multiprocessing pool is properly closed.
        pool.close()
        pool.join()
        
    # Return the list of unique file paths and the map of duplicates.
    return [fp for _, _, fp in unique_list], duplicate_map


def process_folder(folder_path):
    """
    Main workflow function to process a given folder. It reads structural data,
    finds unique structures, and organizes the results into folders and a CSV file.
    """
    base_path = folder_path

    # Define output directories for unique structures and their duplicates.
    t_folder = os.path.join(base_path, f"unique_{TARGET_UNIQUE_COUNT}")
    d_folder = os.path.join(base_path, f"unique_{TARGET_UNIQUE_COUNT}_duplicates")
    
    # Load the summary data from the CSV file.
    df = pd.read_csv(os.path.join(base_path, "results_scheduler.csv"))
    cif_folder = os.path.join(base_path, "cif_result_final")
    #########################

    # Create a map from a simplified filename to its full file path for quick lookup.
    file_paths = glob.glob(os.path.join(cif_folder, "*.cif"))
    file_map = {}
    for fp in file_paths:
        base = Path(fp).stem
        # Standardize the name by removing "_opt" suffix if it exists.
        if base.endswith("_opt"):
            key = base[:-4]
        else:
            key = base
        file_map[key] = fp

    # Build the list of candidate structures, filtering by the energy threshold.
    struct_list = []
    for _, row in df.iterrows():
        name = row["file"]
        dens = row["stage2_density"]
        ene  = row["relative_energy"]
        if ene > ENERGY_THRESHOLD:
            continue
        fp = file_map.get(name)
        if fp:
            struct_list.append((dens, ene, fp))
        else:
            print(f"Warning: no CIF for {name}")

    # Sort the list of structures by energy in ascending order.
    struct_list.sort(key=lambda x: x[1])

    # Call the main function to find unique structures and the duplicate map.
    unique_paths, duplicate_map = find_top_unique(
        struct_list,
        target_count=TARGET_UNIQUE_COUNT,
        n_workers=WORKER_COUNT
    )

    # Prepare output directories, removing them first if they already exist.
    if os.path.exists(t_folder):
        shutil.rmtree(t_folder)
    os.makedirs(t_folder)

    if os.path.exists(d_folder):
        shutil.rmtree(d_folder)
    os.makedirs(d_folder)
    
    print(f"\nFound {len(unique_paths)} unique structures "
          f"(energy <= {ENERGY_THRESHOLD}, target {TARGET_UNIQUE_COUNT}).")
    
    # Copy the unique structure files to the target folder.
    for p in unique_paths:
        shutil.copy(p, t_folder)

    # Helper function to get a clean filename without the "_opt" suffix.
    get_clean_name = lambda p: Path(p).stem.replace('_opt', '')

    # Create a new DataFrame containing only the data for the unique structures.
    unique_names = [get_clean_name(p) for p in unique_paths]
    unique_df = df[df['file'].isin(unique_names)].copy()
    unique_df['duplicates'] = '' # Add a new column to store duplicate names.

    # Populate the 'duplicates' column and copy duplicate files to their folder.
    for unique_fp, duplicates_list in duplicate_map.items():
        unique_name = get_clean_name(unique_fp)
        if unique_name in unique_df['file'].values:
            du_name = [get_clean_name(p) for p in duplicates_list]
            # Add the list of duplicate names as a comma-separated string.
            unique_df.loc[unique_df['file'] == unique_name, 'duplicates'] = ', '.join(du_name)
        # Copy duplicate files to the duplicates folder.
        for p in duplicates_list:
            shutil.copy(p, d_folder)

    # Save the final DataFrame with unique structures and their duplicates to a new CSV file.
    unique_csv_path = os.path.join(base_path, 'unique_structures.csv')
    unique_df.to_csv(unique_csv_path, index=False)

if __name__ == '__main__':
    # Required for freezing the application when creating executables with multiprocessing.
    mp.freeze_support()
    
    # Set up command-line argument parsing.
    parser = argparse.ArgumentParser()
    parser.add_argument('--path', type=str, default="./", help='Path to process')
    parser.add_argument('--energy', type=float, default=30, help='Energy threshold for filtering structures')
    parser.add_argument('--count', type=int, default=200, help='Target number of unique structures to find')
    parser.add_argument('--workers', type=int, default=80, help='Max worker number limit')
    args = parser.parse_args()
    
    # Set global variables from command-line arguments.
    target_folder = args.path
    ENERGY_THRESHOLD = args.energy
    TARGET_UNIQUE_COUNT = args.count
    WORKER_COUNT = args.workers

    # Start the processing if the specified folder exists.
    if os.path.exists(target_folder):
        print(f"Processing folder: {target_folder}")
zcxzcx1's avatar
zcxzcx1 committed
247
        process_folder(target_folder)