expand_roda_duplicates.py 2.66 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""
The RODA database is non-redundant, meaning that it only stores one explicit
representative alignment directory for all PDB chains in a 100% sequence
identity cluster. In order to add explicit alignments for all PDB chains, this
script will add the missing chain directories and symlink them to their
representative alignment directories.
"""

from argparse import ArgumentParser
from pathlib import Path

from tqdm import tqdm


def create_duplicate_dirs(duplicate_chains: list[list[str]], alignment_dir: Path):
    """
    Create duplicate directory symlinks for all chains in the given duplicate lists.

    Args:
        duplicate_lists (list[list[str]]): A list of lists, where each inner list
            contains chains that are 100% sequence identical.
        alignment_dir (Path): Path to flattened alignment directory, with one
            subdirectory per chain.
    """
    print("Creating duplicate directory symlinks...")
    dirs_created = 0
    for chains in tqdm(duplicate_chains):
        # find the chain that has an alignment
        for chain in chains:
            if (alignment_dir / chain).exists():
                representative_chain = chain
                break
        else:
            print(f"No representative chain found for {chains}, skipping...")
            continue

        # create symlinks for all other chains
        for chain in chains:
            if chain != representative_chain:
                target_path = alignment_dir / chain
                if target_path.exists():
                    print(f"Chain {chain} already exists, skipping...")
                else:
                    (target_path).symlink_to(alignment_dir / representative_chain)
                    dirs_created += 1

    print(f"Created directories for {dirs_created} duplicate chains.")


def main(alignment_dir: Path, duplicate_chains_file: Path):
    # read duplicate chains file
    with open(duplicate_chains_file, "r") as fp:
        duplicate_chains = [list(line.strip().split()) for line in fp]

    create_duplicate_dirs(duplicate_chains, alignment_dir)


if __name__ == "__main__":
    parser = ArgumentParser(description=__doc__)
    parser.add_argument(
        "alignment_dir",
        type=Path,
        help="""Path to flattened alignment directory, with one subdirectory 
                per chain.""",
    )
    parser.add_argument(
        "duplicate_chains_file",
        type=Path,
        help="""Path to file containing duplicate chains, where each line
                contains a space-separated list of chains that are 100%%
                sequence identical.
                """,
    )
    args = parser.parse_args()
    main(args.alignment_dir, args.duplicate_chains_file)