prep_proteinnet_msas.py 1.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import argparse
import logging
import os
import shutil


def main(args):
    count = 0
    max_count = args.max_count if args.max_count is not None else -1
    msas = sorted(f for f in os.listdir(args.msa_dir))
    mmcifs = sorted(f for f in os.listdir(args.mmcif_dir))
    mmcif_idx = 0
    for f in msas:
        if(count == max_count):
            break

        path = os.path.join(args.msa_dir, f)
        name = os.path.splitext(f)[0]
        spl = name.upper().split('_')
        if(len(spl) != 3):
            continue
         
        pdb_id, _, chain_id = spl
       
        while pdb_id > os.path.splitext(mmcifs[mmcif_idx])[0].upper():
            mmcif_idx += 1

        # Only consider files with matching mmCIF files
        if(pdb_id == os.path.splitext(mmcifs[mmcif_idx])[0].upper()):
            dirname = os.path.join(args.out_dir, '_'.join([pdb_id, chain_id]))
            os.makedirs(dirname, exist_ok=True)
            dest = os.path.join(dirname, f)
            if(args.copy):
                shutil.copyfile(path, dest)
            else:
                os.rename(path, dest)

            count += 1
 

if __name__ == "__main__":
42
43
44
    parser = argparse.ArgumentParser(description=
        "Converts raw ProteinNet MSAs into a format recognized by the parser"
    )
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
    parser.add_argument(
        "msa_dir", type=str, help="Directory containing ProteinNet MSAs"
    )
    parser.add_argument(
        "mmcif_dir", type=str, help="Directory containing PDB mmCIFs"
    )
    parser.add_argument(
        "out_dir", type=str,
        help="Directory to which output should be saved"
    )
    parser.add_argument(
        "--copy", type=bool, default=True,
        help="Whether to copy the MSAs to out_dir rather than moving them"
    )
    parser.add_argument(
        "--max_count", type=int, default=None,
        help="A bound on the number of MSAs to process"
    )

    args = parser.parse_args()

    main(args)