download_cameo.py 2.54 KB
Newer Older
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
1
2
3
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
4
import json
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
5
import os
6
import requests
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
7

8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from openfold.data import mmcif_parsing


VALID_PERIODS = [
    "1-year",
    "6-months",
    "3-months",
    "1-month",
    "1-week",
]


def generate_url(period, end_date):
    return '/'.join([
        "https://www.cameo3d.org/",
        "modeling",
        "targets",
        period,
        "ajax",
        f"?to_date={end_date}",
    ])
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
29
30
31


def main(args):
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
    data_dir_path = os.path.join(args.output_dir, "data_dir")
    fasta_dir_path = os.path.join(args.output_dir, "fasta_dir")
    
    os.makedirs(data_dir_path, exist_ok=True)
    os.makedirs(fasta_dir_path, exist_ok=True)

    url = generate_url(args.period, args.end_date)
    raw_data = requests.get(url).text
    parsed_data = json.loads(raw_data)

    chain_data = parsed_data["aaData"]
    for chain in chain_data:
        pdb_id = chain["pdbid"]
        chain_id = chain["pdbid_chain"]

        pdb_url = f"https://files.rcsb.org/view/{pdb_id.upper()}.cif"
        pdb_file = requests.get(pdb_url).text
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
49

50
51
52
53
54
55
        parsed_cif = mmcif_parsing.parse(
            file_id=pdb_id, mmcif_string=pdb_file
        )
        mmcif_object = parsed_cif.mmcif_object
        if(mmcif_object is None):
            raise list(parsed_cif.errors.values())[0]
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
56

57
        seq = mmcif_object.chain_to_seqres[chain_id]
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
58

59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
        if(args.max_seqlen > 0):
            if(len(seq) > len(seq)):
                continue

        fasta_file = '\n'.join([
            f">{pdb_id}_{chain_id}",
            seq,
        ])

        fasta_filename = f"{pdb_id}_{chain_id}.fasta"
        with open(os.path.join(fasta_dir_path, fasta_filename), "w") as fp:
            fp.write(fasta_file)

        cif_filename = f"{pdb_id}.cif"
        with open(os.path.join(data_dir_path, cif_filename), "w") as fp:
            fp.write(pdb_file)
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
75
76
77

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
78
79
80
81
82
83
84
85
86
87
88
89
90
91
    parser.add_argument(
        "period", type=str,
        help=f"""The length of the period from which to draw CAMEO proteins. 
             Choose from {VALID_PERIODS}"""
    )
    parser.add_argument(
        "end_date", type=str,
        help="The date marking the end of the period (YYYY-MM-DD)"
    )
    parser.add_argument("output_dir")
    parser.add_argument(
        "--max_seqlen", type=int, default=700,
        help="The maximum length in residues of downloaded proteins (or -1)"
    )
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
92
93
94

    args = parser.parse_args()

95
96
97
98
    if(args.period not in VALID_PERIODS):
        raise ValueError(f"Invalid period. Choose from {VALID_PERIODS}")

    main(args)