get_pdb_assembly.py 3.53 KB
Newer Older
zhangqha's avatar
zhangqha committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os, sys

import glob
import json
from tqdm import tqdm
from multiprocessing import Pool
import requests
import time

input_dir = sys.argv[1]
output_dir = sys.argv[2]
input_files = glob.glob(input_dir + "*.cif") + glob.glob(input_dir + "*.cif.gz")
os.system("mkdir -p " + output_dir)

pdb_chain_mapper = json.load(open(sys.argv[3]))

rot_keys = [
    "matrix11",
    "matrix12",
    "matrix13",
    "matrix21",
    "matrix22",
    "matrix23",
    "matrix31",
    "matrix32",
    "matrix33",
]

trans_keys = ["vector1", "vector2", "vector3"]


def get_oper(cont):
    cont = cont["pdbx_struct_oper_list"]
    ret = {}
    for c in cont:
        id = c["id"]
        rot = []
        trans = []
        for k in rot_keys:
            rot.append(c[k])
        for k in trans_keys:
            trans.append(c[k])
        rot, trans = tuple(rot), tuple(trans)
        if rot == (1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0) and trans == (
            0.0,
            0.0,
            0.0,
        ):
            ret[id] = "I"
        else:
            ret[id] = (rot, trans)
    ret["I"] = "I"
    return ret


# refer to https://data.rcsb.org/redoc/index.html
def get_pdb_meta_info(mmcif_path):
    name = os.path.split(mmcif_path)[-1].split(".")[0]
    url = f"https://data.rcsb.org/rest/v1/core/assembly/{name}/1"
    max_try_time = 10
    for i in range(max_try_time):
        try:
            out_path = os.path.join(output_dir, name + ".json")
            load = False
            if os.path.isfile(out_path):
                cont = json.load(open(out_path, "r"))
                load = True
            else:
                r = requests.get(url)
                cont = json.loads(r.text)
                json.dump(cont, open(out_path, "w"))
                load = r.ok
            if load:
                if "rcsb_struct_symmetry" not in cont:
                    break
                cur_mapper = pdb_chain_mapper[name]["to_auth_id"]
                for tt in cont["rcsb_struct_symmetry"]:
                    if tt["kind"] == "Global Symmetry":
                        symbol = tt["symbol"]
                        stoi = tt["stoichiometry"]
                        all_opers = get_oper(cont)
                        chains = []
                        opers = []
                        for c in tt["clusters"]:
                            for m in c["members"]:
                                chain_id = cur_mapper[m["asym_id"]]
                                if "pdbx_struct_oper_list_ids" in m:
                                    for op_idx in m["pdbx_struct_oper_list_ids"]:
                                        chains.append(chain_id)
                                        opers.append(all_opers[op_idx])
                                else:
                                    chains.append(chain_id)
                                    opers.append("I")
                        return name, {
                            "symbol": symbol,
                            "stoi": stoi,
                            "chains": chains,
                            "opers": opers,
                        }
                break
            elif cont["status"] == "404":
                break
        except Exception as e:
            print(name, e)
        time.sleep(2)
    return name, None


file_cnt = len(input_files)
meta_dict = {}
# failed = []
with Pool(64) as pool:
    for ret in tqdm(pool.imap(get_pdb_meta_info, input_files), total=file_cnt):
        name, res = ret
        if res:
            meta_dict[name] = res

json.dump(meta_dict, open("pdb_assembly.json", "w"), indent=2)