Restore cache generation script

54deee17 · Gustaf Ahdritz · 9a0c9e1c · 54deee17 · 54deee17 · 54deee17
Commit 54deee17 authored Feb 04, 2022 by Gustaf Ahdritz
Showing with 141 additions and 1 deletion

README.md README.md +1 -1

scripts/generate_mmcif_cache.py scripts/generate_mmcif_cache.py +66 -0

scripts/generate_prot_data_cache.py scripts/generate_prot_data_cache.py +74 -0

No files found.
--- a/README.md
+++ b/README.md
@@ -181,7 +181,7 @@ files, we provide `scripts/data_dir_to_fasta.py`.
 Next, generate a cache of certain datapoints in the mmCIF files:

 ```bash
-python3 scripts/generate_mmcif_cache.py \
+python3 scripts/generate_prot_data_cache.py \
    mmcif_dir/ \
    mmcif_cache.json \
    --no_workers 16

--- a/scripts/generate_mmcif_cache.py
+++ b/scripts/generate_mmcif_cache.py
+import argparse
+from functools import partial
+import logging
+from multiprocessing import Pool
+import os
+import sys
+import json
+sys.path.append(".") # an innocent hack to get this to run from the top level
+
+from tqdm import tqdm
+
+from openfold.data.mmcif_parsing import parse 
+
+
+def parse_file(f, args):
+    with open(os.path.join(args.mmcif_dir, f), "r") as fp:
+        mmcif_string = fp.read()
+    file_id = os.path.splitext(f)[0]
+    mmcif = parse(file_id=file_id, mmcif_string=mmcif_string)
+    if mmcif.mmcif_object is None:
+        logging.info(f"Could not parse {f}. Skipping...")
+        return {}
+    else:
+        mmcif = mmcif.mmcif_object
+
+    local_data = {}
+    local_data["release_date"] = mmcif.header["release_date"]
+    local_data["no_chains"] = len(list(mmcif.structure.get_chains()))
+
+    return {file_id: local_data}
+
+
+def main(args):
+    files = [f for f in os.listdir(args.mmcif_dir) if ".cif" in f]
+    fn = partial(parse_file, args=args)
+    data = {}
+    with Pool(processes=args.no_workers) as p:
+        with tqdm(total=len(files)) as pbar:
+            for d in p.imap_unordered(fn, files, chunksize=args.chunksize):
+                data.update(d)
+                pbar.update()
+
+    with open(args.output_path, "w") as fp:
+        fp.write(json.dumps(data, indent=4))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "mmcif_dir", type=str, help="Directory containing mmCIF files"
+    )
+    parser.add_argument(
+        "output_path", type=str, help="Path for .json output"
+    )
+    parser.add_argument(
+        "--no_workers", type=int, default=4,
+        help="Number of workers to use for parsing"
+    )
+    parser.add_argument(
+        "--chunksize", type=int, default=10,
+        help="How many files should be distributed to each worker at a time"
+    )
+
+    args = parser.parse_args()
+
+    main(args)
--- a/scripts/generate_prot_data_cache.py
+++ b/scripts/generate_prot_data_cache.py
+import argparse
+from functools import partial
+import logging
+from multiprocessing import Pool
+import os
+import sys
+import json
+sys.path.append(".") # an innocent hack to get this to run from the top level
+
+from tqdm import tqdm
+
+from openfold.data.mmcif_parsing import parse 
+
+
+def parse_file(f, args):
+    with open(os.path.join(args.mmcif_dir, f), "r") as fp:
+        mmcif_string = fp.read()
+    file_id = os.path.splitext(f)[0]
+    mmcif = parse(file_id=file_id, mmcif_string=mmcif_string)
+    if mmcif.mmcif_object is None:
+        logging.info(f"Could not parse {f}. Skipping...")
+        return {}
+    else:
+        mmcif = mmcif.mmcif_object
+
+    local_data = {}
+    local_data["release_date"] = mmcif.header["release_date"]
+
+    chain_ids, seqs = mmcif.chain_to_seqres.items()
+    local_data["chain_ids"] = chain_ids
+    local_data["seqs"] = seqs
+    local_data["no_chains"] = len(chain_ids)
+
+    local_data["resolution"] = mmcif.header["resolution"]
+
+    if(cluser_file)
+
+    return {file_id: local_data}
+
+
+def main(args):
+    files = [f for f in os.listdir(args.mmcif_dir) if ".cif" in f]
+    fn = partial(parse_file, args=args)
+    data = {}
+    with Pool(processes=args.no_workers) as p:
+        with tqdm(total=len(files)) as pbar:
+            for d in p.imap_unordered(fn, files, chunksize=args.chunksize):
+                data.update(d)
+                pbar.update()
+
+    with open(args.output_path, "w") as fp:
+        fp.write(json.dumps(data, indent=4))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "mmcif_dir", type=str, help="Directory containing mmCIF files"
+    )
+    parser.add_argument(
+        "output_path", type=str, help="Path for .json output"
+    )
+    parser.add_argument(
+        "--no_workers", type=int, default=4,
+        help="Number of workers to use for parsing"
+    )
+    parser.add_argument(
+        "--chunksize", type=int, default=10,
+        help="How many files should be distributed to each worker at a time"
+    )
+
+    args = parser.parse_args()
+
+    main(args)