Add alignment DB scripts

728f99fc · Gustaf Ahdritz · 4e730269 · 728f99fc · 728f99fc · 728f99fc
Commit 728f99fc authored Jun 27, 2022 by Gustaf Ahdritz
4 changed files
--- a/README.md
+++ b/README.md
@@ -315,6 +315,18 @@ or even ProteinNet .core files. To emulate the AlphaFold training procedure,
 which uses a self-distillation set subject to special preprocessing steps, use
 the family of `--distillation` flags.

+In cases where it may be burdensome to create separate files for each chain's
+alignments, alignment directories can be consolidated using the scripts in 
+`scripts/alignment_db_scripts/`. First, run `create_alignment_db.py` to
+consolidate an alignment directory into a pair of database and index files.
+Once all alignment directories (or shards of a single alignment directory)
+have been compiled, unify the indices with `unify_alignment_db_indices`. The
+resulting index, `super.index` can be passed to the training script flags
+containing the phrase `alignment_index`. In this scenario, the `alignment_dir`
+flags instead represent the directory containing the compiled alignment
+databases. Both the training and distillation datasets can be compiled in this
+way.
+
 ## Testing

 To run unit tests, use

--- a/scripts/alignment_db_scripts/create_alignment_db.py
+++ b/scripts/alignment_db_scripts/create_alignment_db.py
+import argparse
+import json
+import os
+
+
+def main(args):
+    db_path = os.path.join(args.output_db_path, f"{args.output_db_name}.db")
+    index_path = os.path.join(
+        args.output_db_path, f"{args.output_db_name}.index"
+    )
+    db_fp = open(db_path, "wb")
+    index = {}
+    db_offset = 0
+    for chain_alignment_dir in os.listdir(args.alignment_dir):
+        cad_path = os.path.join(args.alignment_dir, chain_alignment_dir)
+        for f in os.listdir(cad_path):
+            f_path = os.path.join(cad_path, f)
+            with open(f_path, "rb") as fp:
+                file_bytes = fp.read()
+
+            l = len(file_bytes)
+            file_list = index.setdefault(chain_alignment_dir, [])
+            file_list.append((f, db_offset, l))
+            
+            db_fp.write(file_bytes)
+            db_offset += l
+
+    db_fp.close()
+
+    with open(index_path, "w") as fp:
+        json.dump(index, fp)
+            
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "alignment_dir", type=str, 
+        help="""Path to precomputed alignment directory, with one subdirectory 
+                per chain."""
+    )
+    parser.add_argument("output_db_path", type=str)
+    parser.add_argument("output_db_name", type=str)
+
+    args = parser.parse_args()
+
+    main(args)
--- a/scripts/alignment_db_scripts/unify_alignment_db_indices.py
+++ b/scripts/alignment_db_scripts/unify_alignment_db_indices.py
+import argparse
+import json
+import os
+
+
+""" Unifies databases created with create_alignment_db.py """
+
+
+def main(args):
+    super_index = {}
+    for f in os.listdir(args.alignment_db_dir):
+        if(not os.path.splitext(f)[-1] == ".index"):
+            continue
+        
+        with open(os.path.join(args.alignment_db_dir, f), "r") as fp:
+            index = json.load(fp)
+
+        db_name = f"{os.path.splitext(f)[0]}.db"
+        
+        for k in index:
+            super_index[k] = {
+                "db": db_name,
+                "files": index[k],
+            }
+
+    with open(os.path.join(args.output_dir, "super.index"), "w") as fp:
+        json.dump(super_index, fp)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("alignment_db_dir", type=str, help="Path to directory containing alignment_dbs")
+    parser.add_argument("output_dir", type=str, help="Path in which to output super index")
+
+    args = parser.parse_args()
+
+    main(args)
--- a/train_openfold.py
+++ b/train_openfold.py
@@ -512,10 +512,12 @@ if __name__ == "__main__":
        "--_distillation_structure_index_path", type=str, default=None,
    )
    parser.add_argument(
-        "--_alignment_index_path", type=str, default=None,
+        "--alignment_index_path", type=str, default=None,
+        help="Training alignment index. See the README for instructions."
    )
    parser.add_argument(
-        "--_distillation_alignment_index_path", type=str, default=None,
+        "--distillation_alignment_index_path", type=str, default=None,
+        help="Distillation alignment index. See the README for instructions."
    )
    parser = pl.Trainer.add_argparse_args(parser)