"examples/ScanNet/vscode:/vscode.git/clone" did not exist on "a3a079efc2ef1dcd83a4ba9dfa395e52506814ba"
Commit 728f99fc authored by Gustaf Ahdritz's avatar Gustaf Ahdritz
Browse files

Add alignment DB scripts

parent 4e730269
......@@ -315,6 +315,18 @@ or even ProteinNet .core files. To emulate the AlphaFold training procedure,
which uses a self-distillation set subject to special preprocessing steps, use
the family of `--distillation` flags.
In cases where it may be burdensome to create separate files for each chain's
alignments, alignment directories can be consolidated using the scripts in
`scripts/alignment_db_scripts/`. First, run `create_alignment_db.py` to
consolidate an alignment directory into a pair of database and index files.
Once all alignment directories (or shards of a single alignment directory)
have been compiled, unify the indices with `unify_alignment_db_indices`. The
resulting index, `super.index` can be passed to the training script flags
containing the phrase `alignment_index`. In this scenario, the `alignment_dir`
flags instead represent the directory containing the compiled alignment
databases. Both the training and distillation datasets can be compiled in this
way.
## Testing
To run unit tests, use
......
import argparse
import json
import os
def main(args):
db_path = os.path.join(args.output_db_path, f"{args.output_db_name}.db")
index_path = os.path.join(
args.output_db_path, f"{args.output_db_name}.index"
)
db_fp = open(db_path, "wb")
index = {}
db_offset = 0
for chain_alignment_dir in os.listdir(args.alignment_dir):
cad_path = os.path.join(args.alignment_dir, chain_alignment_dir)
for f in os.listdir(cad_path):
f_path = os.path.join(cad_path, f)
with open(f_path, "rb") as fp:
file_bytes = fp.read()
l = len(file_bytes)
file_list = index.setdefault(chain_alignment_dir, [])
file_list.append((f, db_offset, l))
db_fp.write(file_bytes)
db_offset += l
db_fp.close()
with open(index_path, "w") as fp:
json.dump(index, fp)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"alignment_dir", type=str,
help="""Path to precomputed alignment directory, with one subdirectory
per chain."""
)
parser.add_argument("output_db_path", type=str)
parser.add_argument("output_db_name", type=str)
args = parser.parse_args()
main(args)
import argparse
import json
import os
""" Unifies databases created with create_alignment_db.py """
def main(args):
super_index = {}
for f in os.listdir(args.alignment_db_dir):
if(not os.path.splitext(f)[-1] == ".index"):
continue
with open(os.path.join(args.alignment_db_dir, f), "r") as fp:
index = json.load(fp)
db_name = f"{os.path.splitext(f)[0]}.db"
for k in index:
super_index[k] = {
"db": db_name,
"files": index[k],
}
with open(os.path.join(args.output_dir, "super.index"), "w") as fp:
json.dump(super_index, fp)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("alignment_db_dir", type=str, help="Path to directory containing alignment_dbs")
parser.add_argument("output_dir", type=str, help="Path in which to output super index")
args = parser.parse_args()
main(args)
......@@ -512,10 +512,12 @@ if __name__ == "__main__":
"--_distillation_structure_index_path", type=str, default=None,
)
parser.add_argument(
"--_alignment_index_path", type=str, default=None,
"--alignment_index_path", type=str, default=None,
help="Training alignment index. See the README for instructions."
)
parser.add_argument(
"--_distillation_alignment_index_path", type=str, default=None,
"--distillation_alignment_index_path", type=str, default=None,
help="Distillation alignment index. See the README for instructions."
)
parser = pl.Trainer.add_argparse_args(parser)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment