"vscode:/vscode.git/clone" did not exist on "305ef8df854a9a7ed3d3f5a28526f924ad8efb5b"
Commit afd91982 authored by Christina Floristean's avatar Christina Floristean
Browse files

Scripts from Lukas to be used in improved setup process

parent 2134cc09
from argparse import ArgumentParser
from pathlib import Path
import json
def main(args):
# get the super index
with open(args.alignment_db_super_index_path, "r") as fp:
super_index = json.load(fp)
# get all chains and sequences
chains_to_seqs = {}
with open(args.all_chains_fasta, "r") as fp:
lines = fp.readlines()
# iterate through chain-sequence pairs
for chain_idx in range(0, len(lines), 2):
chain = lines[chain_idx][1:].strip()
seq = lines[chain_idx + 1].strip()
chains_to_seqs[chain] = seq
chains_w_alignments = set(super_index.keys())
chains_wo_alignments = set(chains_to_seqs.keys()) - chains_w_alignments
seq_to_chain_w_alignment = {
chains_to_seqs[chain]: chain for chain in chains_w_alignments
}
print("Unique sequences with alignments:", len(seq_to_chain_w_alignment))
# map chain without alignment to alignment entry of another chain with the
# same sequence
remaining_unaligned_chains = []
for chain in chains_wo_alignments:
seq = chains_to_seqs[chain]
try:
corresponding_alignment = super_index[seq_to_chain_w_alignment[seq]]
# no corresponding chain with alignment found
except KeyError:
remaining_unaligned_chains.append(chain)
continue
super_index[chain] = corresponding_alignment
with open(args.output_path, "w") as fp:
json.dump(super_index, fp)
print(
f"No corresponding alignment found for the following {len(remaining_unaligned_chains)} chains:",
remaining_unaligned_chains,
)
if __name__ == "__main__":
parser = ArgumentParser(
description="""
If the alignment-db index was created on unique-chain alignments only,
this will add the missing chain entries to the super-index file based on
a .fasta file that contains sequences for all chains.
Note that this only modifies the index and not the database itself, as
the duplicate sequences will just point to the same alignments.
"""
)
parser.add_argument(
"alignment_db_super_index_path",
type=Path,
help="Path to alignment-db super index file.",
)
parser.add_argument(
"output_path", type=Path, help="Write the output super index to this path."
)
parser.add_argument(
"all_chains_fasta",
type=Path,
help="Path to the fasta file containing sequences for all chains.",
)
args = parser.parse_args()
main(args)
"""
This script takes a .fasta file as input and then clusters it on a given
sequence identity threshold using mmseqs2. The mmseqs2 flags are identical to
what PDB officially uses to provide their official sequence clusters
(https://github.com/soedinglab/MMseqs2/issues/452).
"""
import shutil
import subprocess
from argparse import ArgumentParser
from collections import defaultdict
from pathlib import Path
def reformat_cluster_file(cluster_file: Path, output_file: Path):
"""
This function takes a mmseqs2 output cluster file and reformats it to a text
file where each line contains a space-separated list of {PDB_ID}_{CHAIN_ID}
belonging to the same cluster.
"""
cluster_to_chains = defaultdict(list)
# extract all chains belonging to each cluster
with open(cluster_file, "r") as f:
for line in f:
line = line.strip()
cluster_name, chain_id = line.split()
cluster_to_chains[cluster_name].append(chain_id)
# write all chains belonging to the same cluster on the same line
with open(output_file, "w") as f:
for chains in cluster_to_chains.values():
f.write(f"{' '.join(chains)}\n")
def main(args):
input_file = args.input_fasta.absolute()
output_file = args.output_file.absolute()
output_dir = args.output_file.parent
# prefix that all output files get
mmseqs_prefix = "_mmseqs_out"
# temporary directory that mmseqs2 uses
tmp_name = f"{mmseqs_prefix}_temp"
tmp_dir = output_dir / tmp_name
mmseqs_command = [
args.mmseqs_binary_path,
"easy-cluster",
input_file,
mmseqs_prefix,
tmp_name,
"--min-seq-id",
str(args.seq_id),
"-c",
"0.9",
"-s",
"8",
"--max-seqs",
"1000",
"--cluster-mode",
"1",
]
# run mmseqs with PDB settings
print("Running mmseqs2...")
subprocess.run(mmseqs_command, check=True, cwd=output_dir)
cluster_file = output_dir / "_mmseqs_out_cluster.tsv"
print("Reformatting output file...")
reformat_cluster_file(cluster_file, output_file)
print("Cleaning up mmseqs2 output...")
mmseqs_outputs = [
output_dir / f"{mmseqs_prefix}_{suffix}"
for suffix in ["cluster.tsv", "rep_seq.fasta", "all_seqs.fasta"]
]
for file in mmseqs_outputs:
file.unlink()
shutil.rmtree(tmp_dir)
print("Done!")
if __name__ == "__main__":
parser = ArgumentParser(
description="Creates a sequence cluster file from a .fasta file using mmseqs2 with PDB settings."
)
parser.add_argument(
"input_fasta",
type=Path,
help="Input .fasta file. Sequence names should be in format >{PDB_ID}_{CHAIN_ID}",
)
parser.add_argument(
"output_file",
type=Path,
help="Output file. Each line will contain a space-separated list of {PDB_ID}_{CHAIN_ID} belonging to the same cluster.",
)
parser.add_argument("mmseqs_binary_path", type=str, help="Path to mmseqs binary")
parser.add_argument("--seq-id", type=float, default=0.4, help="Sequence identity threshold for clustering.")
args = parser.parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment