""" This script takes a .fasta file as input and then clusters it on a given sequence identity threshold using mmseqs2. The mmseqs2 flags are identical to what PDB officially uses to provide their official sequence clusters (https://github.com/soedinglab/MMseqs2/issues/452). """ import shutil import subprocess from argparse import ArgumentParser from collections import defaultdict from pathlib import Path def reformat_cluster_file(cluster_file: Path, output_file: Path): """ This function takes a mmseqs2 output cluster file and reformats it to a text file where each line contains a space-separated list of {PDB_ID}_{CHAIN_ID} belonging to the same cluster. """ cluster_to_chains = defaultdict(list) # extract all chains belonging to each cluster with open(cluster_file, "r") as f: for line in f: line = line.strip() cluster_name, chain_id = line.split() cluster_to_chains[cluster_name].append(chain_id) # write all chains belonging to the same cluster on the same line with open(output_file, "w") as f: for chains in cluster_to_chains.values(): f.write(f"{' '.join(chains)}\n") def main(args): input_file = args.input_fasta.absolute() output_file = args.output_file.absolute() output_dir = args.output_file.parent # prefix that all output files get mmseqs_prefix = "_mmseqs_out" # temporary directory that mmseqs2 uses tmp_name = f"{mmseqs_prefix}_temp" tmp_dir = output_dir / tmp_name mmseqs_command = [ args.mmseqs_binary_path, "easy-cluster", input_file, mmseqs_prefix, tmp_name, "--min-seq-id", str(args.seq_id), "-c", "0.9", "-s", "8", "--max-seqs", "1000", "--cluster-mode", "1", ] # run mmseqs with PDB settings print("Running mmseqs2...") subprocess.run(mmseqs_command, check=True, cwd=output_dir) cluster_file = output_dir / "_mmseqs_out_cluster.tsv" print("Reformatting output file...") reformat_cluster_file(cluster_file, output_file) print("Cleaning up mmseqs2 output...") mmseqs_outputs = [ output_dir / f"{mmseqs_prefix}_{suffix}" for suffix in ["cluster.tsv", "rep_seq.fasta", "all_seqs.fasta"] ] for file in mmseqs_outputs: file.unlink() shutil.rmtree(tmp_dir) print("Done!") if __name__ == "__main__": parser = ArgumentParser( description="Creates a sequence cluster file from a .fasta file using mmseqs2 with PDB settings." ) parser.add_argument( "input_fasta", type=Path, help="Input .fasta file. Sequence names should be in format >{PDB_ID}_{CHAIN_ID}", ) parser.add_argument( "output_file", type=Path, help="Output file. Each line will contain a space-separated list of {PDB_ID}_{CHAIN_ID} belonging to the same cluster.", ) parser.add_argument("mmseqs_binary_path", type=str, help="Path to mmseqs binary") parser.add_argument("--seq-id", type=float, default=0.4, help="Sequence identity threshold for clustering.") args = parser.parse_args() main(args)