Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenFold
Commits
76b1ec98
Commit
76b1ec98
authored
Sep 08, 2023
by
Christina Floristean
Committed by
Jennifer
May 02, 2024
Browse files
Scripts from Lukas to be used in improved setup process
parent
2625e0b2
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
187 additions
and
0 deletions
+187
-0
scripts/alignment_db_scripts/add_non_unique_to_alignment_db.py
...ts/alignment_db_scripts/add_non_unique_to_alignment_db.py
+82
-0
scripts/fasta_to_clusterfile.py
scripts/fasta_to_clusterfile.py
+105
-0
No files found.
scripts/alignment_db_scripts/add_non_unique_to_alignment_db.py
0 → 100644
View file @
76b1ec98
from
argparse
import
ArgumentParser
from
pathlib
import
Path
import
json
def
main
(
args
):
# get the super index
with
open
(
args
.
alignment_db_super_index_path
,
"r"
)
as
fp
:
super_index
=
json
.
load
(
fp
)
# get all chains and sequences
chains_to_seqs
=
{}
with
open
(
args
.
all_chains_fasta
,
"r"
)
as
fp
:
lines
=
fp
.
readlines
()
# iterate through chain-sequence pairs
for
chain_idx
in
range
(
0
,
len
(
lines
),
2
):
chain
=
lines
[
chain_idx
][
1
:].
strip
()
seq
=
lines
[
chain_idx
+
1
].
strip
()
chains_to_seqs
[
chain
]
=
seq
chains_w_alignments
=
set
(
super_index
.
keys
())
chains_wo_alignments
=
set
(
chains_to_seqs
.
keys
())
-
chains_w_alignments
seq_to_chain_w_alignment
=
{
chains_to_seqs
[
chain
]:
chain
for
chain
in
chains_w_alignments
}
print
(
"Unique sequences with alignments:"
,
len
(
seq_to_chain_w_alignment
))
# map chain without alignment to alignment entry of another chain with the
# same sequence
remaining_unaligned_chains
=
[]
for
chain
in
chains_wo_alignments
:
seq
=
chains_to_seqs
[
chain
]
try
:
corresponding_alignment
=
super_index
[
seq_to_chain_w_alignment
[
seq
]]
# no corresponding chain with alignment found
except
KeyError
:
remaining_unaligned_chains
.
append
(
chain
)
continue
super_index
[
chain
]
=
corresponding_alignment
with
open
(
args
.
output_path
,
"w"
)
as
fp
:
json
.
dump
(
super_index
,
fp
)
print
(
f
"No corresponding alignment found for the following
{
len
(
remaining_unaligned_chains
)
}
chains:"
,
remaining_unaligned_chains
,
)
if
__name__
==
"__main__"
:
parser
=
ArgumentParser
(
description
=
"""
If the alignment-db index was created on unique-chain alignments only,
this will add the missing chain entries to the super-index file based on
a .fasta file that contains sequences for all chains.
Note that this only modifies the index and not the database itself, as
the duplicate sequences will just point to the same alignments.
"""
)
parser
.
add_argument
(
"alignment_db_super_index_path"
,
type
=
Path
,
help
=
"Path to alignment-db super index file."
,
)
parser
.
add_argument
(
"output_path"
,
type
=
Path
,
help
=
"Write the output super index to this path."
)
parser
.
add_argument
(
"all_chains_fasta"
,
type
=
Path
,
help
=
"Path to the fasta file containing sequences for all chains."
,
)
args
=
parser
.
parse_args
()
main
(
args
)
scripts/fasta_to_clusterfile.py
0 → 100644
View file @
76b1ec98
"""
This script takes a .fasta file as input and then clusters it on a given
sequence identity threshold using mmseqs2. The mmseqs2 flags are identical to
what PDB officially uses to provide their official sequence clusters
(https://github.com/soedinglab/MMseqs2/issues/452).
"""
import
shutil
import
subprocess
from
argparse
import
ArgumentParser
from
collections
import
defaultdict
from
pathlib
import
Path
def
reformat_cluster_file
(
cluster_file
:
Path
,
output_file
:
Path
):
"""
This function takes a mmseqs2 output cluster file and reformats it to a text
file where each line contains a space-separated list of {PDB_ID}_{CHAIN_ID}
belonging to the same cluster.
"""
cluster_to_chains
=
defaultdict
(
list
)
# extract all chains belonging to each cluster
with
open
(
cluster_file
,
"r"
)
as
f
:
for
line
in
f
:
line
=
line
.
strip
()
cluster_name
,
chain_id
=
line
.
split
()
cluster_to_chains
[
cluster_name
].
append
(
chain_id
)
# write all chains belonging to the same cluster on the same line
with
open
(
output_file
,
"w"
)
as
f
:
for
chains
in
cluster_to_chains
.
values
():
f
.
write
(
f
"
{
' '
.
join
(
chains
)
}
\n
"
)
def
main
(
args
):
input_file
=
args
.
input_fasta
.
absolute
()
output_file
=
args
.
output_file
.
absolute
()
output_dir
=
args
.
output_file
.
parent
# prefix that all output files get
mmseqs_prefix
=
"_mmseqs_out"
# temporary directory that mmseqs2 uses
tmp_name
=
f
"
{
mmseqs_prefix
}
_temp"
tmp_dir
=
output_dir
/
tmp_name
mmseqs_command
=
[
args
.
mmseqs_binary_path
,
"easy-cluster"
,
input_file
,
mmseqs_prefix
,
tmp_name
,
"--min-seq-id"
,
str
(
args
.
seq_id
),
"-c"
,
"0.9"
,
"-s"
,
"8"
,
"--max-seqs"
,
"1000"
,
"--cluster-mode"
,
"1"
,
]
# run mmseqs with PDB settings
print
(
"Running mmseqs2..."
)
subprocess
.
run
(
mmseqs_command
,
check
=
True
,
cwd
=
output_dir
)
cluster_file
=
output_dir
/
"_mmseqs_out_cluster.tsv"
print
(
"Reformatting output file..."
)
reformat_cluster_file
(
cluster_file
,
output_file
)
print
(
"Cleaning up mmseqs2 output..."
)
mmseqs_outputs
=
[
output_dir
/
f
"
{
mmseqs_prefix
}
_
{
suffix
}
"
for
suffix
in
[
"cluster.tsv"
,
"rep_seq.fasta"
,
"all_seqs.fasta"
]
]
for
file
in
mmseqs_outputs
:
file
.
unlink
()
shutil
.
rmtree
(
tmp_dir
)
print
(
"Done!"
)
if
__name__
==
"__main__"
:
parser
=
ArgumentParser
(
description
=
"Creates a sequence cluster file from a .fasta file using mmseqs2 with PDB settings."
)
parser
.
add_argument
(
"input_fasta"
,
type
=
Path
,
help
=
"Input .fasta file. Sequence names should be in format >{PDB_ID}_{CHAIN_ID}"
,
)
parser
.
add_argument
(
"output_file"
,
type
=
Path
,
help
=
"Output file. Each line will contain a space-separated list of {PDB_ID}_{CHAIN_ID} belonging to the same cluster."
,
)
parser
.
add_argument
(
"mmseqs_binary_path"
,
type
=
str
,
help
=
"Path to mmseqs binary"
)
parser
.
add_argument
(
"--seq-id"
,
type
=
float
,
default
=
0.4
,
help
=
"Sequence identity threshold for clustering."
)
args
=
parser
.
parse_args
()
main
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment