Commit 4b726a2e authored by Augustin Zidek's avatar Augustin Zidek Committed by Copybara-Service
Browse files

Speed up Colab multimer MSA search: fetch each db chunk only once and run all queries against it.

PiperOrigin-RevId: 496636186
Change-Id: I0a428b6269f8e1bcb1a6efb33cad2fc70b0d1f35
parent b21167be
...@@ -426,12 +426,28 @@ ...@@ -426,12 +426,28 @@
"}\n", "}\n",
"\n", "\n",
"\n", "\n",
"def get_msa(fasta_path):\n", "def get_msa(sequences):\n",
" \"\"\"Searches for MSA for the given sequence using chunked Jackhmmer search.\"\"\"\n", " \"\"\"Searches for MSA for given sequences using chunked Jackhmmer search.\n",
" \n",
" Args:\n",
" sequences: A list of sequences to search against all databases.\n",
"\n",
" Returns:\n",
" A dictionary mapping unique sequences to dicionaries mapping each database\n",
" to a list of results, one for each chunk of the database.\n",
" \"\"\"\n",
" sequence_to_fasta_path = {}\n",
" # Deduplicate to not do redundant work for multiple copies of the same chain in homomers.\n",
" for sequence_index, sequence in enumerate(sorted(set(sequences)), 1):\n",
" fasta_path = f'target_{sequence_index:02d}.fasta'\n",
" with open(fasta_path, 'wt') as f:\n",
" f.write(f'\u003equery\\n{sequence}')\n",
" sequence_to_fasta_path[sequence] = fasta_path\n",
"\n", "\n",
" # Run the search against chunks of genetic databases (since the genetic\n", " # Run the search against chunks of genetic databases (since the genetic\n",
" # databases don't fit in Colab disk).\n", " # databases don't fit in Colab disk).\n",
" raw_msa_results = collections.defaultdict(list)\n", " raw_msa_results = {sequence: {} for sequence in sequence_to_fasta_path.keys()}\n",
" print('\\nGetting MSA for all sequences')\n",
" with tqdm.notebook.tqdm(total=TOTAL_JACKHMMER_CHUNKS, bar_format=TQDM_BAR_FORMAT) as pbar:\n", " with tqdm.notebook.tqdm(total=TOTAL_JACKHMMER_CHUNKS, bar_format=TQDM_BAR_FORMAT) as pbar:\n",
" def jackhmmer_chunk_callback(i):\n", " def jackhmmer_chunk_callback(i):\n",
" pbar.update(n=1)\n", " pbar.update(n=1)\n",
...@@ -446,26 +462,18 @@ ...@@ -446,26 +462,18 @@
" num_streamed_chunks=db_config['num_streamed_chunks'],\n", " num_streamed_chunks=db_config['num_streamed_chunks'],\n",
" streaming_callback=jackhmmer_chunk_callback,\n", " streaming_callback=jackhmmer_chunk_callback,\n",
" z_value=db_config['z_value'])\n", " z_value=db_config['z_value'])\n",
" # Group the results by database name.\n", " # Query all unique sequences against each chunk of the database to prevent\n",
" raw_msa_results[db_name].extend(jackhmmer_runner.query(fasta_path))\n", " # redunantly fetching each chunk for each unique sequence.\n",
" results = jackhmmer_runner.query_multiple(list(sequence_to_fasta_path.values()))\n",
" for sequence, result_for_sequence in zip(sequence_to_fasta_path.keys(), results):\n",
" raw_msa_results[sequence][db_name] = result_for_sequence\n",
"\n", "\n",
" return raw_msa_results\n", " return raw_msa_results\n",
"\n", "\n",
"\n", "\n",
"features_for_chain = {}\n", "features_for_chain = {}\n",
"raw_msa_results_for_sequence = {}\n", "raw_msa_results_for_sequence = get_msa(sequences)\n",
"for sequence_index, sequence in enumerate(sequences, start=1):\n", "for sequence_index, sequence in enumerate(sequences, start=1):\n",
" print(f'\\nGetting MSA for sequence {sequence_index}')\n",
"\n",
" fasta_path = f'target_{sequence_index}.fasta'\n",
" with open(fasta_path, 'wt') as f:\n",
" f.write(f'\u003equery\\n{sequence}')\n",
"\n",
" # Don't do redundant work for multiple copies of the same chain in the multimer.\n",
" if sequence not in raw_msa_results_for_sequence:\n",
" raw_msa_results = get_msa(fasta_path=fasta_path)\n",
" raw_msa_results_for_sequence[sequence] = raw_msa_results\n",
" else:\n",
" raw_msa_results = copy.deepcopy(raw_msa_results_for_sequence[sequence])\n", " raw_msa_results = copy.deepcopy(raw_msa_results_for_sequence[sequence])\n",
"\n", "\n",
" # Extract the MSAs from the Stockholm files.\n", " # Extract the MSAs from the Stockholm files.\n",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment