Merge branch 'dtk24.04.1'

15cd3506 · mashun1 · 24e633dc · 19085464 · 15cd3506 · 15cd3506
Commit 15cd3506 authored Aug 29, 2024 by mashun1
20 changed files
--- a/.gitignore
+++ b/.gitignore
+*.egg*
+tryme.ipynb
+build/
+dist/
+test/
+temp_output/
+temp_fasta/
+
+*pycache*
--- a/Dockerfile
+++ b/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/jax:0.4.23-ubuntu20.04-dtk24.04.1-py3.10
+
+# RUN apt update
+
+# WORKDIR /app
+
+# WORKDIR /app/softwares
+
+# RUN git clone https://github.com/soedinglab/hh-suite.git
+
+# RUN mkdir -p hh-suite/build && cd hh-suite/build && cmake -DCMAKE_INSTALL_PREFIX=. .. && make -j 4 && make install
+
+# ENV PATH=/app/softwares/hh-suite/build/bin:/app/softwares/hh-suite/build/scripts:$PATH
+
+# WORKDIR /app/softwares
+
+# RUN wget https://github.com/TimoLassmann/kalign/archive/refs/tags/v3.4.0.zip && unzip v3.4.0.zip && cd kalign-3.4.0 && mkdir build && cd build && cmake .. && make && make install
+
+# WORKDIR /app/softwares
+
+# RUN sudo apt install doxygen -y
+
+# RUN wget https://github.com/openmm/openmm/archive/refs/tags/8.0.0.zip && unzip 8.0.0.zip && cd openmm-8.0.0 && mkdir build && cd build && cmake .. && make && sudo make install && sudo make PythonInstall
+
+# WORKDIR /app/softwares
+
+# RUN wget https://github.com/openmm/pdbfixer/archive/refs/tags/1.9.zip && unzip 1.9.zip && cd pdbfixer-1.9 && python setup.py install 
+
+# RUN sudo apt install hmmer -y
+
+# WORKDIR /app
+
+# COPY . /app/alphafold2
+
+# RUN ls
+
+# RUN pip install --no-cache-dir -r /app/alphafold2/requirements_dcu.txt -i https://mirrors.ustc.edu.cn/pypi/web/simple
+
+# RUN pip install dm-haiku==0.0.11 flax==0.7.1 jmp==0.0.2 tabulate==0.8.9 --no-deps jax -i https://mirrors.ustc.edu.cn/pypi/web/simple
+
+# RUN pip install orbax==0.1.6 orbax-checkpoint==0.1.6 optax==0.2.2 -i https://mirrors.ustc.edu.cn/pypi/web/simple
+
+# WORKDIR /app/alphafold2
+
+# RUN python setup.py install
+
+
+
+
--- a/README.md
+++ b/README.md
-<!--
- * @Author: zhuww
- * @email: zhuww@sugon.com
- * @Date: 2023-04-06 18:04:07
- * @LastEditTime: 2023-12-26 15:54:01
-->
 # AF2
+
 ## 论文
 - [https://www.nature.com/articles/s41586-021-03819-2](https://www.nature.com/articles/s41586-021-03819-2)

@@ -19,9 +14,17 @@ AlphaFold2通过从蛋白质序列和结构数据中提取信息，使用神经
 ![img](./docs/alphafold2_1.png)


-<!-- ## 环境配置
+## 环境配置
+
+### Docker（方法一）
+
+    # 使用该方法不需要下载本仓库，镜像中已包含可运行代码，但需要挂载相应的数据文件

-### Docker
+    docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:alphafold2-dtk24.04.1-py310
+
+    docker run --shm-size 100g --network=host --name=alphafold2 --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 本地数据地址:镜像数据地址 -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
+
+### Docker（方法二）
    
    docker pull image.sourcefind.cn:5000/dcu/admin/base/jax:0.4.23-ubuntu20.04-dtk24.04.1-py3.10

@@ -45,7 +48,7 @@ AlphaFold2通过从蛋白质序列和结构数据中提取信息，使用神经
    export PATH="$(pwd)/bin:$(pwd)/scripts:$PATH"

    wget https://github.com/TimoLassmann/kalign/archive/refs/tags/v3.4.0.zip
-    unzip 3.4.0.zip && cd kalign-3.4.0
+    unzip v3.4.0.zip && cd kalign-3.4.0
    mkdir build 
    cd build
    cmake .. 
@@ -65,23 +68,8 @@ AlphaFold2通过从蛋白质序列和结构数据中提取信息，使用神经

    wget https://github.com/openmm/pdbfixer/archive/refs/tags/1.9.zip

-    unzip 1.9.zip && cd pdbfixer-1.9 && python setup.py install  -->
+    unzip 1.9.zip && cd pdbfixer-1.9 && python setup.py install 

-## 环境配置
-提供[光源](https://www.sourcefind.cn/#/image/dcu/custom)拉取推理的docker镜像：
-```
-docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:alphafold2-2.3.2-dtk23.10-py38
-# <Image ID>用上面拉取docker镜像的ID替换
-# <Host Path>主机端路径
-# <Container Path>容器映射路径
-docker run -it --name alphafold --privileged --shm-size=32G  --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v <Host Path>:<Container Path> <Image ID> /bin/bash
-```
-
-镜像版本依赖：
-* DTK驱动：dtk23.10
-* Jax: 0.3.25
-* TensorFlow2: 2.11.0
-* python: python3.8

 ## 数据集
 推荐使用AlphaFold2中的开源数据集，包括BFD、MGnify、PDB70、Uniclust、Uniref90等,数据集大小约2.62TB。数据集格式如下：
@@ -171,12 +159,12 @@ $DOWNLOAD_DIR/
 ```

 [查看蛋白质3D结构](https://www.pdbus.org/3d-view)
-<div style="display: flex; justify-content: center; align-items: center;">
-  <img src="./docs/result_pdb.png" alt="Image">
-  <div style="position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); background: rgba(0, 0, 0, 0.5); color: #fff; padding: 10px;">
-    红色为真实结构，蓝色为预测结构。
-  </div>
-</div>
+
+ID: 8U23
+
+蓝色的为预测结构，黄色为真实结构
+
+![alt text](image.png)

 ### 精度
 测试数据：[casp15](https://www.predictioncenter.org/casp15/targetlist.cgi)、[uniprot](https://www.uniprot.org/)，
@@ -196,6 +184,8 @@ $DOWNLOAD_DIR/
 | fp32 | 单体 | T1024 | 408 | 0.664 | 0.470 | 87.076 | 0.829 | 0.518 | 3.516 |
 | fp32 | 多体 | H1106 | 236 | 0.203 | 0.144 | 0.860 | 0.181 | 0.151 | 20.457 |

+
+
 ## 应用场景

 ### 算法类别

--- a/README_official.md
+++ b/README_official.md
--- a/alphafold/common/confidence.py
+++ b/alphafold/common/confidence.py
@@ -14,7 +14,9 @@

 """Functions for processing confidence metrics."""

+import json
 from typing import Dict, Optional, Tuple
+
 import numpy as np
 import scipy.special

@@ -36,6 +38,43 @@ def compute_plddt(logits: np.ndarray) -> np.ndarray:
  return predicted_lddt_ca * 100


+def _confidence_category(score: float) -> str:
+  """Categorizes pLDDT into: disordered (D), low (L), medium (M), high (H)."""
+  if 0 <= score < 50:
+    return 'D'
+  if 50 <= score < 70:
+    return 'L'
+  elif 70 <= score < 90:
+    return 'M'
+  elif 90 <= score <= 100:
+    return 'H'
+  else:
+    raise ValueError(f'Invalid pLDDT score {score}')
+
+
+def confidence_json(plddt: np.ndarray) -> str:
+  """Returns JSON with confidence score and category for every residue.
+
+  Args:
+    plddt: Per-residue confidence metric data.
+
+  Returns:
+    String with a formatted JSON.
+
+  Raises:
+    ValueError: If `plddt` has a rank different than 1.
+  """
+  if plddt.ndim != 1:
+    raise ValueError(f'The plddt array must be rank 1, got: {plddt.shape}.')
+
+  confidence = {
+      'residueNumber': list(range(1, len(plddt) + 1)),
+      'confidenceScore': [round(float(s), 2) for s in plddt],
+      'confidenceCategory': [_confidence_category(s) for s in plddt],
+  }
+  return json.dumps(confidence, indent=None, separators=(',', ':'))
+
+
 def _calculate_bin_centers(breaks: np.ndarray):
  """Gets the bin centers from the bin edges.

@@ -108,6 +147,32 @@ def compute_predicted_aligned_error(
  }


+def pae_json(pae: np.ndarray, max_pae: float) -> str:
+  """Returns the PAE in the same format as is used in the AFDB.
+
+  Note that the values are presented as floats to 1 decimal place, whereas AFDB
+  returns integer values.
+
+  Args:
+    pae: The n_res x n_res PAE array.
+    max_pae: The maximum possible PAE value.
+
+  Returns:
+    PAE output format as a JSON string.
+  """
+  # Check the PAE array is the correct shape.
+  if pae.ndim != 2 or pae.shape[0] != pae.shape[1]:
+    raise ValueError(f'PAE must be a square matrix, got {pae.shape}')
+
+  # Round the predicted aligned errors to 1 decimal place.
+  rounded_errors = np.round(pae.astype(np.float64), decimals=1)
+  formatted_output = [{
+      'predicted_aligned_error': rounded_errors.tolist(),
+      'max_predicted_aligned_error': max_pae,
+  }]
+  return json.dumps(formatted_output, indent=None, separators=(',', ':'))
+
+
 def predicted_tm_score(
    logits: np.ndarray,
    breaks: np.ndarray,

--- a/alphafold/common/confidence_test.py
+++ b/alphafold/common/confidence_test.py
+# Copyright 2023 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test confidence metrics."""
+
+
+from absl.testing import absltest
+from alphafold.common import confidence
+import numpy as np
+
+
+class ConfidenceTest(absltest.TestCase):
+
+  def test_pae_json(self):
+    pae = np.array([[0.01, 13.12345], [20.0987, 0.0]])
+    pae_json = confidence.pae_json(pae=pae, max_pae=31.75)
+    self.assertEqual(
+        pae_json, '[{"predicted_aligned_error":[[0.0,13.1],[20.1,0.0]],'
+        '"max_predicted_aligned_error":31.75}]')
+
+  def test_confidence_json(self):
+    plddt = np.array([42, 42.42])
+
+    confidence_json = confidence.confidence_json(plddt=plddt)
+
+    print(confidence_json)
+
+    self.assertEqual(
+        confidence_json,
+        ('{"residueNumber":[1,2],'
+         '"confidenceScore":[42.0,42.42],'
+         '"confidenceCategory":["D","D"]}'),
+    )
+
+
+if __name__ == '__main__':
+  absltest.main()
--- a/alphafold/common/mmcif_metadata.py
+++ b/alphafold/common/mmcif_metadata.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""mmCIF metadata."""
+
+from typing import Mapping, Sequence
+from alphafold import version
+import numpy as np
+
+
+_DISCLAIMER = """ALPHAFOLD DATA, COPYRIGHT (2021) DEEPMIND TECHNOLOGIES LIMITED.
+THE INFORMATION PROVIDED IS THEORETICAL MODELLING ONLY AND CAUTION SHOULD BE
+EXERCISED IN ITS USE. IT IS PROVIDED "AS-IS" WITHOUT ANY WARRANTY OF ANY KIND,
+WHETHER EXPRESSED OR IMPLIED. NO WARRANTY IS GIVEN THAT USE OF THE INFORMATION
+SHALL NOT INFRINGE THE RIGHTS OF ANY THIRD PARTY. DISCLAIMER: THE INFORMATION IS
+NOT INTENDED TO BE A SUBSTITUTE FOR PROFESSIONAL MEDICAL ADVICE, DIAGNOSIS, OR
+TREATMENT, AND DOES NOT CONSTITUTE MEDICAL OR OTHER PROFESSIONAL ADVICE. IT IS
+AVAILABLE FOR ACADEMIC AND COMMERCIAL PURPOSES, UNDER CC-BY 4.0 LICENCE."""
+
+# Authors of the Nature methods paper we reference in the mmCIF.
+_MMCIF_PAPER_AUTHORS = (
+    'Jumper, John',
+    'Evans, Richard',
+    'Pritzel, Alexander',
+    'Green, Tim',
+    'Figurnov, Michael',
+    'Ronneberger, Olaf',
+    'Tunyasuvunakool, Kathryn',
+    'Bates, Russ',
+    'Zidek, Augustin',
+    'Potapenko, Anna',
+    'Bridgland, Alex',
+    'Meyer, Clemens',
+    'Kohl, Simon A. A.',
+    'Ballard, Andrew J.',
+    'Cowie, Andrew',
+    'Romera-Paredes, Bernardino',
+    'Nikolov, Stanislav',
+    'Jain, Rishub',
+    'Adler, Jonas',
+    'Back, Trevor',
+    'Petersen, Stig',
+    'Reiman, David',
+    'Clancy, Ellen',
+    'Zielinski, Michal',
+    'Steinegger, Martin',
+    'Pacholska, Michalina',
+    'Berghammer, Tamas',
+    'Silver, David',
+    'Vinyals, Oriol',
+    'Senior, Andrew W.',
+    'Kavukcuoglu, Koray',
+    'Kohli, Pushmeet',
+    'Hassabis, Demis',
+)
+
+# Authors of the mmCIF - we set them to be equal to the authors of the paper.
+_MMCIF_AUTHORS = _MMCIF_PAPER_AUTHORS
+
+
+def add_metadata_to_mmcif(
+    old_cif: Mapping[str, Sequence[str]], model_type: str
+) -> Mapping[str, Sequence[str]]:
+  """Adds AlphaFold metadata in the given mmCIF."""
+  cif = {}
+
+  # ModelCIF conformation dictionary.
+  cif['_audit_conform.dict_name'] = ['mmcif_ma.dic']
+  cif['_audit_conform.dict_version'] = ['1.3.9']
+  cif['_audit_conform.dict_location'] = [
+      'https://raw.githubusercontent.com/ihmwg/ModelCIF/master/dist/'
+      'mmcif_ma.dic'
+  ]
+
+  # License and disclaimer.
+  cif['_pdbx_data_usage.id'] = ['1', '2']
+  cif['_pdbx_data_usage.type'] = ['license', 'disclaimer']
+  cif['_pdbx_data_usage.details'] = [
+      'Data in this file is available under a CC-BY-4.0 license.',
+      _DISCLAIMER,
+  ]
+  cif['_pdbx_data_usage.url'] = [
+      'https://creativecommons.org/licenses/by/4.0/',
+      '?',
+  ]
+  cif['_pdbx_data_usage.name'] = ['CC-BY-4.0', '?']
+
+  # Structure author details.
+  cif['_audit_author.name'] = []
+  cif['_audit_author.pdbx_ordinal'] = []
+  for author_index, author_name in enumerate(_MMCIF_AUTHORS, start=1):
+    cif['_audit_author.name'].append(author_name)
+    cif['_audit_author.pdbx_ordinal'].append(str(author_index))
+
+  # Paper author details.
+  cif['_citation_author.citation_id'] = []
+  cif['_citation_author.name'] = []
+  cif['_citation_author.ordinal'] = []
+  for author_index, author_name in enumerate(_MMCIF_PAPER_AUTHORS, start=1):
+    cif['_citation_author.citation_id'].append('primary')
+    cif['_citation_author.name'].append(author_name)
+    cif['_citation_author.ordinal'].append(str(author_index))
+
+  # Paper citation details.
+  cif['_citation.id'] = ['primary']
+  cif['_citation.title'] = [
+      'Highly accurate protein structure prediction with AlphaFold'
+  ]
+  cif['_citation.journal_full'] = ['Nature']
+  cif['_citation.journal_volume'] = ['596']
+  cif['_citation.page_first'] = ['583']
+  cif['_citation.page_last'] = ['589']
+  cif['_citation.year'] = ['2021']
+  cif['_citation.journal_id_ASTM'] = ['NATUAS']
+  cif['_citation.country'] = ['UK']
+  cif['_citation.journal_id_ISSN'] = ['0028-0836']
+  cif['_citation.journal_id_CSD'] = ['0006']
+  cif['_citation.book_publisher'] = ['?']
+  cif['_citation.pdbx_database_id_PubMed'] = ['34265844']
+  cif['_citation.pdbx_database_id_DOI'] = ['10.1038/s41586-021-03819-2']
+
+  # Type of data in the dataset including data used in the model generation.
+  cif['_ma_data.id'] = ['1']
+  cif['_ma_data.name'] = ['Model']
+  cif['_ma_data.content_type'] = ['model coordinates']
+
+  # Description of number of instances for each entity.
+  cif['_ma_target_entity_instance.asym_id'] = old_cif['_struct_asym.id']
+  cif['_ma_target_entity_instance.entity_id'] = old_cif[
+      '_struct_asym.entity_id'
+  ]
+  cif['_ma_target_entity_instance.details'] = ['.'] * len(
+      cif['_ma_target_entity_instance.entity_id']
+  )
+
+  # Details about the target entities.
+  cif['_ma_target_entity.entity_id'] = cif[
+      '_ma_target_entity_instance.entity_id'
+  ]
+  cif['_ma_target_entity.data_id'] = ['1'] * len(
+      cif['_ma_target_entity.entity_id']
+  )
+  cif['_ma_target_entity.origin'] = ['.'] * len(
+      cif['_ma_target_entity.entity_id']
+  )
+
+  # Details of the models being deposited.
+  cif['_ma_model_list.ordinal_id'] = ['1']
+  cif['_ma_model_list.model_id'] = ['1']
+  cif['_ma_model_list.model_group_id'] = ['1']
+  cif['_ma_model_list.model_name'] = ['Top ranked model']
+
+  cif['_ma_model_list.model_group_name'] = [
+      f'AlphaFold {model_type} v{version.__version__} model'
+  ]
+  cif['_ma_model_list.data_id'] = ['1']
+  cif['_ma_model_list.model_type'] = ['Ab initio model']
+
+  # Software used.
+  cif['_software.pdbx_ordinal'] = ['1']
+  cif['_software.name'] = ['AlphaFold']
+  cif['_software.version'] = [f'v{version.__version__}']
+  cif['_software.type'] = ['package']
+  cif['_software.description'] = ['Structure prediction']
+  cif['_software.classification'] = ['other']
+  cif['_software.date'] = ['?']
+
+  # Collection of software into groups.
+  cif['_ma_software_group.ordinal_id'] = ['1']
+  cif['_ma_software_group.group_id'] = ['1']
+  cif['_ma_software_group.software_id'] = ['1']
+
+  # Method description to conform with ModelCIF.
+  cif['_ma_protocol_step.ordinal_id'] = ['1', '2', '3']
+  cif['_ma_protocol_step.protocol_id'] = ['1', '1', '1']
+  cif['_ma_protocol_step.step_id'] = ['1', '2', '3']
+  cif['_ma_protocol_step.method_type'] = [
+      'coevolution MSA',
+      'template search',
+      'modeling',
+  ]
+
+  # Details of the metrics use to assess model confidence.
+  cif['_ma_qa_metric.id'] = ['1', '2']
+  cif['_ma_qa_metric.name'] = ['pLDDT', 'pLDDT']
+  # Accepted values are distance, energy, normalised score, other, zscore.
+  cif['_ma_qa_metric.type'] = ['pLDDT', 'pLDDT']
+  cif['_ma_qa_metric.mode'] = ['global', 'local']
+  cif['_ma_qa_metric.software_group_id'] = ['1', '1']
+
+  # Global model confidence metric value.
+  cif['_ma_qa_metric_global.ordinal_id'] = ['1']
+  cif['_ma_qa_metric_global.model_id'] = ['1']
+  cif['_ma_qa_metric_global.metric_id'] = ['1']
+  global_plddt = np.mean(
+      [float(v) for v in old_cif['_atom_site.B_iso_or_equiv']]
+  )
+  cif['_ma_qa_metric_global.metric_value'] = [f'{global_plddt:.2f}']
+
+  cif['_atom_type.symbol'] = sorted(set(old_cif['_atom_site.type_symbol']))
+
+  return cif
--- a/alphafold/common/protein.py
+++ b/alphafold/common/protein.py
@@ -13,11 +13,18 @@
 # limitations under the License.

 """Protein data type."""
+
+import collections
 import dataclasses
+import functools
 import io
-from typing import Any, Mapping, Optional
+from typing import Any, Dict, List, Mapping, Optional, Tuple
+from alphafold.common import mmcif_metadata
 from alphafold.common import residue_constants
+from Bio.PDB import MMCIFParser
 from Bio.PDB import PDBParser
+from Bio.PDB.mmcifio import MMCIFIO
+from Bio.PDB.Structure import Structure
 import numpy as np

 FeatureDict = Mapping[str, np.ndarray]
@@ -27,6 +34,32 @@ ModelOutput = Mapping[str, Any]  # Is a nested dict.
 PDB_CHAIN_IDS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
 PDB_MAX_CHAINS = len(PDB_CHAIN_IDS)  # := 62.

+# Data to fill the _chem_comp table when writing mmCIFs.
+_CHEM_COMP: Mapping[str, Tuple[Tuple[str, str], ...]] = {
+    'L-peptide linking': (
+        ('ALA', 'ALANINE'),
+        ('ARG', 'ARGININE'),
+        ('ASN', 'ASPARAGINE'),
+        ('ASP', 'ASPARTIC ACID'),
+        ('CYS', 'CYSTEINE'),
+        ('GLN', 'GLUTAMINE'),
+        ('GLU', 'GLUTAMIC ACID'),
+        ('HIS', 'HISTIDINE'),
+        ('ILE', 'ISOLEUCINE'),
+        ('LEU', 'LEUCINE'),
+        ('LYS', 'LYSINE'),
+        ('MET', 'METHIONINE'),
+        ('PHE', 'PHENYLALANINE'),
+        ('PRO', 'PROLINE'),
+        ('SER', 'SERINE'),
+        ('THR', 'THREONINE'),
+        ('TRP', 'TRYPTOPHAN'),
+        ('TYR', 'TYROSINE'),
+        ('VAL', 'VALINE'),
+    ),
+    'peptide linking': (('GLY', 'GLYCINE'),),
+}
+

 @dataclasses.dataclass(frozen=True)
 class Protein:
@@ -63,27 +96,32 @@ class Protein:
          'because these cannot be written to PDB format.')


-def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
-  """Takes a PDB string and constructs a Protein object.
+def _from_bio_structure(
+    structure: Structure, chain_id: Optional[str] = None
+) -> Protein:
+  """Takes a Biopython structure and creates a `Protein` instance.

  WARNING: All non-standard residue types will be converted into UNK. All
    non-standard atoms will be ignored.

  Args:
-    pdb_str: The contents of the pdb file
-    chain_id: If chain_id is specified (e.g. A), then only that chain
-      is parsed. Otherwise all chains are parsed.
+    structure: Structure from the Biopython library.
+    chain_id: If chain_id is specified (e.g. A), then only that chain is parsed.
+      Otherwise all chains are parsed.

  Returns:
-    A new `Protein` parsed from the pdb contents.
+    A new `Protein` created from the structure contents.
+
+  Raises:
+    ValueError: If the number of models included in the structure is not 1.
+    ValueError: If insertion code is detected at a residue.
  """
-  pdb_fh = io.StringIO(pdb_str)
-  parser = PDBParser(QUIET=True)
-  structure = parser.get_structure('none', pdb_fh)
  models = list(structure.get_models())
  if len(models) != 1:
    raise ValueError(
-        f'Only single model PDBs are supported. Found {len(models)} models.')
+        'Only single model PDBs/mmCIFs are supported. Found'
+        f' {len(models)} models.'
+    )
  model = models[0]

  atom_positions = []
@@ -99,8 +137,9 @@ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
    for res in chain:
      if res.id[2] != ' ':
        raise ValueError(
-            f'PDB contains an insertion code at chain {chain.id} and residue '
-            f'index {res.id[1]}. These are not supported.')
+            f'PDB/mmCIF contains an insertion code at chain {chain.id} and'
+            f' residue index {res.id[1]}. These are not supported.'
+        )
      res_shortname = residue_constants.restype_3to1.get(res.resname, 'X')
      restype_idx = residue_constants.restype_order.get(
          res_shortname, residue_constants.restype_num)
@@ -137,6 +176,48 @@ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
      b_factors=np.array(b_factors))


+def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
+  """Takes a PDB string and constructs a `Protein` object.
+
+  WARNING: All non-standard residue types will be converted into UNK. All
+    non-standard atoms will be ignored.
+
+  Args:
+    pdb_str: The contents of the pdb file
+    chain_id: If chain_id is specified (e.g. A), then only that chain is parsed.
+      Otherwise all chains are parsed.
+
+  Returns:
+    A new `Protein` parsed from the pdb contents.
+  """
+  with io.StringIO(pdb_str) as pdb_fh:
+    parser = PDBParser(QUIET=True)
+    structure = parser.get_structure(id='none', file=pdb_fh)
+    return _from_bio_structure(structure, chain_id)
+
+
+def from_mmcif_string(
+    mmcif_str: str, chain_id: Optional[str] = None
+) -> Protein:
+  """Takes a mmCIF string and constructs a `Protein` object.
+
+  WARNING: All non-standard residue types will be converted into UNK. All
+    non-standard atoms will be ignored.
+
+  Args:
+    mmcif_str: The contents of the mmCIF file
+    chain_id: If chain_id is specified (e.g. A), then only that chain is parsed.
+      Otherwise all chains are parsed.
+
+  Returns:
+    A new `Protein` parsed from the mmCIF contents.
+  """
+  with io.StringIO(mmcif_str) as mmcif_fh:
+    parser = MMCIFParser(QUIET=True)
+    structure = parser.get_structure(structure_id='none', filename=mmcif_fh)
+    return _from_bio_structure(structure, chain_id)
+
+
 def _chain_end(atom_index, end_resname, chain_name, residue_index) -> str:
  chain_end = 'TER'
  return (f'{chain_end:<6}{atom_index:>5}      {end_resname:>3} '
@@ -276,3 +357,223 @@ def from_prediction(
      residue_index=_maybe_remove_leading_dim(features['residue_index']) + 1,
      chain_index=chain_index,
      b_factors=b_factors)
+
+
+def to_mmcif(
+    prot: Protein,
+    file_id: str,
+    model_type: str,
+) -> str:
+  """Converts a `Protein` instance to an mmCIF string.
+
+  WARNING 1: The _entity_poly_seq is filled with unknown (UNK) residues for any
+    missing residue indices in the range from min(1, min(residue_index)) to
+    max(residue_index). E.g. for a protein object with positions for residues
+    2 (MET), 3 (LYS), 6 (GLY), this method would set the _entity_poly_seq to:
+    1 UNK
+    2 MET
+    3 LYS
+    4 UNK
+    5 UNK
+    6 GLY
+    This is done to preserve the residue numbering.
+
+  WARNING 2: Converting ground truth mmCIF file to Protein and then back to
+    mmCIF using this method will convert all non-standard residue types to UNK.
+    If you need this behaviour, you need to store more mmCIF metadata in the
+    Protein object (e.g. all fields except for the _atom_site loop).
+
+  WARNING 3: Converting ground truth mmCIF file to Protein and then back to
+    mmCIF using this method will not retain the original chain indices.
+
+  WARNING 4: In case of multiple identical chains, they are assigned different
+    `_atom_site.label_entity_id` values.
+
+  Args:
+    prot: A protein to convert to mmCIF string.
+    file_id: The file ID (usually the PDB ID) to be used in the mmCIF.
+    model_type: 'Multimer' or 'Monomer'.
+
+  Returns:
+    A valid mmCIF string.
+
+  Raises:
+    ValueError: If aminoacid types array contains entries with too many protein
+    types.
+  """
+  atom_mask = prot.atom_mask
+  aatype = prot.aatype
+  atom_positions = prot.atom_positions
+  residue_index = prot.residue_index.astype(np.int32)
+  chain_index = prot.chain_index.astype(np.int32)
+  b_factors = prot.b_factors
+
+  # Construct a mapping from chain integer indices to chain ID strings.
+  chain_ids = {}
+  # We count unknown residues as protein residues.
+  for entity_id in np.unique(chain_index):  # np.unique gives sorted output.
+    chain_ids[entity_id] = _int_id_to_str_id(entity_id + 1)
+
+  mmcif_dict = collections.defaultdict(list)
+
+  mmcif_dict['data_'] = file_id.upper()
+  mmcif_dict['_entry.id'] = file_id.upper()
+
+  label_asym_id_to_entity_id = {}
+  # Entity and chain information.
+  for entity_id, chain_id in chain_ids.items():
+    # Add all chain information to the _struct_asym table.
+    label_asym_id_to_entity_id[str(chain_id)] = str(entity_id)
+    mmcif_dict['_struct_asym.id'].append(chain_id)
+    mmcif_dict['_struct_asym.entity_id'].append(str(entity_id))
+    # Add information about the entity to the _entity_poly table.
+    mmcif_dict['_entity_poly.entity_id'].append(str(entity_id))
+    mmcif_dict['_entity_poly.type'].append(residue_constants.PROTEIN_CHAIN)
+    mmcif_dict['_entity_poly.pdbx_strand_id'].append(chain_id)
+    # Generate the _entity table.
+    mmcif_dict['_entity.id'].append(str(entity_id))
+    mmcif_dict['_entity.type'].append(residue_constants.POLYMER_CHAIN)
+
+  # Add the residues to the _entity_poly_seq table.
+  for entity_id, (res_ids, aas) in _get_entity_poly_seq(
+      aatype, residue_index, chain_index
+  ).items():
+    for res_id, aa in zip(res_ids, aas):
+      mmcif_dict['_entity_poly_seq.entity_id'].append(str(entity_id))
+      mmcif_dict['_entity_poly_seq.num'].append(str(res_id))
+      mmcif_dict['_entity_poly_seq.mon_id'].append(
+          residue_constants.resnames[aa]
+      )
+
+  # Populate the chem comp table.
+  for chem_type, chem_comp in _CHEM_COMP.items():
+    for chem_id, chem_name in chem_comp:
+      mmcif_dict['_chem_comp.id'].append(chem_id)
+      mmcif_dict['_chem_comp.type'].append(chem_type)
+      mmcif_dict['_chem_comp.name'].append(chem_name)
+
+  # Add all atom sites.
+  atom_index = 1
+  for i in range(aatype.shape[0]):
+    res_name_3 = residue_constants.resnames[aatype[i]]
+    if aatype[i] <= len(residue_constants.restypes):
+      atom_names = residue_constants.atom_types
+    else:
+      raise ValueError(
+          'Amino acid types array contains entries with too many protein types.'
+      )
+    for atom_name, pos, mask, b_factor in zip(
+        atom_names, atom_positions[i], atom_mask[i], b_factors[i]
+    ):
+      if mask < 0.5:
+        continue
+      type_symbol = residue_constants.atom_id_to_type(atom_name)
+
+      mmcif_dict['_atom_site.group_PDB'].append('ATOM')
+      mmcif_dict['_atom_site.id'].append(str(atom_index))
+      mmcif_dict['_atom_site.type_symbol'].append(type_symbol)
+      mmcif_dict['_atom_site.label_atom_id'].append(atom_name)
+      mmcif_dict['_atom_site.label_alt_id'].append('.')
+      mmcif_dict['_atom_site.label_comp_id'].append(res_name_3)
+      mmcif_dict['_atom_site.label_asym_id'].append(chain_ids[chain_index[i]])
+      mmcif_dict['_atom_site.label_entity_id'].append(
+          label_asym_id_to_entity_id[chain_ids[chain_index[i]]]
+      )
+      mmcif_dict['_atom_site.label_seq_id'].append(str(residue_index[i]))
+      mmcif_dict['_atom_site.pdbx_PDB_ins_code'].append('.')
+      mmcif_dict['_atom_site.Cartn_x'].append(f'{pos[0]:.3f}')
+      mmcif_dict['_atom_site.Cartn_y'].append(f'{pos[1]:.3f}')
+      mmcif_dict['_atom_site.Cartn_z'].append(f'{pos[2]:.3f}')
+      mmcif_dict['_atom_site.occupancy'].append('1.00')
+      mmcif_dict['_atom_site.B_iso_or_equiv'].append(f'{b_factor:.2f}')
+      mmcif_dict['_atom_site.auth_seq_id'].append(str(residue_index[i]))
+      mmcif_dict['_atom_site.auth_asym_id'].append(chain_ids[chain_index[i]])
+      mmcif_dict['_atom_site.pdbx_PDB_model_num'].append('1')
+
+      atom_index += 1
+
+  metadata_dict = mmcif_metadata.add_metadata_to_mmcif(mmcif_dict, model_type)
+  mmcif_dict.update(metadata_dict)
+
+  return _create_mmcif_string(mmcif_dict)
+
+
+@functools.lru_cache(maxsize=256)
+def _int_id_to_str_id(num: int) -> str:
+  """Encodes a number as a string, using reverse spreadsheet style naming.
+
+  Args:
+    num: A positive integer.
+
+  Returns:
+    A string that encodes the positive integer using reverse spreadsheet style,
+    naming e.g. 1 = A, 2 = B, ..., 27 = AA, 28 = BA, 29 = CA, ... This is the
+    usual way to encode chain IDs in mmCIF files.
+  """
+  if num <= 0:
+    raise ValueError(f'Only positive integers allowed, got {num}.')
+
+  num = num - 1  # 1-based indexing.
+  output = []
+  while num >= 0:
+    output.append(chr(num % 26 + ord('A')))
+    num = num // 26 - 1
+  return ''.join(output)
+
+
+def _get_entity_poly_seq(
+    aatypes: np.ndarray, residue_indices: np.ndarray, chain_indices: np.ndarray
+) -> Dict[int, Tuple[List[int], List[int]]]:
+  """Constructs gapless residue index and aatype lists for each chain.
+
+  Args:
+    aatypes: A numpy array with aatypes.
+    residue_indices: A numpy array with residue indices.
+    chain_indices: A numpy array with chain indices.
+
+  Returns:
+    A dictionary mapping chain indices to a tuple with list of residue indices
+    and a list of aatypes. Missing residues are filled with UNK residue type.
+  """
+  if (
+      aatypes.shape[0] != residue_indices.shape[0]
+      or aatypes.shape[0] != chain_indices.shape[0]
+  ):
+    raise ValueError(
+        'aatypes, residue_indices, chain_indices must have the same length.'
+    )
+
+  # Group the present residues by chain index.
+  present = collections.defaultdict(list)
+  for chain_id, res_id, aa in zip(chain_indices, residue_indices, aatypes):
+    present[chain_id].append((res_id, aa))
+
+  # Add any missing residues (from 1 to the first residue and for any gaps).
+  entity_poly_seq = {}
+  for chain_id, present_residues in present.items():
+    present_residue_indices = set([x[0] for x in present_residues])
+    min_res_id = min(present_residue_indices)  # Could be negative.
+    max_res_id = max(present_residue_indices)
+
+    new_residue_indices = []
+    new_aatypes = []
+    present_index = 0
+    for i in range(min(1, min_res_id), max_res_id + 1):
+      new_residue_indices.append(i)
+      if i in present_residue_indices:
+        new_aatypes.append(present_residues[present_index][1])
+        present_index += 1
+      else:
+        new_aatypes.append(20)  # Unknown amino acid type.
+    entity_poly_seq[chain_id] = (new_residue_indices, new_aatypes)
+  return entity_poly_seq
+
+
+def _create_mmcif_string(mmcif_dict: Dict[str, Any]) -> str:
+  """Converts mmCIF dictionary into mmCIF string."""
+  mmcifio = MMCIFIO()
+  mmcifio.set_dict(mmcif_dict)
+
+  with io.StringIO() as file_handle:
+    mmcifio.save(file_handle)
+    return file_handle.getvalue()
--- a/alphafold/common/protein_test.py
+++ b/alphafold/common/protein_test.py
@@ -82,16 +82,55 @@ class ProteinTest(parameterized.TestCase):
    np.testing.assert_array_almost_equal(
        prot_reconstr.b_factors, prot.b_factors)

+  @parameterized.named_parameters(
+      dict(
+          testcase_name='glucagon',
+          pdb_file='glucagon.pdb',
+          model_type='Monomer',
+      ),
+      dict(testcase_name='7bui', pdb_file='5nmu.pdb', model_type='Multimer'),
+  )
+  def test_to_mmcif(self, pdb_file, model_type):
+    with open(
+        os.path.join(
+            absltest.get_default_test_srcdir(), TEST_DATA_DIR, pdb_file
+        )
+    ) as f:
+      pdb_string = f.read()
+    prot = protein.from_pdb_string(pdb_string)
+
+    file_id = 'test'
+    mmcif_string = protein.to_mmcif(prot, file_id, model_type)
+    prot_reconstr = protein.from_mmcif_string(mmcif_string)
+
+    np.testing.assert_array_equal(prot_reconstr.aatype, prot.aatype)
+    np.testing.assert_array_almost_equal(
+        prot_reconstr.atom_positions, prot.atom_positions
+    )
+    np.testing.assert_array_almost_equal(
+        prot_reconstr.atom_mask, prot.atom_mask
+    )
+    np.testing.assert_array_equal(
+        prot_reconstr.residue_index, prot.residue_index
+    )
+    np.testing.assert_array_equal(prot_reconstr.chain_index, prot.chain_index)
+    np.testing.assert_array_almost_equal(
+        prot_reconstr.b_factors, prot.b_factors
+    )
+
  def test_ideal_atom_mask(self):
    with open(
-        os.path.join(absltest.get_default_test_srcdir(), TEST_DATA_DIR,
-                     '2rbg.pdb')) as f:
+        os.path.join(
+            absltest.get_default_test_srcdir(), TEST_DATA_DIR, '2rbg.pdb'
+        )
+    ) as f:
      pdb_string = f.read()
    prot = protein.from_pdb_string(pdb_string)
    ideal_mask = protein.ideal_atom_mask(prot)
    non_ideal_residues = set([102] + list(range(127, 286)))
    for i, (res, atom_mask) in enumerate(
-        zip(prot.residue_index, prot.atom_mask)):
+        zip(prot.residue_index, prot.atom_mask)
+    ):
      if res in non_ideal_residues:
        self.assertFalse(np.all(atom_mask == ideal_mask[i]), msg=f'{res}')
      else:

--- a/alphafold/common/residue_constants.py
+++ b/alphafold/common/residue_constants.py
@@ -17,7 +17,7 @@
 import collections
 import functools
 import os
-from typing import List, Mapping, Tuple
+from typing import Final, List, Mapping, Tuple

 import numpy as np
 import tree
@@ -609,6 +609,35 @@ restype_1to3 = {
    'V': 'VAL',
 }

+PROTEIN_CHAIN: Final[str] = 'polypeptide(L)'
+POLYMER_CHAIN: Final[str] = 'polymer'
+
+
+def atom_id_to_type(atom_id: str) -> str:
+  """Convert atom ID to atom type, works only for standard protein residues.
+
+  Args:
+    atom_id: Atom ID to be converted.
+
+  Returns:
+    String corresponding to atom type.
+
+  Raises:
+    ValueError: If atom ID not recognized.
+  """
+
+  if atom_id.startswith('C'):
+    return 'C'
+  elif atom_id.startswith('N'):
+    return 'N'
+  elif atom_id.startswith('O'):
+    return 'O'
+  elif atom_id.startswith('H'):
+    return 'H'
+  elif atom_id.startswith('S'):
+    return 'S'
+  raise ValueError('Atom ID not recognized.')
+

 # NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
 # 1-to-1 mapping of 3 letter names to one letter names. The latter contains

--- a/alphafold/common/testdata/5nmu.pdb
+++ b/alphafold/common/testdata/5nmu.pdb
--- a/alphafold/common/testdata/glucagon.pdb
+++ b/alphafold/common/testdata/glucagon.pdb
--- a/alphafold/data/mmcif_parsing.py
+++ b/alphafold/data/mmcif_parsing.py
@@ -315,6 +315,7 @@ def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
      try:
        raw_resolution = parsed_info[res_key][0]
        header['resolution'] = float(raw_resolution)
+        break
      except ValueError:
        logging.debug('Invalid resolution format: %s', parsed_info[res_key])


--- a/alphafold/data/msa_pairing.py
+++ b/alphafold/data/msa_pairing.py
@@ -15,9 +15,7 @@
 """Pairing logic for multimer data pipeline."""

 import collections
-import functools
-import string
-from typing import Any, Dict, Iterable, List, Sequence
+from typing import cast, Dict, Iterable, List, Sequence

 from alphafold.common import residue_constants
 from alphafold.data import pipeline
@@ -135,7 +133,7 @@ def _create_species_dict(msa_df: pd.DataFrame) -> Dict[bytes, pd.DataFrame]:
  """Creates mapping from species to msa dataframe of that species."""
  species_lookup = {}
  for species, species_df in msa_df.groupby('msa_species_identifiers'):
-    species_lookup[species] = species_df
+    species_lookup[cast(bytes, species)] = species_df
  return species_lookup



--- a/alphafold/data/templates.py
+++ b/alphafold/data/templates.py
@@ -449,6 +449,7 @@ def _get_atom_positions(
    mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32)
    res_at_position = mmcif_object.seqres_to_structure[auth_chain_id][res_index]
    if not res_at_position.is_missing:
+      assert res_at_position.position is not None
      res = chain[(res_at_position.hetflag,
                   res_at_position.position.residue_number,
                   res_at_position.position.insertion_code)]

--- a/alphafold/model/folding_multimer.py
+++ b/alphafold/model/folding_multimer.py
@@ -775,7 +775,7 @@ def compute_atom14_gt(
  gt_mask = (1. - use_alt) * gt_mask + use_alt * alt_gt_mask
  gt_positions = (1. - use_alt) * gt_positions + use_alt * alt_gt_positions

-  return gt_positions, alt_gt_mask, alt_naming_is_better
+  return gt_positions, gt_mask, alt_naming_is_better


 def backbone_loss(gt_rigid: geometry.Rigid3Array,

--- a/alphafold/model/geometry/test_utils.py
+++ b/alphafold/model/geometry/test_utils.py
@@ -61,9 +61,9 @@ def assert_vectors_equal(vec1: vector.Vec3Array, vec2: vector.Vec3Array):


 def assert_vectors_close(vec1: vector.Vec3Array, vec2: vector.Vec3Array):
-  np.testing.assert_allclose(vec1.x, vec2.x, atol=1e-6, rtol=0.)
-  np.testing.assert_allclose(vec1.y, vec2.y, atol=1e-6, rtol=0.)
-  np.testing.assert_allclose(vec1.z, vec2.z, atol=1e-6, rtol=0.)
+  np.testing.assert_allclose(vec1.x, vec2.x, atol=1e-5, rtol=0.)
+  np.testing.assert_allclose(vec1.y, vec2.y, atol=1e-5, rtol=0.)
+  np.testing.assert_allclose(vec1.z, vec2.z, atol=1e-5, rtol=0.)


 def assert_array_close_to_vector(array: jnp.ndarray, vec: vector.Vec3Array):

--- a/alphafold/model/prng_test.py
+++ b/alphafold/model/prng_test.py
@@ -29,8 +29,7 @@ class PrngTest(absltest.TestCase):

    raw_key = safe_key.get()

-    self.assertNotEqual(raw_key[0], init_key[0])
-    self.assertNotEqual(raw_key[1], init_key[1])
+    self.assertFalse((raw_key == init_key).all())

    with self.assertRaises(RuntimeError):
      safe_key.get()

--- a/alphafold/model/utils.py
+++ b/alphafold/model/utils.py
@@ -160,8 +160,14 @@ def padding_consistent_rng(f):
    return jax.vmap(functools.partial(grid_keys, shape=shape[1:]))(new_keys)

  def inner(key, shape, **kwargs):
+    keys = grid_keys(key, shape)
+    signature = (
+        '()->()'
+        if jax.dtypes.issubdtype(keys.dtype, jax.dtypes.prng_key)
+        else '(2)->()'
+    )
    return jnp.vectorize(
-        lambda key: f(key, shape=(), **kwargs),
-        signature='(2)->()')(
-            grid_keys(key, shape))
+        functools.partial(f, shape=(), **kwargs), signature=signature
+    )(keys)
+
  return inner
--- a/alphafold/notebooks/notebook_utils.py
+++ b/alphafold/notebooks/notebook_utils.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 """Helper methods for the AlphaFold Colab notebook."""
-import json
 from typing import AbstractSet, Any, Mapping, Optional, Sequence

 from alphafold.common import residue_constants
@@ -143,31 +142,6 @@ def empty_placeholder_template_features(
  }


-def get_pae_json(pae: np.ndarray, max_pae: float) -> str:
-  """Returns the PAE in the same format as is used in the AFDB.
-
-  Note that the values are presented as floats to 1 decimal place,
-  whereas AFDB returns integer values.
-
-  Args:
-    pae: The n_res x n_res PAE array.
-    max_pae: The maximum possible PAE value.
-  Returns:
-    PAE output format as a JSON string.
-  """
-  # Check the PAE array is the correct shape.
-  if (pae.ndim != 2 or pae.shape[0] != pae.shape[1]):
-    raise ValueError(f'PAE must be a square matrix, got {pae.shape}')
-
-  # Round the predicted aligned errors to 1 decimal place.
-  rounded_errors = np.round(pae.astype(np.float64), decimals=1)
-  formatted_output = [{
-      'predicted_aligned_error': rounded_errors.tolist(),
-      'max_predicted_aligned_error': max_pae
-  }]
-  return json.dumps(formatted_output, indent=None, separators=(',', ':'))
-
-
 def check_cell_execution_order(
    cells_ran: AbstractSet[int], cell_number: int) -> None:
  """Check that the cell execution order is correct.