Use pLDDT in the B-factor column of the output PDBs.

PiperOrigin-RevId: 390566020 Change-Id: I3fafbe8246d0a5ad018f0398b39bf7dacee00468

Use pLDDT in the B-factor column of the output PDBs.
PiperOrigin-RevId: 390566020 Change-Id: I3fafbe8246d0a5ad018f0398b39bf7dacee00468
cef198e0 · Augustin Zidek · Copybara-Service · f65e94fc · cef198e0 · cef198e0
Commit cef198e0 authored Aug 13, 2021 by Augustin Zidek Committed by Copybara-Service Aug 13, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 32 additions and 4 deletions

README.md README.md +4 -0

run_alphafold.py run_alphafold.py +11 -3

run_alphafold_test.py run_alphafold_test.py +17 -1

No files found.
--- a/README.md
+++ b/README.md
@@ -273,6 +273,10 @@ The contents of each output file are as follows:
        serve for a visualisation of domain packing confidence within the
        structure.

+The pLDDT confidence measure is stored in the B-factor field of the output PDB
+files (although unlike a B-factor, higher pLDDT is better, so care must be taken
+when using for tasks such as molecular replacement).
+
 This code has been tested to match mean top-1 accuracy on a CASP14 test set with
 pLDDT ranking over 5 model predictions (some CASP targets were run with earlier
 versions of AlphaFold and some had manual interventions; see our forthcoming

--- a/run_alphafold.py
+++ b/run_alphafold.py
@@ -26,6 +26,7 @@ from absl import app
 from absl import flags
 from absl import logging
 from alphafold.common import protein
+from alphafold.common import residue_constants
 from alphafold.data import pipeline
 from alphafold.data import templates
 from alphafold.model import data
@@ -158,15 +159,22 @@ def predict_structure(
      timings[f'predict_benchmark_{model_name}'] = time.time() - t_0

    # Get mean pLDDT confidence metric.
-    plddts[model_name] = np.mean(prediction_result['plddt'])
+    plddt = prediction_result['plddt']
+    plddts[model_name] = np.mean(plddt)

    # Save the model outputs.
    result_output_path = os.path.join(output_dir, f'result_{model_name}.pkl')
    with open(result_output_path, 'wb') as f:
      pickle.dump(prediction_result, f, protocol=4)

-    unrelaxed_protein = protein.from_prediction(processed_feature_dict,
-                                                prediction_result)
+    # Add the predicted LDDT in the b-factor column.
+    # Note that higher predicted LDDT value means higher model confidence.
+    plddt_b_factors = np.repeat(
+        plddt[:, None], residue_constants.atom_type_num, axis=-1)
+    unrelaxed_protein = protein.from_prediction(
+        features=processed_feature_dict,
+        result=prediction_result,
+        b_factors=plddt_b_factors)

    unrelaxed_pdb_path = os.path.join(output_dir, f'unrelaxed_{model_name}.pdb')
    with open(unrelaxed_pdb_path, 'w') as f:

--- a/run_alphafold_test.py
+++ b/run_alphafold_test.py
@@ -45,7 +45,7 @@ class RunAlphafoldTest(parameterized.TestCase):
        'predicted_lddt': {
            'logits': np.ones((10, 50)),
        },
-        'plddt': np.zeros(10),
+        'plddt': np.ones(10) * 42,
        'ptm': np.array(0.),
        'aligned_confidence_probs': np.zeros((10, 10, 50)),
        'predicted_aligned_error': np.zeros((10, 10)),
@@ -71,6 +71,22 @@ class RunAlphafoldTest(parameterized.TestCase):
        benchmark=False,
        random_seed=0)

+    base_output_files = os.listdir(out_dir)
+    self.assertIn('target.fasta', base_output_files)
+    self.assertIn('test', base_output_files)
+
+    target_output_files = os.listdir(os.path.join(out_dir, 'test'))
+    self.assertSequenceEqual(
+        ['features.pkl', 'msas', 'ranked_0.pdb', 'ranking_debug.json',
+         'relaxed_model1.pdb', 'result_model1.pkl', 'timings.json',
+         'unrelaxed_model1.pdb'], target_output_files)
+
+    # Check that pLDDT is set in the B-factor column.
+    with open(os.path.join(out_dir, 'test', 'unrelaxed_model1.pdb')) as f:
+      for line in f:
+        if line.startswith('ATOM'):
+          self.assertEqual(line[61:66], '42.00')
+

 if __name__ == '__main__':
  absltest.main()