Working implementation for 1 chain case

ba8de687 · Jose Duarte · 7bc13cfc · ba8de687
Commit ba8de687 authored Feb 20, 2023 by Jose Duarte
Show whitespace changes
Inline Side-by-side

Showing with 76 additions and 21 deletions

openfold/np/protein.py openfold/np/protein.py +76 -21

No files found.
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -402,6 +402,10 @@ def to_modelcif(prot: Protein) -> str:
      ModelCIF string.
    """
+    restypes = residue_constants.restypes + ["X"]
+    res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], "UNK")
+    atom_types = residue_constants.atom_types
    atom_mask = prot.atom_mask
    aatype = prot.aatype
    atom_positions = prot.atom_positions
@@ -409,16 +413,14 @@ def to_modelcif(prot: Protein) -> str:
    b_factors = prot.b_factors
    chain_index = prot.chain_index
-    system = modelcif.System(title='Ligand example')
+    system = modelcif.System(title='OpenFold prediction')
-    # Describe the amino acid (polymer) sequences as Entity objects, for both
+    # sequence into entity
-    # template and model:
+    n = aatype.shape[0]
-    model_e = modelcif.Entity('AYVINDSCIA', description='Model subunit')
+    seq = [restypes[aatype[i]] for i in range(n)]
+    model_e = modelcif.Entity("".join(seq), description='Model subunit')
-    # Define the model assembly, as two AsymUnits. NonPolymerFromTemplate is a
+    # Define the model assembly
-    # subclass of AsymUnit that additionally notes the Template from which it
-    # was derived. In this case we state that the ligand was simply copied from
-    # the template into the target (explicit=False):
    asymA = modelcif.AsymUnit(model_e, details='Model subunit A', id='A')
    modeled_assembly = modelcif.Assembly((asymA,), name='Modeled assembly')
@@ -426,11 +428,55 @@ def to_modelcif(prot: Protein) -> str:
        asym_unit_map = {'A': asymA}
        def get_atoms(self):
-            for asym, seq_id, type_symbol, atom_id, x, y, z in atoms:
+            prev_chain_index = 0
+            chain_tags = string.ascii_uppercase
+            # Add all atom sites.
+            for i in range(n):
+                res_name_3 = res_1to3(aatype[i])
+                for atom_name, pos, mask, b_factor in zip(
+                        atom_types, atom_positions[i], atom_mask[i], b_factors[i]
+                ):
+                    if mask < 0.5:
+                        continue
+                    name = atom_name if len(atom_name) == 4 else f" {atom_name}"
+                    element = atom_name[0]  # Protein supports only C, N, O, S, this works.
+                    chain_tag = "A"
+                    if chain_index is not None:
+                        chain_tag = chain_tags[chain_index[i]]
+                    # TODO check that residue indices are ok
+                    # TODO how to set residue types? do they come from the entity sequence set above?
                    yield modelcif.model.Atom(
-                    asym_unit=self.asym_unit_map[asym], type_symbol=type_symbol,
+                        asym_unit=asymA, type_symbol=element,
-                    seq_id=seq_id, atom_id=atom_id, x=x, y=y, z=z,
+                        seq_id=residue_index[i], atom_id=name,
-                    het=seq_id is None)
+                        x=pos[0], y=pos[1], z=pos[2],
+                        het=False, biso=b_factor, occupancy=1.00)
+                # TODO multiple chains
+                # should_terminate = (i == n - 1)
+                # if(chain_index is not None):
+                #     if(i != n - 1 and chain_index[i + 1] != prev_chain_index):
+                #         should_terminate = True
+                #         prev_chain_index = chain_index[i + 1]
+                #
+                # if(should_terminate):
+                #     # Close the chain.
+                #     chain_end = "TER"
+                #     chain_termination_line = (
+                #         f"{chain_end:<6}{atom_index:>5}      "
+                #         f"{res_1to3(aatype[i]):>3} "
+                #         f"{chain_tag:>1}{residue_index[i]:>4}"
+                #     )
+                #     pdb_lines.append(chain_termination_line)
+                #     atom_index += 1
+                #
+                #     if(i != n - 1):
+                #         # "prev" is a misnomer here. This happens at the beginning of
+                #         # each new chain.
+                #         pdb_lines.extend(get_pdb_headers(prot, prev_chain_index))
    # Add the model and modeling protocol to the file and write them out:
    model = MyModel(assembly=modeled_assembly, name='Best scoring model')
@@ -442,10 +488,10 @@ def to_modelcif(prot: Protein) -> str:
    # protocol.steps.append(modelcif.protocol.ModelingStep(
    #     input_data=aln, output_data=model))
    system.protocols.append(protocol)
+    fh = io.StringIO()
-    with open('/Users/jose/output3.cif', 'w') as fh:
    modelcif.dumper.write(fh, [system])
+    modelcifstr = fh.getvalue()
+    return modelcifstr
 def ideal_atom_mask(prot: Protein) -> np.ndarray:
@@ -499,3 +545,12 @@ def from_prediction(
        parents=parents,
        parents_chain_index=parents_chain_index,
    )
+if __name__ == "__main__":
+    with open('/home/jose/Downloads/171l.pdb', 'r') as file:
+        pdbstr = file.read()
+    prot = from_pdb_string(pdbstr)
+    cifstr = to_modelcif(prot)
+    print(cifstr)
\ No newline at end of file