Skip to main content

Entities

Entities are the core data objects that tools consume and produce. They provide a consistent interface for working with biological data: protein structures, small molecules, and antibodies.
python
from proto_tools.entities import Structure, Ligands, Fragment

Structure

Represents a 3D molecular structure (protein, RNA, DNA, or complex). Supports both PDB and mmCIF formats.

Creating Structures

python
from proto_tools.entities import Structure

# From a file path
structure = Structure.from_file("protein.pdb")
structure = Structure.from_file("protein.cif")

# From a raw PDB/CIF string
structure = Structure(structure=pdb_string)

# With metadata (the structure= field accepts a file path or raw content)
structure = Structure(
    structure="prediction.cif",
    b_factor_type="pLDDT",
    source="esmfold",
)

Inspecting Structures

python
# Chain information
structure.get_chain_ids()              # ["A", "B"]
structure.get_chain_sequences()        # {"A": "MKTAYL...", "B": "GIVEQ..."}
structure.get_chain_sequence("A")      # "MKTAYL..."
structure.get_chain_types()            # {"A": "polymer", "B": "polymer"}
structure.num_chains                   # 2
structure.num_residues                 # 350

# Residue positions (1-indexed)
structure.get_chain_positions("A")     # [1, 2, 3, ..., 200]

# Raw content
structure.structure_format             # "cif" or "pdb"
structure.structure_pdb                # PDB format string
structure.structure_cif                # CIF format string

# Custom metrics (Metrics object; access by attribute, [], or .get())
structure.metrics.avg_plddt            # 0.85 (tool-populated; ESMFold/AlphaFold use a 0-1 scale)
structure.add_metric("rmsd", 1.23)

Saving Structures

python
structure.write_cif("output.cif")   # Preferred format
structure.write_pdb("output.pdb")   # PDB has format limitations

Visualizing Structures

python
# Interactive 3D visualization (in Jupyter notebooks)
structure.visualize(
    style="cartoon",        # "cartoon", "stick", "line", "sphere"
    color_by="bfactor",     # "bfactor" or "chain"
)

B-Factor Types

Structures carry a b_factor_type annotation that describes what the B-factor column represents:
TypeDescription
pLDDTPer-residue confidence (0-100 scale)
normalized_pLDDTPer-residue confidence (0-1 scale)
confidenceGeneral confidence score
temperature_factorCrystallographic B-factors
unknownB-factor column present but its meaning could not be determined
unspecifiedDefault when not set
Tools that produce structures (e.g., ESMFold, AlphaFold3) automatically set the appropriate B-factor type.

Structure Utilities

python
from proto_tools.entities import (
    load_structure_file,
    detect_structure_format,
    is_valid_structure,
    convert_pdb_str_to_cif_str,
    convert_cif_str_to_pdb_str,
)

# Format detection and validation
content = load_structure_file("protein.pdb")
fmt = detect_structure_format(content)         # "pdb"
valid = is_valid_structure("protein.pdb")      # True

# Format conversion
cif_str = convert_pdb_str_to_cif_str(pdb_str)
pdb_str = convert_cif_str_to_pdb_str(cif_str)

StructureEnsemble

Represents multiple conformations of the same sequence, produced by tools like BioEmu or structure sampling methods:
python
from proto_tools.entities import StructureEnsemble

ensemble = StructureEnsemble(
    structures=[struct1, struct2, struct3],
    sequence="MKTAYLLIGL..."
)

for structure in ensemble.structures:
    print(f"pLDDT: {structure.metrics.avg_plddt}")

Ligands

Represents small molecules (drugs, metabolites, cofactors). Built on RDKit.

Fragment (Single Molecule)

python
from proto_tools.entities import Fragment

# From SMILES
ethanol = Fragment(smiles="CCO", name="ethanol")
ibuprofen = Fragment(smiles="CC(C)CC1=CC=C(C=C1)C(C)C(O)=O")

# Properties
ethanol.smiles              # "CCO" (canonical)
ethanol.name                # "ethanol" (defaults to None unless passed; see lookup_name_via_pubchem)

# Generate 3D conformers
ethanol.generate_conformers(num_conformers=5)
ethanol.conformers           # List of 3D conformers

# Visualize
ethanol.visualize(style="stick")

Ligands (Collection)

python
from proto_tools.entities import Ligands

# From SMILES (dot-separated for multiple molecules)
ligands = Ligands(smiles="CCO.CC(C)O.CCCO")

# From a file
ligands = Ligands.from_file("molecules.smi")    # SMILES file
ligands = Ligands.from_file("molecules.sdf")    # SDF file

# From Fragment objects
ligands = Ligands(fragments=[Fragment(smiles="CCO"), Fragment(smiles="CC(C)O")])

# List-like interface
len(ligands)                # 3
ligands[0]                  # First Fragment
for frag in ligands:
    print(frag.smiles)

# Export
ligands.to_smi("output.smi")
ligands.to_sdf("output.sdf")
ligands.to_pdb("output.pdb")

Ligand Utilities

python
from proto_tools.entities import (
    is_smiles_valid,
    lookup_name_via_pubchem,
    lookup_smiles_via_pubchem,
    map_smiles_to_ccd_code,
    map_ccd_code_to_smiles,
)

# Validation
is_smiles_valid("CCO")                        # True
is_smiles_valid("not_a_smiles")               # False

# Name lookups (via PubChem)
lookup_name_via_pubchem("CCO")                # "ethanol"
lookup_smiles_via_pubchem("aspirin")          # "CC(=O)OC1=CC=CC=C1C(O)=O"

# CCD database mapping (wwPDB Chemical Component Dictionary)
map_smiles_to_ccd_code("CCO")                 # "EOH"
map_ccd_code_to_smiles("ATP")                 # SMILES for ATP

How Tools Use Entities

Tools consume and produce entities through their Input and Output Pydantic models:
python
from proto_tools import run_esmfold, ESMFoldInput, ESMFoldConfig, Complex, Chain

# Run prediction
output = run_esmfold(
    ESMFoldInput(
        complexes=[
            Complex(
                chains=[Chain(sequence="MKTAYLLIGLLAIAAFSPQVLA")]
            )
        ]
    )
)

# Output contains Structure entities
structure = output.structures[0]          # Structure object
structure.get_chain_sequences()           # {"A": "MKTAYLLIGLLAIAAFSPQVLA"}
structure.metrics.avg_plddt               # 0.82 (Metrics object, 0-1 scale)
structure.write_cif("predicted.cif")

Built-in Example

A GFP structure is included for testing:
python
from proto_tools.entities import get_gfp_structure, GFP_CIF_PATH

gfp = get_gfp_structure()
print(f"GFP has {gfp.num_residues} residues")

Next Steps

Tools

How tools consume and produce entities

Quickstart

Run your first tool