Source code for genome_entropy.pipeline.types

"""Unified data types for pipeline output format.

This module defines the unified feature structure that eliminates redundancy
by consolidating ORF, protein, and 3Di data into a single hierarchical format.

The unified structure addresses the problem where:
- The old `proteins` list duplicated entire ORF objects
- The old `three_dis` list duplicated entire protein objects (which contained ORFs)
- Each level repeated sequences, coordinates, and metadata

The new structure stores each piece of biological information exactly once,
organized hierarchically by biological concept.
"""

from dataclasses import dataclass
from typing import Dict, Literal



[docs]
@dataclass
class FeatureLocation:
    """Genomic location of a feature (ORF).

    Attributes:
        start: 0-based start position (inclusive)
        end: 0-based end position (exclusive)
        strand: Strand orientation ('+' or '-')
        frame: Reading frame (0, 1, 2, or 3)
    """

    start: int
    end: int
    strand: Literal["+", "-"]
    frame: int




[docs]
@dataclass
class FeatureDNA:
    """DNA-level information for a feature.

    Attributes:
        nt_sequence: Nucleotide sequence
        length: Length of nucleotide sequence
    """

    nt_sequence: str
    length: int




[docs]
@dataclass
class FeatureProtein:
    """Protein-level information for a feature.

    Attributes:
        aa_sequence: Amino acid sequence
        length: Length of amino acid sequence
    """

    aa_sequence: str
    length: int




[docs]
@dataclass
class FeatureThreeDi:
    """3Di structural encoding for a feature.

    Attributes:
        encoding: 3Di token sequence
        length: Length of 3Di sequence
        method: Method used for encoding (e.g., "prostt5_aa2fold")
        model_name: Name of the model used
        inference_device: Device used for inference ("cuda", "mps", or "cpu")
    """

    encoding: str
    length: int
    method: str
    model_name: str
    inference_device: str




[docs]
@dataclass
class FeatureMetadata:
    """Metadata about a feature.

    Attributes:
        parent_id: ID of the parent DNA sequence
        table_id: NCBI genetic code table ID used
        has_start_codon: Whether the ORF has a start codon
        has_stop_codon: Whether the ORF has a stop codon
        in_genbank: Whether this ORF matches a CDS annotated in GenBank
    """

    parent_id: str
    table_id: int
    has_start_codon: bool
    has_stop_codon: bool
    in_genbank: bool




[docs]
@dataclass
class FeatureEntropy:
    """Entropy values at different representation levels for a feature.

    Attributes:
        dna_entropy: Shannon entropy of nucleotide sequence
        protein_entropy: Shannon entropy of amino acid sequence
        three_di_entropy: Shannon entropy of 3Di encoding
    """

    dna_entropy: float
    protein_entropy: float
    three_di_entropy: float




[docs]
@dataclass
class UnifiedFeature:
    """Unified representation of a biological feature (ORF and derived data).

    This structure consolidates all information about a single ORF into one
    hierarchical object, eliminating the redundancy present in the old format
    where ORF data was duplicated in proteins list and protein data was
    duplicated in three_dis list.

    Attributes:
        orf_id: Unique identifier for this feature
        location: Genomic coordinates
        dna: DNA sequence information
        protein: Protein sequence information
        three_di: 3Di structural encoding
        metadata: Additional metadata
        entropy: Entropy values at all representation levels
    """

    orf_id: str
    location: FeatureLocation
    dna: FeatureDNA
    protein: FeatureProtein
    three_di: FeatureThreeDi
    metadata: FeatureMetadata
    entropy: FeatureEntropy




[docs]
@dataclass
class UnifiedPipelineResult:
    """Result of running the complete DNA to 3Di pipeline (unified format).

    This is the new format that eliminates redundancy by using a single
    dictionary of features keyed by orf_id, instead of separate parallel
    lists for orfs, proteins, and three_dis.

    Attributes:
        schema_version: Version of the output schema (for compatibility tracking)
        input_id: ID of the input DNA sequence
        input_dna_length: Length of the input DNA sequence
        dna_entropy_global: Entropy of the entire input DNA sequence
        alphabet_sizes: Dictionary with alphabet sizes for each representation
        features: Dictionary mapping orf_id to UnifiedFeature objects
    """

    schema_version: str
    input_id: str
    input_dna_length: int
    dna_entropy_global: float
    alphabet_sizes: Dict[str, int]
    features: Dict[str, UnifiedFeature]