Source code for genome_entropy.translate.translator

"""Translation of nucleotide sequences to amino acids."""

from dataclasses import dataclass
from typing import List

import PyGeneticCode

from ..config import DEFAULT_GENETIC_CODE_TABLE
from ..errors import TranslationError
from ..logging_config import get_logger
from ..orf.types import OrfRecord

logger = get_logger(__name__)


[docs] @dataclass class ProteinRecord: """Represents a translated protein from an ORF. Attributes: orf: The OrfRecord that was translated aa_sequence: The amino acid sequence aa_length: Length of the amino acid sequence """ orf: OrfRecord aa_sequence: str aa_length: int
[docs] def __post_init__(self) -> None: """Validate protein attributes.""" if self.aa_length != len(self.aa_sequence): raise ValueError( f"aa_length {self.aa_length} doesn't match sequence length " f"{len(self.aa_sequence)}" )
[docs] def translate_orf( orf: OrfRecord, table_id: int = DEFAULT_GENETIC_CODE_TABLE ) -> ProteinRecord: """Translate an ORF to a protein sequence. Uses the pygenetic-code library for translation with NCBI genetic codes. Ambiguous codons (containing N or other IUPAC codes) are translated to 'X'. Args: orf: OrfRecord to translate table_id: NCBI genetic code table ID (default: from config) Returns: ProteinRecord with translated sequence Raises: TranslationError: If translation fails """ logger.debug( "Translating ORF %s (length=%d nt) with table %d", orf.orf_id, len(orf.nt_sequence), table_id, ) try: # Translate sequence aa_sequence = PyGeneticCode.translate(orf.nt_sequence, table_id) if aa_sequence != orf.aa_sequence: errmsg = f""" Translated amino acid sequence is not the same as read from the file: Provided: {orf.aa_sequence} Translated: {aa_sequence} DNA sequence: {orf.nt_sequence} Location: Start: {orf.start} Stop: {orf.end} Frame: {orf.frame} Strand: {orf.strand} """ logger.error("Translation mismatch for ORF %s", orf.orf_id) raise ValueError(errmsg) # Remove stop codon (*) if present at the end if aa_sequence.endswith("*"): aa_sequence = aa_sequence[:-1] logger.debug( "Successfully translated ORF %s to %d amino acids", orf.orf_id, len(aa_sequence), ) return ProteinRecord( orf=orf, aa_sequence=aa_sequence, aa_length=len(aa_sequence), ) except Exception as e: logger.error("Failed to translate ORF %s: %s", orf.orf_id, e) raise TranslationError(f"Failed to translate ORF {orf.orf_id}: {e}")
[docs] def translate_orfs( orfs: List[OrfRecord], table_id: int = DEFAULT_GENETIC_CODE_TABLE ) -> List[ProteinRecord]: """Translate multiple ORFs to protein sequences. Args: orfs: List of OrfRecord objects to translate table_id: NCBI genetic code table ID Returns: List of ProteinRecord objects """ logger.info("Translating %d ORF(s) with table %d", len(orfs), table_id) proteins = [translate_orf(orf, table_id=table_id) for orf in orfs] logger.info("Successfully translated %d ORF(s) to proteins", len(proteins)) return proteins