Source code for genome_entropy.io.genbank

"""GenBank file reading and parsing utilities."""

import gzip
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Union

from Bio import SeqIO

from ..logging_config import get_logger

logger = get_logger(__name__)


[docs] @dataclass class GenBankCDS: """Represents a CDS (Coding Sequence) feature from GenBank. Attributes: parent_id: ID of the parent sequence start: 0-based start position (inclusive) end: 0-based end position (exclusive) strand: Strand orientation ('+' or '-') protein_sequence: Translated protein sequence """ parent_id: str start: int end: int strand: str protein_sequence: str
[docs] def read_genbank(genbank_path: Union[str, Path]) -> Dict[str, str]: """Read a GenBank file and return a dictionary of sequence_id -> DNA sequence. Automatically detects and handles gzipped files (ending in .gz). Args: genbank_path: Path to GenBank file (plain text or gzipped) Returns: Dictionary mapping sequence IDs to DNA sequences Raises: FileNotFoundError: If the GenBank file doesn't exist ValueError: If the GenBank file is malformed """ genbank_path = Path(genbank_path) logger.info("Reading GenBank file: %s", genbank_path) if not genbank_path.exists(): logger.error("GenBank file not found: %s", genbank_path) raise FileNotFoundError(f"GenBank file not found: {genbank_path}") sequences = {} try: # Auto-detect gzipped files by extension is_gzipped = str(genbank_path).endswith(".gz") open_func = gzip.open if is_gzipped else open mode = "rt" if is_gzipped else "r" with open_func(genbank_path, mode) as handle: for record in SeqIO.parse(handle, "genbank"): seq_id = record.id dna_sequence = str(record.seq).upper() sequences[seq_id] = dna_sequence logger.debug( "Read sequence '%s' (length=%d)", seq_id, len(dna_sequence) ) except Exception as e: logger.error("Failed to parse GenBank file: %s", e) raise ValueError(f"Failed to parse GenBank file: {e}") if not sequences: logger.error("No sequences found in GenBank file: %s", genbank_path) raise ValueError(f"No sequences found in GenBank file: {genbank_path}") logger.info( "Successfully read %d sequence(s) from %s", len(sequences), genbank_path ) return sequences
[docs] def extract_cds_features(genbank_path: Union[str, Path]) -> List[GenBankCDS]: """Extract CDS features from a GenBank file. Automatically detects and handles gzipped files (ending in .gz). Args: genbank_path: Path to GenBank file (plain text or gzipped) Returns: List of GenBankCDS objects Raises: FileNotFoundError: If the GenBank file doesn't exist ValueError: If the GenBank file is malformed """ genbank_path = Path(genbank_path) logger.info("Extracting CDS features from GenBank file: %s", genbank_path) if not genbank_path.exists(): logger.error("GenBank file not found: %s", genbank_path) raise FileNotFoundError(f"GenBank file not found: {genbank_path}") cds_features = [] try: # Auto-detect gzipped files by extension is_gzipped = str(genbank_path).endswith(".gz") open_func = gzip.open if is_gzipped else open mode = "rt" if is_gzipped else "r" with open_func(genbank_path, mode) as handle: for record in SeqIO.parse(handle, "genbank"): seq_id = record.id for feature in record.features: if feature.type != "CDS": continue # Get protein translation if available protein_seq = "" if "translation" in feature.qualifiers: protein_seq = feature.qualifiers["translation"][0] # Convert strand strand = "+" if feature.location.strand == 1 else "-" # BioPython uses 0-based coordinates (inclusive start, exclusive end) start = int(feature.location.start) end = int(feature.location.end) cds = GenBankCDS( parent_id=seq_id, start=start, end=end, strand=strand, protein_sequence=protein_seq, ) cds_features.append(cds) logger.debug( "Extracted CDS: %s %s:%d-%d (protein_len=%d)", seq_id, strand, start, end, len(protein_seq), ) except Exception as e: logger.error("Failed to extract CDS features: %s", e) raise ValueError(f"Failed to extract CDS features: {e}") logger.info("Extracted %d CDS feature(s)", len(cds_features)) return cds_features
[docs] def match_orf_to_genbank_cds( orf_aa_sequence: str, genbank_cds_list: List[GenBankCDS], min_c_terminal_match: int = 10, ) -> bool: """Check if an ORF matches any GenBank CDS by C-terminal sequence. Matches are determined by comparing the C-terminal (end) sequences of the protein sequences. This accounts for cases where the predicted ORF may not exactly match the annotated CDS start position. Args: orf_aa_sequence: Amino acid sequence of the ORF genbank_cds_list: List of CDS features from GenBank min_c_terminal_match: Minimum length of C-terminal sequence to match (default: 10) Returns: True if the ORF C-terminal matches any GenBank CDS, False otherwise """ if not orf_aa_sequence or not genbank_cds_list: return False # Remove stop codon (*) from ORF sequence for comparison orf_seq_clean = orf_aa_sequence.rstrip("*") # Get C-terminal sequence (last N amino acids) orf_c_terminal = orf_seq_clean[-min_c_terminal_match:] # If ORF is shorter than min match length, use the whole sequence if len(orf_seq_clean) < min_c_terminal_match: orf_c_terminal = orf_seq_clean # Check against all GenBank CDS features for cds in genbank_cds_list: if not cds.protein_sequence: continue cds_seq_clean = cds.protein_sequence.rstrip("*") # Get C-terminal of CDS cds_c_terminal = cds_seq_clean[-min_c_terminal_match:] # If CDS is shorter than min match length, use the whole sequence if len(cds_seq_clean) < min_c_terminal_match: cds_c_terminal = cds_seq_clean # Check if C-terminals match if orf_c_terminal == cds_c_terminal: logger.debug( "ORF C-terminal matches GenBank CDS (match_length=%d)", len(orf_c_terminal), ) return True return False