""" Medical Terminology Module for HBV (Hepatitis B Virus) This module provides intelligent handling of HBV medical linguistic variability including: - Synonyms and alternate terms - Abbreviations and acronyms (with context awareness) - Regional spelling variations (US/UK/International) - Specialty-specific terminology - Dynamic learning from corpus """ import re import json from typing import List, Dict, Set, Tuple, Optional from collections import defaultdict from pathlib import Path from .config import logger # ============================================================================ # CORE HBV MEDICAL TERMINOLOGY MAPPINGS # ============================================================================ # Common HBV medical abbreviations with context-aware expansions MEDICAL_ABBREVIATIONS = { # HBV Terminology "hbv": ["hepatitis b virus", "hepatitis b"], "hbsag": ["hepatitis b surface antigen", "hbs antigen"], "hbeag": ["hepatitis b e antigen", "hbe antigen"], "hbcag": ["hepatitis b core antigen"], "anti-hbs": ["antibody to hepatitis b surface antigen", "anti-hbs antibody"], "anti-hbe": ["antibody to hepatitis b e antigen"], "anti-hbc": ["antibody to hepatitis b core antigen"], "hbv dna": ["hepatitis b virus dna", "hbv viral load"], # Liver Disease Terms "alt": ["alanine aminotransferase", "alanine transaminase", "sgpt"], "ast": ["aspartate aminotransferase", "aspartate transaminase", "sgot"], "alp": ["alkaline phosphatase"], "ggt": ["gamma-glutamyl transferase", "gamma glutamyl transpeptidase"], "inr": ["international normalized ratio"], "pt": ["prothrombin time"], "apri": ["ast to platelet ratio index"], "fib-4": ["fibrosis-4 index"], # Fibrosis Staging "f0": ["no fibrosis"], "f1": ["mild fibrosis", "portal fibrosis"], "f2": ["moderate fibrosis"], "f3": ["severe fibrosis", "advanced fibrosis"], "f4": ["cirrhosis", "liver cirrhosis"], # Necroinflammatory Activity "a0": ["no activity"], "a1": ["mild activity"], "a2": ["moderate activity"], "a3": ["severe activity"], # Treatment Terms "etv": ["entecavir"], "tdf": ["tenofovir disoproxil fumarate", "tenofovir df"], "taf": ["tenofovir alafenamide"], "lam": ["lamivudine", "3tc"], "adv": ["adefovir", "adefovir dipivoxil"], "ldv": ["telbivudine"], "peg-ifn": ["pegylated interferon", "peginterferon"], "ifn": ["interferon"], # Complications "hcc": ["hepatocellular carcinoma", "liver cancer"], "dc": ["decompensated cirrhosis"], "cc": ["compensated cirrhosis"], "esld": ["end-stage liver disease"], "alf": ["acute liver failure"], "aclf": ["acute-on-chronic liver failure"], # Coinfections "hiv": ["human immunodeficiency virus"], "hcv": ["hepatitis c virus", "hepatitis c"], "hdv": ["hepatitis d virus", "hepatitis delta"], "hav": ["hepatitis a virus", "hepatitis a"], # Clinical Terms "uln": ["upper limit of normal"], "iu/ml": ["international units per milliliter"], "log": ["logarithm", "log10"], "svr": ["sustained virological response"], "vr": ["virological response"], "br": ["biochemical response"], "sr": ["serological response"], } # Synonym mappings for HBV medical terms MEDICAL_SYNONYMS = { # HBV terminology "hepatitis b": ["hbv", "hepatitis b virus", "hep b", "hbv infection"], "chronic hepatitis b": ["chb", "chronic hbv", "chronic hbv infection"], "acute hepatitis b": ["ahb", "acute hbv"], "hbv dna": ["viral load", "hbv viral load", "serum hbv dna"], # Serological markers "hbsag positive": ["hbsag+", "hbs antigen positive"], "hbeag positive": ["hbeag+", "hbe antigen positive"], "hbsag negative": ["hbsag-", "hbs antigen negative"], "hbeag negative": ["hbeag-", "hbe antigen negative"], # Liver disease stages "cirrhosis": ["f4", "liver cirrhosis", "hepatic cirrhosis"], "fibrosis": ["liver fibrosis", "hepatic fibrosis"], "compensated cirrhosis": ["cc", "child-pugh a", "child-pugh b"], "decompensated cirrhosis": ["dc", "child-pugh c"], # Treatment terms "antiviral therapy": ["antiviral treatment", "nucleos(t)ide analogue", "na therapy"], "entecavir": ["etv", "baraclude"], "tenofovir": ["tdf", "taf", "viread", "vemlidy"], "interferon": ["ifn", "pegylated interferon", "peg-ifn"], # Clinical outcomes "treatment response": ["virological response", "biochemical response"], "viral suppression": ["undetectable hbv dna", "hbv dna < lloq"], "alt normalization": ["alt normal", "alt within normal limits"], # Complications "hepatocellular carcinoma": ["hcc", "liver cancer", "primary liver cancer"], "liver failure": ["hepatic failure", "end-stage liver disease", "esld"], "portal hypertension": ["esophageal varices", "ascites", "splenomegaly"], # Special populations "pregnant women": ["pregnancy", "pregnant patients"], "immunosuppressed": ["immunocompromised", "on immunosuppression"], "coinfection": ["co-infection", "dual infection"], } # Regional spelling variations (US/UK/International) SPELLING_VARIATIONS = { "fibrosis": ["fibrosis"], "cirrhosis": ["cirrhosis"], "anaemia": ["anemia"], "haemorrhage": ["hemorrhage"], "oesophageal": ["esophageal"], } # Context-specific term preferences CONTEXT_PREFERENCES = { "treatment": ["antiviral", "therapy", "regimen", "medication"], "diagnosis": ["hbsag", "hbeag", "hbv dna", "serology"], "monitoring": ["alt", "hbv dna", "liver function", "fibrosis"], "complications": ["hcc", "cirrhosis", "decompensation", "liver failure"], } # ============================================================================ # DYNAMIC TERMINOLOGY LEARNING # ============================================================================ class MedicalTerminologyExpander: """ Dynamically learns and expands medical terminology from corpus. Handles abbreviations, synonyms, and context-specific variations for HBV. """ def __init__(self, corpus_path: Optional[Path] = None): """Initialize with optional corpus for dynamic learning.""" self.abbreviations = MEDICAL_ABBREVIATIONS.copy() self.synonyms = MEDICAL_SYNONYMS.copy() self.spelling_vars = SPELLING_VARIATIONS.copy() self.learned_terms = defaultdict(set) if corpus_path and corpus_path.exists(): self._learn_from_corpus(corpus_path) def expand_query(self, query: str, context: Optional[str] = None) -> List[str]: """ Expand a query with medical synonyms and abbreviations. Args: query: Original query string context: Optional context hint (e.g., 'treatment', 'diagnosis') Returns: List of expanded query variations """ expansions = [query] query_lower = query.lower() # Expand abbreviations for abbrev, full_forms in self.abbreviations.items(): if abbrev in query_lower: for full_form in full_forms: expansions.append(query_lower.replace(abbrev, full_form)) # Expand synonyms for term, synonyms in self.synonyms.items(): if term in query_lower: for synonym in synonyms: expansions.append(query_lower.replace(term, synonym)) # Add context-specific preferences if context and context in CONTEXT_PREFERENCES: for pref_term in CONTEXT_PREFERENCES[context]: if pref_term not in query_lower: expansions.append(f"{query} {pref_term}") # Remove duplicates while preserving order seen = set() unique_expansions = [] for exp in expansions: if exp not in seen: seen.add(exp) unique_expansions.append(exp) return unique_expansions def normalize_term(self, term: str) -> str: """ Normalize a medical term to its canonical form. Args: term: Medical term to normalize Returns: Normalized canonical form """ term_lower = term.lower().strip() # Check if it's an abbreviation if term_lower in self.abbreviations: return self.abbreviations[term_lower][0] # Check if it's a synonym for canonical, synonyms in self.synonyms.items(): if term_lower in synonyms or term_lower == canonical: return canonical # Check spelling variations for canonical, variations in self.spelling_vars.items(): if term_lower in variations: return canonical return term def _learn_from_corpus(self, corpus_path: Path): """Learn new terminology patterns from corpus.""" try: # Implementation for dynamic learning from HBV guidelines logger.info(f"Learning terminology from corpus: {corpus_path}") # This would analyze the corpus and extract new term relationships except Exception as e: logger.warning(f"Could not learn from corpus: {e}") def get_related_terms(self, term: str, max_terms: int = 5) -> List[str]: """ Get related medical terms for a given term. Args: term: Medical term max_terms: Maximum number of related terms to return Returns: List of related terms """ related = set() term_lower = term.lower() # Find synonyms for canonical, synonyms in self.synonyms.items(): if term_lower == canonical or term_lower in synonyms: related.update(synonyms) related.add(canonical) # Find abbreviations if term_lower in self.abbreviations: related.update(self.abbreviations[term_lower]) # Remove the original term related.discard(term_lower) return list(related)[:max_terms] # Global instance for easy access _global_expander = None def get_terminology_expander() -> MedicalTerminologyExpander: """Get or create the global terminology expander instance.""" global _global_expander if _global_expander is None: _global_expander = MedicalTerminologyExpander() return _global_expander