Spaces:

chenzihong
/

GraphGen

Running

App Files Files Community

github-actions[bot] commited on 12 days ago

Commit

06c3276

1 Parent(s): 2bc66d4

Auto-sync from demo at Mon Dec 1 10:51:45 UTC 2025

Browse files

Files changed (9) hide show

graphgen/configs/search_dna_config.yaml +17 -0
graphgen/configs/{search_config.yaml → search_protein_config.yaml} +3 -2
graphgen/configs/search_rna_config.yaml +16 -0
graphgen/graphgen.py +1 -1
graphgen/models/__init__.py +2 -0
graphgen/models/searcher/db/ncbi_searcher.py +390 -0
graphgen/models/searcher/db/rnacentral_searcher.py +314 -0
graphgen/operators/search/search_all.py +36 -5
requirements.txt +1 -0

graphgen/configs/search_dna_config.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+pipeline:
+  - name: read_step
+    op_key: read
+    params:
+      input_file:  resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+  - name: search_step
+    op_key: search
+    deps: [read_step] # search_step depends on read_step
+    params:
+      data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
+      ncbi_params:
+        email: test@example.com # NCBI requires an email address
+        tool: GraphGen # tool name for NCBI API
+        use_local_blast: true # whether to use local blast for DNA search
+        local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension)

graphgen/configs/{search_config.yaml → search_protein_config.yaml} RENAMED Viewed

@@ -2,7 +2,7 @@ pipeline:
   - name: read_step
     op_key: read
     params:
-      input_file:  resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
   - name: search_step
     op_key: search
@@ -11,4 +11,5 @@ pipeline:
       data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
       uniprot_params:
         use_local_blast: true # whether to use local blast for uniprot search
-        local_blast_db: /your_path/uniprot_sprot

   - name: read_step
     op_key: read
     params:
+      input_file:  resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
   - name: search_step
     op_key: search
       data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
       uniprot_params:
         use_local_blast: true # whether to use local blast for uniprot search
+        local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot
+        # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database)

graphgen/configs/search_rna_config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+pipeline:
+  - name: read_step
+    op_key: read
+    params:
+      input_file:  resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+  - name: search_step
+    op_key: search
+    deps: [read_step] # search_step depends on read_step
+    params:
+      data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
+      rnacentral_params:
+        use_local_blast: true # whether to use local blast for RNA search
+        local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE}
+        # can also use DNA database with RNA sequences (if already built)

graphgen/graphgen.py CHANGED Viewed

@@ -45,7 +45,7 @@ class GraphGen:
         # llm
         self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer(
-            model_name=os.getenv("TOKENIZER_MODEL")
         )
         self.synthesizer_llm_client: BaseLLMWrapper = (

         # llm
         self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer(
+            model_name=os.getenv("TOKENIZER_MODEL", "cl100k_base")
         )
         self.synthesizer_llm_client: BaseLLMWrapper = (

graphgen/models/__init__.py CHANGED Viewed

@@ -26,6 +26,8 @@ from .reader import (
     RDFReader,
     TXTReader,
 )
 from .searcher.db.uniprot_searcher import UniProtSearch
 from .searcher.kg.wiki_search import WikiSearch
 from .searcher.web.bing_search import BingSearch

     RDFReader,
     TXTReader,
 )
+from .searcher.db.ncbi_searcher import NCBISearch
+from .searcher.db.rnacentral_searcher import RNACentralSearch
 from .searcher.db.uniprot_searcher import UniProtSearch
 from .searcher.kg.wiki_search import WikiSearch
 from .searcher.web.bing_search import BingSearch

graphgen/models/searcher/db/ncbi_searcher.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import asyncio
+import os
+import re
+import subprocess
+import tempfile
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+from http.client import IncompleteRead
+from typing import Dict, Optional
+from Bio import Entrez, SeqIO
+from Bio.Blast import NCBIWWW, NCBIXML
+from requests.exceptions import RequestException
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+from graphgen.bases import BaseSearcher
+from graphgen.utils import logger
+@lru_cache(maxsize=None)
+def _get_pool():
+    return ThreadPoolExecutor(max_workers=10)
+# ensure only one NCBI request at a time
+_ncbi_lock = asyncio.Lock()
+class NCBISearch(BaseSearcher):
+    """
+    NCBI Search client to search DNA/GenBank/Entrez databases.
+    1) Get the gene/DNA by accession number or gene ID.
+    2) Search with keywords or gene names (fuzzy search).
+    3) Search with FASTA sequence (BLAST search for DNA sequences).
+    API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/
+    Note: NCBI has rate limits (max 3 requests per second), delays are required between requests.
+    """
+    def __init__(
+        self,
+        use_local_blast: bool = False,
+        local_blast_db: str = "nt_db",
+        email: str = "email@example.com",
+        api_key: str = "",
+        tool: str = "GraphGen",
+    ):
+        """
+        Initialize the NCBI Search client.
+        Args:
+            use_local_blast (bool): Whether to use local BLAST database.
+            local_blast_db (str): Path to the local BLAST database.
+            email (str): Email address for NCBI API requests.
+            api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/.
+            tool (str): Tool name for NCBI API requests.
+        """
+        super().__init__()
+        Entrez.timeout = 60  # 60 seconds timeout
+        Entrez.email = email
+        Entrez.tool = tool
+        if api_key:
+            Entrez.api_key = api_key
+        Entrez.max_tries = 10 if api_key else 3
+        Entrez.sleep_between_tries = 5
+        self.use_local_blast = use_local_blast
+        self.local_blast_db = local_blast_db
+        if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
+            logger.error("Local BLAST database files not found. Please check the path.")
+            self.use_local_blast = False
+    @staticmethod
+    def _nested_get(data: dict, *keys, default=None):
+        """Safely traverse nested dictionaries."""
+        for key in keys:
+            if not isinstance(data, dict):
+                return default
+            data = data.get(key, default)
+        return data
+    def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
+        """
+        Convert an Entrez gene record to a dictionary.
+        All extraction logic is inlined for maximum clarity and performance.
+        """
+        if not gene_record:
+            raise ValueError("Empty gene record")
+        data = gene_record[0]
+        locus = (data.get("Entrezgene_locus") or [{}])[0]
+        # Extract common nested paths once
+        gene_ref = self._nested_get(data, "Entrezgene_gene", "Gene-ref", default={})
+        biosource = self._nested_get(data, "Entrezgene_source", "BioSource", default={})
+        # Process synonyms
+        synonyms_raw = gene_ref.get("Gene-ref_syn", [])
+        gene_synonyms = []
+        if isinstance(synonyms_raw, list):
+            for syn in synonyms_raw:
+                gene_synonyms.append(syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn))
+        elif synonyms_raw:
+            gene_synonyms.append(str(synonyms_raw))
+        # Extract location info
+        label = locus.get("Gene-commentary_label", "")
+        chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None
+        seq_interval = self._nested_get(
+            locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={}
+        )
+        genomic_location = (
+            f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}"
+            if seq_interval.get('Seq-interval_from') and seq_interval.get('Seq-interval_to')
+            else None
+        )
+        # Extract representative accession
+        representative_accession = next(
+            (
+                product.get("Gene-commentary_accession")
+                for product in locus.get("Gene-commentary_products", [])
+                if product.get("Gene-commentary_type") == "3"
+            ),
+            None,
+        )
+        # Extract function
+        function = data.get("Entrezgene_summary") or next(
+            (
+                comment.get("Gene-commentary_comment")
+                for comment in data.get("Entrezgene_comments", [])
+                if isinstance(comment, dict)
+                and "function" in str(comment.get("Gene-commentary_heading", "")).lower()
+            ),
+            None,
+        )
+        return {
+            "molecule_type": "DNA",
+            "database": "NCBI",
+            "id": gene_id,
+            "gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
+            "gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
+            "organism": self._nested_get(
+                biosource, "BioSource_org", "Org-ref", "Org-ref_taxname", default="N/A"
+            ),
+            "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
+            "gene_synonyms": gene_synonyms or None,
+            "gene_type": {
+                "1": "protein-coding",
+                "2": "pseudo",
+                "3": "rRNA",
+                "4": "tRNA",
+                "5": "snRNA",
+                "6": "ncRNA",
+                "7": "other",
+            }.get(str(data.get("Entrezgene_type")), f"type_{data.get('Entrezgene_type')}"),
+            "chromosome": chromosome_match.group(1) if chromosome_match else None,
+            "genomic_location": genomic_location,
+            "function": function,
+            # Fields from accession-based queries
+            "title": None,
+            "sequence": None,
+            "sequence_length": None,
+            "gene_id": gene_id,
+            "molecule_type_detail": None,
+            "_representative_accession": representative_accession,
+        }
+    def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
+        """Get gene information by Gene ID."""
+        def _extract_from_genbank(result: dict, accession: str):
+            """Enrich result dictionary with sequence and summary information from accession."""
+            with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
+                record = SeqIO.read(handle, "genbank")
+                result["sequence"] = str(record.seq)
+                result["sequence_length"] = len(record.seq)
+                result["title"] = record.description
+                result["molecule_type_detail"] = (
+                    "mRNA" if accession.startswith(("NM_", "XM_")) else
+                    "genomic DNA" if accession.startswith(("NC_", "NT_")) else
+                    "RNA" if accession.startswith(("NR_", "XR_")) else
+                    "genomic region" if accession.startswith("NG_") else "N/A"
+                )
+                for feature in record.features:
+                    if feature.type == "source":
+                        if 'chromosome' in feature.qualifiers:
+                            result["chromosome"] = feature.qualifiers['chromosome'][0]
+                        if feature.location:
+                            start = int(feature.location.start) + 1
+                            end = int(feature.location.end)
+                            result["genomic_location"] = f"{start}-{end}"
+                        break
+                if not result.get("organism") and 'organism' in record.annotations:
+                    result["organism"] = record.annotations['organism']
+            return result
+        try:
+            with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
+                gene_record = Entrez.read(handle)
+                if not gene_record:
+                    return None
+                result = self._gene_record_to_dict(gene_record, gene_id)
+                if accession := (preferred_accession or result.get("_representative_accession")):
+                    result = _extract_from_genbank(result, accession)
+                result.pop("_representative_accession", None)
+                return result
+        except (RequestException, IncompleteRead):
+            raise
+        except Exception as exc:
+            logger.error("Gene ID %s not found: %s", gene_id, exc)
+            return None
+    def get_by_accession(self, accession: str) -> Optional[dict]:
+        """Get sequence information by accession number."""
+        def _extract_gene_id(link_handle):
+            """Extract GeneID from elink results."""
+            links = Entrez.read(link_handle)
+            if not links or "LinkSetDb" not in links[0]:
+                return None
+            for link_set in links[0]["LinkSetDb"]:
+                if link_set.get("DbTo") != "gene":
+                    continue
+                link = (link_set.get("Link") or link_set.get("IdList", [{}]))[0]
+                return str(link.get("Id") if isinstance(link, dict) else link)
+        try:
+            # TODO: support accession number with version number (e.g., NM_000546.3)
+            with Entrez.elink(dbfrom="nuccore", db="gene", id=accession) as link_handle:
+                gene_id = _extract_gene_id(link_handle)
+            if not gene_id:
+                logger.warning("Accession %s has no associated GeneID", accession)
+                return None
+            result = self.get_by_gene_id(gene_id, preferred_accession=accession)
+            if result:
+                result["id"] = accession
+                result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}"
+            return result
+        except (RequestException, IncompleteRead):
+            raise
+        except Exception as exc:
+            logger.error("Accession %s not found: %s", accession, exc)
+            return None
+    def get_best_hit(self, keyword: str) -> Optional[dict]:
+        """Search NCBI Gene database with a keyword and return the best hit."""
+        if not keyword.strip():
+            return None
+        try:
+            for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]:
+                with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle:
+                    search_results = Entrez.read(search_handle)
+                    if len(gene_id := search_results.get("IdList", [])) > 0:
+                        return self.get_by_gene_id(gene_id)
+        except (RequestException, IncompleteRead):
+            raise
+        except Exception as e:
+            logger.error("Keyword %s not found: %s", keyword, e)
+        return None
+    def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
+        """Perform local BLAST search using local BLAST database."""
+        try:
+            with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp:
+                tmp.write(f">query\n{seq}\n")
+                tmp_name = tmp.name
+            cmd = [
+                "blastn", "-db", self.local_blast_db, "-query", tmp_name,
+                "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc"
+            ]
+            logger.debug("Running local blastn: %s", " ".join(cmd))
+            out = subprocess.check_output(cmd, text=True).strip()
+            os.remove(tmp_name)
+            return out.split("\n", maxsplit=1)[0] if out else None
+        except Exception as exc:
+            logger.error("Local blastn failed: %s", exc)
+            return None
+    def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
+        """Search NCBI with a DNA sequence using BLAST."""
+        def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
+            """Extract and normalize DNA sequence from input."""
+            if sequence.startswith(">"):
+                seq = "".join(sequence.strip().split("\n")[1:])
+            else:
+                seq = sequence.strip().replace(" ", "").replace("\n", "")
+            return seq if re.fullmatch(r"[ATCGN]+", seq, re.I) else None
+        def _process_network_blast_result(blast_record, seq: str, threshold: float) -> Optional[dict]:
+            """Process network BLAST result and return dictionary or None."""
+            if not blast_record.alignments:
+                logger.info("No BLAST hits found for the given sequence.")
+                return None
+            best_alignment = blast_record.alignments[0]
+            best_hsp = best_alignment.hsps[0]
+            if best_hsp.expect > threshold:
+                logger.info("No BLAST hits below the threshold E-value.")
+                return None
+            hit_id = best_alignment.hit_id
+            if accession_match := re.search(r"ref\|([^|]+)", hit_id):
+                return self.get_by_accession(accession_match.group(1).split(".")[0])
+            # If unable to extract accession, return basic information
+            return {
+                "molecule_type": "DNA",
+                "database": "NCBI",
+                "id": hit_id,
+                "title": best_alignment.title,
+                "sequence_length": len(seq),
+                "e_value": best_hsp.expect,
+                "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
+                "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
+            }
+        try:
+            if not (seq := _extract_and_normalize_sequence(sequence)):
+                logger.error("Empty or invalid DNA sequence provided.")
+                return None
+            # Try local BLAST first if enabled
+            if self.use_local_blast and (accession := self._local_blast(seq, threshold)):
+                logger.debug("Local BLAST found accession: %s", accession)
+                return self.get_by_accession(accession)
+            # Fall back to network BLAST
+            logger.debug("Falling back to NCBIWWW.qblast")
+            with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle:
+                return _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold)
+        except (RequestException, IncompleteRead):
+            raise
+        except Exception as e:
+            logger.error("BLAST search failed: %s", e)
+            return None
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
+    async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optional[Dict]:
+        """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence."""
+        if not query or not isinstance(query, str):
+            logger.error("Empty or non-string input.")
+            return None
+        query = query.strip()
+        logger.debug("NCBI search query: %s", query)
+        loop = asyncio.get_running_loop()
+        # limit concurrent requests (NCBI rate limit: max 3 requests per second)
+        async with _ncbi_lock:
+            # Auto-detect query type and execute in thread pool
+            if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
+                result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
+            elif re.fullmatch(r"^\d+$", query):
+                result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query)
+            elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
+                result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query)
+            else:
+                result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
+        if result:
+            result["_search_query"] = query
+        return result

graphgen/models/searcher/db/rnacentral_searcher.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import asyncio
+import os
+import re
+import subprocess
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+import tempfile
+from typing import Dict, Optional, List, Any, Set
+import hashlib
+import requests
+import aiohttp
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+from graphgen.bases import BaseSearcher
+from graphgen.utils import logger
+@lru_cache(maxsize=None)
+def _get_pool():
+    return ThreadPoolExecutor(max_workers=10)
+class RNACentralSearch(BaseSearcher):
+    """
+    RNAcentral Search client to search RNA databases.
+    1) Get RNA by RNAcentral ID.
+    2) Search with keywords or RNA names (fuzzy search).
+    3) Search with RNA sequence.
+    API Documentation: https://rnacentral.org/api/v1
+    """
+    def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"):
+        super().__init__()
+        self.base_url = "https://rnacentral.org/api/v1"
+        self.headers = {"Accept": "application/json"}
+        self.use_local_blast = use_local_blast
+        self.local_blast_db = local_blast_db
+        if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
+            logger.error("Local BLAST database files not found. Please check the path.")
+            self.use_local_blast = False
+    @staticmethod
+    def _rna_data_to_dict(
+        rna_id: str,
+        rna_data: Dict[str, Any],
+        xrefs_data: Optional[List[Dict[str, Any]]] = None
+    ) -> Dict[str, Any]:
+        organisms, gene_names, so_terms = set(), set(), set()
+        modifications: List[Any] = []
+        for xref in xrefs_data or []:
+            acc = xref.get("accession", {})
+            if s := acc.get("species"):
+                organisms.add(s)
+            if g := acc.get("gene", "").strip():
+                gene_names.add(g)
+            if m := xref.get("modifications"):
+                modifications.extend(m)
+            if b := acc.get("biotype"):
+                so_terms.add(b)
+        def format_unique_values(values: Set[str]) -> Optional[str]:
+            if not values:
+                return None
+            if len(values) == 1:
+                return next(iter(values))
+            return ", ".join(sorted(values))
+        xrefs_info = {
+            "organism": format_unique_values(organisms),
+            "gene_name": format_unique_values(gene_names),
+            "related_genes": list(gene_names) if gene_names else None,
+            "modifications": modifications or None,
+            "so_term": format_unique_values(so_terms),
+        }
+        fallback_rules = {
+            "organism": ["organism", "species"],
+            "related_genes": ["related_genes", "genes"],
+            "gene_name": ["gene_name", "gene"],
+            "so_term": ["so_term"],
+            "modifications": ["modifications"],
+        }
+        def resolve_field(field_name: str) -> Any:
+            if (value := xrefs_info.get(field_name)) is not None:
+                return value
+            for key in fallback_rules[field_name]:
+                if (value := rna_data.get(key)) is not None:
+                    return value
+            return None
+        organism = resolve_field("organism")
+        gene_name = resolve_field("gene_name")
+        so_term = resolve_field("so_term")
+        modifications = resolve_field("modifications")
+        related_genes = resolve_field("related_genes")
+        if not related_genes and (single_gene := rna_data.get("gene_name")):
+            related_genes = [single_gene]
+        sequence = rna_data.get("sequence", "")
+        return {
+            "molecule_type": "RNA",
+            "database": "RNAcentral",
+            "id": rna_id,
+            "rnacentral_id": rna_data.get("rnacentral_id", rna_id),
+            "sequence": sequence,
+            "sequence_length": rna_data.get("length", len(sequence)),
+            "rna_type": rna_data.get("rna_type", "N/A"),
+            "description": rna_data.get("description", "N/A"),
+            "url": f"https://rnacentral.org/rna/{rna_id}",
+            "organism": organism,
+            "related_genes": related_genes or None,
+            "gene_name": gene_name,
+            "so_term": so_term,
+            "modifications": modifications,
+        }
+    @staticmethod
+    def _calculate_md5(sequence: str) -> str:
+        """
+        Calculate MD5 hash for RNA sequence as per RNAcentral spec.
+        - Replace U with T
+        - Convert to uppercase
+        - Encode as ASCII
+        """
+        # Normalize sequence
+        normalized_seq = sequence.replace("U", "T").replace("u", "t").upper()
+        if not re.fullmatch(r"[ATCGN]+", normalized_seq):
+            raise ValueError(f"Invalid sequence characters after normalization: {normalized_seq[:50]}...")
+        return hashlib.md5(normalized_seq.encode("ascii")).hexdigest()
+    def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
+        """
+        Get RNA information by RNAcentral ID.
+        :param rna_id: RNAcentral ID (e.g., URS0000000001).
+        :return: A dictionary containing RNA information or None if not found.
+        """
+        try:
+            url = f"{self.base_url}/rna/{rna_id}"
+            url += "?flat=true"
+            resp = requests.get(url, headers=self.headers, timeout=30)
+            resp.raise_for_status()
+            rna_data = resp.json()
+            xrefs_data = rna_data.get("xrefs", [])
+            return self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
+        except requests.RequestException as e:
+            logger.error("Network error getting RNA ID %s: %s", rna_id, e)
+            return None
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e)
+            return None
+    def get_best_hit(self, keyword: str) -> Optional[dict]:
+        """
+        Search RNAcentral with a keyword and return the best hit.
+        :param keyword: The search keyword (e.g., miRNA name, RNA name).
+        :return: Dictionary with RNA information or None.
+        """
+        keyword = keyword.strip()
+        if not keyword:
+            logger.warning("Empty keyword provided to get_best_hit")
+            return None
+        try:
+            url = f"{self.base_url}/rna"
+            params = {"search": keyword, "format": "json"}
+            resp = requests.get(url, params=params, headers=self.headers, timeout=30)
+            resp.raise_for_status()
+            data = resp.json()
+            results = data.get("results", [])
+            if not results:
+                logger.info("No search results for keyword: %s", keyword)
+                return None
+            first_result = results[0]
+            rna_id = first_result.get("rnacentral_id")
+            if rna_id:
+                detailed = self.get_by_rna_id(rna_id)
+                if detailed:
+                    return detailed
+            logger.debug("Using search result data for %s", rna_id or "unknown")
+            return self._rna_data_to_dict(rna_id or "", first_result)
+        except requests.RequestException as e:
+            logger.error("Network error searching keyword '%s': %s", keyword, e)
+            return None
+        except Exception as e:
+            logger.error("Unexpected error searching keyword '%s': %s", keyword, e)
+            return None
+    def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
+        """Perform local BLAST search using local BLAST database."""
+        try:
+            with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp:
+                tmp.write(f">query\n{seq}\n")
+                tmp_name = tmp.name
+            cmd = [
+                "blastn", "-db", self.local_blast_db, "-query", tmp_name,
+                "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc"
+            ]
+            logger.debug("Running local blastn for RNA: %s", " ".join(cmd))
+            out = subprocess.check_output(cmd, text=True).strip()
+            os.remove(tmp_name)
+            return out.split("\n", maxsplit=1)[0] if out else None
+        except Exception as exc:
+            logger.error("Local blastn failed: %s", exc)
+            return None
+    def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
+        """
+        Search RNAcentral with an RNA sequence.
+        Tries local BLAST first if enabled, falls back to RNAcentral API.
+        Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information.
+        :param sequence: RNA sequence (FASTA format or raw sequence).
+        :param threshold: E-value threshold for BLAST search.
+        :return: A dictionary containing complete RNA information or None if not found.
+        """
+        def _extract_sequence(sequence: str) -> Optional[str]:
+            """Extract and normalize RNA sequence from input."""
+            if sequence.startswith(">"):
+                seq_lines = sequence.strip().split("\n")
+                seq = "".join(seq_lines[1:])
+            else:
+                seq = sequence.strip().replace(" ", "").replace("\n", "")
+            return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None
+        try:
+            seq = _extract_sequence(sequence)
+            if not seq:
+                logger.error("Empty or invalid RNA sequence provided.")
+                return None
+            # Try local BLAST first if enabled
+            if self.use_local_blast:
+                accession = self._local_blast(seq, threshold)
+                if accession:
+                    logger.debug("Local BLAST found accession: %s", accession)
+                    return self.get_by_rna_id(accession)
+            # Fall back to RNAcentral API if local BLAST didn't find result
+            logger.debug("Falling back to RNAcentral API.")
+            md5_hash = self._calculate_md5(seq)
+            search_url = f"{self.base_url}/rna"
+            params = {"md5": md5_hash, "format": "json"}
+            resp = requests.get(search_url, params=params, headers=self.headers, timeout=60)
+            resp.raise_for_status()
+            search_results = resp.json()
+            results = search_results.get("results", [])
+            if not results:
+                logger.info("No exact match found in RNAcentral for sequence")
+                return None
+            rna_id = results[0].get("rnacentral_id")
+            if not rna_id:
+                logger.error("No RNAcentral ID found in search results.")
+                return None
+            return self.get_by_rna_id(rna_id)
+        except Exception as e:
+            logger.error("Sequence search failed: %s", e)
+            return None
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)),
+        reraise=True,
+    )
+    async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional[Dict]:
+        """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence."""
+        if not query or not isinstance(query, str):
+            logger.error("Empty or non-string input.")
+            return None
+        query = query.strip()
+        logger.debug("RNAcentral search query: %s", query)
+        loop = asyncio.get_running_loop()
+        # check if RNA sequence (AUCG characters, contains U)
+        if query.startswith(">") or (
+            re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper()
+        ):
+            result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
+        # check if RNAcentral ID (typically starts with URS)
+        elif re.fullmatch(r"URS\d+", query, re.I):
+            result = await loop.run_in_executor(_get_pool(), self.get_by_rna_id, query)
+        else:
+            # otherwise treat as keyword
+            result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
+        if result:
+            result["_search_query"] = query
+        return result

graphgen/operators/search/search_all.py CHANGED Viewed

@@ -27,6 +27,10 @@ async def search_all(
     data_sources = search_config.get("data_sources", [])
     for data_source in data_sources:
         if data_source == "uniprot":
             from graphgen.models import UniProtSearch
@@ -34,19 +38,46 @@ async def search_all(
                 **search_config.get("uniprot_params", {})
             )
-            data = list(seed_data.values())
-            data = [d["content"] for d in data if "content" in d]
-            data = list(set(data))  # Remove duplicates
             uniprot_results = await run_concurrent(
                 uniprot_search_client.search,
                 data,
                 desc="Searching UniProt database",
                 unit="keyword",
             )
         else:
             logger.error("Data source %s not supported.", data_source)
             continue
-        results[data_source] = uniprot_results
     return results

     data_sources = search_config.get("data_sources", [])
     for data_source in data_sources:
+        data = list(seed_data.values())
+        data = [d["content"] for d in data if "content" in d]
+        data = list(set(data))  # Remove duplicates
         if data_source == "uniprot":
             from graphgen.models import UniProtSearch
                 **search_config.get("uniprot_params", {})
             )
             uniprot_results = await run_concurrent(
                 uniprot_search_client.search,
                 data,
                 desc="Searching UniProt database",
                 unit="keyword",
             )
+            results[data_source] = uniprot_results
+        elif data_source == "ncbi":
+            from graphgen.models import NCBISearch
+            ncbi_search_client = NCBISearch(
+                **search_config.get("ncbi_params", {})
+            )
+            ncbi_results = await run_concurrent(
+                ncbi_search_client.search,
+                data,
+                desc="Searching NCBI database",
+                unit="keyword",
+            )
+            results[data_source] = ncbi_results
+        elif data_source == "rnacentral":
+            from graphgen.models import RNACentralSearch
+            rnacentral_search_client = RNACentralSearch(
+                **search_config.get("rnacentral_params", {})
+            )
+            rnacentral_results = await run_concurrent(
+                rnacentral_search_client.search,
+                data,
+                desc="Searching RNAcentral database",
+                unit="keyword",
+            )
+            results[data_source] = rnacentral_results
         else:
             logger.error("Data source %s not supported.", data_source)
             continue
     return results

requirements.txt CHANGED Viewed

@@ -21,6 +21,7 @@ fastapi
 trafilatura
 aiohttp
 diskcache
 leidenalg
 igraph

 trafilatura
 aiohttp
 diskcache
+socksio
 leidenalg
 igraph