Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
10ba08f
1
Parent(s):
fa9fcab
Auto-sync from demo at Fri Dec 5 06:11:52 UTC 2025
Browse files
graphgen/configs/search_dna_config.yaml
CHANGED
|
@@ -13,5 +13,5 @@ pipeline:
|
|
| 13 |
email: test@example.com # NCBI requires an email address
|
| 14 |
tool: GraphGen # tool name for NCBI API
|
| 15 |
use_local_blast: true # whether to use local blast for DNA search
|
| 16 |
-
local_blast_db: /
|
| 17 |
|
|
|
|
| 13 |
email: test@example.com # NCBI requires an email address
|
| 14 |
tool: GraphGen # tool name for NCBI API
|
| 15 |
use_local_blast: true # whether to use local blast for DNA search
|
| 16 |
+
local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension)
|
| 17 |
|
graphgen/configs/search_rna_config.yaml
CHANGED
|
@@ -11,6 +11,4 @@ pipeline:
|
|
| 11 |
data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
|
| 12 |
rnacentral_params:
|
| 13 |
use_local_blast: true # whether to use local blast for RNA search
|
| 14 |
-
local_blast_db: /
|
| 15 |
-
# can also use DNA database with RNA sequences (if already built)
|
| 16 |
-
|
|
|
|
| 11 |
data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
|
| 12 |
rnacentral_params:
|
| 13 |
use_local_blast: true # whether to use local blast for RNA search
|
| 14 |
+
local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension)
|
|
|
|
|
|
graphgen/models/searcher/db/ncbi_searcher.py
CHANGED
|
@@ -83,6 +83,29 @@ class NCBISearch(BaseSearcher):
|
|
| 83 |
data = data.get(key, default)
|
| 84 |
return data
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
|
| 87 |
"""
|
| 88 |
Convert an Entrez gene record to a dictionary.
|
|
@@ -120,7 +143,7 @@ class NCBISearch(BaseSearcher):
|
|
| 120 |
else None
|
| 121 |
)
|
| 122 |
|
| 123 |
-
# Extract representative accession
|
| 124 |
representative_accession = next(
|
| 125 |
(
|
| 126 |
product.get("Gene-commentary_accession")
|
|
@@ -129,6 +152,17 @@ class NCBISearch(BaseSearcher):
|
|
| 129 |
),
|
| 130 |
None,
|
| 131 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
# Extract function
|
| 134 |
function = data.get("Entrezgene_summary") or next(
|
|
@@ -169,18 +203,19 @@ class NCBISearch(BaseSearcher):
|
|
| 169 |
"sequence": None,
|
| 170 |
"sequence_length": None,
|
| 171 |
"gene_id": gene_id,
|
| 172 |
-
"molecule_type_detail":
|
|
|
|
|
|
|
| 173 |
"_representative_accession": representative_accession,
|
| 174 |
}
|
| 175 |
|
| 176 |
def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
|
| 177 |
"""Get gene information by Gene ID."""
|
| 178 |
-
def
|
| 179 |
-
"""
|
| 180 |
with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
|
| 181 |
record = SeqIO.read(handle, "genbank")
|
| 182 |
-
|
| 183 |
-
result["sequence_length"] = len(record.seq)
|
| 184 |
result["title"] = record.description
|
| 185 |
result["molecule_type_detail"] = (
|
| 186 |
"mRNA" if accession.startswith(("NM_", "XM_")) else
|
|
@@ -206,6 +241,22 @@ class NCBISearch(BaseSearcher):
|
|
| 206 |
|
| 207 |
return result
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
try:
|
| 210 |
with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
|
| 211 |
gene_record = Entrez.read(handle)
|
|
@@ -214,7 +265,8 @@ class NCBISearch(BaseSearcher):
|
|
| 214 |
|
| 215 |
result = self._gene_record_to_dict(gene_record, gene_id)
|
| 216 |
if accession := (preferred_accession or result.get("_representative_accession")):
|
| 217 |
-
result =
|
|
|
|
| 218 |
|
| 219 |
result.pop("_representative_accession", None)
|
| 220 |
return result
|
|
|
|
| 83 |
data = data.get(key, default)
|
| 84 |
return data
|
| 85 |
|
| 86 |
+
@staticmethod
|
| 87 |
+
def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]:
|
| 88 |
+
"""Infer molecule_type_detail from accession prefix or gene type."""
|
| 89 |
+
if accession:
|
| 90 |
+
if accession.startswith(("NM_", "XM_")):
|
| 91 |
+
return "mRNA"
|
| 92 |
+
if accession.startswith(("NC_", "NT_")):
|
| 93 |
+
return "genomic DNA"
|
| 94 |
+
if accession.startswith(("NR_", "XR_")):
|
| 95 |
+
return "RNA"
|
| 96 |
+
if accession.startswith("NG_"):
|
| 97 |
+
return "genomic region"
|
| 98 |
+
# Fallback: infer from gene type if available
|
| 99 |
+
if gene_type is not None:
|
| 100 |
+
gene_type_map = {
|
| 101 |
+
3: "rRNA",
|
| 102 |
+
4: "tRNA",
|
| 103 |
+
5: "snRNA",
|
| 104 |
+
6: "ncRNA",
|
| 105 |
+
}
|
| 106 |
+
return gene_type_map.get(gene_type)
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
|
| 110 |
"""
|
| 111 |
Convert an Entrez gene record to a dictionary.
|
|
|
|
| 143 |
else None
|
| 144 |
)
|
| 145 |
|
| 146 |
+
# Extract representative accession (prefer type 3 = mRNA/transcript)
|
| 147 |
representative_accession = next(
|
| 148 |
(
|
| 149 |
product.get("Gene-commentary_accession")
|
|
|
|
| 152 |
),
|
| 153 |
None,
|
| 154 |
)
|
| 155 |
+
# Fallback: if no type 3 accession, try any available accession
|
| 156 |
+
# This is needed for genes that don't have mRNA transcripts but have other sequence records
|
| 157 |
+
if not representative_accession:
|
| 158 |
+
representative_accession = next(
|
| 159 |
+
(
|
| 160 |
+
product.get("Gene-commentary_accession")
|
| 161 |
+
for product in locus.get("Gene-commentary_products", [])
|
| 162 |
+
if product.get("Gene-commentary_accession")
|
| 163 |
+
),
|
| 164 |
+
None,
|
| 165 |
+
)
|
| 166 |
|
| 167 |
# Extract function
|
| 168 |
function = data.get("Entrezgene_summary") or next(
|
|
|
|
| 203 |
"sequence": None,
|
| 204 |
"sequence_length": None,
|
| 205 |
"gene_id": gene_id,
|
| 206 |
+
"molecule_type_detail": self._infer_molecule_type_detail(
|
| 207 |
+
representative_accession, data.get("Entrezgene_type")
|
| 208 |
+
),
|
| 209 |
"_representative_accession": representative_accession,
|
| 210 |
}
|
| 211 |
|
| 212 |
def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
|
| 213 |
"""Get gene information by Gene ID."""
|
| 214 |
+
def _extract_metadata_from_genbank(result: dict, accession: str):
|
| 215 |
+
"""Extract metadata from GenBank format (title, features, organism, etc.)."""
|
| 216 |
with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
|
| 217 |
record = SeqIO.read(handle, "genbank")
|
| 218 |
+
|
|
|
|
| 219 |
result["title"] = record.description
|
| 220 |
result["molecule_type_detail"] = (
|
| 221 |
"mRNA" if accession.startswith(("NM_", "XM_")) else
|
|
|
|
| 241 |
|
| 242 |
return result
|
| 243 |
|
| 244 |
+
def _extract_sequence_from_fasta(result: dict, accession: str):
|
| 245 |
+
"""Extract sequence from FASTA format (more reliable than GenBank for CON-type records)."""
|
| 246 |
+
try:
|
| 247 |
+
with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle:
|
| 248 |
+
fasta_record = SeqIO.read(fasta_handle, "fasta")
|
| 249 |
+
result["sequence"] = str(fasta_record.seq)
|
| 250 |
+
result["sequence_length"] = len(fasta_record.seq)
|
| 251 |
+
except Exception as fasta_exc:
|
| 252 |
+
logger.warning(
|
| 253 |
+
"Failed to extract sequence from accession %s using FASTA format: %s",
|
| 254 |
+
accession, fasta_exc
|
| 255 |
+
)
|
| 256 |
+
result["sequence"] = None
|
| 257 |
+
result["sequence_length"] = None
|
| 258 |
+
return result
|
| 259 |
+
|
| 260 |
try:
|
| 261 |
with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
|
| 262 |
gene_record = Entrez.read(handle)
|
|
|
|
| 265 |
|
| 266 |
result = self._gene_record_to_dict(gene_record, gene_id)
|
| 267 |
if accession := (preferred_accession or result.get("_representative_accession")):
|
| 268 |
+
result = _extract_metadata_from_genbank(result, accession)
|
| 269 |
+
result = _extract_sequence_from_fasta(result, accession)
|
| 270 |
|
| 271 |
result.pop("_representative_accession", None)
|
| 272 |
return result
|