github-actions[bot] commited on
Commit
10ba08f
·
1 Parent(s): fa9fcab

Auto-sync from demo at Fri Dec 5 06:11:52 UTC 2025

Browse files
graphgen/configs/search_dna_config.yaml CHANGED
@@ -13,5 +13,5 @@ pipeline:
13
  email: test@example.com # NCBI requires an email address
14
  tool: GraphGen # tool name for NCBI API
15
  use_local_blast: true # whether to use local blast for DNA search
16
- local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension)
17
 
 
13
  email: test@example.com # NCBI requires an email address
14
  tool: GraphGen # tool name for NCBI API
15
  use_local_blast: true # whether to use local blast for DNA search
16
+ local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension)
17
 
graphgen/configs/search_rna_config.yaml CHANGED
@@ -11,6 +11,4 @@ pipeline:
11
  data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
12
  rnacentral_params:
13
  use_local_blast: true # whether to use local blast for RNA search
14
- local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE}
15
- # can also use DNA database with RNA sequences (if already built)
16
-
 
11
  data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
12
  rnacentral_params:
13
  use_local_blast: true # whether to use local blast for RNA search
14
+ local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension)
 
 
graphgen/models/searcher/db/ncbi_searcher.py CHANGED
@@ -83,6 +83,29 @@ class NCBISearch(BaseSearcher):
83
  data = data.get(key, default)
84
  return data
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
87
  """
88
  Convert an Entrez gene record to a dictionary.
@@ -120,7 +143,7 @@ class NCBISearch(BaseSearcher):
120
  else None
121
  )
122
 
123
- # Extract representative accession
124
  representative_accession = next(
125
  (
126
  product.get("Gene-commentary_accession")
@@ -129,6 +152,17 @@ class NCBISearch(BaseSearcher):
129
  ),
130
  None,
131
  )
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  # Extract function
134
  function = data.get("Entrezgene_summary") or next(
@@ -169,18 +203,19 @@ class NCBISearch(BaseSearcher):
169
  "sequence": None,
170
  "sequence_length": None,
171
  "gene_id": gene_id,
172
- "molecule_type_detail": None,
 
 
173
  "_representative_accession": representative_accession,
174
  }
175
 
176
  def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
177
  """Get gene information by Gene ID."""
178
- def _extract_from_genbank(result: dict, accession: str):
179
- """Enrich result dictionary with sequence and summary information from accession."""
180
  with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
181
  record = SeqIO.read(handle, "genbank")
182
- result["sequence"] = str(record.seq)
183
- result["sequence_length"] = len(record.seq)
184
  result["title"] = record.description
185
  result["molecule_type_detail"] = (
186
  "mRNA" if accession.startswith(("NM_", "XM_")) else
@@ -206,6 +241,22 @@ class NCBISearch(BaseSearcher):
206
 
207
  return result
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  try:
210
  with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
211
  gene_record = Entrez.read(handle)
@@ -214,7 +265,8 @@ class NCBISearch(BaseSearcher):
214
 
215
  result = self._gene_record_to_dict(gene_record, gene_id)
216
  if accession := (preferred_accession or result.get("_representative_accession")):
217
- result = _extract_from_genbank(result, accession)
 
218
 
219
  result.pop("_representative_accession", None)
220
  return result
 
83
  data = data.get(key, default)
84
  return data
85
 
86
+ @staticmethod
87
+ def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]:
88
+ """Infer molecule_type_detail from accession prefix or gene type."""
89
+ if accession:
90
+ if accession.startswith(("NM_", "XM_")):
91
+ return "mRNA"
92
+ if accession.startswith(("NC_", "NT_")):
93
+ return "genomic DNA"
94
+ if accession.startswith(("NR_", "XR_")):
95
+ return "RNA"
96
+ if accession.startswith("NG_"):
97
+ return "genomic region"
98
+ # Fallback: infer from gene type if available
99
+ if gene_type is not None:
100
+ gene_type_map = {
101
+ 3: "rRNA",
102
+ 4: "tRNA",
103
+ 5: "snRNA",
104
+ 6: "ncRNA",
105
+ }
106
+ return gene_type_map.get(gene_type)
107
+ return None
108
+
109
  def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
110
  """
111
  Convert an Entrez gene record to a dictionary.
 
143
  else None
144
  )
145
 
146
+ # Extract representative accession (prefer type 3 = mRNA/transcript)
147
  representative_accession = next(
148
  (
149
  product.get("Gene-commentary_accession")
 
152
  ),
153
  None,
154
  )
155
+ # Fallback: if no type 3 accession, try any available accession
156
+ # This is needed for genes that don't have mRNA transcripts but have other sequence records
157
+ if not representative_accession:
158
+ representative_accession = next(
159
+ (
160
+ product.get("Gene-commentary_accession")
161
+ for product in locus.get("Gene-commentary_products", [])
162
+ if product.get("Gene-commentary_accession")
163
+ ),
164
+ None,
165
+ )
166
 
167
  # Extract function
168
  function = data.get("Entrezgene_summary") or next(
 
203
  "sequence": None,
204
  "sequence_length": None,
205
  "gene_id": gene_id,
206
+ "molecule_type_detail": self._infer_molecule_type_detail(
207
+ representative_accession, data.get("Entrezgene_type")
208
+ ),
209
  "_representative_accession": representative_accession,
210
  }
211
 
212
  def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
213
  """Get gene information by Gene ID."""
214
+ def _extract_metadata_from_genbank(result: dict, accession: str):
215
+ """Extract metadata from GenBank format (title, features, organism, etc.)."""
216
  with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
217
  record = SeqIO.read(handle, "genbank")
218
+
 
219
  result["title"] = record.description
220
  result["molecule_type_detail"] = (
221
  "mRNA" if accession.startswith(("NM_", "XM_")) else
 
241
 
242
  return result
243
 
244
+ def _extract_sequence_from_fasta(result: dict, accession: str):
245
+ """Extract sequence from FASTA format (more reliable than GenBank for CON-type records)."""
246
+ try:
247
+ with Entrez.efetch(db="nuccore", id=accession, rettype="fasta", retmode="text") as fasta_handle:
248
+ fasta_record = SeqIO.read(fasta_handle, "fasta")
249
+ result["sequence"] = str(fasta_record.seq)
250
+ result["sequence_length"] = len(fasta_record.seq)
251
+ except Exception as fasta_exc:
252
+ logger.warning(
253
+ "Failed to extract sequence from accession %s using FASTA format: %s",
254
+ accession, fasta_exc
255
+ )
256
+ result["sequence"] = None
257
+ result["sequence_length"] = None
258
+ return result
259
+
260
  try:
261
  with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
262
  gene_record = Entrez.read(handle)
 
265
 
266
  result = self._gene_record_to_dict(gene_record, gene_id)
267
  if accession := (preferred_accession or result.get("_representative_accession")):
268
+ result = _extract_metadata_from_genbank(result, accession)
269
+ result = _extract_sequence_from_fasta(result, accession)
270
 
271
  result.pop("_representative_accession", None)
272
  return result