Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
65b9482
1
Parent(s):
e84bc8e
Auto-sync from demo at Tue Nov 4 07:20:05 UTC 2025
Browse files
graphgen/models/search/db/uniprot_search.py
CHANGED
|
@@ -1,61 +1,118 @@
|
|
| 1 |
-
import
|
| 2 |
-
from
|
| 3 |
|
| 4 |
-
from
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
|
| 8 |
|
| 9 |
class UniProtSearch:
|
| 10 |
"""
|
| 11 |
UniProt Search client to search with UniProt.
|
| 12 |
1) Get the protein by accession number.
|
| 13 |
-
2) Search with keywords or protein names.
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
"""
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def search(
|
| 24 |
-
self,
|
| 25 |
-
query: str,
|
| 26 |
-
*,
|
| 27 |
-
size: int = 10,
|
| 28 |
-
cursor: str = None,
|
| 29 |
-
fields: list[str] = None,
|
| 30 |
-
) -> dict:
|
| 31 |
"""
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
:
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from io import StringIO
|
| 2 |
+
from typing import Dict, Optional
|
| 3 |
|
| 4 |
+
from Bio import ExPASy, SeqIO, SwissProt, UniProt
|
| 5 |
+
from Bio.Blast import NCBIWWW, NCBIXML
|
| 6 |
|
| 7 |
+
from graphgen.utils import logger
|
| 8 |
|
| 9 |
|
| 10 |
class UniProtSearch:
|
| 11 |
"""
|
| 12 |
UniProt Search client to search with UniProt.
|
| 13 |
1) Get the protein by accession number.
|
| 14 |
+
2) Search with keywords or protein names (fuzzy search).
|
| 15 |
+
3) Search with FASTA sequence (BLAST search).
|
| 16 |
"""
|
| 17 |
|
| 18 |
+
def get_by_accession(self, accession: str) -> Optional[dict]:
|
| 19 |
+
try:
|
| 20 |
+
handle = ExPASy.get_sprot_raw(accession)
|
| 21 |
+
record = SwissProt.read(handle)
|
| 22 |
+
handle.close()
|
| 23 |
+
return self._swissprot_to_dict(record)
|
| 24 |
+
except Exception as exc: # pylint: disable=broad-except
|
| 25 |
+
logger.error("Accession %s not found: %s", accession, exc)
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
@staticmethod
|
| 29 |
+
def _swissprot_to_dict(record: SwissProt.Record) -> dict:
|
| 30 |
+
"""error
|
| 31 |
+
Convert a SwissProt.Record to a dictionary.
|
| 32 |
"""
|
| 33 |
+
functions = []
|
| 34 |
+
for line in record.comments:
|
| 35 |
+
if line.startswith("FUNCTION:"):
|
| 36 |
+
functions.append(line[9:].strip())
|
| 37 |
+
|
| 38 |
+
return {
|
| 39 |
+
"molecule_type": "protein",
|
| 40 |
+
"database": "UniProt",
|
| 41 |
+
"id": record.accessions[0],
|
| 42 |
+
"entry_name": record.entry_name,
|
| 43 |
+
"gene_names": record.gene_name,
|
| 44 |
+
"protein_name": record.description.split(";")[0].split("=")[-1],
|
| 45 |
+
"organism": record.organism.split(" (")[0],
|
| 46 |
+
"sequence": str(record.sequence),
|
| 47 |
+
"function": functions,
|
| 48 |
+
"url": f"https://www.uniprot.org/uniprot/{record.accessions[0]}",
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
def get_best_hit(self, keyword: str) -> Optional[Dict]:
|
| 52 |
"""
|
| 53 |
+
Search UniProt with a keyword and return the best hit.
|
| 54 |
+
:param keyword: The search keyword.
|
| 55 |
+
:return: A dictionary containing the best hit information or None if not found.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
"""
|
| 57 |
+
if not keyword.strip():
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
iterator = UniProt.search(keyword, fields=None, batch_size=1)
|
| 62 |
+
hit = next(iterator, None)
|
| 63 |
+
if hit is None:
|
| 64 |
+
return None
|
| 65 |
+
return self.get_by_accession(hit["primaryAccession"])
|
| 66 |
+
|
| 67 |
+
except Exception as e: # pylint: disable=broad-except
|
| 68 |
+
logger.error("Keyword %s not found: %s", keyword, e)
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
|
| 72 |
"""
|
| 73 |
+
Search UniProt with a FASTA sequence and return the best hit.
|
| 74 |
+
:param fasta_sequence: The FASTA sequence.
|
| 75 |
+
:param threshold: E-value threshold for BLAST search.
|
| 76 |
+
:return: A dictionary containing the best hit information or None if not found.
|
| 77 |
+
"""
|
| 78 |
+
try:
|
| 79 |
+
if fasta_sequence.startswith(">"):
|
| 80 |
+
seq = str(list(SeqIO.parse(StringIO(fasta_sequence), "fasta"))[0].seq)
|
| 81 |
+
else:
|
| 82 |
+
seq = fasta_sequence.strip()
|
| 83 |
+
except Exception as e: # pylint: disable=broad-except
|
| 84 |
+
logger.error("Invalid FASTA sequence: %s", e)
|
| 85 |
+
return None
|
| 86 |
|
| 87 |
+
if not seq:
|
| 88 |
+
logger.error("Empty FASTA sequence provided.")
|
| 89 |
+
return None
|
| 90 |
+
|
| 91 |
+
# UniProtKB/Swiss-Prot BLAST API
|
| 92 |
+
try:
|
| 93 |
+
result_handle = NCBIWWW.qblast(
|
| 94 |
+
program="blastp",
|
| 95 |
+
database="swissprot",
|
| 96 |
+
sequence=seq,
|
| 97 |
+
hitlist_size=1,
|
| 98 |
+
expect=threshold,
|
| 99 |
+
)
|
| 100 |
+
blast_record = NCBIXML.read(result_handle)
|
| 101 |
+
except Exception as e: # pylint: disable=broad-except
|
| 102 |
+
logger.error("BLAST search failed: %s", e)
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
if not blast_record.alignments:
|
| 106 |
+
logger.info("No BLAST hits found for the given sequence.")
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
best_alignment = blast_record.alignments[0]
|
| 110 |
+
best_hsp = best_alignment.hsps[0]
|
| 111 |
+
if best_hsp.expect > threshold:
|
| 112 |
+
logger.info("No BLAST hits below the threshold E-value.")
|
| 113 |
+
return None
|
| 114 |
+
hit_id = best_alignment.hit_id
|
| 115 |
+
|
| 116 |
+
# like sp|P01308.1|INS_HUMAN
|
| 117 |
+
accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
|
| 118 |
+
return self.get_by_accession(accession)
|
graphgen/operators/search/db/__init__.py
DELETED
|
File without changes
|
graphgen/operators/search/db/search_uniprot.py
DELETED
|
File without changes
|
requirements.txt
CHANGED
|
@@ -24,5 +24,8 @@ leidenalg
|
|
| 24 |
igraph
|
| 25 |
python-louvain
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
# For visualization
|
| 28 |
matplotlib
|
|
|
|
| 24 |
igraph
|
| 25 |
python-louvain
|
| 26 |
|
| 27 |
+
# Bioinformatics
|
| 28 |
+
biopython
|
| 29 |
+
|
| 30 |
# For visualization
|
| 31 |
matplotlib
|