Spaces:
Paused
Paused
Update chemietoolkit/interface.py
Browse files- chemietoolkit/interface.py +2 -101
chemietoolkit/interface.py
CHANGED
|
@@ -8,7 +8,6 @@ from huggingface_hub import hf_hub_download, snapshot_download
|
|
| 8 |
from molscribe import MolScribe
|
| 9 |
from rxnscribe import RxnScribe, MolDetect
|
| 10 |
from chemiener import ChemNER
|
| 11 |
-
from .chemrxnextractor import ChemRxnExtractor
|
| 12 |
from .tableextractor import TableExtractor
|
| 13 |
from .utils import *
|
| 14 |
|
|
@@ -23,7 +22,6 @@ class ChemIEToolkit:
|
|
| 23 |
self._rxnscribe = None
|
| 24 |
self._pdfparser = None
|
| 25 |
self._moldet = None
|
| 26 |
-
self._chemrxnextractor = None
|
| 27 |
self._chemner = None
|
| 28 |
self._coref = None
|
| 29 |
|
|
@@ -116,22 +114,8 @@ class ChemIEToolkit:
|
|
| 116 |
self._coref = MolDetect(ckpt_path, device=self.device, coref=True)
|
| 117 |
|
| 118 |
|
| 119 |
-
@property
|
| 120 |
-
def chemrxnextractor(self):
|
| 121 |
-
if self._chemrxnextractor is None:
|
| 122 |
-
self.init_chemrxnextractor()
|
| 123 |
-
return self._chemrxnextractor
|
| 124 |
|
| 125 |
-
|
| 126 |
-
def init_chemrxnextractor(self, ckpt_path=None):
|
| 127 |
-
"""
|
| 128 |
-
Set model to custom checkpoint
|
| 129 |
-
Parameters:
|
| 130 |
-
ckpt_path: path to checkpoint to use, if None then will use default
|
| 131 |
-
"""
|
| 132 |
-
if ckpt_path is None:
|
| 133 |
-
ckpt_path = snapshot_download(repo_id="amberwang/chemrxnextractor-training-modules")
|
| 134 |
-
self._chemrxnextractor = ChemRxnExtractor("", None, ckpt_path, self.device.type)
|
| 135 |
|
| 136 |
|
| 137 |
@property
|
|
@@ -505,85 +489,7 @@ class ChemIEToolkit:
|
|
| 505 |
results.append(data)
|
| 506 |
return results
|
| 507 |
|
| 508 |
-
def extract_molecules_from_text_in_pdf(self, pdf, batch_size=16, num_pages=None):
|
| 509 |
-
"""
|
| 510 |
-
Get molecules in text of given pdf
|
| 511 |
-
|
| 512 |
-
Parameters:
|
| 513 |
-
pdf: path to pdf, or byte file
|
| 514 |
-
batch_size: batch size for inference in all models
|
| 515 |
-
num_pages: process only first `num_pages` pages, if `None` then process all
|
| 516 |
-
Returns:
|
| 517 |
-
list of sentences and found molecules in the following format
|
| 518 |
-
[
|
| 519 |
-
{
|
| 520 |
-
'molecules': [
|
| 521 |
-
{ # first paragraph
|
| 522 |
-
'text': str,
|
| 523 |
-
'labels': [
|
| 524 |
-
(str, int, int), # tuple of label, range start (inclusive), range end (exclusive)
|
| 525 |
-
# more labels
|
| 526 |
-
]
|
| 527 |
-
},
|
| 528 |
-
# more paragraphs
|
| 529 |
-
]
|
| 530 |
-
'page': int
|
| 531 |
-
},
|
| 532 |
-
# more pages
|
| 533 |
-
]
|
| 534 |
-
"""
|
| 535 |
-
self.chemrxnextractor.set_pdf_file(pdf)
|
| 536 |
-
self.chemrxnextractor.set_pages(num_pages)
|
| 537 |
-
text = self.chemrxnextractor.get_paragraphs_from_pdf(num_pages)
|
| 538 |
-
result = []
|
| 539 |
-
for data in text:
|
| 540 |
-
model_inp = []
|
| 541 |
-
for paragraph in data['paragraphs']:
|
| 542 |
-
model_inp.append(' '.join(paragraph).replace('\n', ''))
|
| 543 |
-
output = self.chemner.predict_strings(model_inp, batch_size=batch_size)
|
| 544 |
-
to_add = {
|
| 545 |
-
'molecules': [{
|
| 546 |
-
'text': t,
|
| 547 |
-
'labels': labels,
|
| 548 |
-
} for t, labels in zip(model_inp, output)],
|
| 549 |
-
'page': data['page']
|
| 550 |
-
}
|
| 551 |
-
result.append(to_add)
|
| 552 |
-
return result
|
| 553 |
-
|
| 554 |
|
| 555 |
-
def extract_reactions_from_text_in_pdf(self, pdf, num_pages=None):
|
| 556 |
-
"""
|
| 557 |
-
Get reaction information from text in pdf
|
| 558 |
-
Parameters:
|
| 559 |
-
pdf: path to pdf
|
| 560 |
-
num_pages: process only first `num_pages` pages, if `None` then process all
|
| 561 |
-
Returns:
|
| 562 |
-
list of pages and corresponding reaction info in the following format
|
| 563 |
-
[
|
| 564 |
-
{
|
| 565 |
-
'page': page number
|
| 566 |
-
'reactions': [
|
| 567 |
-
{
|
| 568 |
-
'tokens': list of words in relevant sentence,
|
| 569 |
-
'reactions' : [
|
| 570 |
-
{
|
| 571 |
-
# key, value pairs where key is the label and value is a tuple
|
| 572 |
-
# or list of tuples of the form (tokens, start index, end index)
|
| 573 |
-
# where indices are for the corresponding token list and start and end are inclusive
|
| 574 |
-
}
|
| 575 |
-
# more reactions
|
| 576 |
-
]
|
| 577 |
-
}
|
| 578 |
-
# more reactions in other sentences
|
| 579 |
-
]
|
| 580 |
-
},
|
| 581 |
-
# more pages
|
| 582 |
-
]
|
| 583 |
-
"""
|
| 584 |
-
self.chemrxnextractor.set_pdf_file(pdf)
|
| 585 |
-
self.chemrxnextractor.set_pages(num_pages)
|
| 586 |
-
return self.chemrxnextractor.extract_reactions_from_text()
|
| 587 |
|
| 588 |
def extract_reactions_from_text_in_pdf_combined(self, pdf, num_pages=None):
|
| 589 |
"""
|
|
@@ -735,15 +641,10 @@ class ChemIEToolkit:
|
|
| 735 |
images = [figure['figure']['image'] for figure in figures]
|
| 736 |
results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=True, ocr=True)
|
| 737 |
table_expanded_results = process_tables(figures, results, self.molscribe, batch_size=batch_size)
|
| 738 |
-
text_results = self.extract_reactions_from_text_in_pdf(pdf, num_pages=num_pages)
|
| 739 |
results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
|
| 740 |
figure_results = replace_rgroups_in_figure(figures, table_expanded_results, results_coref, self.molscribe, batch_size=batch_size)
|
| 741 |
table_expanded_results = expand_reactions_with_backout(figure_results, results_coref, self.molscribe)
|
| 742 |
-
|
| 743 |
return {
|
| 744 |
'figures': table_expanded_results,
|
| 745 |
-
'text': coref_expanded_results,
|
| 746 |
}
|
| 747 |
-
|
| 748 |
-
if __name__=="__main__":
|
| 749 |
-
model = OpenChemIE()
|
|
|
|
| 8 |
from molscribe import MolScribe
|
| 9 |
from rxnscribe import RxnScribe, MolDetect
|
| 10 |
from chemiener import ChemNER
|
|
|
|
| 11 |
from .tableextractor import TableExtractor
|
| 12 |
from .utils import *
|
| 13 |
|
|
|
|
| 22 |
self._rxnscribe = None
|
| 23 |
self._pdfparser = None
|
| 24 |
self._moldet = None
|
|
|
|
| 25 |
self._chemner = None
|
| 26 |
self._coref = None
|
| 27 |
|
|
|
|
| 114 |
self._coref = MolDetect(ckpt_path, device=self.device, coref=True)
|
| 115 |
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
|
| 121 |
@property
|
|
|
|
| 489 |
results.append(data)
|
| 490 |
return results
|
| 491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
|
| 494 |
def extract_reactions_from_text_in_pdf_combined(self, pdf, num_pages=None):
|
| 495 |
"""
|
|
|
|
| 641 |
images = [figure['figure']['image'] for figure in figures]
|
| 642 |
results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=True, ocr=True)
|
| 643 |
table_expanded_results = process_tables(figures, results, self.molscribe, batch_size=batch_size)
|
|
|
|
| 644 |
results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
|
| 645 |
figure_results = replace_rgroups_in_figure(figures, table_expanded_results, results_coref, self.molscribe, batch_size=batch_size)
|
| 646 |
table_expanded_results = expand_reactions_with_backout(figure_results, results_coref, self.molscribe)
|
| 647 |
+
|
| 648 |
return {
|
| 649 |
'figures': table_expanded_results,
|
|
|
|
| 650 |
}
|
|
|
|
|
|
|
|
|