Spaces:

CYF200127
/

ChemEagle_API

Paused

App Files Files Community

CYF200127 commited on May 17

Commit

565cbbc

verified ·

1 Parent(s): 5d415f0

Update chemietoolkit/interface.py

Browse files

Files changed (1) hide show

chemietoolkit/interface.py +2 -101

chemietoolkit/interface.py CHANGED Viewed

@@ -8,7 +8,6 @@ from huggingface_hub import hf_hub_download, snapshot_download
 from molscribe import MolScribe
 from rxnscribe import RxnScribe, MolDetect
 from chemiener import ChemNER
-from .chemrxnextractor import ChemRxnExtractor
 from .tableextractor import TableExtractor
 from .utils import *
@@ -23,7 +22,6 @@ class ChemIEToolkit:
         self._rxnscribe = None
         self._pdfparser = None
         self._moldet = None
-        self._chemrxnextractor = None
         self._chemner = None
         self._coref = None
@@ -116,22 +114,8 @@ class ChemIEToolkit:
         self._coref = MolDetect(ckpt_path, device=self.device, coref=True)
-    @property
-    def chemrxnextractor(self):
-        if self._chemrxnextractor is None:
-            self.init_chemrxnextractor()
-        return self._chemrxnextractor
-    @lru_cache(maxsize=None)
-    def init_chemrxnextractor(self, ckpt_path=None):
-        """
-        Set model to custom checkpoint
-        Parameters:
-            ckpt_path: path to checkpoint to use, if None then will use default
-        """
-        if ckpt_path is None:
-            ckpt_path = snapshot_download(repo_id="amberwang/chemrxnextractor-training-modules")
-        self._chemrxnextractor = ChemRxnExtractor("", None, ckpt_path, self.device.type)
     @property
@@ -505,85 +489,7 @@ class ChemIEToolkit:
             results.append(data)
         return results
-    def extract_molecules_from_text_in_pdf(self, pdf, batch_size=16, num_pages=None):
-        """
-        Get molecules in text of given pdf
-        Parameters:
-            pdf: path to pdf, or byte file
-            batch_size: batch size for inference in all models
-            num_pages: process only first `num_pages` pages, if `None` then process all
-        Returns:
-            list of sentences and found molecules in the following format
-            [
-                {
-                    'molecules': [
-                        { # first paragraph
-                            'text': str,
-                            'labels': [
-                                (str, int, int), # tuple of label, range start (inclusive), range end (exclusive)
-                                # more labels
-                            ]
-                        },
-                        # more paragraphs
-                    ]
-                    'page': int
-                },
-                # more pages
-            ]
-        """
-        self.chemrxnextractor.set_pdf_file(pdf)
-        self.chemrxnextractor.set_pages(num_pages)
-        text = self.chemrxnextractor.get_paragraphs_from_pdf(num_pages)
-        result = []
-        for data in text:
-            model_inp = []
-            for paragraph in data['paragraphs']:
-                model_inp.append(' '.join(paragraph).replace('\n', ''))
-            output = self.chemner.predict_strings(model_inp, batch_size=batch_size)
-            to_add = {
-                'molecules': [{
-                    'text': t,
-                    'labels': labels,
-                    } for t, labels in zip(model_inp, output)],
-                'page': data['page']
-            }
-            result.append(to_add)
-        return result
-    def extract_reactions_from_text_in_pdf(self, pdf, num_pages=None):
-        """
-        Get reaction information from text in pdf
-        Parameters:
-            pdf: path to pdf
-            num_pages: process only first `num_pages` pages, if `None` then process all
-        Returns:
-            list of pages and corresponding reaction info in the following format
-            [
-                {
-                    'page': page number
-                    'reactions': [
-                        {
-                            'tokens': list of words in relevant sentence,
-                            'reactions' : [
-                                {
-                                    # key, value pairs where key is the label and value is a tuple
-                                    # or list of tuples of the form (tokens, start index, end index)
-                                    # where indices are for the corresponding token list and start and end are inclusive
-                                }
-                                # more reactions
-                            ]
-                        }
-                        # more reactions in other sentences
-                    ]
-                },
-                # more pages
-            ]
-        """
-        self.chemrxnextractor.set_pdf_file(pdf)
-        self.chemrxnextractor.set_pages(num_pages)
-        return self.chemrxnextractor.extract_reactions_from_text()
     def extract_reactions_from_text_in_pdf_combined(self, pdf, num_pages=None):
         """
@@ -735,15 +641,10 @@ class ChemIEToolkit:
         images = [figure['figure']['image'] for figure in figures]
         results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=True, ocr=True)
         table_expanded_results = process_tables(figures, results, self.molscribe, batch_size=batch_size)
-        text_results = self.extract_reactions_from_text_in_pdf(pdf, num_pages=num_pages)
         results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
         figure_results = replace_rgroups_in_figure(figures, table_expanded_results, results_coref, self.molscribe, batch_size=batch_size)
         table_expanded_results = expand_reactions_with_backout(figure_results, results_coref, self.molscribe)
-        coref_expanded_results = associate_corefs(text_results, results_coref)
         return {
             'figures': table_expanded_results,
-            'text': coref_expanded_results,
         }
-if __name__=="__main__":
-    model = OpenChemIE()

 from molscribe import MolScribe
 from rxnscribe import RxnScribe, MolDetect
 from chemiener import ChemNER
 from .tableextractor import TableExtractor
 from .utils import *
         self._rxnscribe = None
         self._pdfparser = None
         self._moldet = None
         self._chemner = None
         self._coref = None
         self._coref = MolDetect(ckpt_path, device=self.device, coref=True)
     @property
             results.append(data)
         return results
     def extract_reactions_from_text_in_pdf_combined(self, pdf, num_pages=None):
         """
         images = [figure['figure']['image'] for figure in figures]
         results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=True, ocr=True)
         table_expanded_results = process_tables(figures, results, self.molscribe, batch_size=batch_size)
         results_coref = self.extract_molecule_corefs_from_figures_in_pdf(pdf, num_pages=num_pages)
         figure_results = replace_rgroups_in_figure(figures, table_expanded_results, results_coref, self.molscribe, batch_size=batch_size)
         table_expanded_results = expand_reactions_with_backout(figure_results, results_coref, self.molscribe)
         return {
             'figures': table_expanded_results,
         }