Spaces:
Runtime error
Runtime error
| from deepdoc.parser import RAGFlowPdfParser, PlainParser | |
| import os | |
| from PIL import Image | |
| import json | |
| class PDFprocess(): | |
| def __init__(self, repodir, workdir): | |
| self.repodir = repodir | |
| self.preprocessdir = os.path.join(workdir,'preprocess') | |
| self.pdf_parser = RAGFlowPdfParser() | |
| # self.plain_parser = PlainParser() | |
| def _save_image(self,image, path, name): | |
| """ 保存图片到指定路径 """ | |
| if not os.path.exists(path): | |
| os.makedirs(path) | |
| image_path = os.path.join(path, name) | |
| image.save(image_path) | |
| return image_path | |
| def save_all_image(self,preprocessdir,tables): | |
| image_folder = os.path.join(preprocessdir,'saved_images') | |
| # 假设 res 中包含了图片对象和其他数据 | |
| for index, data in enumerate(tables): | |
| image, text = data # 假设 data 结构是这样的 | |
| image_path = self._save_image(image, image_folder, f'image_{index}.png') | |
| # relative_path = os.path.relpath(image_path, preprocessdir) | |
| tables[index] = (image_path, text) # 更新 res 中的图片对象为图片路径 | |
| return tables | |
| def create_html_file(self,tables, html_file_path): | |
| html_content = '<html><body>\n' | |
| for index, data in enumerate(tables): | |
| image_path, text = data | |
| # 创建图片链接和文本 | |
| html_content += f'<img src="{image_path}" alt="Image">\n{text}\n' | |
| html_content += '</body></html>' | |
| # 写入 HTML 文件 | |
| with open(html_file_path, 'w') as file: | |
| file.write(html_content) | |
| def process(self, pdffilename): | |
| pdf_file_path = os.path.join(self.repodir,pdffilename) | |
| text_content, tables = self.pdf_parser(pdf_file_path, need_image=False, zoomin=3, return_html=True) | |
| text_file_path = os.path.join(self.preprocessdir,pdffilename.replace('.pdf','.txt')) | |
| with open(text_file_path, 'w') as f: | |
| f.write(text_content) | |
| image_folder = os.path.join(self.preprocessdir,f'{pdffilename}_images') | |
| tables = self.save_all_image(image_folder,tables) | |
| html_file_path = os.path.join(self.preprocessdir,pdffilename.replace('.pdf','.html')) | |
| self.create_html_file(tables, html_file_path) | |
| json_file_path = os.path.join(self.preprocessdir,pdffilename.replace('.pdf','.json')) | |
| with open(json_file_path, 'w') as f: | |
| json.dump(tables, f, indent=4, ensure_ascii=False) | |
| if __name__ == '__main__': | |
| repodir = '/Users/chen/Downloads/ReviewAgent2' | |
| preprocessdir = '/Users/chen/Downloads/ReviewAgent2/preprocess' | |
| pdfprocess = PDFprocess(repodir, preprocessdir) | |
| pdfprocess.process('test.pdf') | |