Spaces:
No application file
No application file
| import requests | |
| import time | |
| import json | |
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| HEADERS = { | |
| 'app_id': os.environ.get('MATHPIX_APP_ID', 'default_app_id'), | |
| 'app_key': os.environ.get('MATHPIX_APP_KEY', 'default_app_key') | |
| } | |
| def extract_text(file_path: str) -> str: | |
| print("Parsing resume") | |
| if not os.path.exists(file_path): | |
| raise FileNotFoundError(f"The file at {file_path} does not exist.") | |
| file_name = os.path.basename(file_path) | |
| url1 = 'https://api.mathpix.com/v3/pdf' | |
| with open(file_path, 'rb') as file: | |
| files = {'file': file} | |
| data = {'options_json': json.dumps({ | |
| "conversion_formats": {"md": True}, | |
| "math_inline_delimiters": ["$", "$"], | |
| "rm_spaces": True | |
| })} | |
| status_resp = requests.post(url1, headers=HEADERS, files=files, data=data) | |
| if status_resp.status_code != 200: | |
| raise Exception(f"Failed to upload PDF: {status_resp.text}") | |
| status_resp_data = status_resp.json() | |
| pdf_id = status_resp_data.get('pdf_id') | |
| if not pdf_id: | |
| raise Exception("Failed to retrieve PDF ID from response.") | |
| time.sleep(1) | |
| url2 = f'https://api.mathpix.com/v3/pdf/{pdf_id}' | |
| while True: | |
| challenge_resp = requests.get(url2, headers=HEADERS) | |
| challenge_resp_data = challenge_resp.json() | |
| if challenge_resp_data.get('status') == 'completed': | |
| break | |
| time.sleep(1) | |
| url3 = f'https://api.mathpix.com/v3/pdf/{pdf_id}.mmd' | |
| contents = requests.get(url3, headers=HEADERS) | |
| if contents.status_code != 200: | |
| raise Exception(f"Failed to download converted file: {contents.text}") | |
| open(os.path.join(os.getcwd(),"resume_mmds", (str(file_name)+'.mmd')),"w").write(contents.text) | |
| return contents.text | |