Spaces:
Sleeping
Sleeping
| import os | |
| from abc import ABC, abstractmethod | |
| from typing import Any, Dict, List | |
| import requests | |
| class BaseReader(ABC): | |
| """ | |
| Abstract base class for reading and processing data. | |
| """ | |
| def __init__(self, text_column: str = "content"): | |
| self.text_column = text_column | |
| def read(self, file_path: str) -> List[Dict[str, Any]]: | |
| """ | |
| Read data from the specified file path. | |
| :param file_path: Path to the input file. | |
| :return: List of dictionaries containing the data. | |
| """ | |
| def filter(data: List[dict]) -> List[dict]: | |
| """ | |
| Filter out entries with empty or missing text in the specified column. | |
| :param data: List of dictionaries containing the data. | |
| :return: Filtered list of dictionaries. | |
| """ | |
| def _image_exists(path_or_url: str, timeout: int = 3) -> bool: | |
| """ | |
| Check if an image exists at the given local path or URL. | |
| :param path_or_url: Local file path or remote URL of the image. | |
| :param timeout: Timeout for remote URL requests in seconds. | |
| :return: True if the image exists, False otherwise. | |
| """ | |
| if not path_or_url: | |
| return False | |
| if not path_or_url.startswith(("http://", "https://", "ftp://")): | |
| path = path_or_url.replace("file://", "", 1) | |
| path = os.path.abspath(path) | |
| return os.path.isfile(path) | |
| try: | |
| resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout) | |
| return resp.status_code == 200 | |
| except requests.RequestException: | |
| return False | |
| filtered_data = [] | |
| for item in data: | |
| if item.get("type") == "text": | |
| content = item.get("content", "").strip() | |
| if content: | |
| filtered_data.append(item) | |
| elif item.get("type") in ("image", "table", "equation"): | |
| img_path = item.get("img_path") | |
| if _image_exists(img_path): | |
| filtered_data.append(item) | |
| else: | |
| filtered_data.append(item) | |
| return filtered_data | |