Spaces:
Runtime error
Runtime error
| """ | |
| Data Collectors Module | |
| Provides various data collection methods for training data. | |
| """ | |
| import json | |
| from pathlib import Path | |
| from typing import List, Dict, Any, Optional | |
| from abc import ABC, abstractmethod | |
| class DataCollector(ABC): | |
| """Base class for data collectors.""" | |
| def collect(self) -> List[Dict[str, Any]]: | |
| """ | |
| Collect data from source. | |
| Returns: | |
| List of data examples | |
| """ | |
| pass | |
| class TextDataCollector(DataCollector): | |
| """Collect data from text files.""" | |
| def __init__(self, file_path: str): | |
| """ | |
| Initialize text data collector. | |
| Args: | |
| file_path: Path to text file | |
| """ | |
| self.file_path = Path(file_path) | |
| def collect(self) -> List[Dict[str, Any]]: | |
| """ | |
| Collect data from text file. | |
| Returns: | |
| List of text examples | |
| """ | |
| if not self.file_path.exists(): | |
| raise FileNotFoundError(f"File not found: {self.file_path}") | |
| with open(self.file_path, 'r', encoding='utf-8') as f: | |
| lines = f.readlines() | |
| # Simple line-by-line collection | |
| data = [] | |
| for i, line in enumerate(lines): | |
| line = line.strip() | |
| if line: | |
| data.append({ | |
| "id": i, | |
| "text": line, | |
| "source": str(self.file_path) | |
| }) | |
| return data | |
| class JSONDataCollector(DataCollector): | |
| """Collect data from JSON files.""" | |
| def __init__(self, file_path: str): | |
| """ | |
| Initialize JSON data collector. | |
| Args: | |
| file_path: Path to JSON file | |
| """ | |
| self.file_path = Path(file_path) | |
| def collect(self) -> List[Dict[str, Any]]: | |
| """ | |
| Collect data from JSON file. | |
| Expects format: | |
| [ | |
| {"instruction": "...", "input": "...", "output": "..."}, | |
| ... | |
| ] | |
| Returns: | |
| List of data examples | |
| """ | |
| if not self.file_path.exists(): | |
| raise FileNotFoundError(f"File not found: {self.file_path}") | |
| with open(self.file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| # Ensure data is a list | |
| if isinstance(data, dict): | |
| # If it's a dict, try to extract data from common keys | |
| if "data" in data: | |
| data = data["data"] | |
| elif "examples" in data: | |
| data = data["examples"] | |
| else: | |
| # Wrap single example in list | |
| data = [data] | |
| return data | |
| class APIDataCollector(DataCollector): | |
| """Collect data from API endpoints.""" | |
| def __init__(self, api_url: str, headers: Optional[Dict] = None): | |
| """ | |
| Initialize API data collector. | |
| Args: | |
| api_url: API endpoint URL | |
| headers: Optional HTTP headers | |
| """ | |
| self.api_url = api_url | |
| self.headers = headers or {} | |
| def collect(self) -> List[Dict[str, Any]]: | |
| """ | |
| Collect data from API. | |
| Returns: | |
| List of data examples | |
| """ | |
| import requests | |
| response = requests.get(self.api_url, headers=self.headers) | |
| response.raise_for_status() | |
| data = response.json() | |
| # Handle different response formats | |
| if isinstance(data, list): | |
| return data | |
| elif isinstance(data, dict): | |
| # Try common keys | |
| for key in ["data", "results", "items", "examples"]: | |
| if key in data: | |
| return data[key] | |
| # Otherwise wrap in list | |
| return [data] | |
| return [] | |
| class CSVDataCollector(DataCollector): | |
| """Collect data from CSV files.""" | |
| def __init__(self, file_path: str): | |
| """ | |
| Initialize CSV data collector. | |
| Args: | |
| file_path: Path to CSV file | |
| """ | |
| self.file_path = Path(file_path) | |
| def collect(self) -> List[Dict[str, Any]]: | |
| """ | |
| Collect data from CSV file. | |
| Returns: | |
| List of data examples (dict per row) | |
| """ | |
| import csv | |
| if not self.file_path.exists(): | |
| raise FileNotFoundError(f"File not found: {self.file_path}") | |
| data = [] | |
| with open(self.file_path, 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| data.append(dict(row)) | |
| return data | |