Spaces:
Runtime error
Runtime error
File size: 4,528 Bytes
ec8f374 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
"""
Data Collectors Module
Provides various data collection methods for training data.
"""
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from abc import ABC, abstractmethod
class DataCollector(ABC):
"""Base class for data collectors."""
@abstractmethod
def collect(self) -> List[Dict[str, Any]]:
"""
Collect data from source.
Returns:
List of data examples
"""
pass
class TextDataCollector(DataCollector):
"""Collect data from text files."""
def __init__(self, file_path: str):
"""
Initialize text data collector.
Args:
file_path: Path to text file
"""
self.file_path = Path(file_path)
def collect(self) -> List[Dict[str, Any]]:
"""
Collect data from text file.
Returns:
List of text examples
"""
if not self.file_path.exists():
raise FileNotFoundError(f"File not found: {self.file_path}")
with open(self.file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# Simple line-by-line collection
data = []
for i, line in enumerate(lines):
line = line.strip()
if line:
data.append({
"id": i,
"text": line,
"source": str(self.file_path)
})
return data
class JSONDataCollector(DataCollector):
"""Collect data from JSON files."""
def __init__(self, file_path: str):
"""
Initialize JSON data collector.
Args:
file_path: Path to JSON file
"""
self.file_path = Path(file_path)
def collect(self) -> List[Dict[str, Any]]:
"""
Collect data from JSON file.
Expects format:
[
{"instruction": "...", "input": "...", "output": "..."},
...
]
Returns:
List of data examples
"""
if not self.file_path.exists():
raise FileNotFoundError(f"File not found: {self.file_path}")
with open(self.file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Ensure data is a list
if isinstance(data, dict):
# If it's a dict, try to extract data from common keys
if "data" in data:
data = data["data"]
elif "examples" in data:
data = data["examples"]
else:
# Wrap single example in list
data = [data]
return data
class APIDataCollector(DataCollector):
"""Collect data from API endpoints."""
def __init__(self, api_url: str, headers: Optional[Dict] = None):
"""
Initialize API data collector.
Args:
api_url: API endpoint URL
headers: Optional HTTP headers
"""
self.api_url = api_url
self.headers = headers or {}
def collect(self) -> List[Dict[str, Any]]:
"""
Collect data from API.
Returns:
List of data examples
"""
import requests
response = requests.get(self.api_url, headers=self.headers)
response.raise_for_status()
data = response.json()
# Handle different response formats
if isinstance(data, list):
return data
elif isinstance(data, dict):
# Try common keys
for key in ["data", "results", "items", "examples"]:
if key in data:
return data[key]
# Otherwise wrap in list
return [data]
return []
class CSVDataCollector(DataCollector):
"""Collect data from CSV files."""
def __init__(self, file_path: str):
"""
Initialize CSV data collector.
Args:
file_path: Path to CSV file
"""
self.file_path = Path(file_path)
def collect(self) -> List[Dict[str, Any]]:
"""
Collect data from CSV file.
Returns:
List of data examples (dict per row)
"""
import csv
if not self.file_path.exists():
raise FileNotFoundError(f"File not found: {self.file_path}")
data = []
with open(self.file_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
data.append(dict(row))
return data
|