yazoniak commited on
Commit
b02839d
·
verified ·
1 Parent(s): 061ca04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -46
app.py CHANGED
@@ -28,9 +28,10 @@ import os
28
  import re
29
  import spaces
30
  from datetime import datetime
31
- from datasets import Dataset, load_dataset
32
- from huggingface_hub import HfApi
33
  import pandas as pd
 
34
 
35
 
36
  # Model configuration
@@ -64,6 +65,9 @@ class HFDatasetLogger:
64
 
65
  This provides persistent storage across space restarts by storing data
66
  directly to a HuggingFace dataset repository.
 
 
 
67
  """
68
 
69
  def __init__(self, dataset_name: str, hf_token: str, private: bool = True):
@@ -80,14 +84,66 @@ class HFDatasetLogger:
80
  self.private = private
81
  self.api = HfApi()
82
  self.dataset_exists = False
 
83
 
84
- # Check if dataset exists
85
  try:
86
- load_dataset(dataset_name, split="train", token=hf_token, streaming=True)
87
- self.dataset_exists = True
88
- except Exception:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  self.dataset_exists = False
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def log(
92
  self,
93
  text: str,
@@ -100,8 +156,8 @@ class HFDatasetLogger:
100
  """
101
  Log a prediction to the HuggingFace dataset.
102
 
103
- Uses pandas DataFrame as intermediate format to ensure proper
104
- parquet compatibility when appending to existing datasets.
105
 
106
  Args:
107
  text: Input text
@@ -124,46 +180,31 @@ class HFDatasetLogger:
124
  }])
125
 
126
  if self.dataset_exists:
127
- # Append to existing dataset
128
- try:
129
- # Download existing dataset and convert to pandas
130
- existing_dataset = load_dataset(
131
- self.dataset_name,
132
- split="train",
133
- token=self.hf_token,
134
- download_mode="force_redownload",
135
- )
136
- existing_df = existing_dataset.to_pandas()
137
-
138
  # Concatenate DataFrames
139
  combined_df = pd.concat([existing_df, new_row], ignore_index=True)
140
-
141
- # Convert back to Dataset and push
142
- combined_dataset = Dataset.from_pandas(combined_df)
143
- combined_dataset.push_to_hub(
144
- self.dataset_name,
145
- token=self.hf_token,
146
- private=self.private,
147
- commit_message=f"Add prediction at {datetime.utcnow().isoformat()}",
148
- )
149
- print(f"✓ Appended prediction (total rows: {len(combined_df)})")
150
-
151
- except FileNotFoundError:
152
- # Dataset doesn't exist yet despite our check - create it
153
- print("⚠ Dataset not found, creating new dataset")
154
- new_dataset = Dataset.from_pandas(new_row)
155
- new_dataset.push_to_hub(
156
- self.dataset_name,
157
- token=self.hf_token,
158
- private=self.private,
159
- )
160
- self.dataset_exists = True
161
- except Exception as e:
162
- # For any other error, DO NOT fall back to push_to_hub
163
- # as that would REPLACE the entire dataset with just the new entry!
164
- print(f"⚠ Error appending to dataset (data not saved): {e}")
165
- import traceback
166
- traceback.print_exc()
167
  else:
168
  # Create new dataset
169
  new_dataset = Dataset.from_pandas(new_row)
@@ -173,6 +214,7 @@ class HFDatasetLogger:
173
  private=self.private,
174
  )
175
  self.dataset_exists = True
 
176
  print("✓ Created new dataset with first prediction")
177
 
178
  except Exception as e:
 
28
  import re
29
  import spaces
30
  from datetime import datetime
31
+ from datasets import Dataset
32
+ from huggingface_hub import HfApi, hf_hub_download, list_repo_files
33
  import pandas as pd
34
+ import tempfile
35
 
36
 
37
  # Model configuration
 
65
 
66
  This provides persistent storage across space restarts by storing data
67
  directly to a HuggingFace dataset repository.
68
+
69
+ Uses direct parquet file download via hf_hub_download to bypass
70
+ any caching issues with load_dataset.
71
  """
72
 
73
  def __init__(self, dataset_name: str, hf_token: str, private: bool = True):
 
84
  self.private = private
85
  self.api = HfApi()
86
  self.dataset_exists = False
87
+ self.parquet_filename = None
88
 
89
+ # Check if dataset exists by listing files in the repo
90
  try:
91
+ files = list_repo_files(
92
+ dataset_name,
93
+ repo_type="dataset",
94
+ token=hf_token,
95
+ )
96
+ files_list = list(files) # Convert to list to allow multiple iterations
97
+ print(f" Files in repo: {files_list}")
98
+
99
+ # Find the parquet file(s)
100
+ parquet_files = [f for f in files_list if f.endswith(".parquet")]
101
+ if parquet_files:
102
+ # Use the first parquet file (could be at root or in data/ folder)
103
+ self.parquet_filename = parquet_files[0]
104
+ self.dataset_exists = True
105
+ print(f" ✓ Found existing parquet file: {self.parquet_filename}")
106
+ else:
107
+ print(f" No parquet files found in dataset repo (files: {files_list})")
108
+ except Exception as e:
109
+ print(f" Dataset repo not found or error: {type(e).__name__}: {e}")
110
  self.dataset_exists = False
111
 
112
+ def _download_existing_data(self) -> pd.DataFrame | None:
113
+ """
114
+ Download existing parquet data directly using hf_hub_download.
115
+
116
+ Uses force_download=True to bypass all caching.
117
+
118
+ Returns:
119
+ DataFrame with existing data, or None if download fails
120
+ """
121
+ if not self.parquet_filename:
122
+ print(" No parquet filename set, cannot download")
123
+ return None
124
+
125
+ try:
126
+ print(f" Downloading parquet file: {self.parquet_filename}")
127
+ # Create a unique temp directory for each download to avoid caching
128
+ with tempfile.TemporaryDirectory() as tmp_dir:
129
+ local_path = hf_hub_download(
130
+ repo_id=self.dataset_name,
131
+ filename=self.parquet_filename,
132
+ repo_type="dataset",
133
+ token=self.hf_token,
134
+ force_download=True, # Force fresh download, bypass cache
135
+ local_dir=tmp_dir,
136
+ )
137
+ print(f" Downloaded to: {local_path}")
138
+ df = pd.read_parquet(local_path)
139
+ print(f" ✓ Loaded existing data: {len(df)} rows")
140
+ return df
141
+ except Exception as e:
142
+ print(f" ✗ Error downloading existing data: {type(e).__name__}: {e}")
143
+ import traceback
144
+ traceback.print_exc()
145
+ return None
146
+
147
  def log(
148
  self,
149
  text: str,
 
156
  """
157
  Log a prediction to the HuggingFace dataset.
158
 
159
+ Downloads existing parquet directly (bypassing load_dataset cache),
160
+ appends new row, and pushes combined data back to Hub.
161
 
162
  Args:
163
  text: Input text
 
180
  }])
181
 
182
  if self.dataset_exists:
183
+ # Download existing data directly from parquet file
184
+ existing_df = self._download_existing_data()
185
+
186
+ if existing_df is not None and len(existing_df) > 0:
 
 
 
 
 
 
 
187
  # Concatenate DataFrames
188
  combined_df = pd.concat([existing_df, new_row], ignore_index=True)
189
+ print(f" Combining {len(existing_df)} existing + 1 new = {len(combined_df)} rows")
190
+ else:
191
+ # No existing data or download failed, use just the new row
192
+ combined_df = new_row
193
+ print(" No existing data found, starting fresh")
194
+
195
+ # Convert to Dataset and push
196
+ combined_dataset = Dataset.from_pandas(combined_df)
197
+ combined_dataset.push_to_hub(
198
+ self.dataset_name,
199
+ token=self.hf_token,
200
+ private=self.private,
201
+ commit_message=f"Add prediction at {datetime.utcnow().isoformat()}",
202
+ )
203
+ print(f"✓ Pushed dataset with {len(combined_df)} total rows")
204
+
205
+ # Update parquet filename if this was the first push
206
+ if not self.parquet_filename:
207
+ self.parquet_filename = "data/train-00000-of-00001.parquet"
 
 
 
 
 
 
 
 
208
  else:
209
  # Create new dataset
210
  new_dataset = Dataset.from_pandas(new_row)
 
214
  private=self.private,
215
  )
216
  self.dataset_exists = True
217
+ self.parquet_filename = "data/train-00000-of-00001.parquet"
218
  print("✓ Created new dataset with first prediction")
219
 
220
  except Exception as e: