Spaces:
Running
on
Zero
Running
on
Zero
Add retry mechanism and local publication year approximation to data processing
Browse files
app.py
CHANGED
|
@@ -27,7 +27,7 @@ import colormaps
|
|
| 27 |
import matplotlib.colors as mcolors
|
| 28 |
from matplotlib.colors import Normalize
|
| 29 |
|
| 30 |
-
|
| 31 |
|
| 32 |
import opinionated # for fonts
|
| 33 |
plt.style.use("opinionated_rc")
|
|
@@ -254,15 +254,33 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 254 |
|
| 255 |
should_break = False
|
| 256 |
for page in query.paginate(per_page=200, n_max=None):
|
| 257 |
-
for
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
if should_break:
|
| 267 |
break
|
| 268 |
if should_break:
|
|
@@ -411,6 +429,8 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 411 |
export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
|
| 412 |
export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
|
| 413 |
export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
|
|
|
|
|
|
|
| 414 |
export_df.to_csv(csv_file_path, index=False)
|
| 415 |
|
| 416 |
if download_png_checkbox:
|
|
|
|
| 27 |
import matplotlib.colors as mcolors
|
| 28 |
from matplotlib.colors import Normalize
|
| 29 |
|
| 30 |
+
import random
|
| 31 |
|
| 32 |
import opinionated # for fonts
|
| 33 |
plt.style.use("opinionated_rc")
|
|
|
|
| 254 |
|
| 255 |
should_break = False
|
| 256 |
for page in query.paginate(per_page=200, n_max=None):
|
| 257 |
+
# Add retry mechanism for processing each page
|
| 258 |
+
max_retries = 5
|
| 259 |
+
base_wait_time = 1 # Starting wait time in seconds
|
| 260 |
+
exponent = 1.5 # Exponential factor
|
| 261 |
+
|
| 262 |
+
for retry_attempt in range(max_retries):
|
| 263 |
+
try:
|
| 264 |
+
for record in page:
|
| 265 |
+
records.append(record)
|
| 266 |
+
records_per_query += 1
|
| 267 |
+
progress(0.1 + (0.2 * len(records) / (total_query_length)),
|
| 268 |
+
desc=f"Getting data from query {i+1}/{len(urls)}...")
|
| 269 |
+
|
| 270 |
+
if reduce_sample_checkbox and sample_reduction_method == "First n samples" and records_per_query >= target_size:
|
| 271 |
+
should_break = True
|
| 272 |
+
break
|
| 273 |
+
# If we get here without an exception, break the retry loop
|
| 274 |
break
|
| 275 |
+
except Exception as e:
|
| 276 |
+
print(f"Error processing page: {e}")
|
| 277 |
+
if retry_attempt < max_retries - 1:
|
| 278 |
+
wait_time = base_wait_time * (exponent ** retry_attempt) + random.random()
|
| 279 |
+
print(f"Retrying in {wait_time:.2f} seconds (attempt {retry_attempt + 1}/{max_retries})...")
|
| 280 |
+
time.sleep(wait_time)
|
| 281 |
+
else:
|
| 282 |
+
print(f"Maximum retries reached. Continuing with next page.")
|
| 283 |
+
|
| 284 |
if should_break:
|
| 285 |
break
|
| 286 |
if should_break:
|
|
|
|
| 429 |
export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
|
| 430 |
export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
|
| 431 |
export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
|
| 432 |
+
if locally_approximate_publication_date_checkbox:
|
| 433 |
+
export_df['approximate_publication_year'] = local_years
|
| 434 |
export_df.to_csv(csv_file_path, index=False)
|
| 435 |
|
| 436 |
if download_png_checkbox:
|