manu commited on
Commit
b7aa4ec
·
verified ·
1 Parent(s): ec684bb

Upload maa_jericho_scraper.txt

Browse files
Files changed (1) hide show
  1. maa_jericho_scraper.txt +491 -0
maa_jericho_scraper.txt ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ maa_jericho_scraper.py
4
+ ~~~~~~~~~~~~~~~~~~~~~~~
5
+
6
+ This script scrapes object records from the Museum of Archaeology
7
+ and Anthropology (MAA) collection website at the University of
8
+ Cambridge. It now also offers a Gradio-powered web interface so that
9
+ records can be gathered and downloaded without using the command line.
10
+
11
+ The scraper targets the search results for a user-provided keyword
12
+ (default: "jericho") and collects detailed object pages containing
13
+ fields such as Accession Number, Description, Place, Period, Source,
14
+ Department, Reference Numbers, Cultural Affiliation, Material, Local
15
+ Term, Measurements and a series of Events.
16
+
17
+ Usage (CLI mode):
18
+ python maa_jericho_scraper.py --mode cli --keyword jericho --max-objects 100 --output jericho_objects.csv
19
+
20
+ Usage (Gradio UI):
21
+ python maa_jericho_scraper.py
22
+
23
+ Options:
24
+ --keyword: search keyword for filtering objects (default: jericho)
25
+ --max-objects: number of object pages to scrape (default: 100)
26
+ --output: path to the CSV file to write (default: jericho_objects.csv)
27
+ --start-id: starting numeric object identifier for fallback scraping
28
+ --mode: "cli" to run headless, "gradio" (default) to launch the UI
29
+
30
+ Requirements:
31
+ - Python 3.7+
32
+ - requests
33
+ - beautifulsoup4
34
+ - gradio (for the UI)
35
+
36
+ Note:
37
+ This script is provided for educational purposes. Always review
38
+ and respect the terms of use of any website you scrape. Use
39
+ responsibly and avoid overwhelming the target servers with rapid
40
+ requests.
41
+ """
42
+
43
+ import argparse
44
+ import concurrent.futures
45
+ import csv
46
+ import io
47
+ import os
48
+ import re
49
+ import sys
50
+ import tempfile
51
+ import threading
52
+ import time
53
+ from typing import Dict, List, Optional, Tuple
54
+
55
+ import requests
56
+ from bs4 import BeautifulSoup
57
+
58
+ try:
59
+ import gradio as gr
60
+ except Exception: # pragma: no cover - import guard for optional dependency
61
+ gr = None # type: ignore[assignment]
62
+
63
+ BASE_URL = "https://collections.maa.cam.ac.uk"
64
+ SEARCH_PATH = "/objects/"
65
+ DEFAULT_KEYWORD = "jericho"
66
+ REQUEST_HEADERS = {
67
+ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0",
68
+ "Accept-Language": "en-US,en;q=0.9",
69
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
70
+ "Connection": "keep-alive",
71
+ }
72
+ THREAD_LOCAL = threading.local()
73
+ LOG_LOCK = threading.Lock()
74
+ DEFAULT_MAX_WORKERS = max(4, min(16, (os.cpu_count() or 4)))
75
+ MAX_FETCH_RETRIES = 3
76
+ RETRY_BACKOFF_SECONDS = 1.0
77
+ MINIMUM_VALID_FIELDS = ("Accession No", "Description")
78
+
79
+
80
+ def create_session() -> requests.Session:
81
+ session = requests.Session()
82
+ session.headers.update(REQUEST_HEADERS)
83
+ return session
84
+
85
+
86
+ def get_thread_session() -> requests.Session:
87
+ session = getattr(THREAD_LOCAL, "session", None)
88
+ if session is None:
89
+ session = create_session()
90
+ THREAD_LOCAL.session = session
91
+ return session
92
+
93
+
94
+ def log_info(message: str) -> None:
95
+ with LOG_LOCK:
96
+ sys.stderr.write(message)
97
+ if not message.endswith("\n"):
98
+ sys.stderr.write("\n")
99
+ sys.stderr.flush()
100
+
101
+
102
+ def get_search_page(session: requests.Session, keyword: str, page_num: int) -> Optional[BeautifulSoup]:
103
+ """Return a BeautifulSoup object for a given page of search results."""
104
+ params = {"query": keyword, "page": page_num}
105
+ try:
106
+ resp = session.get(f"{BASE_URL}{SEARCH_PATH}", params=params, timeout=30)
107
+ resp.raise_for_status()
108
+ except Exception as exc: # pragma: no cover - network dependent
109
+ sys.stderr.write(f"[warning] Failed to fetch search page {page_num} for keyword '{keyword}': {exc}\n")
110
+ return None
111
+ return BeautifulSoup(resp.text, "html.parser")
112
+
113
+
114
+ def extract_object_links(soup: BeautifulSoup) -> List[str]:
115
+ """Extract object page URLs from a search results page."""
116
+ links: List[str] = []
117
+ for a in soup.find_all("a", href=True):
118
+ href = a.get("href") or ""
119
+ if re.fullmatch(r"/objects/\d+/?", href):
120
+ full_url = f"{BASE_URL}{href.rstrip('/')}/"
121
+ if full_url not in links:
122
+ links.append(full_url)
123
+ return links
124
+
125
+
126
+ def parse_object_page(url: str, session: Optional[requests.Session] = None) -> Optional[Dict[str, str]]:
127
+ """Retrieve and parse an individual object page."""
128
+ session = session or get_thread_session()
129
+ try:
130
+ resp = session.get(url, timeout=30)
131
+ resp.raise_for_status()
132
+ except Exception as exc: # pragma: no cover - network dependent
133
+ sys.stderr.write(f"[warning] Failed to fetch object page {url}: {exc}\n")
134
+ return None
135
+
136
+ soup = BeautifulSoup(resp.text, "html.parser")
137
+ result: Dict[str, str] = {
138
+ "Accession No": "",
139
+ "Description": "",
140
+ "Place": "",
141
+ "Period": "",
142
+ "Source": "",
143
+ "Department": "",
144
+ "Reference Numbers": "",
145
+ "Cultural Affiliation": "",
146
+ "Material": "",
147
+ "Local Term": "",
148
+ "Measurements": "",
149
+ "Events": "",
150
+ "FM": "",
151
+ "URL": url,
152
+ }
153
+
154
+ data_divs = soup.find_all("div", class_=lambda c: c and "flex-wrap" in c and "flex-md-nowrap" in c)
155
+ for div in data_divs:
156
+ label_p = div.find("p", class_=lambda c: c and "fw-bold" in c)
157
+ if not label_p:
158
+ continue
159
+ label = label_p.get_text(strip=True).rstrip(":").strip()
160
+ if label == "Events":
161
+ events_container = div.find("div", class_=lambda c: c and "d-flex" in c and "flex-column" in c)
162
+ if events_container:
163
+ entries: List[str] = []
164
+ for p_tag in events_container.find_all("p", class_=lambda c: c and c.startswith("col-")):
165
+ text = p_tag.get_text(separator=" ").strip()
166
+ text = re.sub(r"\s+", " ", text)
167
+ if text:
168
+ entries.append(text)
169
+ result["Events"] = " || ".join(entries)
170
+ else:
171
+ value_p = label_p.find_next_sibling("p")
172
+ if value_p:
173
+ value_text = value_p.get_text(separator=" ").strip()
174
+ value_text = re.sub(r"\s+", " ", value_text)
175
+ value_text = re.sub(r";\s*", "; ", value_text)
176
+ result[label] = value_text
177
+
178
+ fm_tag = soup.find("p", class_=lambda c: c and c.startswith("fs-"))
179
+ if fm_tag:
180
+ result["FM"] = fm_tag.get_text(strip=True)
181
+ return result
182
+
183
+
184
+ def is_record_valid(record: Dict[str, str]) -> bool:
185
+ """Check whether a scraped record contains the required fields."""
186
+ return all(record.get(field, "").strip() for field in MINIMUM_VALID_FIELDS)
187
+
188
+
189
+ def fetch_object_with_retry(
190
+ url: str,
191
+ max_retries: int = MAX_FETCH_RETRIES,
192
+ backoff: float = RETRY_BACKOFF_SECONDS,
193
+ ) -> Optional[Dict[str, str]]:
194
+ """Fetch an object page with retries and basic validation."""
195
+ last_result: Optional[Dict[str, str]] = None
196
+ last_error: Optional[str] = None
197
+ for attempt in range(1, max_retries + 1):
198
+ result = parse_object_page(url)
199
+ if result and is_record_valid(result):
200
+ result["FetchStatus"] = "complete"
201
+ if attempt > 1:
202
+ log_info(f"[info] Successful retry for {url} on attempt {attempt}")
203
+ return result
204
+ if result:
205
+ result["FetchStatus"] = "partial"
206
+ last_result = result
207
+ last_error = "missing required fields"
208
+ else:
209
+ last_error = "request failed"
210
+ if attempt < max_retries:
211
+ sleep_for = backoff * attempt
212
+ log_info(
213
+ f"[warning] Attempt {attempt} for {url} failed ({last_error}); retrying in {sleep_for:.1f}s",
214
+ )
215
+ time.sleep(sleep_for)
216
+ if last_result:
217
+ log_info(f"[warning] Using partial data for {url} after {max_retries} attempts")
218
+ return last_result
219
+ log_info(f"[error] Giving up on {url} after {max_retries} attempts ({last_error})")
220
+ return None
221
+
222
+
223
+ def scrape_objects(
224
+ max_objects: int = 100,
225
+ start_id: int = 431363,
226
+ keyword: str = DEFAULT_KEYWORD,
227
+ max_workers: int = DEFAULT_MAX_WORKERS,
228
+ ) -> List[Dict[str, str]]:
229
+ """Scrape object pages until a desired number of results is collected."""
230
+ session = create_session()
231
+
232
+ search_keyword = keyword.strip() or DEFAULT_KEYWORD
233
+ object_urls: List[str] = []
234
+ page = 1
235
+ seen_pages = set()
236
+ while len(object_urls) < max_objects:
237
+ if page in seen_pages:
238
+ break
239
+ seen_pages.add(page)
240
+ soup = get_search_page(session, search_keyword, page)
241
+ if soup is None:
242
+ break
243
+ new_links = extract_object_links(soup)
244
+ if not new_links:
245
+ break
246
+ added = 0
247
+ for link in new_links:
248
+ if link not in object_urls:
249
+ object_urls.append(link)
250
+ added += 1
251
+ if len(object_urls) >= max_objects:
252
+ break
253
+ if added == 0:
254
+ break
255
+ page += 1
256
+ time.sleep(0.2)
257
+
258
+ if len(object_urls) < max_objects:
259
+ current_id = start_id
260
+ while len(object_urls) < max_objects:
261
+ url = f"{BASE_URL}{SEARCH_PATH}{current_id}/"
262
+ if url not in object_urls:
263
+ object_urls.append(url)
264
+ current_id += 1
265
+
266
+ urls_to_fetch = object_urls[:max_objects]
267
+ records: List[Dict[str, str]] = []
268
+ log_info(f"[info] Dispatching {len(urls_to_fetch)} object requests with up to {max_workers} workers")
269
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
270
+ future_to_url = {executor.submit(fetch_object_with_retry, url): url for url in urls_to_fetch}
271
+ for idx, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1):
272
+ url = future_to_url[future]
273
+ try:
274
+ data = future.result()
275
+ except Exception as exc: # pragma: no cover - concurrency guard
276
+ log_info(f"[error] Unexpected exception fetching {url}: {exc}")
277
+ data = None
278
+ log_info(f"[info] ({idx}/{len(urls_to_fetch)}) Fetched {url}")
279
+ if data:
280
+ records.append(data)
281
+ else:
282
+ records.append({"URL": url, "FetchStatus": "failed"})
283
+ return records
284
+
285
+
286
+ def collect_fieldnames(records: List[Dict[str, str]]) -> List[str]:
287
+ fieldnames: List[str] = []
288
+ for rec in records:
289
+ for key in rec.keys():
290
+ if key not in fieldnames:
291
+ fieldnames.append(key)
292
+ return fieldnames or ["URL"]
293
+
294
+
295
+ def records_to_csv_text(records: List[Dict[str, str]]) -> Tuple[List[str], str]:
296
+ fieldnames = collect_fieldnames(records)
297
+ buffer = io.StringIO()
298
+ writer = csv.DictWriter(buffer, fieldnames=fieldnames)
299
+ writer.writeheader()
300
+ for rec in records:
301
+ writer.writerow({key: rec.get(key, "") for key in fieldnames})
302
+ return fieldnames, buffer.getvalue()
303
+
304
+
305
+ def prepare_table(records: List[Dict[str, str]], fieldnames: List[str]) -> List[List[str]]:
306
+ return [[rec.get(field, "") for field in fieldnames] for rec in records]
307
+
308
+
309
+ def write_csv(records: List[Dict[str, str]], output_path: str) -> None:
310
+ """Write scraped records to a CSV file."""
311
+ _, csv_text = records_to_csv_text(records)
312
+ with open(output_path, "w", newline="", encoding="utf-8") as f:
313
+ f.write(csv_text)
314
+
315
+
316
+ def run_scraper_interface(max_objects: int, start_id: int, keyword: str):
317
+ if gr is None: # pragma: no cover - runtime guard
318
+ raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.")
319
+
320
+ try:
321
+ max_int = max(1, int(max_objects))
322
+ start_int = int(start_id)
323
+ except Exception:
324
+ status = "Please provide valid numeric values for max objects and start ID."
325
+ return gr.update(value=[], headers=[]), "", status
326
+
327
+ search_keyword = (keyword or "").strip()
328
+ if not search_keyword:
329
+ status = "Please enter a search keyword."
330
+ return gr.update(value=[], headers=[]), "", status
331
+
332
+ records = scrape_objects(max_objects=max_int, start_id=start_int, keyword=search_keyword)
333
+ fieldnames, csv_text = records_to_csv_text(records)
334
+ table = prepare_table(records, fieldnames)
335
+ status = f"Scraped {len(records)} object(s) for keyword '{search_keyword}'."
336
+ return gr.update(value=table, headers=fieldnames), csv_text, status
337
+
338
+
339
+ def prepare_csv_file(csv_text: str) -> Optional[str]:
340
+ if not csv_text:
341
+ return None
342
+ tmp_file = tempfile.NamedTemporaryFile(
343
+ delete=False,
344
+ suffix=".csv",
345
+ prefix="jericho_",
346
+ mode="w",
347
+ encoding="utf-8",
348
+ )
349
+ with tmp_file:
350
+ tmp_file.write(csv_text)
351
+ return tmp_file.name
352
+
353
+
354
+ def launch_gradio_app(
355
+ default_max: int = 25,
356
+ default_start: int = 431363,
357
+ default_keyword: str = DEFAULT_KEYWORD,
358
+ ) -> None:
359
+ if gr is None: # pragma: no cover - runtime guard
360
+ raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.")
361
+
362
+ custom_css = """
363
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
364
+ .gradio-container {
365
+ background: radial-gradient(circle at top, #f8f5ff 0%, #f5f7fb 55%, #eef1f6 100%);
366
+ font-family: 'IBM Plex Sans', 'Segoe UI', sans-serif;
367
+ color: #1f2937;
368
+ }
369
+ #header-card {
370
+ border-radius: 16px;
371
+ background: rgba(255, 255, 255, 0.85);
372
+ box-shadow: 0 12px 24px rgba(79, 59, 169, 0.15);
373
+ padding: 20px;
374
+ }
375
+ #status-card .gr-markdown {
376
+ background: rgba(255, 255, 255, 0.9);
377
+ padding: 12px 16px;
378
+ border-radius: 12px;
379
+ border-left: 4px solid #6c4ddb;
380
+ }
381
+ .launch-controls .gr-form{
382
+ gap: 16px !important;
383
+ }
384
+ """
385
+
386
+ # with gr.Blocks(title="MAA Jericho Scraper", css=custom_css) as demo:
387
+ with gr.Blocks(title="MAA Jericho Scraper") as demo:
388
+ gr.Markdown(
389
+ """<div id='header-card'>
390
+ <h1 style='margin-bottom: 0.4rem; color: #4f3ba9;'>MAA Jericho Scraper</h1>
391
+ <p style='margin: 0; color: #3b3b54;'>Scrape the Museum of Archaeology and Anthropology collection for artefacts using a keyword and export the results as CSV.</p>
392
+ </div>""",
393
+ elem_id="header-card",
394
+ )
395
+
396
+ with gr.Row(elem_classes="launch-controls"):
397
+ max_objects_input = gr.Slider(
398
+ minimum=1,
399
+ maximum=500,
400
+ value=default_max,
401
+ step=1,
402
+ label="Maximum objects to scrape",
403
+ )
404
+ start_id_input = gr.Number(
405
+ value=default_start,
406
+ precision=0,
407
+ label="Fallback starting object ID",
408
+ )
409
+ keyword_input = gr.Textbox(
410
+ value=default_keyword,
411
+ label="Search keyword",
412
+ placeholder="Try terms such as 'Jericho', 'pottery', 'beads'...",
413
+ )
414
+
415
+ scrape_button = gr.Button("Run scraper", variant="primary", size="lg")
416
+ status_markdown = gr.Markdown("Ready.", elem_id="status-card")
417
+ results_table = gr.Dataframe(
418
+ value=[],
419
+ datatype="str",
420
+ label="Scraped Records",
421
+ interactive=False,
422
+ wrap=True,
423
+ row_count=(0, "dynamic"),
424
+ col_count=(0, "dynamic"),
425
+ )
426
+ csv_state = gr.State("")
427
+ download_button = gr.DownloadButton(
428
+ label="Download CSV",
429
+ variant="secondary",
430
+ size="lg",
431
+ )
432
+
433
+ scrape_button.click(
434
+ fn=run_scraper_interface,
435
+ inputs=[max_objects_input, start_id_input, keyword_input],
436
+ outputs=[results_table, csv_state, status_markdown],
437
+ )
438
+ download_button.click(fn=prepare_csv_file, inputs=csv_state, outputs=download_button)
439
+
440
+ demo.launch()
441
+
442
+
443
+ def main() -> None:
444
+ parser = argparse.ArgumentParser(
445
+ description="Scrape MAA object pages into a CSV file or launch the Gradio UI",
446
+ )
447
+ parser.add_argument(
448
+ "--keyword",
449
+ default=DEFAULT_KEYWORD,
450
+ help="Search keyword to filter objects (default: jericho)",
451
+ )
452
+ parser.add_argument(
453
+ "--max-objects",
454
+ type=int,
455
+ default=100,
456
+ help="Number of object pages to scrape when running in CLI mode (default: 100)",
457
+ )
458
+ parser.add_argument(
459
+ "--start-id",
460
+ type=int,
461
+ default=431363,
462
+ help="Fallback starting ID for sequential scraping",
463
+ )
464
+ parser.add_argument(
465
+ "--output",
466
+ default="jericho_objects.csv",
467
+ help="Output CSV file path when running in CLI mode",
468
+ )
469
+ parser.add_argument(
470
+ "--mode",
471
+ choices=["cli", "gradio"],
472
+ default="gradio",
473
+ help="Execution mode: 'cli' for command line, 'gradio' for the web UI",
474
+ )
475
+
476
+ args = parser.parse_args()
477
+
478
+ if args.mode == "cli":
479
+ records = scrape_objects(max_objects=args.max_objects, start_id=args.start_id, keyword=args.keyword)
480
+ write_csv(records, args.output)
481
+ print(f"Wrote {len(records)} records to {args.output}")
482
+ else:
483
+ launch_gradio_app(
484
+ default_max=args.max_objects,
485
+ default_start=args.start_id,
486
+ default_keyword=args.keyword,
487
+ )
488
+
489
+
490
+ if __name__ == "__main__":
491
+ main()