|
|
|
|
|
""" |
|
|
maa_jericho_scraper.py |
|
|
~~~~~~~~~~~~~~~~~~~~~~~ |
|
|
|
|
|
This script scrapes object records from the Museum of Archaeology |
|
|
and Anthropology (MAA) collection website at the University of |
|
|
Cambridge. It now also offers a Gradio-powered web interface so that |
|
|
records can be gathered and downloaded without using the command line. |
|
|
|
|
|
The scraper targets the search results for a user-provided keyword |
|
|
(default: "jericho") and collects detailed object pages containing |
|
|
fields such as Accession Number, Description, Place, Period, Source, |
|
|
Department, Reference Numbers, Cultural Affiliation, Material, Local |
|
|
Term, Measurements and a series of Events. |
|
|
|
|
|
Usage (CLI mode): |
|
|
python maa_jericho_scraper.py --mode cli --keyword jericho --max-objects 100 --output jericho_objects.csv |
|
|
|
|
|
Usage (Gradio UI): |
|
|
python maa_jericho_scraper.py |
|
|
|
|
|
Options: |
|
|
--keyword: search keyword for filtering objects (default: jericho) |
|
|
--max-objects: number of object pages to scrape (default: 100) |
|
|
--output: path to the CSV file to write (default: jericho_objects.csv) |
|
|
--start-id: starting numeric object identifier for fallback scraping |
|
|
--mode: "cli" to run headless, "gradio" (default) to launch the UI |
|
|
|
|
|
Requirements: |
|
|
- Python 3.7+ |
|
|
- requests |
|
|
- beautifulsoup4 |
|
|
- gradio (for the UI) |
|
|
|
|
|
Note: |
|
|
This script is provided for educational purposes. Always review |
|
|
and respect the terms of use of any website you scrape. Use |
|
|
responsibly and avoid overwhelming the target servers with rapid |
|
|
requests. |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import concurrent.futures |
|
|
import csv |
|
|
import io |
|
|
import os |
|
|
import re |
|
|
import sys |
|
|
import tempfile |
|
|
import threading |
|
|
import time |
|
|
from typing import Dict, List, Optional, Tuple |
|
|
|
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
try: |
|
|
import gradio as gr |
|
|
except Exception: |
|
|
gr = None |
|
|
|
|
|
BASE_URL = "https://collections.maa.cam.ac.uk" |
|
|
SEARCH_PATH = "/objects/" |
|
|
DEFAULT_KEYWORD = "jericho" |
|
|
REQUEST_HEADERS = { |
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0", |
|
|
"Accept-Language": "en-US,en;q=0.9", |
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
|
|
"Connection": "keep-alive", |
|
|
} |
|
|
THREAD_LOCAL = threading.local() |
|
|
LOG_LOCK = threading.Lock() |
|
|
DEFAULT_MAX_WORKERS = max(4, min(16, (os.cpu_count() or 4))) |
|
|
MAX_FETCH_RETRIES = 3 |
|
|
RETRY_BACKOFF_SECONDS = 1.0 |
|
|
MINIMUM_VALID_FIELDS = ("Accession No", "Description") |
|
|
|
|
|
|
|
|
def create_session() -> requests.Session: |
|
|
session = requests.Session() |
|
|
session.headers.update(REQUEST_HEADERS) |
|
|
return session |
|
|
|
|
|
|
|
|
def get_thread_session() -> requests.Session: |
|
|
session = getattr(THREAD_LOCAL, "session", None) |
|
|
if session is None: |
|
|
session = create_session() |
|
|
THREAD_LOCAL.session = session |
|
|
return session |
|
|
|
|
|
|
|
|
def log_info(message: str) -> None: |
|
|
with LOG_LOCK: |
|
|
sys.stderr.write(message) |
|
|
if not message.endswith("\n"): |
|
|
sys.stderr.write("\n") |
|
|
sys.stderr.flush() |
|
|
|
|
|
|
|
|
def get_search_page(session: requests.Session, keyword: str, page_num: int) -> Optional[BeautifulSoup]: |
|
|
"""Return a BeautifulSoup object for a given page of search results.""" |
|
|
params = {"query": keyword, "page": page_num} |
|
|
try: |
|
|
resp = session.get(f"{BASE_URL}{SEARCH_PATH}", params=params, timeout=30) |
|
|
resp.raise_for_status() |
|
|
except Exception as exc: |
|
|
sys.stderr.write(f"[warning] Failed to fetch search page {page_num} for keyword '{keyword}': {exc}\n") |
|
|
return None |
|
|
return BeautifulSoup(resp.text, "html.parser") |
|
|
|
|
|
|
|
|
def extract_object_links(soup: BeautifulSoup) -> List[str]: |
|
|
"""Extract object page URLs from a search results page.""" |
|
|
links: List[str] = [] |
|
|
for a in soup.find_all("a", href=True): |
|
|
href = a.get("href") or "" |
|
|
if re.fullmatch(r"/objects/\d+/?", href): |
|
|
full_url = f"{BASE_URL}{href.rstrip('/')}/" |
|
|
if full_url not in links: |
|
|
links.append(full_url) |
|
|
return links |
|
|
|
|
|
|
|
|
def parse_object_page(url: str, session: Optional[requests.Session] = None) -> Optional[Dict[str, str]]: |
|
|
"""Retrieve and parse an individual object page.""" |
|
|
session = session or get_thread_session() |
|
|
try: |
|
|
resp = session.get(url, timeout=30) |
|
|
resp.raise_for_status() |
|
|
except Exception as exc: |
|
|
sys.stderr.write(f"[warning] Failed to fetch object page {url}: {exc}\n") |
|
|
return None |
|
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser") |
|
|
result: Dict[str, str] = { |
|
|
"Accession No": "", |
|
|
"Description": "", |
|
|
"Place": "", |
|
|
"Period": "", |
|
|
"Source": "", |
|
|
"Department": "", |
|
|
"Reference Numbers": "", |
|
|
"Cultural Affiliation": "", |
|
|
"Material": "", |
|
|
"Local Term": "", |
|
|
"Measurements": "", |
|
|
"Events": "", |
|
|
"FM": "", |
|
|
"URL": url, |
|
|
} |
|
|
|
|
|
data_divs = soup.find_all("div", class_=lambda c: c and "flex-wrap" in c and "flex-md-nowrap" in c) |
|
|
for div in data_divs: |
|
|
label_p = div.find("p", class_=lambda c: c and "fw-bold" in c) |
|
|
if not label_p: |
|
|
continue |
|
|
label = label_p.get_text(strip=True).rstrip(":").strip() |
|
|
if label == "Events": |
|
|
events_container = div.find("div", class_=lambda c: c and "d-flex" in c and "flex-column" in c) |
|
|
if events_container: |
|
|
entries: List[str] = [] |
|
|
for p_tag in events_container.find_all("p", class_=lambda c: c and c.startswith("col-")): |
|
|
text = p_tag.get_text(separator=" ").strip() |
|
|
text = re.sub(r"\s+", " ", text) |
|
|
if text: |
|
|
entries.append(text) |
|
|
result["Events"] = " || ".join(entries) |
|
|
else: |
|
|
value_p = label_p.find_next_sibling("p") |
|
|
if value_p: |
|
|
value_text = value_p.get_text(separator=" ").strip() |
|
|
value_text = re.sub(r"\s+", " ", value_text) |
|
|
value_text = re.sub(r";\s*", "; ", value_text) |
|
|
result[label] = value_text |
|
|
|
|
|
fm_tag = soup.find("p", class_=lambda c: c and c.startswith("fs-")) |
|
|
if fm_tag: |
|
|
result["FM"] = fm_tag.get_text(strip=True) |
|
|
return result |
|
|
|
|
|
|
|
|
def is_record_valid(record: Dict[str, str]) -> bool: |
|
|
"""Check whether a scraped record contains the required fields.""" |
|
|
return all(record.get(field, "").strip() for field in MINIMUM_VALID_FIELDS) |
|
|
|
|
|
|
|
|
def fetch_object_with_retry( |
|
|
url: str, |
|
|
max_retries: int = MAX_FETCH_RETRIES, |
|
|
backoff: float = RETRY_BACKOFF_SECONDS, |
|
|
) -> Optional[Dict[str, str]]: |
|
|
"""Fetch an object page with retries and basic validation.""" |
|
|
last_result: Optional[Dict[str, str]] = None |
|
|
last_error: Optional[str] = None |
|
|
for attempt in range(1, max_retries + 1): |
|
|
result = parse_object_page(url) |
|
|
if result and is_record_valid(result): |
|
|
result["FetchStatus"] = "complete" |
|
|
if attempt > 1: |
|
|
log_info(f"[info] Successful retry for {url} on attempt {attempt}") |
|
|
return result |
|
|
if result: |
|
|
result["FetchStatus"] = "partial" |
|
|
last_result = result |
|
|
last_error = "missing required fields" |
|
|
else: |
|
|
last_error = "request failed" |
|
|
if attempt < max_retries: |
|
|
sleep_for = backoff * attempt |
|
|
log_info( |
|
|
f"[warning] Attempt {attempt} for {url} failed ({last_error}); retrying in {sleep_for:.1f}s", |
|
|
) |
|
|
time.sleep(sleep_for) |
|
|
if last_result: |
|
|
log_info(f"[warning] Using partial data for {url} after {max_retries} attempts") |
|
|
return last_result |
|
|
log_info(f"[error] Giving up on {url} after {max_retries} attempts ({last_error})") |
|
|
return None |
|
|
|
|
|
|
|
|
def scrape_objects( |
|
|
max_objects: int = 100, |
|
|
start_id: int = 431363, |
|
|
keyword: str = DEFAULT_KEYWORD, |
|
|
max_workers: int = DEFAULT_MAX_WORKERS, |
|
|
) -> List[Dict[str, str]]: |
|
|
"""Scrape object pages until a desired number of results is collected.""" |
|
|
session = create_session() |
|
|
|
|
|
search_keyword = keyword.strip() or DEFAULT_KEYWORD |
|
|
object_urls: List[str] = [] |
|
|
page = 1 |
|
|
seen_pages = set() |
|
|
while len(object_urls) < max_objects: |
|
|
if page in seen_pages: |
|
|
break |
|
|
seen_pages.add(page) |
|
|
soup = get_search_page(session, search_keyword, page) |
|
|
if soup is None: |
|
|
break |
|
|
new_links = extract_object_links(soup) |
|
|
if not new_links: |
|
|
break |
|
|
added = 0 |
|
|
for link in new_links: |
|
|
if link not in object_urls: |
|
|
object_urls.append(link) |
|
|
added += 1 |
|
|
if len(object_urls) >= max_objects: |
|
|
break |
|
|
if added == 0: |
|
|
break |
|
|
page += 1 |
|
|
time.sleep(0.2) |
|
|
|
|
|
if len(object_urls) < max_objects: |
|
|
current_id = start_id |
|
|
while len(object_urls) < max_objects: |
|
|
url = f"{BASE_URL}{SEARCH_PATH}{current_id}/" |
|
|
if url not in object_urls: |
|
|
object_urls.append(url) |
|
|
current_id += 1 |
|
|
|
|
|
urls_to_fetch = object_urls[:max_objects] |
|
|
records: List[Dict[str, str]] = [] |
|
|
log_info(f"[info] Dispatching {len(urls_to_fetch)} object requests with up to {max_workers} workers") |
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: |
|
|
future_to_url = {executor.submit(fetch_object_with_retry, url): url for url in urls_to_fetch} |
|
|
for idx, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1): |
|
|
url = future_to_url[future] |
|
|
try: |
|
|
data = future.result() |
|
|
except Exception as exc: |
|
|
log_info(f"[error] Unexpected exception fetching {url}: {exc}") |
|
|
data = None |
|
|
log_info(f"[info] ({idx}/{len(urls_to_fetch)}) Fetched {url}") |
|
|
if data: |
|
|
records.append(data) |
|
|
else: |
|
|
records.append({"URL": url, "FetchStatus": "failed"}) |
|
|
return records |
|
|
|
|
|
|
|
|
def collect_fieldnames(records: List[Dict[str, str]]) -> List[str]: |
|
|
fieldnames: List[str] = [] |
|
|
for rec in records: |
|
|
for key in rec.keys(): |
|
|
if key not in fieldnames: |
|
|
fieldnames.append(key) |
|
|
return fieldnames or ["URL"] |
|
|
|
|
|
|
|
|
def records_to_csv_text(records: List[Dict[str, str]]) -> Tuple[List[str], str]: |
|
|
fieldnames = collect_fieldnames(records) |
|
|
buffer = io.StringIO() |
|
|
writer = csv.DictWriter(buffer, fieldnames=fieldnames) |
|
|
writer.writeheader() |
|
|
for rec in records: |
|
|
writer.writerow({key: rec.get(key, "") for key in fieldnames}) |
|
|
return fieldnames, buffer.getvalue() |
|
|
|
|
|
|
|
|
def prepare_table(records: List[Dict[str, str]], fieldnames: List[str]) -> List[List[str]]: |
|
|
return [[rec.get(field, "") for field in fieldnames] for rec in records] |
|
|
|
|
|
|
|
|
def write_csv(records: List[Dict[str, str]], output_path: str) -> None: |
|
|
"""Write scraped records to a CSV file.""" |
|
|
_, csv_text = records_to_csv_text(records) |
|
|
with open(output_path, "w", newline="", encoding="utf-8") as f: |
|
|
f.write(csv_text) |
|
|
|
|
|
|
|
|
def run_scraper_interface(max_objects: int, start_id: int, keyword: str): |
|
|
if gr is None: |
|
|
raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.") |
|
|
|
|
|
try: |
|
|
max_int = max(1, int(max_objects)) |
|
|
start_int = int(start_id) |
|
|
except Exception: |
|
|
status = "Please provide valid numeric values for max objects and start ID." |
|
|
return gr.update(value=[], headers=[]), "", status |
|
|
|
|
|
search_keyword = (keyword or "").strip() |
|
|
if not search_keyword: |
|
|
status = "Please enter a search keyword." |
|
|
return gr.update(value=[], headers=[]), "", status |
|
|
|
|
|
records = scrape_objects(max_objects=max_int, start_id=start_int, keyword=search_keyword) |
|
|
fieldnames, csv_text = records_to_csv_text(records) |
|
|
table = prepare_table(records, fieldnames) |
|
|
status = f"Scraped {len(records)} object(s) for keyword '{search_keyword}'." |
|
|
return gr.update(value=table, headers=fieldnames), csv_text, status |
|
|
|
|
|
|
|
|
def prepare_csv_file(csv_text: str) -> Optional[str]: |
|
|
if not csv_text: |
|
|
return None |
|
|
tmp_file = tempfile.NamedTemporaryFile( |
|
|
delete=False, |
|
|
suffix=".csv", |
|
|
prefix="jericho_", |
|
|
mode="w", |
|
|
encoding="utf-8", |
|
|
) |
|
|
with tmp_file: |
|
|
tmp_file.write(csv_text) |
|
|
return tmp_file.name |
|
|
|
|
|
|
|
|
def launch_gradio_app( |
|
|
default_max: int = 25, |
|
|
default_start: int = 431363, |
|
|
default_keyword: str = DEFAULT_KEYWORD, |
|
|
) -> None: |
|
|
if gr is None: |
|
|
raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.") |
|
|
|
|
|
custom_css = """ |
|
|
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600&display=swap'); |
|
|
.gradio-container { |
|
|
background: radial-gradient(circle at top, #f8f5ff 0%, #f5f7fb 55%, #eef1f6 100%); |
|
|
font-family: 'IBM Plex Sans', 'Segoe UI', sans-serif; |
|
|
color: #1f2937; |
|
|
} |
|
|
#header-card { |
|
|
border-radius: 16px; |
|
|
background: rgba(255, 255, 255, 0.85); |
|
|
box-shadow: 0 12px 24px rgba(79, 59, 169, 0.15); |
|
|
padding: 20px; |
|
|
} |
|
|
#status-card .gr-markdown { |
|
|
background: rgba(255, 255, 255, 0.9); |
|
|
padding: 12px 16px; |
|
|
border-radius: 12px; |
|
|
border-left: 4px solid #6c4ddb; |
|
|
} |
|
|
.launch-controls .gr-form{ |
|
|
gap: 16px !important; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="MAA Jericho Scraper") as demo: |
|
|
gr.Markdown( |
|
|
"""<div id='header-card'> |
|
|
<h1 style='margin-bottom: 0.4rem; color: #4f3ba9;'>MAA Jericho Scraper</h1> |
|
|
<p style='margin: 0; color: #3b3b54;'>Scrape the Museum of Archaeology and Anthropology collection for artefacts using a keyword and export the results as CSV.</p> |
|
|
</div>""", |
|
|
elem_id="header-card", |
|
|
) |
|
|
|
|
|
with gr.Row(elem_classes="launch-controls"): |
|
|
max_objects_input = gr.Slider( |
|
|
minimum=1, |
|
|
maximum=10000, |
|
|
value=default_max, |
|
|
step=10, |
|
|
label="Maximum objects to scrape", |
|
|
) |
|
|
start_id_input = gr.Number( |
|
|
value=default_start, |
|
|
precision=0, |
|
|
label="Fallback starting object ID", |
|
|
) |
|
|
keyword_input = gr.Textbox( |
|
|
value=default_keyword, |
|
|
label="Search keyword", |
|
|
placeholder="Try terms such as 'Jericho', 'pottery', 'beads'...", |
|
|
) |
|
|
|
|
|
scrape_button = gr.Button("Run scraper", variant="primary", size="lg") |
|
|
status_markdown = gr.Markdown("Ready.", elem_id="status-card") |
|
|
results_table = gr.Dataframe( |
|
|
value=[], |
|
|
datatype="str", |
|
|
label="Scraped Records", |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
row_count=(0, "dynamic"), |
|
|
col_count=(0, "dynamic"), |
|
|
) |
|
|
csv_state = gr.State("") |
|
|
download_button = gr.DownloadButton( |
|
|
label="Download CSV", |
|
|
variant="secondary", |
|
|
size="lg", |
|
|
) |
|
|
|
|
|
scrape_button.click( |
|
|
fn=run_scraper_interface, |
|
|
inputs=[max_objects_input, start_id_input, keyword_input], |
|
|
outputs=[results_table, csv_state, status_markdown], |
|
|
) |
|
|
download_button.click(fn=prepare_csv_file, inputs=csv_state, outputs=download_button) |
|
|
|
|
|
demo.launch() |
|
|
|
|
|
|
|
|
def main() -> None: |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Scrape MAA object pages into a CSV file or launch the Gradio UI", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--keyword", |
|
|
default=DEFAULT_KEYWORD, |
|
|
help="Search keyword to filter objects (default: jericho)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max-objects", |
|
|
type=int, |
|
|
default=100, |
|
|
help="Number of object pages to scrape when running in CLI mode (default: 100)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--start-id", |
|
|
type=int, |
|
|
default=431363, |
|
|
help="Fallback starting ID for sequential scraping", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output", |
|
|
default="jericho_objects.csv", |
|
|
help="Output CSV file path when running in CLI mode", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--mode", |
|
|
choices=["cli", "gradio"], |
|
|
default="gradio", |
|
|
help="Execution mode: 'cli' for command line, 'gradio' for the web UI", |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
if args.mode == "cli": |
|
|
records = scrape_objects(max_objects=args.max_objects, start_id=args.start_id, keyword=args.keyword) |
|
|
write_csv(records, args.output) |
|
|
print(f"Wrote {len(records)} records to {args.output}") |
|
|
else: |
|
|
launch_gradio_app( |
|
|
default_max=args.max_objects, |
|
|
default_start=args.start_id, |
|
|
default_keyword=args.keyword, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|