| #!/usr/bin/env python3 | |
| """ | |
| maa_jericho_scraper.py | |
| ~~~~~~~~~~~~~~~~~~~~~~~ | |
| This script scrapes object records from the Museum of Archaeology | |
| and Anthropology (MAA) collection website at the University of | |
| Cambridge. It now also offers a Gradio-powered web interface so that | |
| records can be gathered and downloaded without using the command line. | |
| The scraper targets the search results for a user-provided keyword | |
| (default: "jericho") and collects detailed object pages containing | |
| fields such as Accession Number, Description, Place, Period, Source, | |
| Department, Reference Numbers, Cultural Affiliation, Material, Local | |
| Term, Measurements and a series of Events. | |
| Usage (CLI mode): | |
| python maa_jericho_scraper.py --mode cli --keyword jericho --max-objects 100 --output jericho_objects.csv | |
| Usage (Gradio UI): | |
| python maa_jericho_scraper.py | |
| Options: | |
| --keyword: search keyword for filtering objects (default: jericho) | |
| --max-objects: number of object pages to scrape (default: 100) | |
| --output: path to the CSV file to write (default: jericho_objects.csv) | |
| --start-id: starting numeric object identifier for fallback scraping | |
| --mode: "cli" to run headless, "gradio" (default) to launch the UI | |
| Requirements: | |
| - Python 3.7+ | |
| - requests | |
| - beautifulsoup4 | |
| - gradio (for the UI) | |
| Note: | |
| This script is provided for educational purposes. Always review | |
| and respect the terms of use of any website you scrape. Use | |
| responsibly and avoid overwhelming the target servers with rapid | |
| requests. | |
| """ | |
| import argparse | |
| import concurrent.futures | |
| import csv | |
| import io | |
| import os | |
| import re | |
| import sys | |
| import tempfile | |
| import threading | |
| import time | |
| from typing import Dict, List, Optional, Tuple | |
| import requests | |
| from bs4 import BeautifulSoup | |
| try: | |
| import gradio as gr | |
| except Exception: # pragma: no cover - import guard for optional dependency | |
| gr = None # type: ignore[assignment] | |
| BASE_URL = "https://collections.maa.cam.ac.uk" | |
| SEARCH_PATH = "/objects/" | |
| DEFAULT_KEYWORD = "jericho" | |
| REQUEST_HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
| "Connection": "keep-alive", | |
| } | |
| THREAD_LOCAL = threading.local() | |
| LOG_LOCK = threading.Lock() | |
| DEFAULT_MAX_WORKERS = max(4, min(16, (os.cpu_count() or 4))) | |
| MAX_FETCH_RETRIES = 3 | |
| RETRY_BACKOFF_SECONDS = 1.0 | |
| MINIMUM_VALID_FIELDS = ("Accession No", "Description") | |
| def create_session() -> requests.Session: | |
| session = requests.Session() | |
| session.headers.update(REQUEST_HEADERS) | |
| return session | |
| def get_thread_session() -> requests.Session: | |
| session = getattr(THREAD_LOCAL, "session", None) | |
| if session is None: | |
| session = create_session() | |
| THREAD_LOCAL.session = session | |
| return session | |
| def log_info(message: str) -> None: | |
| with LOG_LOCK: | |
| sys.stderr.write(message) | |
| if not message.endswith("\n"): | |
| sys.stderr.write("\n") | |
| sys.stderr.flush() | |
| def get_search_page(session: requests.Session, keyword: str, page_num: int) -> Optional[BeautifulSoup]: | |
| """Return a BeautifulSoup object for a given page of search results.""" | |
| params = {"query": keyword, "page": page_num} | |
| try: | |
| resp = session.get(f"{BASE_URL}{SEARCH_PATH}", params=params, timeout=30) | |
| resp.raise_for_status() | |
| except Exception as exc: # pragma: no cover - network dependent | |
| sys.stderr.write(f"[warning] Failed to fetch search page {page_num} for keyword '{keyword}': {exc}\n") | |
| return None | |
| return BeautifulSoup(resp.text, "html.parser") | |
| def extract_object_links(soup: BeautifulSoup) -> List[str]: | |
| """Extract object page URLs from a search results page.""" | |
| links: List[str] = [] | |
| for a in soup.find_all("a", href=True): | |
| href = a.get("href") or "" | |
| if re.fullmatch(r"/objects/\d+/?", href): | |
| full_url = f"{BASE_URL}{href.rstrip('/')}/" | |
| if full_url not in links: | |
| links.append(full_url) | |
| return links | |
| def parse_object_page(url: str, session: Optional[requests.Session] = None) -> Optional[Dict[str, str]]: | |
| """Retrieve and parse an individual object page.""" | |
| session = session or get_thread_session() | |
| try: | |
| resp = session.get(url, timeout=30) | |
| resp.raise_for_status() | |
| except Exception as exc: # pragma: no cover - network dependent | |
| sys.stderr.write(f"[warning] Failed to fetch object page {url}: {exc}\n") | |
| return None | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| result: Dict[str, str] = { | |
| "Accession No": "", | |
| "Description": "", | |
| "Place": "", | |
| "Period": "", | |
| "Source": "", | |
| "Department": "", | |
| "Reference Numbers": "", | |
| "Cultural Affiliation": "", | |
| "Material": "", | |
| "Local Term": "", | |
| "Measurements": "", | |
| "Events": "", | |
| "FM": "", | |
| "URL": url, | |
| } | |
| data_divs = soup.find_all("div", class_=lambda c: c and "flex-wrap" in c and "flex-md-nowrap" in c) | |
| for div in data_divs: | |
| label_p = div.find("p", class_=lambda c: c and "fw-bold" in c) | |
| if not label_p: | |
| continue | |
| label = label_p.get_text(strip=True).rstrip(":").strip() | |
| if label == "Events": | |
| events_container = div.find("div", class_=lambda c: c and "d-flex" in c and "flex-column" in c) | |
| if events_container: | |
| entries: List[str] = [] | |
| for p_tag in events_container.find_all("p", class_=lambda c: c and c.startswith("col-")): | |
| text = p_tag.get_text(separator=" ").strip() | |
| text = re.sub(r"\s+", " ", text) | |
| if text: | |
| entries.append(text) | |
| result["Events"] = " || ".join(entries) | |
| else: | |
| value_p = label_p.find_next_sibling("p") | |
| if value_p: | |
| value_text = value_p.get_text(separator=" ").strip() | |
| value_text = re.sub(r"\s+", " ", value_text) | |
| value_text = re.sub(r";\s*", "; ", value_text) | |
| result[label] = value_text | |
| fm_tag = soup.find("p", class_=lambda c: c and c.startswith("fs-")) | |
| if fm_tag: | |
| result["FM"] = fm_tag.get_text(strip=True) | |
| return result | |
| def is_record_valid(record: Dict[str, str]) -> bool: | |
| """Check whether a scraped record contains the required fields.""" | |
| return all(record.get(field, "").strip() for field in MINIMUM_VALID_FIELDS) | |
| def fetch_object_with_retry( | |
| url: str, | |
| max_retries: int = MAX_FETCH_RETRIES, | |
| backoff: float = RETRY_BACKOFF_SECONDS, | |
| ) -> Optional[Dict[str, str]]: | |
| """Fetch an object page with retries and basic validation.""" | |
| last_result: Optional[Dict[str, str]] = None | |
| last_error: Optional[str] = None | |
| for attempt in range(1, max_retries + 1): | |
| result = parse_object_page(url) | |
| if result and is_record_valid(result): | |
| result["FetchStatus"] = "complete" | |
| if attempt > 1: | |
| log_info(f"[info] Successful retry for {url} on attempt {attempt}") | |
| return result | |
| if result: | |
| result["FetchStatus"] = "partial" | |
| last_result = result | |
| last_error = "missing required fields" | |
| else: | |
| last_error = "request failed" | |
| if attempt < max_retries: | |
| sleep_for = backoff * attempt | |
| log_info( | |
| f"[warning] Attempt {attempt} for {url} failed ({last_error}); retrying in {sleep_for:.1f}s", | |
| ) | |
| time.sleep(sleep_for) | |
| if last_result: | |
| log_info(f"[warning] Using partial data for {url} after {max_retries} attempts") | |
| return last_result | |
| log_info(f"[error] Giving up on {url} after {max_retries} attempts ({last_error})") | |
| return None | |
| def scrape_objects( | |
| max_objects: int = 100, | |
| start_id: int = 431363, | |
| keyword: str = DEFAULT_KEYWORD, | |
| max_workers: int = DEFAULT_MAX_WORKERS, | |
| ) -> List[Dict[str, str]]: | |
| """Scrape object pages until a desired number of results is collected.""" | |
| session = create_session() | |
| search_keyword = keyword.strip() or DEFAULT_KEYWORD | |
| object_urls: List[str] = [] | |
| page = 1 | |
| seen_pages = set() | |
| while len(object_urls) < max_objects: | |
| if page in seen_pages: | |
| break | |
| seen_pages.add(page) | |
| soup = get_search_page(session, search_keyword, page) | |
| if soup is None: | |
| break | |
| new_links = extract_object_links(soup) | |
| if not new_links: | |
| break | |
| added = 0 | |
| for link in new_links: | |
| if link not in object_urls: | |
| object_urls.append(link) | |
| added += 1 | |
| if len(object_urls) >= max_objects: | |
| break | |
| if added == 0: | |
| break | |
| page += 1 | |
| time.sleep(0.2) | |
| if len(object_urls) < max_objects: | |
| current_id = start_id | |
| while len(object_urls) < max_objects: | |
| url = f"{BASE_URL}{SEARCH_PATH}{current_id}/" | |
| if url not in object_urls: | |
| object_urls.append(url) | |
| current_id += 1 | |
| urls_to_fetch = object_urls[:max_objects] | |
| records: List[Dict[str, str]] = [] | |
| log_info(f"[info] Dispatching {len(urls_to_fetch)} object requests with up to {max_workers} workers") | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| future_to_url = {executor.submit(fetch_object_with_retry, url): url for url in urls_to_fetch} | |
| for idx, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1): | |
| url = future_to_url[future] | |
| try: | |
| data = future.result() | |
| except Exception as exc: # pragma: no cover - concurrency guard | |
| log_info(f"[error] Unexpected exception fetching {url}: {exc}") | |
| data = None | |
| log_info(f"[info] ({idx}/{len(urls_to_fetch)}) Fetched {url}") | |
| if data: | |
| records.append(data) | |
| else: | |
| records.append({"URL": url, "FetchStatus": "failed"}) | |
| return records | |
| def collect_fieldnames(records: List[Dict[str, str]]) -> List[str]: | |
| fieldnames: List[str] = [] | |
| for rec in records: | |
| for key in rec.keys(): | |
| if key not in fieldnames: | |
| fieldnames.append(key) | |
| return fieldnames or ["URL"] | |
| def records_to_csv_text(records: List[Dict[str, str]]) -> Tuple[List[str], str]: | |
| fieldnames = collect_fieldnames(records) | |
| buffer = io.StringIO() | |
| writer = csv.DictWriter(buffer, fieldnames=fieldnames) | |
| writer.writeheader() | |
| for rec in records: | |
| writer.writerow({key: rec.get(key, "") for key in fieldnames}) | |
| return fieldnames, buffer.getvalue() | |
| def prepare_table(records: List[Dict[str, str]], fieldnames: List[str]) -> List[List[str]]: | |
| return [[rec.get(field, "") for field in fieldnames] for rec in records] | |
| def write_csv(records: List[Dict[str, str]], output_path: str) -> None: | |
| """Write scraped records to a CSV file.""" | |
| _, csv_text = records_to_csv_text(records) | |
| with open(output_path, "w", newline="", encoding="utf-8") as f: | |
| f.write(csv_text) | |
| def run_scraper_interface(max_objects: int, start_id: int, keyword: str): | |
| if gr is None: # pragma: no cover - runtime guard | |
| raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.") | |
| try: | |
| max_int = max(1, int(max_objects)) | |
| start_int = int(start_id) | |
| except Exception: | |
| status = "Please provide valid numeric values for max objects and start ID." | |
| return gr.update(value=[], headers=[]), "", status | |
| search_keyword = (keyword or "").strip() | |
| if not search_keyword: | |
| status = "Please enter a search keyword." | |
| return gr.update(value=[], headers=[]), "", status | |
| records = scrape_objects(max_objects=max_int, start_id=start_int, keyword=search_keyword) | |
| fieldnames, csv_text = records_to_csv_text(records) | |
| table = prepare_table(records, fieldnames) | |
| status = f"Scraped {len(records)} object(s) for keyword '{search_keyword}'." | |
| return gr.update(value=table, headers=fieldnames), csv_text, status | |
| def prepare_csv_file(csv_text: str) -> Optional[str]: | |
| if not csv_text: | |
| return None | |
| tmp_file = tempfile.NamedTemporaryFile( | |
| delete=False, | |
| suffix=".csv", | |
| prefix="jericho_", | |
| mode="w", | |
| encoding="utf-8", | |
| ) | |
| with tmp_file: | |
| tmp_file.write(csv_text) | |
| return tmp_file.name | |
| def launch_gradio_app( | |
| default_max: int = 25, | |
| default_start: int = 431363, | |
| default_keyword: str = DEFAULT_KEYWORD, | |
| ) -> None: | |
| if gr is None: # pragma: no cover - runtime guard | |
| raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.") | |
| custom_css = """ | |
| @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600&display=swap'); | |
| .gradio-container { | |
| background: radial-gradient(circle at top, #f8f5ff 0%, #f5f7fb 55%, #eef1f6 100%); | |
| font-family: 'IBM Plex Sans', 'Segoe UI', sans-serif; | |
| color: #1f2937; | |
| } | |
| #header-card { | |
| border-radius: 16px; | |
| background: rgba(255, 255, 255, 0.85); | |
| box-shadow: 0 12px 24px rgba(79, 59, 169, 0.15); | |
| padding: 20px; | |
| } | |
| #status-card .gr-markdown { | |
| background: rgba(255, 255, 255, 0.9); | |
| padding: 12px 16px; | |
| border-radius: 12px; | |
| border-left: 4px solid #6c4ddb; | |
| } | |
| .launch-controls .gr-form{ | |
| gap: 16px !important; | |
| } | |
| """ | |
| # with gr.Blocks(title="MAA Jericho Scraper", css=custom_css) as demo: | |
| with gr.Blocks(title="MAA Jericho Scraper") as demo: | |
| gr.Markdown( | |
| """<div id='header-card'> | |
| <h1 style='margin-bottom: 0.4rem; color: #4f3ba9;'>MAA Jericho Scraper</h1> | |
| <p style='margin: 0; color: #3b3b54;'>Scrape the Museum of Archaeology and Anthropology collection for artefacts using a keyword and export the results as CSV.</p> | |
| </div>""", | |
| elem_id="header-card", | |
| ) | |
| with gr.Row(elem_classes="launch-controls"): | |
| max_objects_input = gr.Slider( | |
| minimum=1, | |
| maximum=500, | |
| value=default_max, | |
| step=1, | |
| label="Maximum objects to scrape", | |
| ) | |
| start_id_input = gr.Number( | |
| value=default_start, | |
| precision=0, | |
| label="Fallback starting object ID", | |
| ) | |
| keyword_input = gr.Textbox( | |
| value=default_keyword, | |
| label="Search keyword", | |
| placeholder="Try terms such as 'Jericho', 'pottery', 'beads'...", | |
| ) | |
| scrape_button = gr.Button("Run scraper", variant="primary", size="lg") | |
| status_markdown = gr.Markdown("Ready.", elem_id="status-card") | |
| results_table = gr.Dataframe( | |
| value=[], | |
| datatype="str", | |
| label="Scraped Records", | |
| interactive=False, | |
| wrap=True, | |
| row_count=(0, "dynamic"), | |
| col_count=(0, "dynamic"), | |
| ) | |
| csv_state = gr.State("") | |
| download_button = gr.DownloadButton( | |
| label="Download CSV", | |
| variant="secondary", | |
| size="lg", | |
| ) | |
| scrape_button.click( | |
| fn=run_scraper_interface, | |
| inputs=[max_objects_input, start_id_input, keyword_input], | |
| outputs=[results_table, csv_state, status_markdown], | |
| ) | |
| download_button.click(fn=prepare_csv_file, inputs=csv_state, outputs=download_button) | |
| demo.launch() | |
| def main() -> None: | |
| parser = argparse.ArgumentParser( | |
| description="Scrape MAA object pages into a CSV file or launch the Gradio UI", | |
| ) | |
| parser.add_argument( | |
| "--keyword", | |
| default=DEFAULT_KEYWORD, | |
| help="Search keyword to filter objects (default: jericho)", | |
| ) | |
| parser.add_argument( | |
| "--max-objects", | |
| type=int, | |
| default=100, | |
| help="Number of object pages to scrape when running in CLI mode (default: 100)", | |
| ) | |
| parser.add_argument( | |
| "--start-id", | |
| type=int, | |
| default=431363, | |
| help="Fallback starting ID for sequential scraping", | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| default="jericho_objects.csv", | |
| help="Output CSV file path when running in CLI mode", | |
| ) | |
| parser.add_argument( | |
| "--mode", | |
| choices=["cli", "gradio"], | |
| default="gradio", | |
| help="Execution mode: 'cli' for command line, 'gradio' for the web UI", | |
| ) | |
| args = parser.parse_args() | |
| if args.mode == "cli": | |
| records = scrape_objects(max_objects=args.max_objects, start_id=args.start_id, keyword=args.keyword) | |
| write_csv(records, args.output) | |
| print(f"Wrote {len(records)} records to {args.output}") | |
| else: | |
| launch_gradio_app( | |
| default_max=args.max_objects, | |
| default_start=args.start_id, | |
| default_keyword=args.keyword, | |
| ) | |
| if __name__ == "__main__": | |
| main() | |