Spaces:

manu
/

jericho

Sleeping

App Files Files Community

manu commited on 14 days ago

Commit

b7aa4ec

verified ·

1 Parent(s): ec684bb

Upload maa_jericho_scraper.txt

Browse files

Files changed (1) hide show

maa_jericho_scraper.txt +491 -0

maa_jericho_scraper.txt ADDED Viewed

	@@ -0,0 +1,491 @@

+#!/usr/bin/env python3
+"""
+maa_jericho_scraper.py
+~~~~~~~~~~~~~~~~~~~~~~~
+This script scrapes object records from the Museum of Archaeology
+and Anthropology (MAA) collection website at the University of
+Cambridge. It now also offers a Gradio-powered web interface so that
+records can be gathered and downloaded without using the command line.
+The scraper targets the search results for a user-provided keyword
+(default: "jericho") and collects detailed object pages containing
+fields such as Accession Number, Description, Place, Period, Source,
+Department, Reference Numbers, Cultural Affiliation, Material, Local
+Term, Measurements and a series of Events.
+Usage (CLI mode):
+    python maa_jericho_scraper.py --mode cli --keyword jericho --max-objects 100 --output jericho_objects.csv
+Usage (Gradio UI):
+    python maa_jericho_scraper.py
+Options:
+    --keyword: search keyword for filtering objects (default: jericho)
+    --max-objects: number of object pages to scrape (default: 100)
+    --output: path to the CSV file to write (default: jericho_objects.csv)
+    --start-id: starting numeric object identifier for fallback scraping
+    --mode: "cli" to run headless, "gradio" (default) to launch the UI
+Requirements:
+    - Python 3.7+
+    - requests
+    - beautifulsoup4
+    - gradio (for the UI)
+Note:
+    This script is provided for educational purposes. Always review
+    and respect the terms of use of any website you scrape. Use
+    responsibly and avoid overwhelming the target servers with rapid
+    requests.
+"""
+import argparse
+import concurrent.futures
+import csv
+import io
+import os
+import re
+import sys
+import tempfile
+import threading
+import time
+from typing import Dict, List, Optional, Tuple
+import requests
+from bs4 import BeautifulSoup
+try:
+    import gradio as gr
+except Exception:  # pragma: no cover - import guard for optional dependency
+    gr = None  # type: ignore[assignment]
+BASE_URL = "https://collections.maa.cam.ac.uk"
+SEARCH_PATH = "/objects/"
+DEFAULT_KEYWORD = "jericho"
+REQUEST_HEADERS = {
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0",
+    "Accept-Language": "en-US,en;q=0.9",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+    "Connection": "keep-alive",
+}
+THREAD_LOCAL = threading.local()
+LOG_LOCK = threading.Lock()
+DEFAULT_MAX_WORKERS = max(4, min(16, (os.cpu_count() or 4)))
+MAX_FETCH_RETRIES = 3
+RETRY_BACKOFF_SECONDS = 1.0
+MINIMUM_VALID_FIELDS = ("Accession No", "Description")
+def create_session() -> requests.Session:
+    session = requests.Session()
+    session.headers.update(REQUEST_HEADERS)
+    return session
+def get_thread_session() -> requests.Session:
+    session = getattr(THREAD_LOCAL, "session", None)
+    if session is None:
+        session = create_session()
+        THREAD_LOCAL.session = session
+    return session
+def log_info(message: str) -> None:
+    with LOG_LOCK:
+        sys.stderr.write(message)
+        if not message.endswith("\n"):
+            sys.stderr.write("\n")
+        sys.stderr.flush()
+def get_search_page(session: requests.Session, keyword: str, page_num: int) -> Optional[BeautifulSoup]:
+    """Return a BeautifulSoup object for a given page of search results."""
+    params = {"query": keyword, "page": page_num}
+    try:
+        resp = session.get(f"{BASE_URL}{SEARCH_PATH}", params=params, timeout=30)
+        resp.raise_for_status()
+    except Exception as exc:  # pragma: no cover - network dependent
+        sys.stderr.write(f"[warning] Failed to fetch search page {page_num} for keyword '{keyword}': {exc}\n")
+        return None
+    return BeautifulSoup(resp.text, "html.parser")
+def extract_object_links(soup: BeautifulSoup) -> List[str]:
+    """Extract object page URLs from a search results page."""
+    links: List[str] = []
+    for a in soup.find_all("a", href=True):
+        href = a.get("href") or ""
+        if re.fullmatch(r"/objects/\d+/?", href):
+            full_url = f"{BASE_URL}{href.rstrip('/')}/"
+            if full_url not in links:
+                links.append(full_url)
+    return links
+def parse_object_page(url: str, session: Optional[requests.Session] = None) -> Optional[Dict[str, str]]:
+    """Retrieve and parse an individual object page."""
+    session = session or get_thread_session()
+    try:
+        resp = session.get(url, timeout=30)
+        resp.raise_for_status()
+    except Exception as exc:  # pragma: no cover - network dependent
+        sys.stderr.write(f"[warning] Failed to fetch object page {url}: {exc}\n")
+        return None
+    soup = BeautifulSoup(resp.text, "html.parser")
+    result: Dict[str, str] = {
+        "Accession No": "",
+        "Description": "",
+        "Place": "",
+        "Period": "",
+        "Source": "",
+        "Department": "",
+        "Reference Numbers": "",
+        "Cultural Affiliation": "",
+        "Material": "",
+        "Local Term": "",
+        "Measurements": "",
+        "Events": "",
+        "FM": "",
+        "URL": url,
+    }
+    data_divs = soup.find_all("div", class_=lambda c: c and "flex-wrap" in c and "flex-md-nowrap" in c)
+    for div in data_divs:
+        label_p = div.find("p", class_=lambda c: c and "fw-bold" in c)
+        if not label_p:
+            continue
+        label = label_p.get_text(strip=True).rstrip(":").strip()
+        if label == "Events":
+            events_container = div.find("div", class_=lambda c: c and "d-flex" in c and "flex-column" in c)
+            if events_container:
+                entries: List[str] = []
+                for p_tag in events_container.find_all("p", class_=lambda c: c and c.startswith("col-")):
+                    text = p_tag.get_text(separator=" ").strip()
+                    text = re.sub(r"\s+", " ", text)
+                    if text:
+                        entries.append(text)
+                result["Events"] = " || ".join(entries)
+        else:
+            value_p = label_p.find_next_sibling("p")
+            if value_p:
+                value_text = value_p.get_text(separator=" ").strip()
+                value_text = re.sub(r"\s+", " ", value_text)
+                value_text = re.sub(r";\s*", "; ", value_text)
+                result[label] = value_text
+    fm_tag = soup.find("p", class_=lambda c: c and c.startswith("fs-"))
+    if fm_tag:
+        result["FM"] = fm_tag.get_text(strip=True)
+    return result
+def is_record_valid(record: Dict[str, str]) -> bool:
+    """Check whether a scraped record contains the required fields."""
+    return all(record.get(field, "").strip() for field in MINIMUM_VALID_FIELDS)
+def fetch_object_with_retry(
+    url: str,
+    max_retries: int = MAX_FETCH_RETRIES,
+    backoff: float = RETRY_BACKOFF_SECONDS,
+) -> Optional[Dict[str, str]]:
+    """Fetch an object page with retries and basic validation."""
+    last_result: Optional[Dict[str, str]] = None
+    last_error: Optional[str] = None
+    for attempt in range(1, max_retries + 1):
+        result = parse_object_page(url)
+        if result and is_record_valid(result):
+            result["FetchStatus"] = "complete"
+            if attempt > 1:
+                log_info(f"[info] Successful retry for {url} on attempt {attempt}")
+            return result
+        if result:
+            result["FetchStatus"] = "partial"
+            last_result = result
+            last_error = "missing required fields"
+        else:
+            last_error = "request failed"
+        if attempt < max_retries:
+            sleep_for = backoff * attempt
+            log_info(
+                f"[warning] Attempt {attempt} for {url} failed ({last_error}); retrying in {sleep_for:.1f}s",
+            )
+            time.sleep(sleep_for)
+    if last_result:
+        log_info(f"[warning] Using partial data for {url} after {max_retries} attempts")
+        return last_result
+    log_info(f"[error] Giving up on {url} after {max_retries} attempts ({last_error})")
+    return None
+def scrape_objects(
+    max_objects: int = 100,
+    start_id: int = 431363,
+    keyword: str = DEFAULT_KEYWORD,
+    max_workers: int = DEFAULT_MAX_WORKERS,
+) -> List[Dict[str, str]]:
+    """Scrape object pages until a desired number of results is collected."""
+    session = create_session()
+    search_keyword = keyword.strip() or DEFAULT_KEYWORD
+    object_urls: List[str] = []
+    page = 1
+    seen_pages = set()
+    while len(object_urls) < max_objects:
+        if page in seen_pages:
+            break
+        seen_pages.add(page)
+        soup = get_search_page(session, search_keyword, page)
+        if soup is None:
+            break
+        new_links = extract_object_links(soup)
+        if not new_links:
+            break
+        added = 0
+        for link in new_links:
+            if link not in object_urls:
+                object_urls.append(link)
+                added += 1
+                if len(object_urls) >= max_objects:
+                    break
+        if added == 0:
+            break
+        page += 1
+        time.sleep(0.2)
+    if len(object_urls) < max_objects:
+        current_id = start_id
+        while len(object_urls) < max_objects:
+            url = f"{BASE_URL}{SEARCH_PATH}{current_id}/"
+            if url not in object_urls:
+                object_urls.append(url)
+            current_id += 1
+    urls_to_fetch = object_urls[:max_objects]
+    records: List[Dict[str, str]] = []
+    log_info(f"[info] Dispatching {len(urls_to_fetch)} object requests with up to {max_workers} workers")
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_url = {executor.submit(fetch_object_with_retry, url): url for url in urls_to_fetch}
+        for idx, future in enumerate(concurrent.futures.as_completed(future_to_url), start=1):
+            url = future_to_url[future]
+            try:
+                data = future.result()
+            except Exception as exc:  # pragma: no cover - concurrency guard
+                log_info(f"[error] Unexpected exception fetching {url}: {exc}")
+                data = None
+            log_info(f"[info] ({idx}/{len(urls_to_fetch)}) Fetched {url}")
+            if data:
+                records.append(data)
+            else:
+                records.append({"URL": url, "FetchStatus": "failed"})
+    return records
+def collect_fieldnames(records: List[Dict[str, str]]) -> List[str]:
+    fieldnames: List[str] = []
+    for rec in records:
+        for key in rec.keys():
+            if key not in fieldnames:
+                fieldnames.append(key)
+    return fieldnames or ["URL"]
+def records_to_csv_text(records: List[Dict[str, str]]) -> Tuple[List[str], str]:
+    fieldnames = collect_fieldnames(records)
+    buffer = io.StringIO()
+    writer = csv.DictWriter(buffer, fieldnames=fieldnames)
+    writer.writeheader()
+    for rec in records:
+        writer.writerow({key: rec.get(key, "") for key in fieldnames})
+    return fieldnames, buffer.getvalue()
+def prepare_table(records: List[Dict[str, str]], fieldnames: List[str]) -> List[List[str]]:
+    return [[rec.get(field, "") for field in fieldnames] for rec in records]
+def write_csv(records: List[Dict[str, str]], output_path: str) -> None:
+    """Write scraped records to a CSV file."""
+    _, csv_text = records_to_csv_text(records)
+    with open(output_path, "w", newline="", encoding="utf-8") as f:
+        f.write(csv_text)
+def run_scraper_interface(max_objects: int, start_id: int, keyword: str):
+    if gr is None:  # pragma: no cover - runtime guard
+        raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.")
+    try:
+        max_int = max(1, int(max_objects))
+        start_int = int(start_id)
+    except Exception:
+        status = "Please provide valid numeric values for max objects and start ID."
+        return gr.update(value=[], headers=[]), "", status
+    search_keyword = (keyword or "").strip()
+    if not search_keyword:
+        status = "Please enter a search keyword."
+        return gr.update(value=[], headers=[]), "", status
+    records = scrape_objects(max_objects=max_int, start_id=start_int, keyword=search_keyword)
+    fieldnames, csv_text = records_to_csv_text(records)
+    table = prepare_table(records, fieldnames)
+    status = f"Scraped {len(records)} object(s) for keyword '{search_keyword}'."
+    return gr.update(value=table, headers=fieldnames), csv_text, status
+def prepare_csv_file(csv_text: str) -> Optional[str]:
+    if not csv_text:
+        return None
+    tmp_file = tempfile.NamedTemporaryFile(
+        delete=False,
+        suffix=".csv",
+        prefix="jericho_",
+        mode="w",
+        encoding="utf-8",
+    )
+    with tmp_file:
+        tmp_file.write(csv_text)
+    return tmp_file.name
+def launch_gradio_app(
+    default_max: int = 25,
+    default_start: int = 431363,
+    default_keyword: str = DEFAULT_KEYWORD,
+) -> None:
+    if gr is None:  # pragma: no cover - runtime guard
+        raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.")
+    custom_css = """
+    @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
+    .gradio-container {
+        background: radial-gradient(circle at top, #f8f5ff 0%, #f5f7fb 55%, #eef1f6 100%);
+        font-family: 'IBM Plex Sans', 'Segoe UI', sans-serif;
+        color: #1f2937;
+    }
+    #header-card {
+        border-radius: 16px;
+        background: rgba(255, 255, 255, 0.85);
+        box-shadow: 0 12px 24px rgba(79, 59, 169, 0.15);
+        padding: 20px;
+    }
+    #status-card .gr-markdown {
+        background: rgba(255, 255, 255, 0.9);
+        padding: 12px 16px;
+        border-radius: 12px;
+        border-left: 4px solid #6c4ddb;
+    }
+    .launch-controls .gr-form{
+        gap: 16px !important;
+    }
+    """
+    # with gr.Blocks(title="MAA Jericho Scraper", css=custom_css) as demo:
+    with gr.Blocks(title="MAA Jericho Scraper") as demo:
+        gr.Markdown(
+            """<div id='header-card'>
+            <h1 style='margin-bottom: 0.4rem; color: #4f3ba9;'>MAA Jericho Scraper</h1>
+            <p style='margin: 0; color: #3b3b54;'>Scrape the Museum of Archaeology and Anthropology collection for artefacts using a keyword and export the results as CSV.</p>
+            </div>""",
+            elem_id="header-card",
+        )
+        with gr.Row(elem_classes="launch-controls"):
+            max_objects_input = gr.Slider(
+                minimum=1,
+                maximum=500,
+                value=default_max,
+                step=1,
+                label="Maximum objects to scrape",
+            )
+            start_id_input = gr.Number(
+                value=default_start,
+                precision=0,
+                label="Fallback starting object ID",
+            )
+            keyword_input = gr.Textbox(
+                value=default_keyword,
+                label="Search keyword",
+                placeholder="Try terms such as 'Jericho', 'pottery', 'beads'...",
+            )
+        scrape_button = gr.Button("Run scraper", variant="primary", size="lg")
+        status_markdown = gr.Markdown("Ready.", elem_id="status-card")
+        results_table = gr.Dataframe(
+            value=[],
+            datatype="str",
+            label="Scraped Records",
+            interactive=False,
+            wrap=True,
+            row_count=(0, "dynamic"),
+            col_count=(0, "dynamic"),
+        )
+        csv_state = gr.State("")
+        download_button = gr.DownloadButton(
+            label="Download CSV",
+            variant="secondary",
+            size="lg",
+        )
+        scrape_button.click(
+            fn=run_scraper_interface,
+            inputs=[max_objects_input, start_id_input, keyword_input],
+            outputs=[results_table, csv_state, status_markdown],
+        )
+        download_button.click(fn=prepare_csv_file, inputs=csv_state, outputs=download_button)
+    demo.launch()
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Scrape MAA object pages into a CSV file or launch the Gradio UI",
+    )
+    parser.add_argument(
+        "--keyword",
+        default=DEFAULT_KEYWORD,
+        help="Search keyword to filter objects (default: jericho)",
+    )
+    parser.add_argument(
+        "--max-objects",
+        type=int,
+        default=100,
+        help="Number of object pages to scrape when running in CLI mode (default: 100)",
+    )
+    parser.add_argument(
+        "--start-id",
+        type=int,
+        default=431363,
+        help="Fallback starting ID for sequential scraping",
+    )
+    parser.add_argument(
+        "--output",
+        default="jericho_objects.csv",
+        help="Output CSV file path when running in CLI mode",
+    )
+    parser.add_argument(
+        "--mode",
+        choices=["cli", "gradio"],
+        default="gradio",
+        help="Execution mode: 'cli' for command line, 'gradio' for the web UI",
+    )
+    args = parser.parse_args()
+    if args.mode == "cli":
+        records = scrape_objects(max_objects=args.max_objects, start_id=args.start_id, keyword=args.keyword)
+        write_csv(records, args.output)
+        print(f"Wrote {len(records)} records to {args.output}")
+    else:
+        launch_gradio_app(
+            default_max=args.max_objects,
+            default_start=args.start_id,
+            default_keyword=args.keyword,
+        )
+if __name__ == "__main__":
+    main()