Spaces:

manu
/

jericho

Sleeping

App Files Files Community

manu commited on 17 days ago

Commit

f5be1b0

verified ·

1 Parent(s): b7aa4ec

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -401

app.py DELETED Viewed

@@ -1,401 +0,0 @@
-#!/usr/bin/env python3
-"""
-maa_jericho_scraper.py
-~~~~~~~~~~~~~~~~~~~~~~~
-This script scrapes object records from the Museum of Archaeology
-and Anthropology (MAA) collection website at the University of
-Cambridge. It now also offers a Gradio-powered web interface so that
-records can be gathered and downloaded without using the command line.
-The scraper targets the search results for a user-provided keyword
-(default: "jericho") and collects detailed object pages containing
-fields such as Accession Number, Description, Place, Period, Source,
-Department, Reference Numbers, Cultural Affiliation, Material, Local
-Term, Measurements and a series of Events.
-Usage (CLI mode):
-    python maa_jericho_scraper.py --mode cli --keyword jericho --max-objects 100 --output jericho_objects.csv
-Usage (Gradio UI):
-    python maa_jericho_scraper.py
-Options:
-    --keyword: search keyword for filtering objects (default: jericho)
-    --max-objects: number of object pages to scrape (default: 100)
-    --output: path to the CSV file to write (default: jericho_objects.csv)
-    --start-id: starting numeric object identifier for fallback scraping
-    --mode: "cli" to run headless, "gradio" (default) to launch the UI
-Requirements:
-    - Python 3.7+
-    - requests
-    - beautifulsoup4
-    - gradio (for the UI)
-Note:
-    This script is provided for educational purposes. Always review
-    and respect the terms of use of any website you scrape. Use
-    responsibly and avoid overwhelming the target servers with rapid
-    requests.
-"""
-import argparse
-import csv
-import io
-import re
-import sys
-import tempfile
-import time
-from typing import Dict, List, Optional, Tuple
-import requests
-from bs4 import BeautifulSoup
-try:
-    import gradio as gr
-except Exception:  # pragma: no cover - import guard for optional dependency
-    gr = None  # type: ignore[assignment]
-BASE_URL = "https://collections.maa.cam.ac.uk"
-SEARCH_PATH = "/objects/"
-DEFAULT_KEYWORD = "jericho"
-def get_search_page(session: requests.Session, keyword: str, page_num: int) -> Optional[BeautifulSoup]:
-    """Return a BeautifulSoup object for a given page of search results."""
-    params = {"query": keyword, "page": page_num}
-    try:
-        resp = session.get(f"{BASE_URL}{SEARCH_PATH}", params=params, timeout=30)
-        resp.raise_for_status()
-    except Exception as exc:  # pragma: no cover - network dependent
-        sys.stderr.write(f"[warning] Failed to fetch search page {page_num} for keyword '{keyword}': {exc}\n")
-        return None
-    return BeautifulSoup(resp.text, "html.parser")
-def extract_object_links(soup: BeautifulSoup) -> List[str]:
-    """Extract object page URLs from a search results page."""
-    links: List[str] = []
-    for a in soup.find_all("a", href=True):
-        href = a.get("href") or ""
-        if re.fullmatch(r"/objects/\d+/?", href):
-            full_url = f"{BASE_URL}{href.rstrip('/')}/"
-            if full_url not in links:
-                links.append(full_url)
-    return links
-def parse_object_page(session: requests.Session, url: str) -> Optional[Dict[str, str]]:
-    """Retrieve and parse an individual object page."""
-    try:
-        resp = session.get(url, timeout=30)
-        resp.raise_for_status()
-    except Exception as exc:  # pragma: no cover - network dependent
-        sys.stderr.write(f"[warning] Failed to fetch object page {url}: {exc}\n")
-        return None
-    soup = BeautifulSoup(resp.text, "html.parser")
-    result: Dict[str, str] = {
-        "Accession No": "",
-        "Description": "",
-        "Place": "",
-        "Period": "",
-        "Source": "",
-        "Department": "",
-        "Reference Numbers": "",
-        "Cultural Affiliation": "",
-        "Material": "",
-        "Local Term": "",
-        "Measurements": "",
-        "Events": "",
-        "FM": "",
-        "URL": url,
-    }
-    data_divs = soup.find_all("div", class_=lambda c: c and "flex-wrap" in c and "flex-md-nowrap" in c)
-    for div in data_divs:
-        label_p = div.find("p", class_=lambda c: c and "fw-bold" in c)
-        if not label_p:
-            continue
-        label = label_p.get_text(strip=True).rstrip(":").strip()
-        if label == "Events":
-            events_container = div.find("div", class_=lambda c: c and "d-flex" in c and "flex-column" in c)
-            if events_container:
-                entries: List[str] = []
-                for p_tag in events_container.find_all("p", class_=lambda c: c and c.startswith("col-")):
-                    text = p_tag.get_text(separator=" ").strip()
-                    text = re.sub(r"\s+", " ", text)
-                    if text:
-                        entries.append(text)
-                result["Events"] = " || ".join(entries)
-        else:
-            value_p = label_p.find_next_sibling("p")
-            if value_p:
-                value_text = value_p.get_text(separator=" ").strip()
-                value_text = re.sub(r"\s+", " ", value_text)
-                value_text = re.sub(r";\s*", "; ", value_text)
-                result[label] = value_text
-    fm_tag = soup.find("p", class_=lambda c: c and c.startswith("fs-"))
-    if fm_tag:
-        result["FM"] = fm_tag.get_text(strip=True)
-    return result
-def scrape_objects(max_objects: int = 100, start_id: int = 431363, keyword: str = DEFAULT_KEYWORD) -> List[Dict[str, str]]:
-    """Scrape object pages until a desired number of results is collected."""
-    session = requests.Session()
-    session.headers.update(
-        {
-            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0",
-            "Accept-Language": "en-US,en;q=0.9",
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
-            "Connection": "keep-alive",
-        }
-    )
-    search_keyword = keyword.strip() or DEFAULT_KEYWORD
-    object_urls: List[str] = []
-    page = 1
-    while len(object_urls) < max_objects:
-        soup = get_search_page(session, search_keyword, page)
-        if soup is None:
-            break
-        new_links = extract_object_links(soup)
-        if not new_links:
-            break
-        for link in new_links:
-            if link not in object_urls:
-                object_urls.append(link)
-                if len(object_urls) >= max_objects:
-                    break
-        page += 1
-        time.sleep(0.5)
-    if len(object_urls) < max_objects:
-        current_id = start_id
-        while len(object_urls) < max_objects:
-            url = f"{BASE_URL}{SEARCH_PATH}{current_id}/"
-            if url not in object_urls:
-                object_urls.append(url)
-            current_id += 1
-    results: List[Dict[str, str]] = []
-    for idx, url in enumerate(object_urls[:max_objects], start=1):
-        sys.stderr.write(f"[info] ({idx}/{max_objects}) Scraping {url}\n")
-        data = parse_object_page(session, url)
-        if data:
-            results.append(data)
-        else:
-            results.append({"URL": url})
-        time.sleep(0.5)
-    return results
-def collect_fieldnames(records: List[Dict[str, str]]) -> List[str]:
-    fieldnames: List[str] = []
-    for rec in records:
-        for key in rec.keys():
-            if key not in fieldnames:
-                fieldnames.append(key)
-    return fieldnames or ["URL"]
-def records_to_csv_text(records: List[Dict[str, str]]) -> Tuple[List[str], str]:
-    fieldnames = collect_fieldnames(records)
-    buffer = io.StringIO()
-    writer = csv.DictWriter(buffer, fieldnames=fieldnames)
-    writer.writeheader()
-    for rec in records:
-        writer.writerow({key: rec.get(key, "") for key in fieldnames})
-    return fieldnames, buffer.getvalue()
-def prepare_table(records: List[Dict[str, str]], fieldnames: List[str]) -> List[List[str]]:
-    return [[rec.get(field, "") for field in fieldnames] for rec in records]
-def write_csv(records: List[Dict[str, str]], output_path: str) -> None:
-    """Write scraped records to a CSV file."""
-    _, csv_text = records_to_csv_text(records)
-    with open(output_path, "w", newline="", encoding="utf-8") as f:
-        f.write(csv_text)
-def run_scraper_interface(max_objects: int, start_id: int, keyword: str):
-    if gr is None:  # pragma: no cover - runtime guard
-        raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.")
-    try:
-        max_int = max(1, int(max_objects))
-        start_int = int(start_id)
-    except Exception:
-        status = "Please provide valid numeric values for max objects and start ID."
-        return gr.update(value=[], headers=[]), "", status
-    search_keyword = (keyword or "").strip()
-    if not search_keyword:
-        status = "Please enter a search keyword."
-        return gr.update(value=[], headers=[]), "", status
-    records = scrape_objects(max_objects=max_int, start_id=start_int, keyword=search_keyword)
-    fieldnames, csv_text = records_to_csv_text(records)
-    table = prepare_table(records, fieldnames)
-    status = f"Scraped {len(records)} object(s) for keyword '{search_keyword}'."
-    return gr.update(value=table, headers=fieldnames), csv_text, status
-def prepare_csv_file(csv_text: str) -> Optional[str]:
-    if not csv_text:
-        return None
-    tmp_file = tempfile.NamedTemporaryFile(
-        delete=False,
-        suffix=".csv",
-        prefix="jericho_",
-        mode="w",
-        encoding="utf-8",
-    )
-    with tmp_file:
-        tmp_file.write(csv_text)
-    return tmp_file.name
-def launch_gradio_app(
-    default_max: int = 25,
-    default_start: int = 431363,
-    default_keyword: str = DEFAULT_KEYWORD,
-) -> None:
-    if gr is None:  # pragma: no cover - runtime guard
-        raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.")
-    custom_css = """
-    @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
-    .gradio-container {
-        background: radial-gradient(circle at top, #f8f5ff 0%, #f5f7fb 55%, #eef1f6 100%);
-        font-family: 'IBM Plex Sans', 'Segoe UI', sans-serif;
-        color: #1f2937;
-    }
-    #header-card {
-        border-radius: 16px;
-        background: rgba(255, 255, 255, 0.85);
-        box-shadow: 0 12px 24px rgba(79, 59, 169, 0.15);
-        padding: 20px;
-    }
-    #status-card .gr-markdown {
-        background: rgba(255, 255, 255, 0.9);
-        padding: 12px 16px;
-        border-radius: 12px;
-        border-left: 4px solid #6c4ddb;
-    }
-    .launch-controls .gr-form{
-        gap: 16px !important;
-    }
-    """
-    # with gr.Blocks(title="MAA Jericho Scraper", css=custom_css) as demo:
-    with gr.Blocks(title="MAA Jericho Scraper") as demo:
-        gr.Markdown(
-            """<div id='header-card'>
-            <h1 style='margin-bottom: 0.4rem; color: #4f3ba9;'>MAA Jericho Scraper</h1>
-            <p style='margin: 0; color: #3b3b54;'>Scrape the Museum of Archaeology and Anthropology collection for artefacts using a keyword and export the results as CSV.</p>
-            </div>""",
-            elem_id="header-card",
-        )
-        with gr.Row(elem_classes="launch-controls"):
-            max_objects_input = gr.Slider(
-                minimum=1,
-                maximum=10000,
-                value=default_max,
-                step=10,
-                label="Maximum objects to scrape",
-            )
-            start_id_input = gr.Number(
-                value=default_start,
-                precision=0,
-                label="Fallback starting object ID",
-            )
-            keyword_input = gr.Textbox(
-                value=default_keyword,
-                label="Search keyword",
-                placeholder="Try terms such as 'Jericho', 'pottery', 'beads'...",
-            )
-        scrape_button = gr.Button("Run scraper", variant="primary", size="lg")
-        status_markdown = gr.Markdown("Ready.", elem_id="status-card")
-        results_table = gr.Dataframe(
-            value=[],
-            datatype="str",
-            label="Scraped Records",
-            interactive=False,
-            wrap=True,
-            row_count=(0, "dynamic"),
-            col_count=(0, "dynamic"),
-        )
-        csv_state = gr.State("")
-        download_button = gr.DownloadButton(
-            label="Download CSV",
-            variant="secondary",
-            size="lg",
-        )
-        scrape_button.click(
-            fn=run_scraper_interface,
-            inputs=[max_objects_input, start_id_input, keyword_input],
-            outputs=[results_table, csv_state, status_markdown],
-        )
-        download_button.click(fn=prepare_csv_file, inputs=csv_state, outputs=download_button)
-    demo.launch()
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Scrape MAA object pages into a CSV file or launch the Gradio UI",
-    )
-    parser.add_argument(
-        "--keyword",
-        default=DEFAULT_KEYWORD,
-        help="Search keyword to filter objects (default: jericho)",
-    )
-    parser.add_argument(
-        "--max-objects",
-        type=int,
-        default=100,
-        help="Number of object pages to scrape when running in CLI mode (default: 100)",
-    )
-    parser.add_argument(
-        "--start-id",
-        type=int,
-        default=431363,
-        help="Fallback starting ID for sequential scraping",
-    )
-    parser.add_argument(
-        "--output",
-        default="jericho_objects.csv",
-        help="Output CSV file path when running in CLI mode",
-    )
-    parser.add_argument(
-        "--mode",
-        choices=["cli", "gradio"],
-        default="gradio",
-        help="Execution mode: 'cli' for command line, 'gradio' for the web UI",
-    )
-    args = parser.parse_args()
-    if args.mode == "cli":
-        records = scrape_objects(max_objects=args.max_objects, start_id=args.start_id, keyword=args.keyword)
-        write_csv(records, args.output)
-        print(f"Wrote {len(records)} records to {args.output}")
-    else:
-        launch_gradio_app(
-            default_max=args.max_objects,
-            default_start=args.start_id,
-            default_keyword=args.keyword,
-        )
-if __name__ == "__main__":
-    main()