manu commited on
Commit
f5be1b0
·
verified ·
1 Parent(s): b7aa4ec

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -401
app.py DELETED
@@ -1,401 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- maa_jericho_scraper.py
4
- ~~~~~~~~~~~~~~~~~~~~~~~
5
-
6
- This script scrapes object records from the Museum of Archaeology
7
- and Anthropology (MAA) collection website at the University of
8
- Cambridge. It now also offers a Gradio-powered web interface so that
9
- records can be gathered and downloaded without using the command line.
10
-
11
- The scraper targets the search results for a user-provided keyword
12
- (default: "jericho") and collects detailed object pages containing
13
- fields such as Accession Number, Description, Place, Period, Source,
14
- Department, Reference Numbers, Cultural Affiliation, Material, Local
15
- Term, Measurements and a series of Events.
16
-
17
- Usage (CLI mode):
18
- python maa_jericho_scraper.py --mode cli --keyword jericho --max-objects 100 --output jericho_objects.csv
19
-
20
- Usage (Gradio UI):
21
- python maa_jericho_scraper.py
22
-
23
- Options:
24
- --keyword: search keyword for filtering objects (default: jericho)
25
- --max-objects: number of object pages to scrape (default: 100)
26
- --output: path to the CSV file to write (default: jericho_objects.csv)
27
- --start-id: starting numeric object identifier for fallback scraping
28
- --mode: "cli" to run headless, "gradio" (default) to launch the UI
29
-
30
- Requirements:
31
- - Python 3.7+
32
- - requests
33
- - beautifulsoup4
34
- - gradio (for the UI)
35
-
36
- Note:
37
- This script is provided for educational purposes. Always review
38
- and respect the terms of use of any website you scrape. Use
39
- responsibly and avoid overwhelming the target servers with rapid
40
- requests.
41
- """
42
-
43
- import argparse
44
- import csv
45
- import io
46
- import re
47
- import sys
48
- import tempfile
49
- import time
50
- from typing import Dict, List, Optional, Tuple
51
-
52
- import requests
53
- from bs4 import BeautifulSoup
54
-
55
- try:
56
- import gradio as gr
57
- except Exception: # pragma: no cover - import guard for optional dependency
58
- gr = None # type: ignore[assignment]
59
-
60
- BASE_URL = "https://collections.maa.cam.ac.uk"
61
- SEARCH_PATH = "/objects/"
62
- DEFAULT_KEYWORD = "jericho"
63
-
64
-
65
- def get_search_page(session: requests.Session, keyword: str, page_num: int) -> Optional[BeautifulSoup]:
66
- """Return a BeautifulSoup object for a given page of search results."""
67
- params = {"query": keyword, "page": page_num}
68
- try:
69
- resp = session.get(f"{BASE_URL}{SEARCH_PATH}", params=params, timeout=30)
70
- resp.raise_for_status()
71
- except Exception as exc: # pragma: no cover - network dependent
72
- sys.stderr.write(f"[warning] Failed to fetch search page {page_num} for keyword '{keyword}': {exc}\n")
73
- return None
74
- return BeautifulSoup(resp.text, "html.parser")
75
-
76
-
77
- def extract_object_links(soup: BeautifulSoup) -> List[str]:
78
- """Extract object page URLs from a search results page."""
79
- links: List[str] = []
80
- for a in soup.find_all("a", href=True):
81
- href = a.get("href") or ""
82
- if re.fullmatch(r"/objects/\d+/?", href):
83
- full_url = f"{BASE_URL}{href.rstrip('/')}/"
84
- if full_url not in links:
85
- links.append(full_url)
86
- return links
87
-
88
-
89
- def parse_object_page(session: requests.Session, url: str) -> Optional[Dict[str, str]]:
90
- """Retrieve and parse an individual object page."""
91
- try:
92
- resp = session.get(url, timeout=30)
93
- resp.raise_for_status()
94
- except Exception as exc: # pragma: no cover - network dependent
95
- sys.stderr.write(f"[warning] Failed to fetch object page {url}: {exc}\n")
96
- return None
97
-
98
- soup = BeautifulSoup(resp.text, "html.parser")
99
- result: Dict[str, str] = {
100
- "Accession No": "",
101
- "Description": "",
102
- "Place": "",
103
- "Period": "",
104
- "Source": "",
105
- "Department": "",
106
- "Reference Numbers": "",
107
- "Cultural Affiliation": "",
108
- "Material": "",
109
- "Local Term": "",
110
- "Measurements": "",
111
- "Events": "",
112
- "FM": "",
113
- "URL": url,
114
- }
115
-
116
- data_divs = soup.find_all("div", class_=lambda c: c and "flex-wrap" in c and "flex-md-nowrap" in c)
117
- for div in data_divs:
118
- label_p = div.find("p", class_=lambda c: c and "fw-bold" in c)
119
- if not label_p:
120
- continue
121
- label = label_p.get_text(strip=True).rstrip(":").strip()
122
- if label == "Events":
123
- events_container = div.find("div", class_=lambda c: c and "d-flex" in c and "flex-column" in c)
124
- if events_container:
125
- entries: List[str] = []
126
- for p_tag in events_container.find_all("p", class_=lambda c: c and c.startswith("col-")):
127
- text = p_tag.get_text(separator=" ").strip()
128
- text = re.sub(r"\s+", " ", text)
129
- if text:
130
- entries.append(text)
131
- result["Events"] = " || ".join(entries)
132
- else:
133
- value_p = label_p.find_next_sibling("p")
134
- if value_p:
135
- value_text = value_p.get_text(separator=" ").strip()
136
- value_text = re.sub(r"\s+", " ", value_text)
137
- value_text = re.sub(r";\s*", "; ", value_text)
138
- result[label] = value_text
139
-
140
- fm_tag = soup.find("p", class_=lambda c: c and c.startswith("fs-"))
141
- if fm_tag:
142
- result["FM"] = fm_tag.get_text(strip=True)
143
- return result
144
-
145
-
146
- def scrape_objects(max_objects: int = 100, start_id: int = 431363, keyword: str = DEFAULT_KEYWORD) -> List[Dict[str, str]]:
147
- """Scrape object pages until a desired number of results is collected."""
148
- session = requests.Session()
149
- session.headers.update(
150
- {
151
- "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0",
152
- "Accept-Language": "en-US,en;q=0.9",
153
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
154
- "Connection": "keep-alive",
155
- }
156
- )
157
-
158
- search_keyword = keyword.strip() or DEFAULT_KEYWORD
159
- object_urls: List[str] = []
160
- page = 1
161
- while len(object_urls) < max_objects:
162
- soup = get_search_page(session, search_keyword, page)
163
- if soup is None:
164
- break
165
- new_links = extract_object_links(soup)
166
- if not new_links:
167
- break
168
- for link in new_links:
169
- if link not in object_urls:
170
- object_urls.append(link)
171
- if len(object_urls) >= max_objects:
172
- break
173
- page += 1
174
- time.sleep(0.5)
175
-
176
- if len(object_urls) < max_objects:
177
- current_id = start_id
178
- while len(object_urls) < max_objects:
179
- url = f"{BASE_URL}{SEARCH_PATH}{current_id}/"
180
- if url not in object_urls:
181
- object_urls.append(url)
182
- current_id += 1
183
-
184
- results: List[Dict[str, str]] = []
185
- for idx, url in enumerate(object_urls[:max_objects], start=1):
186
- sys.stderr.write(f"[info] ({idx}/{max_objects}) Scraping {url}\n")
187
- data = parse_object_page(session, url)
188
- if data:
189
- results.append(data)
190
- else:
191
- results.append({"URL": url})
192
- time.sleep(0.5)
193
- return results
194
-
195
-
196
- def collect_fieldnames(records: List[Dict[str, str]]) -> List[str]:
197
- fieldnames: List[str] = []
198
- for rec in records:
199
- for key in rec.keys():
200
- if key not in fieldnames:
201
- fieldnames.append(key)
202
- return fieldnames or ["URL"]
203
-
204
-
205
- def records_to_csv_text(records: List[Dict[str, str]]) -> Tuple[List[str], str]:
206
- fieldnames = collect_fieldnames(records)
207
- buffer = io.StringIO()
208
- writer = csv.DictWriter(buffer, fieldnames=fieldnames)
209
- writer.writeheader()
210
- for rec in records:
211
- writer.writerow({key: rec.get(key, "") for key in fieldnames})
212
- return fieldnames, buffer.getvalue()
213
-
214
-
215
- def prepare_table(records: List[Dict[str, str]], fieldnames: List[str]) -> List[List[str]]:
216
- return [[rec.get(field, "") for field in fieldnames] for rec in records]
217
-
218
-
219
- def write_csv(records: List[Dict[str, str]], output_path: str) -> None:
220
- """Write scraped records to a CSV file."""
221
- _, csv_text = records_to_csv_text(records)
222
- with open(output_path, "w", newline="", encoding="utf-8") as f:
223
- f.write(csv_text)
224
-
225
-
226
- def run_scraper_interface(max_objects: int, start_id: int, keyword: str):
227
- if gr is None: # pragma: no cover - runtime guard
228
- raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.")
229
-
230
- try:
231
- max_int = max(1, int(max_objects))
232
- start_int = int(start_id)
233
- except Exception:
234
- status = "Please provide valid numeric values for max objects and start ID."
235
- return gr.update(value=[], headers=[]), "", status
236
-
237
- search_keyword = (keyword or "").strip()
238
- if not search_keyword:
239
- status = "Please enter a search keyword."
240
- return gr.update(value=[], headers=[]), "", status
241
-
242
- records = scrape_objects(max_objects=max_int, start_id=start_int, keyword=search_keyword)
243
- fieldnames, csv_text = records_to_csv_text(records)
244
- table = prepare_table(records, fieldnames)
245
- status = f"Scraped {len(records)} object(s) for keyword '{search_keyword}'."
246
- return gr.update(value=table, headers=fieldnames), csv_text, status
247
-
248
-
249
- def prepare_csv_file(csv_text: str) -> Optional[str]:
250
- if not csv_text:
251
- return None
252
- tmp_file = tempfile.NamedTemporaryFile(
253
- delete=False,
254
- suffix=".csv",
255
- prefix="jericho_",
256
- mode="w",
257
- encoding="utf-8",
258
- )
259
- with tmp_file:
260
- tmp_file.write(csv_text)
261
- return tmp_file.name
262
-
263
-
264
- def launch_gradio_app(
265
- default_max: int = 25,
266
- default_start: int = 431363,
267
- default_keyword: str = DEFAULT_KEYWORD,
268
- ) -> None:
269
- if gr is None: # pragma: no cover - runtime guard
270
- raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.")
271
-
272
- custom_css = """
273
- @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
274
- .gradio-container {
275
- background: radial-gradient(circle at top, #f8f5ff 0%, #f5f7fb 55%, #eef1f6 100%);
276
- font-family: 'IBM Plex Sans', 'Segoe UI', sans-serif;
277
- color: #1f2937;
278
- }
279
- #header-card {
280
- border-radius: 16px;
281
- background: rgba(255, 255, 255, 0.85);
282
- box-shadow: 0 12px 24px rgba(79, 59, 169, 0.15);
283
- padding: 20px;
284
- }
285
- #status-card .gr-markdown {
286
- background: rgba(255, 255, 255, 0.9);
287
- padding: 12px 16px;
288
- border-radius: 12px;
289
- border-left: 4px solid #6c4ddb;
290
- }
291
- .launch-controls .gr-form{
292
- gap: 16px !important;
293
- }
294
- """
295
-
296
- # with gr.Blocks(title="MAA Jericho Scraper", css=custom_css) as demo:
297
- with gr.Blocks(title="MAA Jericho Scraper") as demo:
298
- gr.Markdown(
299
- """<div id='header-card'>
300
- <h1 style='margin-bottom: 0.4rem; color: #4f3ba9;'>MAA Jericho Scraper</h1>
301
- <p style='margin: 0; color: #3b3b54;'>Scrape the Museum of Archaeology and Anthropology collection for artefacts using a keyword and export the results as CSV.</p>
302
- </div>""",
303
- elem_id="header-card",
304
- )
305
-
306
- with gr.Row(elem_classes="launch-controls"):
307
- max_objects_input = gr.Slider(
308
- minimum=1,
309
- maximum=10000,
310
- value=default_max,
311
- step=10,
312
- label="Maximum objects to scrape",
313
- )
314
- start_id_input = gr.Number(
315
- value=default_start,
316
- precision=0,
317
- label="Fallback starting object ID",
318
- )
319
- keyword_input = gr.Textbox(
320
- value=default_keyword,
321
- label="Search keyword",
322
- placeholder="Try terms such as 'Jericho', 'pottery', 'beads'...",
323
- )
324
-
325
- scrape_button = gr.Button("Run scraper", variant="primary", size="lg")
326
- status_markdown = gr.Markdown("Ready.", elem_id="status-card")
327
- results_table = gr.Dataframe(
328
- value=[],
329
- datatype="str",
330
- label="Scraped Records",
331
- interactive=False,
332
- wrap=True,
333
- row_count=(0, "dynamic"),
334
- col_count=(0, "dynamic"),
335
- )
336
- csv_state = gr.State("")
337
- download_button = gr.DownloadButton(
338
- label="Download CSV",
339
- variant="secondary",
340
- size="lg",
341
- )
342
-
343
- scrape_button.click(
344
- fn=run_scraper_interface,
345
- inputs=[max_objects_input, start_id_input, keyword_input],
346
- outputs=[results_table, csv_state, status_markdown],
347
- )
348
- download_button.click(fn=prepare_csv_file, inputs=csv_state, outputs=download_button)
349
-
350
- demo.launch()
351
-
352
-
353
- def main() -> None:
354
- parser = argparse.ArgumentParser(
355
- description="Scrape MAA object pages into a CSV file or launch the Gradio UI",
356
- )
357
- parser.add_argument(
358
- "--keyword",
359
- default=DEFAULT_KEYWORD,
360
- help="Search keyword to filter objects (default: jericho)",
361
- )
362
- parser.add_argument(
363
- "--max-objects",
364
- type=int,
365
- default=100,
366
- help="Number of object pages to scrape when running in CLI mode (default: 100)",
367
- )
368
- parser.add_argument(
369
- "--start-id",
370
- type=int,
371
- default=431363,
372
- help="Fallback starting ID for sequential scraping",
373
- )
374
- parser.add_argument(
375
- "--output",
376
- default="jericho_objects.csv",
377
- help="Output CSV file path when running in CLI mode",
378
- )
379
- parser.add_argument(
380
- "--mode",
381
- choices=["cli", "gradio"],
382
- default="gradio",
383
- help="Execution mode: 'cli' for command line, 'gradio' for the web UI",
384
- )
385
-
386
- args = parser.parse_args()
387
-
388
- if args.mode == "cli":
389
- records = scrape_objects(max_objects=args.max_objects, start_id=args.start_id, keyword=args.keyword)
390
- write_csv(records, args.output)
391
- print(f"Wrote {len(records)} records to {args.output}")
392
- else:
393
- launch_gradio_app(
394
- default_max=args.max_objects,
395
- default_start=args.start_id,
396
- default_keyword=args.keyword,
397
- )
398
-
399
-
400
- if __name__ == "__main__":
401
- main()