manu commited on
Commit
bc2554a
·
verified ·
1 Parent(s): 02bf460

Upload maa_jericho_scraper.py

Browse files
Files changed (1) hide show
  1. maa_jericho_scraper.py +401 -0
maa_jericho_scraper.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ maa_jericho_scraper.py
4
+ ~~~~~~~~~~~~~~~~~~~~~~~
5
+
6
+ This script scrapes object records from the Museum of Archaeology
7
+ and Anthropology (MAA) collection website at the University of
8
+ Cambridge. It now also offers a Gradio-powered web interface so that
9
+ records can be gathered and downloaded without using the command line.
10
+
11
+ The scraper targets the search results for a user-provided keyword
12
+ (default: "jericho") and collects detailed object pages containing
13
+ fields such as Accession Number, Description, Place, Period, Source,
14
+ Department, Reference Numbers, Cultural Affiliation, Material, Local
15
+ Term, Measurements and a series of Events.
16
+
17
+ Usage (CLI mode):
18
+ python maa_jericho_scraper.py --mode cli --keyword jericho --max-objects 100 --output jericho_objects.csv
19
+
20
+ Usage (Gradio UI):
21
+ python maa_jericho_scraper.py
22
+
23
+ Options:
24
+ --keyword: search keyword for filtering objects (default: jericho)
25
+ --max-objects: number of object pages to scrape (default: 100)
26
+ --output: path to the CSV file to write (default: jericho_objects.csv)
27
+ --start-id: starting numeric object identifier for fallback scraping
28
+ --mode: "cli" to run headless, "gradio" (default) to launch the UI
29
+
30
+ Requirements:
31
+ - Python 3.7+
32
+ - requests
33
+ - beautifulsoup4
34
+ - gradio (for the UI)
35
+
36
+ Note:
37
+ This script is provided for educational purposes. Always review
38
+ and respect the terms of use of any website you scrape. Use
39
+ responsibly and avoid overwhelming the target servers with rapid
40
+ requests.
41
+ """
42
+
43
+ import argparse
44
+ import csv
45
+ import io
46
+ import re
47
+ import sys
48
+ import tempfile
49
+ import time
50
+ from typing import Dict, List, Optional, Tuple
51
+
52
+ import requests
53
+ from bs4 import BeautifulSoup
54
+
55
+ try:
56
+ import gradio as gr
57
+ except Exception: # pragma: no cover - import guard for optional dependency
58
+ gr = None # type: ignore[assignment]
59
+
60
+ BASE_URL = "https://collections.maa.cam.ac.uk"
61
+ SEARCH_PATH = "/objects/"
62
+ DEFAULT_KEYWORD = "jericho"
63
+
64
+
65
+ def get_search_page(session: requests.Session, keyword: str, page_num: int) -> Optional[BeautifulSoup]:
66
+ """Return a BeautifulSoup object for a given page of search results."""
67
+ params = {"query": keyword, "page": page_num}
68
+ try:
69
+ resp = session.get(f"{BASE_URL}{SEARCH_PATH}", params=params, timeout=30)
70
+ resp.raise_for_status()
71
+ except Exception as exc: # pragma: no cover - network dependent
72
+ sys.stderr.write(f"[warning] Failed to fetch search page {page_num} for keyword '{keyword}': {exc}\n")
73
+ return None
74
+ return BeautifulSoup(resp.text, "html.parser")
75
+
76
+
77
+ def extract_object_links(soup: BeautifulSoup) -> List[str]:
78
+ """Extract object page URLs from a search results page."""
79
+ links: List[str] = []
80
+ for a in soup.find_all("a", href=True):
81
+ href = a.get("href") or ""
82
+ if re.fullmatch(r"/objects/\d+/?", href):
83
+ full_url = f"{BASE_URL}{href.rstrip('/')}/"
84
+ if full_url not in links:
85
+ links.append(full_url)
86
+ return links
87
+
88
+
89
+ def parse_object_page(session: requests.Session, url: str) -> Optional[Dict[str, str]]:
90
+ """Retrieve and parse an individual object page."""
91
+ try:
92
+ resp = session.get(url, timeout=30)
93
+ resp.raise_for_status()
94
+ except Exception as exc: # pragma: no cover - network dependent
95
+ sys.stderr.write(f"[warning] Failed to fetch object page {url}: {exc}\n")
96
+ return None
97
+
98
+ soup = BeautifulSoup(resp.text, "html.parser")
99
+ result: Dict[str, str] = {
100
+ "Accession No": "",
101
+ "Description": "",
102
+ "Place": "",
103
+ "Period": "",
104
+ "Source": "",
105
+ "Department": "",
106
+ "Reference Numbers": "",
107
+ "Cultural Affiliation": "",
108
+ "Material": "",
109
+ "Local Term": "",
110
+ "Measurements": "",
111
+ "Events": "",
112
+ "FM": "",
113
+ "URL": url,
114
+ }
115
+
116
+ data_divs = soup.find_all("div", class_=lambda c: c and "flex-wrap" in c and "flex-md-nowrap" in c)
117
+ for div in data_divs:
118
+ label_p = div.find("p", class_=lambda c: c and "fw-bold" in c)
119
+ if not label_p:
120
+ continue
121
+ label = label_p.get_text(strip=True).rstrip(":").strip()
122
+ if label == "Events":
123
+ events_container = div.find("div", class_=lambda c: c and "d-flex" in c and "flex-column" in c)
124
+ if events_container:
125
+ entries: List[str] = []
126
+ for p_tag in events_container.find_all("p", class_=lambda c: c and c.startswith("col-")):
127
+ text = p_tag.get_text(separator=" ").strip()
128
+ text = re.sub(r"\s+", " ", text)
129
+ if text:
130
+ entries.append(text)
131
+ result["Events"] = " || ".join(entries)
132
+ else:
133
+ value_p = label_p.find_next_sibling("p")
134
+ if value_p:
135
+ value_text = value_p.get_text(separator=" ").strip()
136
+ value_text = re.sub(r"\s+", " ", value_text)
137
+ value_text = re.sub(r";\s*", "; ", value_text)
138
+ result[label] = value_text
139
+
140
+ fm_tag = soup.find("p", class_=lambda c: c and c.startswith("fs-"))
141
+ if fm_tag:
142
+ result["FM"] = fm_tag.get_text(strip=True)
143
+ return result
144
+
145
+
146
+ def scrape_objects(max_objects: int = 100, start_id: int = 431363, keyword: str = DEFAULT_KEYWORD) -> List[Dict[str, str]]:
147
+ """Scrape object pages until a desired number of results is collected."""
148
+ session = requests.Session()
149
+ session.headers.update(
150
+ {
151
+ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0",
152
+ "Accept-Language": "en-US,en;q=0.9",
153
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
154
+ "Connection": "keep-alive",
155
+ }
156
+ )
157
+
158
+ search_keyword = keyword.strip() or DEFAULT_KEYWORD
159
+ object_urls: List[str] = []
160
+ page = 1
161
+ while len(object_urls) < max_objects:
162
+ soup = get_search_page(session, search_keyword, page)
163
+ if soup is None:
164
+ break
165
+ new_links = extract_object_links(soup)
166
+ if not new_links:
167
+ break
168
+ for link in new_links:
169
+ if link not in object_urls:
170
+ object_urls.append(link)
171
+ if len(object_urls) >= max_objects:
172
+ break
173
+ page += 1
174
+ time.sleep(0.5)
175
+
176
+ if len(object_urls) < max_objects:
177
+ current_id = start_id
178
+ while len(object_urls) < max_objects:
179
+ url = f"{BASE_URL}{SEARCH_PATH}{current_id}/"
180
+ if url not in object_urls:
181
+ object_urls.append(url)
182
+ current_id += 1
183
+
184
+ results: List[Dict[str, str]] = []
185
+ for idx, url in enumerate(object_urls[:max_objects], start=1):
186
+ sys.stderr.write(f"[info] ({idx}/{max_objects}) Scraping {url}\n")
187
+ data = parse_object_page(session, url)
188
+ if data:
189
+ results.append(data)
190
+ else:
191
+ results.append({"URL": url})
192
+ time.sleep(0.5)
193
+ return results
194
+
195
+
196
+ def collect_fieldnames(records: List[Dict[str, str]]) -> List[str]:
197
+ fieldnames: List[str] = []
198
+ for rec in records:
199
+ for key in rec.keys():
200
+ if key not in fieldnames:
201
+ fieldnames.append(key)
202
+ return fieldnames or ["URL"]
203
+
204
+
205
+ def records_to_csv_text(records: List[Dict[str, str]]) -> Tuple[List[str], str]:
206
+ fieldnames = collect_fieldnames(records)
207
+ buffer = io.StringIO()
208
+ writer = csv.DictWriter(buffer, fieldnames=fieldnames)
209
+ writer.writeheader()
210
+ for rec in records:
211
+ writer.writerow({key: rec.get(key, "") for key in fieldnames})
212
+ return fieldnames, buffer.getvalue()
213
+
214
+
215
+ def prepare_table(records: List[Dict[str, str]], fieldnames: List[str]) -> List[List[str]]:
216
+ return [[rec.get(field, "") for field in fieldnames] for rec in records]
217
+
218
+
219
+ def write_csv(records: List[Dict[str, str]], output_path: str) -> None:
220
+ """Write scraped records to a CSV file."""
221
+ _, csv_text = records_to_csv_text(records)
222
+ with open(output_path, "w", newline="", encoding="utf-8") as f:
223
+ f.write(csv_text)
224
+
225
+
226
+ def run_scraper_interface(max_objects: int, start_id: int, keyword: str):
227
+ if gr is None: # pragma: no cover - runtime guard
228
+ raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.")
229
+
230
+ try:
231
+ max_int = max(1, int(max_objects))
232
+ start_int = int(start_id)
233
+ except Exception:
234
+ status = "Please provide valid numeric values for max objects and start ID."
235
+ return gr.update(value=[], headers=[]), "", status
236
+
237
+ search_keyword = (keyword or "").strip()
238
+ if not search_keyword:
239
+ status = "Please enter a search keyword."
240
+ return gr.update(value=[], headers=[]), "", status
241
+
242
+ records = scrape_objects(max_objects=max_int, start_id=start_int, keyword=search_keyword)
243
+ fieldnames, csv_text = records_to_csv_text(records)
244
+ table = prepare_table(records, fieldnames)
245
+ status = f"Scraped {len(records)} object(s) for keyword '{search_keyword}'."
246
+ return gr.update(value=table, headers=fieldnames), csv_text, status
247
+
248
+
249
+ def prepare_csv_file(csv_text: str) -> Optional[str]:
250
+ if not csv_text:
251
+ return None
252
+ tmp_file = tempfile.NamedTemporaryFile(
253
+ delete=False,
254
+ suffix=".csv",
255
+ prefix="jericho_",
256
+ mode="w",
257
+ encoding="utf-8",
258
+ )
259
+ with tmp_file:
260
+ tmp_file.write(csv_text)
261
+ return tmp_file.name
262
+
263
+
264
+ def launch_gradio_app(
265
+ default_max: int = 25,
266
+ default_start: int = 431363,
267
+ default_keyword: str = DEFAULT_KEYWORD,
268
+ ) -> None:
269
+ if gr is None: # pragma: no cover - runtime guard
270
+ raise RuntimeError("Gradio is not installed. Install it with `pip install gradio` to use the UI mode.")
271
+
272
+ custom_css = """
273
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
274
+ .gradio-container {
275
+ background: radial-gradient(circle at top, #f8f5ff 0%, #f5f7fb 55%, #eef1f6 100%);
276
+ font-family: 'IBM Plex Sans', 'Segoe UI', sans-serif;
277
+ color: #1f2937;
278
+ }
279
+ #header-card {
280
+ border-radius: 16px;
281
+ background: rgba(255, 255, 255, 0.85);
282
+ box-shadow: 0 12px 24px rgba(79, 59, 169, 0.15);
283
+ padding: 20px;
284
+ }
285
+ #status-card .gr-markdown {
286
+ background: rgba(255, 255, 255, 0.9);
287
+ padding: 12px 16px;
288
+ border-radius: 12px;
289
+ border-left: 4px solid #6c4ddb;
290
+ }
291
+ .launch-controls .gr-form{
292
+ gap: 16px !important;
293
+ }
294
+ """
295
+
296
+ # with gr.Blocks(title="MAA Jericho Scraper", css=custom_css) as demo:
297
+ with gr.Blocks(title="MAA Jericho Scraper") as demo:
298
+ gr.Markdown(
299
+ """<div id='header-card'>
300
+ <h1 style='margin-bottom: 0.4rem; color: #4f3ba9;'>MAA Jericho Scraper</h1>
301
+ <p style='margin: 0; color: #3b3b54;'>Scrape the Museum of Archaeology and Anthropology collection for artefacts using a keyword and export the results as CSV.</p>
302
+ </div>""",
303
+ elem_id="header-card",
304
+ )
305
+
306
+ with gr.Row(elem_classes="launch-controls"):
307
+ max_objects_input = gr.Slider(
308
+ minimum=1,
309
+ maximum=500,
310
+ value=default_max,
311
+ step=1,
312
+ label="Maximum objects to scrape",
313
+ )
314
+ start_id_input = gr.Number(
315
+ value=default_start,
316
+ precision=0,
317
+ label="Fallback starting object ID",
318
+ )
319
+ keyword_input = gr.Textbox(
320
+ value=default_keyword,
321
+ label="Search keyword",
322
+ placeholder="Try terms such as 'Jericho', 'pottery', 'beads'...",
323
+ )
324
+
325
+ scrape_button = gr.Button("Run scraper", variant="primary", size="lg")
326
+ status_markdown = gr.Markdown("Ready.", elem_id="status-card")
327
+ results_table = gr.Dataframe(
328
+ value=[],
329
+ datatype="str",
330
+ label="Scraped Records",
331
+ interactive=False,
332
+ wrap=True,
333
+ row_count=(0, "dynamic"),
334
+ col_count=(0, "dynamic"),
335
+ )
336
+ csv_state = gr.State("")
337
+ download_button = gr.DownloadButton(
338
+ label="Download CSV",
339
+ variant="secondary",
340
+ size="lg",
341
+ )
342
+
343
+ scrape_button.click(
344
+ fn=run_scraper_interface,
345
+ inputs=[max_objects_input, start_id_input, keyword_input],
346
+ outputs=[results_table, csv_state, status_markdown],
347
+ )
348
+ download_button.click(fn=prepare_csv_file, inputs=csv_state, outputs=download_button)
349
+
350
+ demo.launch()
351
+
352
+
353
+ def main() -> None:
354
+ parser = argparse.ArgumentParser(
355
+ description="Scrape MAA object pages into a CSV file or launch the Gradio UI",
356
+ )
357
+ parser.add_argument(
358
+ "--keyword",
359
+ default=DEFAULT_KEYWORD,
360
+ help="Search keyword to filter objects (default: jericho)",
361
+ )
362
+ parser.add_argument(
363
+ "--max-objects",
364
+ type=int,
365
+ default=100,
366
+ help="Number of object pages to scrape when running in CLI mode (default: 100)",
367
+ )
368
+ parser.add_argument(
369
+ "--start-id",
370
+ type=int,
371
+ default=431363,
372
+ help="Fallback starting ID for sequential scraping",
373
+ )
374
+ parser.add_argument(
375
+ "--output",
376
+ default="jericho_objects.csv",
377
+ help="Output CSV file path when running in CLI mode",
378
+ )
379
+ parser.add_argument(
380
+ "--mode",
381
+ choices=["cli", "gradio"],
382
+ default="gradio",
383
+ help="Execution mode: 'cli' for command line, 'gradio' for the web UI",
384
+ )
385
+
386
+ args = parser.parse_args()
387
+
388
+ if args.mode == "cli":
389
+ records = scrape_objects(max_objects=args.max_objects, start_id=args.start_id, keyword=args.keyword)
390
+ write_csv(records, args.output)
391
+ print(f"Wrote {len(records)} records to {args.output}")
392
+ else:
393
+ launch_gradio_app(
394
+ default_max=args.max_objects,
395
+ default_start=args.start_id,
396
+ default_keyword=args.keyword,
397
+ )
398
+
399
+
400
+ if __name__ == "__main__":
401
+ main()