LisaMegaWatts commited on
Commit
bc4e57c
·
verified ·
1 Parent(s): 20a2e9f

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +360 -0
app.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio frontend for the text processing pipeline.
3
+
4
+ Provides drag-and-drop file upload, URL fetching, Internet Archive
5
+ search/browse, and corpus management with HuggingFace push.
6
+
7
+ Usage:
8
+ python app.py # Launch on http://localhost:7860
9
+ python app.py --share # Launch with public Gradio link
10
+ """
11
+
12
+ import argparse
13
+ import logging
14
+ import os
15
+ import shutil
16
+ import sys
17
+ import tempfile
18
+ from pathlib import Path
19
+
20
+ # Ensure the script directory is on the path for imports
21
+ SCRIPT_DIR = Path(__file__).resolve().parent
22
+ sys.path.insert(0, str(SCRIPT_DIR))
23
+
24
+ from pipeline import Pipeline
25
+
26
+ logger = logging.getLogger("app")
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Pipeline singleton
30
+ # ---------------------------------------------------------------------------
31
+
32
+ _pipeline: Pipeline | None = None
33
+
34
+
35
+ def get_pipeline() -> Pipeline:
36
+ global _pipeline
37
+ if _pipeline is None:
38
+ _pipeline = Pipeline()
39
+ return _pipeline
40
+
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Tab 1: Add Texts
44
+ # ---------------------------------------------------------------------------
45
+
46
+ def process_uploaded_files(files) -> str:
47
+ """Process uploaded files through the pipeline."""
48
+ if not files:
49
+ return "No files uploaded."
50
+
51
+ pipeline = get_pipeline()
52
+ results = []
53
+
54
+ for file_obj in files:
55
+ src = Path(file_obj.name)
56
+ dest = pipeline.inbox / src.name
57
+
58
+ # Copy to inbox
59
+ shutil.copy2(str(src), str(dest))
60
+ results.append(f"Copied {src.name} to inbox/")
61
+
62
+ # Process inbox
63
+ new_chunks = pipeline.process_inbox()
64
+
65
+ # Rebuild output
66
+ train_n, val_n = pipeline.rebuild_output()
67
+
68
+ results.append(f"\nProcessed: {new_chunks} new chunks")
69
+ results.append(f"Total corpus: {train_n} train / {val_n} val")
70
+
71
+ return "\n".join(results)
72
+
73
+
74
+ def fetch_url(url: str) -> str:
75
+ """Download text from a URL and process it."""
76
+ if not url.strip():
77
+ return "Please enter a URL."
78
+
79
+ import requests
80
+
81
+ pipeline = get_pipeline()
82
+ url = url.strip()
83
+
84
+ try:
85
+ resp = requests.get(url, timeout=30, headers={
86
+ "User-Agent": "PhilosophyCorpus-Pipeline/1.0",
87
+ })
88
+ resp.raise_for_status()
89
+
90
+ # Determine filename from URL
91
+ fname = url.split("/")[-1]
92
+ if not fname.endswith(".txt"):
93
+ fname = fname.replace(".", "_") + ".txt"
94
+
95
+ # Save to inbox
96
+ dest = pipeline.inbox / fname
97
+ dest.write_text(resp.text, encoding="utf-8")
98
+
99
+ # Process
100
+ new_chunks = pipeline.process_inbox()
101
+ train_n, val_n = pipeline.rebuild_output()
102
+
103
+ return (
104
+ f"Downloaded: {fname} ({len(resp.text):,} chars)\n"
105
+ f"Processed: {new_chunks} new chunks\n"
106
+ f"Total corpus: {train_n} train / {val_n} val"
107
+ )
108
+ except Exception as e:
109
+ return f"Error: {e}"
110
+
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # Tab 2: Internet Archive Search
114
+ # ---------------------------------------------------------------------------
115
+
116
+ def search_archive(query: str, subject: str) -> list[list]:
117
+ """Search Internet Archive and return results as table rows."""
118
+ if not query.strip():
119
+ return []
120
+
121
+ from sources.ia_search import search_ia
122
+
123
+ subject_key = subject.lower() if subject != "All" else None
124
+ results = search_ia(query, subject=subject_key, rows=20)
125
+
126
+ rows = []
127
+ for r in results:
128
+ creator = r["creator"]
129
+ if isinstance(creator, list):
130
+ creator = ", ".join(creator)
131
+ rows.append([
132
+ r["identifier"],
133
+ r["title"],
134
+ creator,
135
+ str(r["date"])[:10] if r["date"] else "",
136
+ str(r["downloads"]),
137
+ ])
138
+
139
+ return rows
140
+
141
+
142
+ def add_ia_text(identifier: str) -> str:
143
+ """Download an IA text and process it through the pipeline."""
144
+ if not identifier.strip():
145
+ return "Please enter an Internet Archive identifier."
146
+
147
+ from sources.ia_search import get_ia_text
148
+
149
+ pipeline = get_pipeline()
150
+
151
+ try:
152
+ text = get_ia_text(identifier.strip())
153
+
154
+ fname = f"ia_{identifier.strip()}.txt"
155
+ dest = pipeline.inbox / fname
156
+ dest.write_text(text, encoding="utf-8")
157
+
158
+ new_chunks = pipeline.process_inbox()
159
+ train_n, val_n = pipeline.rebuild_output()
160
+
161
+ return (
162
+ f"Downloaded: {identifier} ({len(text):,} chars)\n"
163
+ f"Processed: {new_chunks} new chunks\n"
164
+ f"Total corpus: {train_n} train / {val_n} val"
165
+ )
166
+ except Exception as e:
167
+ return f"Error: {e}"
168
+
169
+
170
+ # ---------------------------------------------------------------------------
171
+ # Tab 3: Corpus Management
172
+ # ---------------------------------------------------------------------------
173
+
174
+ def get_corpus_stats() -> str:
175
+ """Get current corpus statistics."""
176
+ pipeline = get_pipeline()
177
+ parsed_files = sorted(pipeline.parsed.glob("*.txt"))
178
+
179
+ if not parsed_files:
180
+ return "No parsed files yet. Add texts to get started."
181
+
182
+ lines_out = ["File Chunks Chars", "-" * 60]
183
+ total_chunks = 0
184
+ total_chars = 0
185
+
186
+ for pf in parsed_files:
187
+ file_lines = [l for l in pf.read_text(encoding="utf-8").splitlines() if l.strip()]
188
+ chars = sum(len(l) for l in file_lines)
189
+ total_chunks += len(file_lines)
190
+ total_chars += chars
191
+ lines_out.append(f"{pf.name:<40} {len(file_lines):>8} {chars:>10}")
192
+
193
+ lines_out.append("-" * 60)
194
+ lines_out.append(f"{'TOTAL':<40} {total_chunks:>8} {total_chars:>10}")
195
+
196
+ if total_chunks > 0:
197
+ avg = total_chars / total_chunks
198
+ lines_out.append(f"\nAverage chunk length: {avg:.0f} chars")
199
+
200
+ # Output split info
201
+ train_path = pipeline.output / "train.txt"
202
+ val_path = pipeline.output / "val.txt"
203
+ if train_path.exists() and val_path.exists():
204
+ train_n = len([l for l in train_path.read_text(encoding="utf-8").splitlines() if l.strip()])
205
+ val_n = len([l for l in val_path.read_text(encoding="utf-8").splitlines() if l.strip()])
206
+ lines_out.append(f"\nOutput split: {train_n} train / {val_n} val")
207
+
208
+ # Vocabulary check
209
+ text = train_path.read_text(encoding="utf-8")
210
+ vocab = sorted(set(text) - {"\n"})
211
+ lines_out.append(f"Vocabulary: {len(vocab)} chars -> {''.join(vocab)}")
212
+
213
+ return "\n".join(lines_out)
214
+
215
+
216
+ def get_sample_chunks() -> str:
217
+ """Get sample chunks from the training data."""
218
+ pipeline = get_pipeline()
219
+ train_path = pipeline.output / "train.txt"
220
+
221
+ if not train_path.exists():
222
+ return "No training data yet. Process some texts first."
223
+
224
+ lines = [l.strip() for l in train_path.read_text(encoding="utf-8").splitlines() if l.strip()]
225
+
226
+ if not lines:
227
+ return "Training file is empty."
228
+
229
+ import random
230
+ samples = random.sample(lines, min(10, len(lines)))
231
+ return "\n\n---\n\n".join(f"[{i+1}] {s}" for i, s in enumerate(samples))
232
+
233
+
234
+ def rebuild_dataset() -> str:
235
+ """Rebuild train/val split from existing parsed chunks."""
236
+ pipeline = get_pipeline()
237
+ train_n, val_n = pipeline.rebuild_output()
238
+ return f"Rebuilt: {train_n} train / {val_n} val chunks"
239
+
240
+
241
+ def push_to_hf(repo_id: str) -> str:
242
+ """Push dataset to HuggingFace Hub."""
243
+ if not repo_id.strip():
244
+ return "Please enter a HuggingFace repo ID (e.g. username/philosophy-corpus)."
245
+
246
+ pipeline = get_pipeline()
247
+
248
+ try:
249
+ url = pipeline.push_to_hub(repo_id=repo_id.strip())
250
+ return f"Dataset pushed successfully!\n{url}"
251
+ except Exception as e:
252
+ return f"Error: {e}"
253
+
254
+
255
+ # ---------------------------------------------------------------------------
256
+ # Gradio UI
257
+ # ---------------------------------------------------------------------------
258
+
259
+ def build_ui():
260
+ import gradio as gr
261
+
262
+ with gr.Blocks(title="Philosophy Corpus Pipeline", theme=gr.themes.Soft()) as app:
263
+ gr.Markdown("# Philosophy Corpus Pipeline\nBuild training data for JuliaGPT")
264
+
265
+ with gr.Tab("Add Texts"):
266
+ gr.Markdown("### Upload Files")
267
+ file_upload = gr.File(
268
+ label="Drag and drop .txt, .epub, or .zip files",
269
+ file_count="multiple",
270
+ file_types=[".txt", ".epub", ".zip"],
271
+ )
272
+ upload_btn = gr.Button("Process Uploaded Files", variant="primary")
273
+ upload_output = gr.Textbox(label="Result", lines=6)
274
+ upload_btn.click(process_uploaded_files, inputs=[file_upload], outputs=[upload_output])
275
+
276
+ gr.Markdown("### Fetch from URL")
277
+ url_input = gr.Textbox(
278
+ label="Text URL (Gutenberg, MIT Classics, Internet Archive, or any .txt URL)",
279
+ placeholder="https://www.gutenberg.org/cache/epub/21076/pg21076.txt",
280
+ )
281
+ fetch_btn = gr.Button("Fetch and Process")
282
+ fetch_output = gr.Textbox(label="Result", lines=4)
283
+ fetch_btn.click(fetch_url, inputs=[url_input], outputs=[fetch_output])
284
+
285
+ with gr.Tab("Search Internet Archive"):
286
+ gr.Markdown("### Search the Internet Archive for classical texts")
287
+ with gr.Row():
288
+ search_input = gr.Textbox(label="Search Query", placeholder="aristotle philosophy")
289
+ subject_dropdown = gr.Dropdown(
290
+ choices=["All", "Philosophy", "Mathematics", "Rhetoric",
291
+ "Logic", "Ethics", "Metaphysics", "Politics", "Classical"],
292
+ value="Philosophy",
293
+ label="Subject Filter",
294
+ )
295
+ search_btn = gr.Button("Search", variant="primary")
296
+ search_results = gr.Dataframe(
297
+ headers=["Identifier", "Title", "Author", "Date", "Downloads"],
298
+ label="Search Results",
299
+ interactive=False,
300
+ )
301
+ search_btn.click(
302
+ search_archive,
303
+ inputs=[search_input, subject_dropdown],
304
+ outputs=[search_results],
305
+ )
306
+
307
+ gr.Markdown("### Add a text to the corpus")
308
+ ia_id_input = gr.Textbox(
309
+ label="Internet Archive Identifier",
310
+ placeholder="Paste an identifier from the search results above",
311
+ )
312
+ add_btn = gr.Button("Download and Process")
313
+ add_output = gr.Textbox(label="Result", lines=4)
314
+ add_btn.click(add_ia_text, inputs=[ia_id_input], outputs=[add_output])
315
+
316
+ with gr.Tab("Corpus"):
317
+ gr.Markdown("### Corpus Statistics")
318
+ stats_output = gr.Textbox(label="Statistics", lines=15, value=get_corpus_stats)
319
+ refresh_btn = gr.Button("Refresh Stats")
320
+ refresh_btn.click(get_corpus_stats, outputs=[stats_output])
321
+
322
+ gr.Markdown("### Sample Chunks")
323
+ sample_output = gr.Textbox(label="Random samples from training data", lines=15)
324
+ sample_btn = gr.Button("Show Samples")
325
+ sample_btn.click(get_sample_chunks, outputs=[sample_output])
326
+
327
+ gr.Markdown("### Actions")
328
+ with gr.Row():
329
+ rebuild_btn = gr.Button("Rebuild Dataset")
330
+ rebuild_output = gr.Textbox(label="Result", lines=2)
331
+ rebuild_btn.click(rebuild_dataset, outputs=[rebuild_output])
332
+
333
+ with gr.Row():
334
+ hf_repo_input = gr.Textbox(
335
+ label="HuggingFace Repo ID",
336
+ placeholder="username/philosophy-corpus",
337
+ )
338
+ push_btn = gr.Button("Push to HuggingFace", variant="primary")
339
+ push_output = gr.Textbox(label="Result", lines=2)
340
+ push_btn.click(push_to_hf, inputs=[hf_repo_input], outputs=[push_output])
341
+
342
+ return app
343
+
344
+
345
+ # ---------------------------------------------------------------------------
346
+ # Entry point
347
+ # ---------------------------------------------------------------------------
348
+
349
+ def main():
350
+ parser = argparse.ArgumentParser(description="Philosophy Corpus Pipeline UI")
351
+ parser.add_argument("--share", action="store_true", help="Create a public Gradio link")
352
+ parser.add_argument("--port", type=int, default=7860, help="Port to run on")
353
+ args = parser.parse_args()
354
+
355
+ app = build_ui()
356
+ app.launch(share=args.share, server_port=args.port)
357
+
358
+
359
+ if __name__ == "__main__":
360
+ main()