Spaces:
Running
on
Zero
Running
on
Zero
Bellok
commited on
Commit
·
ec38897
1
Parent(s):
a2c1773
refactor(app): improve code formatting and add background ingestion status display
Browse files- Remove unused 'hashlib' import
- Update typing import by removing unused 'List'
- Standardize string quotes to double quotes for consistency
- Reformat long print statements into multi-line for readability
- Simplify thread creation arguments on single line
- Adjust long string concatenations and metadata formatting
- Add memory garbage collection after large ingestion batches
- Include background pack ingestion details in system stats output for better monitoring
This commit enhances code maintainability through consistent formatting and adds informative status reporting for ingestion processes.
- .gitignore +1 -0
- app.py +45 -14
- compress_packs.py +17 -19
- final_fix.py +2 -4
- fix_theme.py +3 -3
- package-lock.json +15 -1
- package.json +2 -1
- packs/warbler-pack-npc-dialog/src/index.ts +27 -3
- packs/warbler-pack-npc-dialog/warbler-pack-core.jsonl +0 -2
- packs/warbler-pack-wisdom-scrolls/README.md +22 -4
- packs/warbler-pack-wisdom-scrolls/README_HF_DATASET.md +7 -1
- test_app.py +3 -1
- test_compressed_pack.py +3 -1
- test_embedding_integration.py +2 -2
- test_fixes.py +2 -2
- test_pack_loading.py +4 -1
- tests/test_new_mit_datasets.py +2 -4
- tests/test_pdf_ingestion.py +1 -4
- tests/test_rag_e2e.py +12 -22
- tests/test_retrieval_api.py +1 -1
- tsconfig.base.json +14 -0
- validate_new_transformers.py +2 -2
- verify_pack_ingestion.py +0 -1
- warbler_cda/api/cli.py +4 -6
- warbler_cda/api/service.py +9 -13
- warbler_cda/castle_graph.py +1 -1
- warbler_cda/conflict_detector.py +5 -3
- warbler_cda/embeddings/openai_provider.py +0 -1
- warbler_cda/embeddings/sentence_transformer_provider.py +3 -5
- warbler_cda/evaporation.py +4 -3
- warbler_cda/pack_loader.py +1 -3
- warbler_cda/pack_sync.py +2 -2
- warbler_cda/retrieval_api.py +11 -6
- warbler_cda/semantic_anchors.py +2 -4
- warbler_cda/stat7_entity.py +6 -11
- warbler_cda/stat7_experiments.py +9 -17
- warbler_cda/stat7_rag_bridge.py +5 -13
- warbler_cda/stat7_visualization.py +3 -7
- warbler_cda/summarization_ladder.py +2 -2
- warbler_cda/utils/hf_warbler_ingest.py +3 -4
- warbler_cda/utils/load_warbler_packs.py +1 -3
- warbler_cda/utils/transformers/base.py +49 -65
- warbler_cda/utils/transformers/edustories.py +14 -18
- warbler_cda/utils/transformers/enterprise.py +10 -14
- warbler_cda/utils/transformers/multi_character.py +37 -50
- warbler_cda/utils/transformers/novels.py +21 -29
- warbler_cda/utils/transformers/npc_dialogue.py +16 -8
- warbler_cda/utils/transformers/portuguese_education.py +21 -22
.gitignore
CHANGED
|
@@ -661,3 +661,4 @@ node_modules/wrappy/LICENSE
|
|
| 661 |
node_modules/wrappy/package.json
|
| 662 |
node_modules/wrappy/README.md
|
| 663 |
node_modules/wrappy/wrappy.js
|
|
|
|
|
|
| 661 |
node_modules/wrappy/package.json
|
| 662 |
node_modules/wrappy/README.md
|
| 663 |
node_modules/wrappy/wrappy.js
|
| 664 |
+
TODO.md
|
app.py
CHANGED
|
@@ -8,13 +8,12 @@ import time
|
|
| 8 |
import os
|
| 9 |
import threading
|
| 10 |
import gradio as gr
|
| 11 |
-
import hashlib
|
| 12 |
import spaces
|
| 13 |
from pathlib import Path
|
| 14 |
-
from typing import
|
| 15 |
|
| 16 |
# Set TOKENIZERS_PARALLELISM to avoid warnings with SentenceTransformers
|
| 17 |
-
os.environ[
|
| 18 |
|
| 19 |
|
| 20 |
# Global variables for background ingestion tracking
|
|
@@ -76,20 +75,25 @@ def background_ingest_packs(api, pack_docs, pack_manager):
|
|
| 76 |
ingestion_status["rate"] = rate
|
| 77 |
ingestion_status["eta"] = eta
|
| 78 |
|
| 79 |
-
print(
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
| 82 |
|
| 83 |
# Force garbage collection after large batches to free memory
|
| 84 |
if processed % 10000 == 0:
|
| 85 |
import gc
|
|
|
|
| 86 |
gc.collect()
|
| 87 |
|
| 88 |
packs_loaded = processed
|
| 89 |
pack_manager.mark_packs_ingested(1, packs_loaded)
|
| 90 |
total_time = time.time() - start_time
|
| 91 |
-
print(
|
| 92 |
-
|
|
|
|
|
|
|
| 93 |
|
| 94 |
# Mark ingestion complete
|
| 95 |
ingestion_status["running"] = False
|
|
@@ -259,9 +263,7 @@ if WARBLER_AVAILABLE:
|
|
| 259 |
if pack_docs and pack_manager.should_ingest_packs(api, len(pack_docs)):
|
| 260 |
# Start background ingestion
|
| 261 |
ingestion_thread = threading.Thread(
|
| 262 |
-
target=background_ingest_packs,
|
| 263 |
-
args=(api, pack_docs, pack_manager),
|
| 264 |
-
daemon=True
|
| 265 |
)
|
| 266 |
ingestion_thread.start()
|
| 267 |
packs_loaded = 0 # Will be updated asynchronously
|
|
@@ -338,7 +340,7 @@ def query_warbler(
|
|
| 338 |
elapsed_ms = (time.time() - start_time) * 1000
|
| 339 |
|
| 340 |
# Format results
|
| 341 |
-
results_text =
|
| 342 |
results_text += f"**Query:** {query_text}\n\n"
|
| 343 |
results_text += (
|
| 344 |
f"**Mode:** {'Hybrid (Semantic + STAT7)' if use_hybrid else 'Semantic Only'}\n\n"
|
|
@@ -361,7 +363,7 @@ def query_warbler(
|
|
| 361 |
results_text += f"**Type:** {result.content_type}\n\n"
|
| 362 |
|
| 363 |
if result.metadata:
|
| 364 |
-
results_text +=
|
| 365 |
for key, value in result.metadata.items():
|
| 366 |
if key != "stat7": # Skip complex STAT7 object
|
| 367 |
results_text += f"- {key}: {value}\n"
|
|
@@ -428,7 +430,7 @@ def get_system_stats() -> str:
|
|
| 428 |
try:
|
| 429 |
metrics = api.get_retrieval_metrics()
|
| 430 |
|
| 431 |
-
stats =
|
| 432 |
stats += f"**Total Documents:** {metrics['context_store_size']}\n\n"
|
| 433 |
stats += f"**Total Queries:** {metrics['retrieval_metrics']['total_queries']}\n\n"
|
| 434 |
stats += f"**Cache Hit Rate:** {metrics['cache_performance']['hit_rate']:.1%}\n\n"
|
|
@@ -440,6 +442,35 @@ def get_system_stats() -> str:
|
|
| 440 |
for quality, count in metrics["retrieval_metrics"]["quality_distribution"].items():
|
| 441 |
stats += f"- {quality.capitalize()}: {count}\n"
|
| 442 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
return stats
|
| 444 |
|
| 445 |
except Exception as e:
|
|
|
|
| 8 |
import os
|
| 9 |
import threading
|
| 10 |
import gradio as gr
|
|
|
|
| 11 |
import spaces
|
| 12 |
from pathlib import Path
|
| 13 |
+
from typing import Tuple, Optional, Dict
|
| 14 |
|
| 15 |
# Set TOKENIZERS_PARALLELISM to avoid warnings with SentenceTransformers
|
| 16 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 17 |
|
| 18 |
|
| 19 |
# Global variables for background ingestion tracking
|
|
|
|
| 75 |
ingestion_status["rate"] = rate
|
| 76 |
ingestion_status["eta"] = eta
|
| 77 |
|
| 78 |
+
print(
|
| 79 |
+
f"[PROGRESS] {processed}/{total_docs} documents ingested "
|
| 80 |
+
f"({processed/total_docs*100:.1f}%) - "
|
| 81 |
+
f"{rate:.1f} docs/sec - ETA: {eta/60:.1f} min"
|
| 82 |
+
)
|
| 83 |
|
| 84 |
# Force garbage collection after large batches to free memory
|
| 85 |
if processed % 10000 == 0:
|
| 86 |
import gc
|
| 87 |
+
|
| 88 |
gc.collect()
|
| 89 |
|
| 90 |
packs_loaded = processed
|
| 91 |
pack_manager.mark_packs_ingested(1, packs_loaded)
|
| 92 |
total_time = time.time() - start_time
|
| 93 |
+
print(
|
| 94 |
+
f"[OK] Loaded {packs_loaded} documents from Warbler packs "
|
| 95 |
+
f"({failed} failed) in {total_time:.1f} seconds"
|
| 96 |
+
)
|
| 97 |
|
| 98 |
# Mark ingestion complete
|
| 99 |
ingestion_status["running"] = False
|
|
|
|
| 263 |
if pack_docs and pack_manager.should_ingest_packs(api, len(pack_docs)):
|
| 264 |
# Start background ingestion
|
| 265 |
ingestion_thread = threading.Thread(
|
| 266 |
+
target=background_ingest_packs, args=(api, pack_docs, pack_manager), daemon=True
|
|
|
|
|
|
|
| 267 |
)
|
| 268 |
ingestion_thread.start()
|
| 269 |
packs_loaded = 0 # Will be updated asynchronously
|
|
|
|
| 340 |
elapsed_ms = (time.time() - start_time) * 1000
|
| 341 |
|
| 342 |
# Format results
|
| 343 |
+
results_text = "# Query Results\n\n"
|
| 344 |
results_text += f"**Query:** {query_text}\n\n"
|
| 345 |
results_text += (
|
| 346 |
f"**Mode:** {'Hybrid (Semantic + STAT7)' if use_hybrid else 'Semantic Only'}\n\n"
|
|
|
|
| 363 |
results_text += f"**Type:** {result.content_type}\n\n"
|
| 364 |
|
| 365 |
if result.metadata:
|
| 366 |
+
results_text += "**Metadata:**\n"
|
| 367 |
for key, value in result.metadata.items():
|
| 368 |
if key != "stat7": # Skip complex STAT7 object
|
| 369 |
results_text += f"- {key}: {value}\n"
|
|
|
|
| 430 |
try:
|
| 431 |
metrics = api.get_retrieval_metrics()
|
| 432 |
|
| 433 |
+
stats = "# System Statistics\n\n"
|
| 434 |
stats += f"**Total Documents:** {metrics['context_store_size']}\n\n"
|
| 435 |
stats += f"**Total Queries:** {metrics['retrieval_metrics']['total_queries']}\n\n"
|
| 436 |
stats += f"**Cache Hit Rate:** {metrics['cache_performance']['hit_rate']:.1%}\n\n"
|
|
|
|
| 442 |
for quality, count in metrics["retrieval_metrics"]["quality_distribution"].items():
|
| 443 |
stats += f"- {quality.capitalize()}: {count}\n"
|
| 444 |
|
| 445 |
+
# Add ingestion status information
|
| 446 |
+
global ingestion_status
|
| 447 |
+
stats += "\n## Background Pack Ingestion\n\n"
|
| 448 |
+
|
| 449 |
+
if ingestion_status["running"]:
|
| 450 |
+
# Currently ingesting
|
| 451 |
+
progress_percent = (ingestion_status["processed"] / ingestion_status["total_docs"] * 100) if ingestion_status["total_docs"] > 0 else 0
|
| 452 |
+
eta_minutes = ingestion_status["eta"] / 60 if ingestion_status["eta"] > 0 else 0
|
| 453 |
+
|
| 454 |
+
stats += "**Status:** 🟢 **ACTIVE** - Ingesting documents...\n\n"
|
| 455 |
+
stats += "```\n"
|
| 456 |
+
stats += f"Progress: {ingestion_status['processed']}/{ingestion_status['total_docs']} documents\n"
|
| 457 |
+
stats += f"Complete: {progress_percent:.1f}%\n"
|
| 458 |
+
stats += f"Rate: {ingestion_status['rate']:.1f} docs/sec\n"
|
| 459 |
+
stats += f"ETA: {eta_minutes:.1f} minutes\n"
|
| 460 |
+
if ingestion_status['failed'] > 0:
|
| 461 |
+
stats += f"Failed: {ingestion_status['failed']} documents\n"
|
| 462 |
+
stats += "```\n\n"
|
| 463 |
+
elif ingestion_status["total_docs"] > 0:
|
| 464 |
+
# Completed ingestion (has totals but not running)
|
| 465 |
+
stats += "**Status:** ✅ **COMPLETE**\n\n"
|
| 466 |
+
stats += f"**Last Ingestion:** Processed {ingestion_status['processed']} documents"
|
| 467 |
+
if ingestion_status['failed'] > 0:
|
| 468 |
+
stats += f" ({ingestion_status['failed']} failed)"
|
| 469 |
+
stats += "\n\n"
|
| 470 |
+
else:
|
| 471 |
+
# No background ingestion detected
|
| 472 |
+
stats += "**Status:** ⚪ **IDLE** - No background ingestion active\n\n"
|
| 473 |
+
|
| 474 |
return stats
|
| 475 |
|
| 476 |
except Exception as e:
|
compress_packs.py
CHANGED
|
@@ -7,7 +7,6 @@ compressed proto-thoughts generated by the evaporation engine.
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import json
|
| 10 |
-
import os
|
| 11 |
import sys
|
| 12 |
from pathlib import Path
|
| 13 |
from typing import Dict, Any, List
|
|
@@ -22,7 +21,7 @@ from warbler_cda.evaporation import EvaporationEngine, CloudStore
|
|
| 22 |
def load_jsonl_file(filepath: str) -> List[Dict[str, Any]]:
|
| 23 |
"""Load a JSONL file and return list of documents."""
|
| 24 |
documents = []
|
| 25 |
-
with open(filepath,
|
| 26 |
for line in f:
|
| 27 |
line = line.strip()
|
| 28 |
if line:
|
|
@@ -32,9 +31,9 @@ def load_jsonl_file(filepath: str) -> List[Dict[str, Any]]:
|
|
| 32 |
|
| 33 |
def save_jsonl_file(filepath: str, documents: List[Dict[str, Any]]) -> None:
|
| 34 |
"""Save list of documents to a JSONL file."""
|
| 35 |
-
with open(filepath,
|
| 36 |
for doc in documents:
|
| 37 |
-
f.write(json.dumps(doc, ensure_ascii=False) +
|
| 38 |
|
| 39 |
|
| 40 |
def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
|
|
@@ -70,39 +69,38 @@ def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
|
|
| 70 |
compressed_documents = []
|
| 71 |
|
| 72 |
for doc in documents:
|
| 73 |
-
if
|
| 74 |
-
print(
|
| 75 |
continue
|
| 76 |
|
| 77 |
-
content = doc[
|
| 78 |
if not content or not isinstance(content, str):
|
| 79 |
-
print(
|
| 80 |
continue
|
| 81 |
|
| 82 |
try:
|
| 83 |
# Create a fragment from the document content
|
| 84 |
-
fragment = {
|
| 85 |
-
'id': doc.get('content_id', f'doc_{compressed_docs}'),
|
| 86 |
-
'text': content
|
| 87 |
-
}
|
| 88 |
|
| 89 |
# Create glyph from the single fragment
|
| 90 |
-
|
| 91 |
|
| 92 |
# Evaporate to get proto-thought
|
| 93 |
mist_lines = evaporation_engine.evaporate(limit=1)
|
| 94 |
|
| 95 |
if mist_lines:
|
| 96 |
-
proto_thought = mist_lines[0][
|
| 97 |
# Replace content with compressed proto-thought
|
| 98 |
compressed_doc = doc.copy()
|
| 99 |
-
compressed_doc[
|
| 100 |
-
compressed_doc[
|
| 101 |
-
compressed_doc[
|
| 102 |
compressed_documents.append(compressed_doc)
|
| 103 |
compressed_docs += 1
|
| 104 |
else:
|
| 105 |
-
print(
|
|
|
|
|
|
|
| 106 |
# Keep original document if evaporation fails
|
| 107 |
compressed_documents.append(doc)
|
| 108 |
|
|
@@ -116,7 +114,7 @@ def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
|
|
| 116 |
save_jsonl_file(str(output_file), compressed_documents)
|
| 117 |
print(f"Saved compressed file: {output_file}")
|
| 118 |
|
| 119 |
-
print(
|
| 120 |
print(f" Total documents processed: {total_docs}")
|
| 121 |
print(f" Documents compressed: {compressed_docs}")
|
| 122 |
if total_docs > 0:
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import json
|
|
|
|
| 10 |
import sys
|
| 11 |
from pathlib import Path
|
| 12 |
from typing import Dict, Any, List
|
|
|
|
| 21 |
def load_jsonl_file(filepath: str) -> List[Dict[str, Any]]:
|
| 22 |
"""Load a JSONL file and return list of documents."""
|
| 23 |
documents = []
|
| 24 |
+
with open(filepath, "r", encoding="utf-8") as f:
|
| 25 |
for line in f:
|
| 26 |
line = line.strip()
|
| 27 |
if line:
|
|
|
|
| 31 |
|
| 32 |
def save_jsonl_file(filepath: str, documents: List[Dict[str, Any]]) -> None:
|
| 33 |
"""Save list of documents to a JSONL file."""
|
| 34 |
+
with open(filepath, "w", encoding="utf-8") as f:
|
| 35 |
for doc in documents:
|
| 36 |
+
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
|
| 37 |
|
| 38 |
|
| 39 |
def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
|
|
|
|
| 69 |
compressed_documents = []
|
| 70 |
|
| 71 |
for doc in documents:
|
| 72 |
+
if "content" not in doc:
|
| 73 |
+
print("Warning: Document missing 'content' field, skipping")
|
| 74 |
continue
|
| 75 |
|
| 76 |
+
content = doc["content"]
|
| 77 |
if not content or not isinstance(content, str):
|
| 78 |
+
print("Warning: Empty or invalid content, skipping")
|
| 79 |
continue
|
| 80 |
|
| 81 |
try:
|
| 82 |
# Create a fragment from the document content
|
| 83 |
+
fragment = {"id": doc.get("content_id", f"doc_{compressed_docs}"), "text": content}
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
# Create glyph from the single fragment
|
| 86 |
+
melt_layer.retire_cluster({"fragments": [fragment]})
|
| 87 |
|
| 88 |
# Evaporate to get proto-thought
|
| 89 |
mist_lines = evaporation_engine.evaporate(limit=1)
|
| 90 |
|
| 91 |
if mist_lines:
|
| 92 |
+
proto_thought = mist_lines[0]["proto_thought"]
|
| 93 |
# Replace content with compressed proto-thought
|
| 94 |
compressed_doc = doc.copy()
|
| 95 |
+
compressed_doc["content"] = proto_thought
|
| 96 |
+
compressed_doc["original_content_length"] = len(content)
|
| 97 |
+
compressed_doc["compressed_content_length"] = len(proto_thought)
|
| 98 |
compressed_documents.append(compressed_doc)
|
| 99 |
compressed_docs += 1
|
| 100 |
else:
|
| 101 |
+
print(
|
| 102 |
+
f"Warning: Failed to evaporate glyph for document {doc.get('content_id', 'unknown')}"
|
| 103 |
+
)
|
| 104 |
# Keep original document if evaporation fails
|
| 105 |
compressed_documents.append(doc)
|
| 106 |
|
|
|
|
| 114 |
save_jsonl_file(str(output_file), compressed_documents)
|
| 115 |
print(f"Saved compressed file: {output_file}")
|
| 116 |
|
| 117 |
+
print("Compression complete:")
|
| 118 |
print(f" Total documents processed: {total_docs}")
|
| 119 |
print(f" Documents compressed: {compressed_docs}")
|
| 120 |
if total_docs > 0:
|
final_fix.py
CHANGED
|
@@ -2,27 +2,25 @@
|
|
| 2 |
"""Final fixes for stat7_entity.py and verify the fixes work"""
|
| 3 |
|
| 4 |
# Fix the stat7_entity.py bug
|
| 5 |
-
with open(
|
| 6 |
content = f.read()
|
| 7 |
|
| 8 |
# Fix the description reference bug
|
| 9 |
content = content.replace('"description": description,', '"description": self.description,')
|
| 10 |
|
| 11 |
# Write back the fixed content
|
| 12 |
-
with open(
|
| 13 |
f.write(content)
|
| 14 |
|
| 15 |
print("Fixed stat7_entity.py description bug")
|
| 16 |
|
| 17 |
# Test import to make sure everything works
|
| 18 |
try:
|
| 19 |
-
import warbler_cda.stat7_entity
|
| 20 |
print("✅ stat7_entity imports successfully")
|
| 21 |
except Exception as e:
|
| 22 |
print(f"❌ stat7_entity import failed: {e}")
|
| 23 |
|
| 24 |
try:
|
| 25 |
-
import warbler_cda.stat7_rag_bridge
|
| 26 |
print("✅ stat7_rag_bridge imports successfully")
|
| 27 |
except Exception as e:
|
| 28 |
print(f"❌ stat7_rag_bridge import failed: {e}")
|
|
|
|
| 2 |
"""Final fixes for stat7_entity.py and verify the fixes work"""
|
| 3 |
|
| 4 |
# Fix the stat7_entity.py bug
|
| 5 |
+
with open("warbler_cda/stat7_entity.py", "r", encoding="utf-8") as f:
|
| 6 |
content = f.read()
|
| 7 |
|
| 8 |
# Fix the description reference bug
|
| 9 |
content = content.replace('"description": description,', '"description": self.description,')
|
| 10 |
|
| 11 |
# Write back the fixed content
|
| 12 |
+
with open("warbler_cda/stat7_entity.py", "w", encoding="utf-8") as f:
|
| 13 |
f.write(content)
|
| 14 |
|
| 15 |
print("Fixed stat7_entity.py description bug")
|
| 16 |
|
| 17 |
# Test import to make sure everything works
|
| 18 |
try:
|
|
|
|
| 19 |
print("✅ stat7_entity imports successfully")
|
| 20 |
except Exception as e:
|
| 21 |
print(f"❌ stat7_entity import failed: {e}")
|
| 22 |
|
| 23 |
try:
|
|
|
|
| 24 |
print("✅ stat7_rag_bridge imports successfully")
|
| 25 |
except Exception as e:
|
| 26 |
print(f"❌ stat7_rag_bridge import failed: {e}")
|
fix_theme.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""Fix the theme issue in app.py"""
|
| 3 |
|
| 4 |
-
with open(
|
| 5 |
content = f.read()
|
| 6 |
|
| 7 |
old_line = 'with gr.Blocks(title="Warbler CDA - RAG System Demo", theme=gr.themes.Soft()) as demo:'
|
|
@@ -9,7 +9,7 @@ new_line = 'with gr.Blocks(title="Warbler CDA - RAG System Demo") as demo:'
|
|
| 9 |
|
| 10 |
content = content.replace(old_line, new_line)
|
| 11 |
|
| 12 |
-
with open(
|
| 13 |
f.write(content)
|
| 14 |
|
| 15 |
-
print(
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""Fix the theme issue in app.py"""
|
| 3 |
|
| 4 |
+
with open("app.py", "r", encoding="utf-8") as f:
|
| 5 |
content = f.read()
|
| 6 |
|
| 7 |
old_line = 'with gr.Blocks(title="Warbler CDA - RAG System Demo", theme=gr.themes.Soft()) as demo:'
|
|
|
|
| 9 |
|
| 10 |
content = content.replace(old_line, new_line)
|
| 11 |
|
| 12 |
+
with open("app.py", "w", encoding="utf-8") as f:
|
| 13 |
f.write(content)
|
| 14 |
|
| 15 |
+
print("Fixed theme issue")
|
package-lock.json
CHANGED
|
@@ -9,7 +9,8 @@
|
|
| 9 |
"version": "1.0.0",
|
| 10 |
"license": "ISC",
|
| 11 |
"dependencies": {
|
| 12 |
-
"express": "^5.1.0"
|
|
|
|
| 13 |
}
|
| 14 |
},
|
| 15 |
"node_modules/accepts": {
|
|
@@ -819,6 +820,19 @@
|
|
| 819 |
"node": ">= 0.6"
|
| 820 |
}
|
| 821 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 822 |
"node_modules/unpipe": {
|
| 823 |
"version": "1.0.0",
|
| 824 |
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
|
|
|
|
| 9 |
"version": "1.0.0",
|
| 10 |
"license": "ISC",
|
| 11 |
"dependencies": {
|
| 12 |
+
"express": "^5.1.0",
|
| 13 |
+
"typescript": "^5.9.3"
|
| 14 |
}
|
| 15 |
},
|
| 16 |
"node_modules/accepts": {
|
|
|
|
| 820 |
"node": ">= 0.6"
|
| 821 |
}
|
| 822 |
},
|
| 823 |
+
"node_modules/typescript": {
|
| 824 |
+
"version": "5.9.3",
|
| 825 |
+
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
|
| 826 |
+
"integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
|
| 827 |
+
"license": "Apache-2.0",
|
| 828 |
+
"bin": {
|
| 829 |
+
"tsc": "bin/tsc",
|
| 830 |
+
"tsserver": "bin/tsserver"
|
| 831 |
+
},
|
| 832 |
+
"engines": {
|
| 833 |
+
"node": ">=14.17"
|
| 834 |
+
}
|
| 835 |
+
},
|
| 836 |
"node_modules/unpipe": {
|
| 837 |
"version": "1.0.0",
|
| 838 |
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
|
package.json
CHANGED
|
@@ -13,6 +13,7 @@
|
|
| 13 |
"author": "",
|
| 14 |
"license": "ISC",
|
| 15 |
"dependencies": {
|
| 16 |
-
"express": "^5.1.0"
|
|
|
|
| 17 |
}
|
| 18 |
}
|
|
|
|
| 13 |
"author": "",
|
| 14 |
"license": "ISC",
|
| 15 |
"dependencies": {
|
| 16 |
+
"express": "^5.1.0",
|
| 17 |
+
"typescript": "^5.9.3"
|
| 18 |
}
|
| 19 |
}
|
packs/warbler-pack-npc-dialog/src/index.ts
CHANGED
|
@@ -1,12 +1,36 @@
|
|
| 1 |
/**
|
| 2 |
* Warbler NPC Dialog Pack - Essential conversation templates
|
| 3 |
-
*
|
| 4 |
* Re-exports templates for dynamic loading in the Warbler conversation system
|
| 5 |
*/
|
| 6 |
|
| 7 |
-
import { WarblerTemplate, WarblerPackMetadata } from 'warbler-npc';
|
| 8 |
import templatesData from '../pack/templates.json';
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
// Transform JSON data to proper WarblerTemplate objects
|
| 11 |
export const templates: WarblerTemplate[] = templatesData.templates.map(template => ({
|
| 12 |
...template,
|
|
@@ -48,4 +72,4 @@ export default {
|
|
| 48 |
tradeInquiryWelcome,
|
| 49 |
generalConversation,
|
| 50 |
unknownResponse
|
| 51 |
-
};
|
|
|
|
| 1 |
/**
|
| 2 |
* Warbler NPC Dialog Pack - Essential conversation templates
|
| 3 |
+
*
|
| 4 |
* Re-exports templates for dynamic loading in the Warbler conversation system
|
| 5 |
*/
|
| 6 |
|
|
|
|
| 7 |
import templatesData from '../pack/templates.json';
|
| 8 |
|
| 9 |
+
// Type definitions for Warbler pack types
|
| 10 |
+
export interface WarblerTemplate {
|
| 11 |
+
id: string;
|
| 12 |
+
version: string;
|
| 13 |
+
title: string;
|
| 14 |
+
description: string;
|
| 15 |
+
content: string;
|
| 16 |
+
requiredSlots: Array<{
|
| 17 |
+
name: string;
|
| 18 |
+
type: 'string' | 'number' | 'boolean' | 'object';
|
| 19 |
+
required: boolean;
|
| 20 |
+
description?: string;
|
| 21 |
+
}>;
|
| 22 |
+
tags: string[];
|
| 23 |
+
maxLength?: number;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
export interface WarblerPackMetadata {
|
| 27 |
+
name: string;
|
| 28 |
+
version: string;
|
| 29 |
+
description: string;
|
| 30 |
+
author: string;
|
| 31 |
+
templates: WarblerTemplate[];
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
// Transform JSON data to proper WarblerTemplate objects
|
| 35 |
export const templates: WarblerTemplate[] = templatesData.templates.map(template => ({
|
| 36 |
...template,
|
|
|
|
| 72 |
tradeInquiryWelcome,
|
| 73 |
generalConversation,
|
| 74 |
unknownResponse
|
| 75 |
+
};
|
packs/warbler-pack-npc-dialog/warbler-pack-core.jsonl
DELETED
|
@@ -1,2 +0,0 @@
|
|
| 1 |
-
"packInfo"
|
| 2 |
-
"templates"
|
|
|
|
|
|
|
|
|
packs/warbler-pack-wisdom-scrolls/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# 🎭 Warbler Pack: Wisdom Scrolls
|
| 2 |
|
| 3 |
-
**Dynamic wisdom generation templates for the Secret Art of the Living Dev**
|
| 4 |
|
| 5 |
This Warbler content pack provides mystical wisdom generation templates that create fresh quotes in the authentic style of the Sacred Scrolls, breathing new life into the ancient wisdom while maintaining the sacred atmosphere of the Cheekdom.
|
| 6 |
|
|
@@ -23,32 +23,44 @@ scripts/lda-quote --warbler
|
|
| 23 |
## Template Categories
|
| 24 |
|
| 25 |
### 🧙♂️ Development Wisdom (`wisdom_development_insight`)
|
|
|
|
| 26 |
Generates profound insights about development practices using philosophical structure:
|
|
|
|
| 27 |
- **Pattern**: `{action} is not {misconception}; it's {deeper_truth}. Like {metaphor}, but for {domain}.`
|
| 28 |
- **Example**: *"Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."*
|
| 29 |
|
| 30 |
-
### 📜 Sacred Attribution (`scroll_attribution_template`)
|
|
|
|
| 31 |
Creates mystical attribution in the style of ancient texts:
|
|
|
|
| 32 |
- **Pattern**: `— {author_title}, {source_title}, {volume_designation}`
|
| 33 |
- **Example**: *"— The Great Validator, Secret Art of the Living Dev, Vol. III"*
|
| 34 |
|
| 35 |
### 🐛 Debugging Proverbs (`debugging_proverb_template`)
|
|
|
|
| 36 |
Humorous debugging wisdom using classical proverb structure:
|
|
|
|
| 37 |
- **Pattern**: `The {problem_type} you can't {action_verb} is like the {creature} under the {location}—{reality_statement}.`
|
| 38 |
- **Example**: *"The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."*
|
| 39 |
|
| 40 |
### 📖 Documentation Philosophy (`documentation_philosophy`)
|
|
|
|
| 41 |
Profound insights about documentation practices:
|
|
|
|
| 42 |
- **Pattern**: `Documentation is not {what_its_not}; it's {what_it_really_is}.`
|
| 43 |
- **Example**: *"Documentation is not what you write for others; it's what you write for the you of six months from now."*
|
| 44 |
|
| 45 |
### 🏰 Cheekdom Lore (`cheekdom_lore_template`)
|
|
|
|
| 46 |
Epic lore about the Cheekdom and its sacred mission:
|
|
|
|
| 47 |
- **Pattern**: `In the {realm} of {domain}, the {guardian_class} stands between {civilization} and {threat_type}.`
|
| 48 |
- **Example**: *"In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."*
|
| 49 |
|
| 50 |
### 🍑 Buttsafe Wisdom (`buttsafe_wisdom`)
|
|
|
|
| 51 |
Sacred wisdom about ergonomic development practices:
|
|
|
|
| 52 |
- **Pattern**: `Every developer's {body_part} is {sacred_designation}. {protection_action} with {protection_means}.`
|
| 53 |
- **Example**: *"Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."*
|
| 54 |
|
|
@@ -138,6 +150,7 @@ scripts/weekly-wisdom-oracle.sh stats
|
|
| 138 |
All generated quotes maintain the Sacred Code Standards:
|
| 139 |
|
| 140 |
### ✅ **Buttsafe Certified Requirements**
|
|
|
|
| 141 |
- Professional workplace appropriateness
|
| 142 |
- Dry, witty humor style (never offensive)
|
| 143 |
- Development-focused insights
|
|
@@ -145,12 +158,14 @@ All generated quotes maintain the Sacred Code Standards:
|
|
| 145 |
- Maximum length: 200 characters per template
|
| 146 |
|
| 147 |
### 🎭 **Authenticity Standards**
|
|
|
|
| 148 |
- Maintains mystical atmosphere of original quotes
|
| 149 |
- Uses consistent Sacred Art terminology
|
| 150 |
- Preserves philosophical depth and wisdom
|
| 151 |
- Integrates seamlessly with static quote database
|
| 152 |
|
| 153 |
### 📊 **Quality Assurance**
|
|
|
|
| 154 |
- All templates validated for structure and content
|
| 155 |
- Slot combinations tested for coherent output
|
| 156 |
- Generated quotes pass content filtering
|
|
@@ -160,7 +175,7 @@ All generated quotes maintain the Sacred Code Standards:
|
|
| 160 |
|
| 161 |
The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through multiple layers:
|
| 162 |
|
| 163 |
-
```
|
| 164 |
┌─────────────────────────────────────────────────┐
|
| 165 |
│ Weekly Oracle Workflow │
|
| 166 |
│ (GitHub Actions Automation) │
|
|
@@ -185,6 +200,7 @@ The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through m
|
|
| 185 |
## Versioning and Evolution
|
| 186 |
|
| 187 |
### Current Version: 1.0.0
|
|
|
|
| 188 |
- ✅ Six core template categories
|
| 189 |
- ✅ Complete slot value libraries
|
| 190 |
- ✅ Integration with Warbler Quote Engine
|
|
@@ -192,12 +208,14 @@ The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through m
|
|
| 192 |
- ✅ CLI integration
|
| 193 |
|
| 194 |
### Planned Enhancements (v1.1.0)
|
|
|
|
| 195 |
- 🔄 Additional template categories (CI/CD wisdom, workflow philosophy)
|
| 196 |
- 🔄 Context-aware slot selection
|
| 197 |
- 🔄 Machine learning-enhanced quote quality
|
| 198 |
- 🔄 Cross-reference generation with existing quotes
|
| 199 |
|
| 200 |
### Future Vision (v2.0.0)
|
|
|
|
| 201 |
- 🌟 Dynamic template creation based on repository context
|
| 202 |
- 🌟 Personalized wisdom generation
|
| 203 |
- 🌟 Integration with Git commit analysis
|
|
@@ -228,7 +246,7 @@ scripts/lda-quote --warbler --stats
|
|
| 228 |
|
| 229 |
## Sacred Mission
|
| 230 |
|
| 231 |
-
|
| 232 |
|
| 233 |
— **Pack Philosophy**, Living Oracle Manifesto, Sacred Design Document
|
| 234 |
|
|
|
|
| 1 |
# 🎭 Warbler Pack: Wisdom Scrolls
|
| 2 |
|
| 3 |
+
## **Dynamic wisdom generation templates for the Secret Art of the Living Dev**
|
| 4 |
|
| 5 |
This Warbler content pack provides mystical wisdom generation templates that create fresh quotes in the authentic style of the Sacred Scrolls, breathing new life into the ancient wisdom while maintaining the sacred atmosphere of the Cheekdom.
|
| 6 |
|
|
|
|
| 23 |
## Template Categories
|
| 24 |
|
| 25 |
### 🧙♂️ Development Wisdom (`wisdom_development_insight`)
|
| 26 |
+
|
| 27 |
Generates profound insights about development practices using philosophical structure:
|
| 28 |
+
|
| 29 |
- **Pattern**: `{action} is not {misconception}; it's {deeper_truth}. Like {metaphor}, but for {domain}.`
|
| 30 |
- **Example**: *"Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."*
|
| 31 |
|
| 32 |
+
### 📜 Sacred Attribution (`scroll_attribution_template`)
|
| 33 |
+
|
| 34 |
Creates mystical attribution in the style of ancient texts:
|
| 35 |
+
|
| 36 |
- **Pattern**: `— {author_title}, {source_title}, {volume_designation}`
|
| 37 |
- **Example**: *"— The Great Validator, Secret Art of the Living Dev, Vol. III"*
|
| 38 |
|
| 39 |
### 🐛 Debugging Proverbs (`debugging_proverb_template`)
|
| 40 |
+
|
| 41 |
Humorous debugging wisdom using classical proverb structure:
|
| 42 |
+
|
| 43 |
- **Pattern**: `The {problem_type} you can't {action_verb} is like the {creature} under the {location}—{reality_statement}.`
|
| 44 |
- **Example**: *"The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."*
|
| 45 |
|
| 46 |
### 📖 Documentation Philosophy (`documentation_philosophy`)
|
| 47 |
+
|
| 48 |
Profound insights about documentation practices:
|
| 49 |
+
|
| 50 |
- **Pattern**: `Documentation is not {what_its_not}; it's {what_it_really_is}.`
|
| 51 |
- **Example**: *"Documentation is not what you write for others; it's what you write for the you of six months from now."*
|
| 52 |
|
| 53 |
### 🏰 Cheekdom Lore (`cheekdom_lore_template`)
|
| 54 |
+
|
| 55 |
Epic lore about the Cheekdom and its sacred mission:
|
| 56 |
+
|
| 57 |
- **Pattern**: `In the {realm} of {domain}, the {guardian_class} stands between {civilization} and {threat_type}.`
|
| 58 |
- **Example**: *"In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."*
|
| 59 |
|
| 60 |
### 🍑 Buttsafe Wisdom (`buttsafe_wisdom`)
|
| 61 |
+
|
| 62 |
Sacred wisdom about ergonomic development practices:
|
| 63 |
+
|
| 64 |
- **Pattern**: `Every developer's {body_part} is {sacred_designation}. {protection_action} with {protection_means}.`
|
| 65 |
- **Example**: *"Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."*
|
| 66 |
|
|
|
|
| 150 |
All generated quotes maintain the Sacred Code Standards:
|
| 151 |
|
| 152 |
### ✅ **Buttsafe Certified Requirements**
|
| 153 |
+
|
| 154 |
- Professional workplace appropriateness
|
| 155 |
- Dry, witty humor style (never offensive)
|
| 156 |
- Development-focused insights
|
|
|
|
| 158 |
- Maximum length: 200 characters per template
|
| 159 |
|
| 160 |
### 🎭 **Authenticity Standards**
|
| 161 |
+
|
| 162 |
- Maintains mystical atmosphere of original quotes
|
| 163 |
- Uses consistent Sacred Art terminology
|
| 164 |
- Preserves philosophical depth and wisdom
|
| 165 |
- Integrates seamlessly with static quote database
|
| 166 |
|
| 167 |
### 📊 **Quality Assurance**
|
| 168 |
+
|
| 169 |
- All templates validated for structure and content
|
| 170 |
- Slot combinations tested for coherent output
|
| 171 |
- Generated quotes pass content filtering
|
|
|
|
| 175 |
|
| 176 |
The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through multiple layers:
|
| 177 |
|
| 178 |
+
```none
|
| 179 |
┌─────────────────────────────────────────────────┐
|
| 180 |
│ Weekly Oracle Workflow │
|
| 181 |
│ (GitHub Actions Automation) │
|
|
|
|
| 200 |
## Versioning and Evolution
|
| 201 |
|
| 202 |
### Current Version: 1.0.0
|
| 203 |
+
|
| 204 |
- ✅ Six core template categories
|
| 205 |
- ✅ Complete slot value libraries
|
| 206 |
- ✅ Integration with Warbler Quote Engine
|
|
|
|
| 208 |
- ✅ CLI integration
|
| 209 |
|
| 210 |
### Planned Enhancements (v1.1.0)
|
| 211 |
+
|
| 212 |
- 🔄 Additional template categories (CI/CD wisdom, workflow philosophy)
|
| 213 |
- 🔄 Context-aware slot selection
|
| 214 |
- 🔄 Machine learning-enhanced quote quality
|
| 215 |
- 🔄 Cross-reference generation with existing quotes
|
| 216 |
|
| 217 |
### Future Vision (v2.0.0)
|
| 218 |
+
|
| 219 |
- 🌟 Dynamic template creation based on repository context
|
| 220 |
- 🌟 Personalized wisdom generation
|
| 221 |
- 🌟 Integration with Git commit analysis
|
|
|
|
| 246 |
|
| 247 |
## Sacred Mission
|
| 248 |
|
| 249 |
+
-*"The Wisdom Scrolls pack transforms static sacred texts into living oracles, ensuring that fresh insights flow continuously through the channels of development wisdom while preserving the mystical essence of the original teachings."*
|
| 250 |
|
| 251 |
— **Pack Philosophy**, Living Oracle Manifesto, Sacred Design Document
|
| 252 |
|
packs/warbler-pack-wisdom-scrolls/README_HF_DATASET.md
CHANGED
|
@@ -34,7 +34,7 @@ This dataset contains mystical wisdom generation templates that create fresh quo
|
|
| 34 |
|
| 35 |
## Dataset Structure
|
| 36 |
|
| 37 |
-
```
|
| 38 |
{
|
| 39 |
"template_id": str,
|
| 40 |
"category": str,
|
|
@@ -49,26 +49,32 @@ This dataset contains mystical wisdom generation templates that create fresh quo
|
|
| 49 |
## Template Categories
|
| 50 |
|
| 51 |
### 🧙♂️ Development Wisdom
|
|
|
|
| 52 |
Generates profound insights about development practices using philosophical structure.
|
| 53 |
*Example*: "Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."
|
| 54 |
|
| 55 |
### 📜 Sacred Attribution
|
|
|
|
| 56 |
Creates mystical attribution in the style of ancient texts.
|
| 57 |
*Example*: "— The Great Validator, Secret Art of the Living Dev, Vol. III"
|
| 58 |
|
| 59 |
### 🐛 Debugging Proverbs
|
|
|
|
| 60 |
Humorous debugging wisdom using classical proverb structure.
|
| 61 |
*Example*: "The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."
|
| 62 |
|
| 63 |
### 📖 Documentation Philosophy
|
|
|
|
| 64 |
Profound insights about documentation practices.
|
| 65 |
*Example*: "Documentation is not what you write for others; it's what you write for the you of six months from now."
|
| 66 |
|
| 67 |
### 🏰 Cheekdom Lore
|
|
|
|
| 68 |
Epic lore about the Cheekdom and its sacred mission.
|
| 69 |
*Example*: "In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."
|
| 70 |
|
| 71 |
### 🍑 Buttsafe Wisdom
|
|
|
|
| 72 |
Sacred wisdom about ergonomic development practices.
|
| 73 |
*Example*: "Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."
|
| 74 |
|
|
|
|
| 34 |
|
| 35 |
## Dataset Structure
|
| 36 |
|
| 37 |
+
```py
|
| 38 |
{
|
| 39 |
"template_id": str,
|
| 40 |
"category": str,
|
|
|
|
| 49 |
## Template Categories
|
| 50 |
|
| 51 |
### 🧙♂️ Development Wisdom
|
| 52 |
+
|
| 53 |
Generates profound insights about development practices using philosophical structure.
|
| 54 |
*Example*: "Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."
|
| 55 |
|
| 56 |
### 📜 Sacred Attribution
|
| 57 |
+
|
| 58 |
Creates mystical attribution in the style of ancient texts.
|
| 59 |
*Example*: "— The Great Validator, Secret Art of the Living Dev, Vol. III"
|
| 60 |
|
| 61 |
### 🐛 Debugging Proverbs
|
| 62 |
+
|
| 63 |
Humorous debugging wisdom using classical proverb structure.
|
| 64 |
*Example*: "The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."
|
| 65 |
|
| 66 |
### 📖 Documentation Philosophy
|
| 67 |
+
|
| 68 |
Profound insights about documentation practices.
|
| 69 |
*Example*: "Documentation is not what you write for others; it's what you write for the you of six months from now."
|
| 70 |
|
| 71 |
### 🏰 Cheekdom Lore
|
| 72 |
+
|
| 73 |
Epic lore about the Cheekdom and its sacred mission.
|
| 74 |
*Example*: "In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."
|
| 75 |
|
| 76 |
### 🍑 Buttsafe Wisdom
|
| 77 |
+
|
| 78 |
Sacred wisdom about ergonomic development practices.
|
| 79 |
*Example*: "Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."
|
| 80 |
|
test_app.py
CHANGED
|
@@ -4,7 +4,8 @@ Test script to debug app.py initialization issues
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
| 7 |
-
|
|
|
|
| 8 |
|
| 9 |
try:
|
| 10 |
from warbler_cda import (
|
|
@@ -54,6 +55,7 @@ if WARBLER_AVAILABLE:
|
|
| 54 |
except Exception as e:
|
| 55 |
print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
|
| 56 |
import traceback
|
|
|
|
| 57 |
traceback.print_exc()
|
| 58 |
api = None
|
| 59 |
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
| 7 |
+
|
| 8 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 9 |
|
| 10 |
try:
|
| 11 |
from warbler_cda import (
|
|
|
|
| 55 |
except Exception as e:
|
| 56 |
print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
|
| 57 |
import traceback
|
| 58 |
+
|
| 59 |
traceback.print_exc()
|
| 60 |
api = None
|
| 61 |
|
test_compressed_pack.py
CHANGED
|
@@ -11,6 +11,7 @@ sys.path.insert(0, str(Path(__file__).parent))
|
|
| 11 |
|
| 12 |
from warbler_cda.pack_loader import PackLoader
|
| 13 |
|
|
|
|
| 14 |
def test_compressed_pack_loading():
|
| 15 |
"""Test loading the compressed novels pack"""
|
| 16 |
packs_dir = Path("packs")
|
|
@@ -47,7 +48,7 @@ def test_compressed_pack_loading():
|
|
| 47 |
print()
|
| 48 |
|
| 49 |
# Check that content is compressed (should be short proto-thoughts)
|
| 50 |
-
avg_content_length = sum(len(doc[
|
| 51 |
print(f"Average content length: {avg_content_length:.1f} characters")
|
| 52 |
|
| 53 |
if avg_content_length > 200: # Original was ~1100, compressed should be much shorter
|
|
@@ -57,6 +58,7 @@ def test_compressed_pack_loading():
|
|
| 57 |
print("✓ Compressed pack loading test passed!")
|
| 58 |
return True
|
| 59 |
|
|
|
|
| 60 |
if __name__ == "__main__":
|
| 61 |
success = test_compressed_pack_loading()
|
| 62 |
sys.exit(0 if success else 1)
|
|
|
|
| 11 |
|
| 12 |
from warbler_cda.pack_loader import PackLoader
|
| 13 |
|
| 14 |
+
|
| 15 |
def test_compressed_pack_loading():
|
| 16 |
"""Test loading the compressed novels pack"""
|
| 17 |
packs_dir = Path("packs")
|
|
|
|
| 48 |
print()
|
| 49 |
|
| 50 |
# Check that content is compressed (should be short proto-thoughts)
|
| 51 |
+
avg_content_length = sum(len(doc["content"]) for doc in documents) / len(documents)
|
| 52 |
print(f"Average content length: {avg_content_length:.1f} characters")
|
| 53 |
|
| 54 |
if avg_content_length > 200: # Original was ~1100, compressed should be much shorter
|
|
|
|
| 58 |
print("✓ Compressed pack loading test passed!")
|
| 59 |
return True
|
| 60 |
|
| 61 |
+
|
| 62 |
if __name__ == "__main__":
|
| 63 |
success = test_compressed_pack_loading()
|
| 64 |
sys.exit(0 if success else 1)
|
test_embedding_integration.py
CHANGED
|
@@ -116,10 +116,10 @@ def test_embedding_cache():
|
|
| 116 |
|
| 117 |
text = "Cache test document"
|
| 118 |
|
| 119 |
-
|
| 120 |
hits_before = provider.cache_stats["hits"]
|
| 121 |
|
| 122 |
-
|
| 123 |
hits_after = provider.cache_stats["hits"]
|
| 124 |
|
| 125 |
if hits_after > hits_before:
|
|
|
|
| 116 |
|
| 117 |
text = "Cache test document"
|
| 118 |
|
| 119 |
+
provider.embed_text(text)
|
| 120 |
hits_before = provider.cache_stats["hits"]
|
| 121 |
|
| 122 |
+
provider.embed_text(text)
|
| 123 |
hits_after = provider.cache_stats["hits"]
|
| 124 |
|
| 125 |
if hits_after > hits_before:
|
test_fixes.py
CHANGED
|
@@ -17,7 +17,7 @@ def test_load_warbler_packs():
|
|
| 17 |
|
| 18 |
print("Testing WarblerPackLoader...")
|
| 19 |
try:
|
| 20 |
-
|
| 21 |
print("✓ WarblerPackLoader instantiated successfully")
|
| 22 |
|
| 23 |
print("✓ JSONL parsing fix applied")
|
|
@@ -36,7 +36,7 @@ def test_sentence_transformer():
|
|
| 36 |
print("\nTesting SentenceTransformerEmbeddingProvider...")
|
| 37 |
try:
|
| 38 |
config = {"model_name": "all-MiniLM-L6-v2", "batch_size": 32}
|
| 39 |
-
|
| 40 |
print("✓ Provider initialized with proper type annotations")
|
| 41 |
return True
|
| 42 |
except Exception as e:
|
|
|
|
| 17 |
|
| 18 |
print("Testing WarblerPackLoader...")
|
| 19 |
try:
|
| 20 |
+
WarblerPackLoader()
|
| 21 |
print("✓ WarblerPackLoader instantiated successfully")
|
| 22 |
|
| 23 |
print("✓ JSONL parsing fix applied")
|
|
|
|
| 36 |
print("\nTesting SentenceTransformerEmbeddingProvider...")
|
| 37 |
try:
|
| 38 |
config = {"model_name": "all-MiniLM-L6-v2", "batch_size": 32}
|
| 39 |
+
SentenceTransformerEmbeddingProvider(config)
|
| 40 |
print("✓ Provider initialized with proper type annotations")
|
| 41 |
return True
|
| 42 |
except Exception as e:
|
test_pack_loading.py
CHANGED
|
@@ -4,7 +4,8 @@ Test pack loading to debug app.py issues
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
| 7 |
-
|
|
|
|
| 8 |
|
| 9 |
try:
|
| 10 |
from warbler_cda import (
|
|
@@ -78,11 +79,13 @@ if WARBLER_AVAILABLE:
|
|
| 78 |
except Exception as e:
|
| 79 |
print(f"[ERROR] Pack loading failed: {e}")
|
| 80 |
import traceback
|
|
|
|
| 81 |
traceback.print_exc()
|
| 82 |
|
| 83 |
except Exception as e:
|
| 84 |
print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
|
| 85 |
import traceback
|
|
|
|
| 86 |
traceback.print_exc()
|
| 87 |
api = None
|
| 88 |
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
| 7 |
+
|
| 8 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 9 |
|
| 10 |
try:
|
| 11 |
from warbler_cda import (
|
|
|
|
| 79 |
except Exception as e:
|
| 80 |
print(f"[ERROR] Pack loading failed: {e}")
|
| 81 |
import traceback
|
| 82 |
+
|
| 83 |
traceback.print_exc()
|
| 84 |
|
| 85 |
except Exception as e:
|
| 86 |
print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
|
| 87 |
import traceback
|
| 88 |
+
|
| 89 |
traceback.print_exc()
|
| 90 |
api = None
|
| 91 |
|
tests/test_new_mit_datasets.py
CHANGED
|
@@ -22,11 +22,9 @@ from warbler_cda.utils.transformers import (
|
|
| 22 |
WarblerPackBuilder,
|
| 23 |
)
|
| 24 |
import pytest
|
| 25 |
-
import json
|
| 26 |
import sys
|
| 27 |
from pathlib import Path
|
| 28 |
-
from unittest.mock import
|
| 29 |
-
from typing import Dict, List, Any
|
| 30 |
|
| 31 |
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 32 |
|
|
@@ -517,7 +515,7 @@ class TestNewDatasetsIntegrationWithRetrieval:
|
|
| 517 |
"""Test that packs can be created from new datasets"""
|
| 518 |
builder = WarblerPackBuilder()
|
| 519 |
|
| 520 |
-
|
| 521 |
{
|
| 522 |
"content_id": f"test_{i}",
|
| 523 |
"content": f"Test content {i}",
|
|
|
|
| 22 |
WarblerPackBuilder,
|
| 23 |
)
|
| 24 |
import pytest
|
|
|
|
| 25 |
import sys
|
| 26 |
from pathlib import Path
|
| 27 |
+
from unittest.mock import patch, MagicMock
|
|
|
|
| 28 |
|
| 29 |
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 30 |
|
|
|
|
| 515 |
"""Test that packs can be created from new datasets"""
|
| 516 |
builder = WarblerPackBuilder()
|
| 517 |
|
| 518 |
+
[
|
| 519 |
{
|
| 520 |
"content_id": f"test_{i}",
|
| 521 |
"content": f"Test content {i}",
|
tests/test_pdf_ingestion.py
CHANGED
|
@@ -13,12 +13,9 @@ from warbler_cda.utils.transformers import (
|
|
| 13 |
PromptReportTransformer,
|
| 14 |
ManualsTransformer,
|
| 15 |
)
|
| 16 |
-
import pytest
|
| 17 |
-
import json
|
| 18 |
import sys
|
| 19 |
from pathlib import Path
|
| 20 |
-
from unittest.mock import
|
| 21 |
-
from typing import Dict, List, Any
|
| 22 |
|
| 23 |
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 24 |
|
|
|
|
| 13 |
PromptReportTransformer,
|
| 14 |
ManualsTransformer,
|
| 15 |
)
|
|
|
|
|
|
|
| 16 |
import sys
|
| 17 |
from pathlib import Path
|
| 18 |
+
from unittest.mock import patch, MagicMock
|
|
|
|
| 19 |
|
| 20 |
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 21 |
|
tests/test_rag_e2e.py
CHANGED
|
@@ -38,11 +38,9 @@ class TestEndToEndRAG:
|
|
| 38 |
print("RAG SYSTEM METRICS")
|
| 39 |
print("=" * 60)
|
| 40 |
print(f"Embedding Provider: {self.embedding_provider.provider_id}")
|
| 41 |
-
print(
|
| 42 |
-
f"Embedding Dimension: {self.embedding_provider.get_dimension()}")
|
| 43 |
print(f"Documents in Store: {metrics['context_store_size']}")
|
| 44 |
-
print(
|
| 45 |
-
f"Total Queries: {metrics['retrieval_metrics']['total_queries']}")
|
| 46 |
print("=" * 60)
|
| 47 |
|
| 48 |
def test_01_embedding_generation(self):
|
|
@@ -128,8 +126,7 @@ class TestEndToEndRAG:
|
|
| 128 |
|
| 129 |
print(f"[PASS] Retrieved {len(assembly.results)} relevant documents")
|
| 130 |
for i, result in enumerate(assembly.results, 1):
|
| 131 |
-
print(
|
| 132 |
-
f" {i}. [{result.relevance_score:.4f}] {result.content[:50]}")
|
| 133 |
|
| 134 |
def test_05_max_results_respected(self):
|
| 135 |
"""Test 05: Verify max_results parameter is respected."""
|
|
@@ -149,10 +146,7 @@ class TestEndToEndRAG:
|
|
| 149 |
assembly = self.api.retrieve_context(query)
|
| 150 |
|
| 151 |
assert len(assembly.results) <= 3
|
| 152 |
-
print(
|
| 153 |
-
"[PASS] Query returned:"
|
| 154 |
-
f"{len(assembly.results)} results",
|
| 155 |
-
"(max 3 requested)")
|
| 156 |
|
| 157 |
def test_06_confidence_threshold(self):
|
| 158 |
"""Test 06: Verify confidence threshold filtering."""
|
|
@@ -186,12 +180,8 @@ class TestEndToEndRAG:
|
|
| 186 |
strict_results = self.api.retrieve_context(query_strict)
|
| 187 |
loose_results = self.api.retrieve_context(query_loose)
|
| 188 |
|
| 189 |
-
print(
|
| 190 |
-
|
| 191 |
-
f"{len(strict_results.results)} results")
|
| 192 |
-
print(
|
| 193 |
-
"[PASS] Loose threshold (0.2):",
|
| 194 |
-
f"{len(loose_results.results)} results")
|
| 195 |
|
| 196 |
assert len(strict_results.results) <= len(loose_results.results)
|
| 197 |
|
|
@@ -207,8 +197,7 @@ class TestEndToEndRAG:
|
|
| 207 |
provider = SentenceTransformerEmbeddingProvider()
|
| 208 |
|
| 209 |
hybrid_api = RetrievalAPI(
|
| 210 |
-
embedding_provider=provider, config={
|
| 211 |
-
"enable_stat7_hybrid": True}
|
| 212 |
)
|
| 213 |
except ImportError:
|
| 214 |
pytest.skip("SentenceTransformer not installed for STAT7 testing")
|
|
@@ -242,7 +231,8 @@ class TestEndToEndRAG:
|
|
| 242 |
print(
|
| 243 |
"[PASS] Result:",
|
| 244 |
f"semantic={result.semantic_similarity:.4f}",
|
| 245 |
-
f"STAT7={result.stat7_resonance:.4f}"
|
|
|
|
| 246 |
|
| 247 |
def test_08_temporal_retrieval(self):
|
| 248 |
"""Test 08: Verify temporal retrieval works."""
|
|
@@ -268,8 +258,7 @@ class TestEndToEndRAG:
|
|
| 268 |
assembly = self.api.retrieve_context(query)
|
| 269 |
|
| 270 |
assert assembly is not None
|
| 271 |
-
print(
|
| 272 |
-
f"[PASS] Temporal query retrieved {len(assembly.results)} results")
|
| 273 |
|
| 274 |
def test_09_retrieval_metrics(self):
|
| 275 |
"""Test 09: Verify retrieval metrics are tracked."""
|
|
@@ -294,7 +283,8 @@ class TestEndToEndRAG:
|
|
| 294 |
|
| 295 |
print(
|
| 296 |
f"[PASS] Metrics tracked: {
|
| 297 |
-
metrics['retrieval_metrics']['total_queries']} queries"
|
|
|
|
| 298 |
|
| 299 |
def test_10_full_rag_pipeline(self):
|
| 300 |
"""Test 10: Complete RAG pipeline end-to-end."""
|
|
|
|
| 38 |
print("RAG SYSTEM METRICS")
|
| 39 |
print("=" * 60)
|
| 40 |
print(f"Embedding Provider: {self.embedding_provider.provider_id}")
|
| 41 |
+
print(f"Embedding Dimension: {self.embedding_provider.get_dimension()}")
|
|
|
|
| 42 |
print(f"Documents in Store: {metrics['context_store_size']}")
|
| 43 |
+
print(f"Total Queries: {metrics['retrieval_metrics']['total_queries']}")
|
|
|
|
| 44 |
print("=" * 60)
|
| 45 |
|
| 46 |
def test_01_embedding_generation(self):
|
|
|
|
| 126 |
|
| 127 |
print(f"[PASS] Retrieved {len(assembly.results)} relevant documents")
|
| 128 |
for i, result in enumerate(assembly.results, 1):
|
| 129 |
+
print(f" {i}. [{result.relevance_score:.4f}] {result.content[:50]}")
|
|
|
|
| 130 |
|
| 131 |
def test_05_max_results_respected(self):
|
| 132 |
"""Test 05: Verify max_results parameter is respected."""
|
|
|
|
| 146 |
assembly = self.api.retrieve_context(query)
|
| 147 |
|
| 148 |
assert len(assembly.results) <= 3
|
| 149 |
+
print("[PASS] Query returned:" f"{len(assembly.results)} results", "(max 3 requested)")
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
def test_06_confidence_threshold(self):
|
| 152 |
"""Test 06: Verify confidence threshold filtering."""
|
|
|
|
| 180 |
strict_results = self.api.retrieve_context(query_strict)
|
| 181 |
loose_results = self.api.retrieve_context(query_loose)
|
| 182 |
|
| 183 |
+
print("[PASS] Strict threshold (0.8):", f"{len(strict_results.results)} results")
|
| 184 |
+
print("[PASS] Loose threshold (0.2):", f"{len(loose_results.results)} results")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
assert len(strict_results.results) <= len(loose_results.results)
|
| 187 |
|
|
|
|
| 197 |
provider = SentenceTransformerEmbeddingProvider()
|
| 198 |
|
| 199 |
hybrid_api = RetrievalAPI(
|
| 200 |
+
embedding_provider=provider, config={"enable_stat7_hybrid": True}
|
|
|
|
| 201 |
)
|
| 202 |
except ImportError:
|
| 203 |
pytest.skip("SentenceTransformer not installed for STAT7 testing")
|
|
|
|
| 231 |
print(
|
| 232 |
"[PASS] Result:",
|
| 233 |
f"semantic={result.semantic_similarity:.4f}",
|
| 234 |
+
f"STAT7={result.stat7_resonance:.4f}",
|
| 235 |
+
)
|
| 236 |
|
| 237 |
def test_08_temporal_retrieval(self):
|
| 238 |
"""Test 08: Verify temporal retrieval works."""
|
|
|
|
| 258 |
assembly = self.api.retrieve_context(query)
|
| 259 |
|
| 260 |
assert assembly is not None
|
| 261 |
+
print(f"[PASS] Temporal query retrieved {len(assembly.results)} results")
|
|
|
|
| 262 |
|
| 263 |
def test_09_retrieval_metrics(self):
|
| 264 |
"""Test 09: Verify retrieval metrics are tracked."""
|
|
|
|
| 283 |
|
| 284 |
print(
|
| 285 |
f"[PASS] Metrics tracked: {
|
| 286 |
+
metrics['retrieval_metrics']['total_queries']} queries"
|
| 287 |
+
)
|
| 288 |
|
| 289 |
def test_10_full_rag_pipeline(self):
|
| 290 |
"""Test 10: Complete RAG pipeline end-to-end."""
|
tests/test_retrieval_api.py
CHANGED
|
@@ -331,7 +331,7 @@ class TestRetrievalMetrics:
|
|
| 331 |
max_results=5,
|
| 332 |
)
|
| 333 |
|
| 334 |
-
|
| 335 |
|
| 336 |
self.api.retrieve_context(query)
|
| 337 |
self.api.retrieve_context(query)
|
|
|
|
| 331 |
max_results=5,
|
| 332 |
)
|
| 333 |
|
| 334 |
+
self.api.get_retrieval_metrics()
|
| 335 |
|
| 336 |
self.api.retrieve_context(query)
|
| 337 |
self.api.retrieve_context(query)
|
tsconfig.base.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"target": "ES2020",
|
| 4 |
+
"module": "commonjs",
|
| 5 |
+
"esModuleInterop": true,
|
| 6 |
+
"allowSyntheticDefaultImports": true,
|
| 7 |
+
"strict": true,
|
| 8 |
+
"skipLibCheck": true,
|
| 9 |
+
"forceConsistentCasingInFileNames": true,
|
| 10 |
+
"declaration": true,
|
| 11 |
+
"moduleResolution": "node",
|
| 12 |
+
"resolveJsonModule": true
|
| 13 |
+
}
|
| 14 |
+
}
|
validate_new_transformers.py
CHANGED
|
@@ -110,9 +110,9 @@ def main():
|
|
| 110 |
chunks = ingestor._chunk_text(test_text, chunk_size=100)
|
| 111 |
print(f" ✓ Successfully chunked text into {len(chunks)} chunks")
|
| 112 |
if all(isinstance(chunk, str) for chunk in chunks):
|
| 113 |
-
print(
|
| 114 |
else:
|
| 115 |
-
print(
|
| 116 |
all_good = False
|
| 117 |
except Exception as e:
|
| 118 |
print(f" ✗ _chunk_text failed: {e}")
|
|
|
|
| 110 |
chunks = ingestor._chunk_text(test_text, chunk_size=100)
|
| 111 |
print(f" ✓ Successfully chunked text into {len(chunks)} chunks")
|
| 112 |
if all(isinstance(chunk, str) for chunk in chunks):
|
| 113 |
+
print(" ✓ All chunks are strings")
|
| 114 |
else:
|
| 115 |
+
print(" ✗ Some chunks are not strings")
|
| 116 |
all_good = False
|
| 117 |
except Exception as e:
|
| 118 |
print(f" ✗ _chunk_text failed: {e}")
|
verify_pack_ingestion.py
CHANGED
|
@@ -7,7 +7,6 @@ Run this locally before deploying to HuggingFace.
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import sys
|
| 10 |
-
from pathlib import Path
|
| 11 |
import logging
|
| 12 |
|
| 13 |
# Setup logging
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import sys
|
|
|
|
| 10 |
import logging
|
| 11 |
|
| 12 |
# Setup logging
|
warbler_cda/api/cli.py
CHANGED
|
@@ -11,10 +11,8 @@ import json
|
|
| 11 |
import requests
|
| 12 |
import time
|
| 13 |
from typing import List, Dict, Any
|
| 14 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
from datetime import datetime
|
| 16 |
import logging
|
| 17 |
-
from pathlib import Path
|
| 18 |
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
| 20 |
logger = logging.getLogger(__name__)
|
|
@@ -178,7 +176,7 @@ def query(
|
|
| 178 |
# Show narrative analysis
|
| 179 |
if result.get("narrative_analysis"):
|
| 180 |
narr = result["narrative_analysis"]
|
| 181 |
-
click.echo(
|
| 182 |
click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
|
| 183 |
click.echo(f" Narrative Threads: {narr.get('narrative_threads', 0)}")
|
| 184 |
click.echo(f" Analysis: {narr.get('analysis')}")
|
|
@@ -249,7 +247,7 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
|
|
| 249 |
|
| 250 |
result = client.bulk_query(query_data, concurrency=concurrency, include_narrative=True)
|
| 251 |
|
| 252 |
-
|
| 253 |
|
| 254 |
if json_output:
|
| 255 |
click.echo(json.dumps(result, indent=2))
|
|
@@ -270,7 +268,7 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
|
|
| 270 |
# Narrative analysis for entire batch
|
| 271 |
if result.get("batch_narrative_analysis"):
|
| 272 |
narr = result["batch_narrative_analysis"]
|
| 273 |
-
click.echo(
|
| 274 |
click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
|
| 275 |
click.echo(
|
| 276 |
f" Total Narrative Threads: {
|
|
@@ -282,7 +280,7 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
|
|
| 282 |
click.echo(f" Analysis: {narr.get('analysis')}")
|
| 283 |
|
| 284 |
# Per-query summary
|
| 285 |
-
click.echo(
|
| 286 |
for res in result.get("results", [])[:3]:
|
| 287 |
click.echo(
|
| 288 |
f" {
|
|
|
|
| 11 |
import requests
|
| 12 |
import time
|
| 13 |
from typing import List, Dict, Any
|
|
|
|
| 14 |
from datetime import datetime
|
| 15 |
import logging
|
|
|
|
| 16 |
|
| 17 |
logging.basicConfig(level=logging.INFO)
|
| 18 |
logger = logging.getLogger(__name__)
|
|
|
|
| 176 |
# Show narrative analysis
|
| 177 |
if result.get("narrative_analysis"):
|
| 178 |
narr = result["narrative_analysis"]
|
| 179 |
+
click.echo("\nNarrative Analysis:")
|
| 180 |
click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
|
| 181 |
click.echo(f" Narrative Threads: {narr.get('narrative_threads', 0)}")
|
| 182 |
click.echo(f" Analysis: {narr.get('analysis')}")
|
|
|
|
| 247 |
|
| 248 |
result = client.bulk_query(query_data, concurrency=concurrency, include_narrative=True)
|
| 249 |
|
| 250 |
+
time.time() - start_time
|
| 251 |
|
| 252 |
if json_output:
|
| 253 |
click.echo(json.dumps(result, indent=2))
|
|
|
|
| 268 |
# Narrative analysis for entire batch
|
| 269 |
if result.get("batch_narrative_analysis"):
|
| 270 |
narr = result["batch_narrative_analysis"]
|
| 271 |
+
click.echo("\nBatch Narrative Analysis:")
|
| 272 |
click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
|
| 273 |
click.echo(
|
| 274 |
f" Total Narrative Threads: {
|
|
|
|
| 280 |
click.echo(f" Analysis: {narr.get('analysis')}")
|
| 281 |
|
| 282 |
# Per-query summary
|
| 283 |
+
click.echo("\nPer-Query Summary (first 3):")
|
| 284 |
for res in result.get("results", [])[:3]:
|
| 285 |
click.echo(
|
| 286 |
f" {
|
warbler_cda/api/service.py
CHANGED
|
@@ -495,8 +495,10 @@ async def startup_event():
|
|
| 495 |
@app.get("/health", response_model=HealthResponse)
|
| 496 |
async def health_check(service: RetrievalService = Depends(get_retrieval_service)):
|
| 497 |
"""Health check endpoint"""
|
| 498 |
-
|
| 499 |
-
uptime = (
|
|
|
|
|
|
|
| 500 |
|
| 501 |
return HealthResponse(
|
| 502 |
status="healthy",
|
|
@@ -511,8 +513,7 @@ async def health_check(service: RetrievalService = Depends(get_retrieval_service
|
|
| 511 |
|
| 512 |
@app.post("/query", response_model=QueryResult)
|
| 513 |
async def single_query(
|
| 514 |
-
request: QueryRequest,
|
| 515 |
-
service: RetrievalService = Depends(get_retrieval_service)
|
| 516 |
):
|
| 517 |
"""Execute a single retrieval query"""
|
| 518 |
api = service.init_api()
|
|
@@ -579,10 +580,7 @@ async def single_query(
|
|
| 579 |
|
| 580 |
# Bob the Skeptic: Verify suspiciously perfect results
|
| 581 |
bob_status, bob_verification_log = await _bob_skeptic_filter(
|
| 582 |
-
narrative_analysis=narrative_analysis,
|
| 583 |
-
results_data=results_data,
|
| 584 |
-
query=query,
|
| 585 |
-
api=api
|
| 586 |
)
|
| 587 |
|
| 588 |
return QueryResult(
|
|
@@ -611,11 +609,10 @@ async def single_query(
|
|
| 611 |
|
| 612 |
@app.post("/bulk_query")
|
| 613 |
async def bulk_concurrent_queries(
|
| 614 |
-
request: BulkQueryRequest,
|
| 615 |
-
service: RetrievalService = Depends(get_retrieval_service)
|
| 616 |
):
|
| 617 |
"""Execute multiple queries concurrently"""
|
| 618 |
-
|
| 619 |
logger.info(
|
| 620 |
f"Executing {len(request.queries)} queries with concurrency level {request.concurrency_level}"
|
| 621 |
)
|
|
@@ -669,8 +666,7 @@ async def bulk_concurrent_queries(
|
|
| 669 |
|
| 670 |
@app.post("/ingest")
|
| 671 |
async def ingest_documents(
|
| 672 |
-
request: Dict[str, Any],
|
| 673 |
-
service: RetrievalService = Depends(get_retrieval_service)
|
| 674 |
):
|
| 675 |
"""Ingest documents into the RetrievalAPI"""
|
| 676 |
api = service.init_api()
|
|
|
|
| 495 |
@app.get("/health", response_model=HealthResponse)
|
| 496 |
async def health_check(service: RetrievalService = Depends(get_retrieval_service)):
|
| 497 |
"""Health check endpoint"""
|
| 498 |
+
service.init_api()
|
| 499 |
+
uptime = (
|
| 500 |
+
datetime.now() - datetime.fromisoformat(service.metrics["start_time"])
|
| 501 |
+
).total_seconds()
|
| 502 |
|
| 503 |
return HealthResponse(
|
| 504 |
status="healthy",
|
|
|
|
| 513 |
|
| 514 |
@app.post("/query", response_model=QueryResult)
|
| 515 |
async def single_query(
|
| 516 |
+
request: QueryRequest, service: RetrievalService = Depends(get_retrieval_service)
|
|
|
|
| 517 |
):
|
| 518 |
"""Execute a single retrieval query"""
|
| 519 |
api = service.init_api()
|
|
|
|
| 580 |
|
| 581 |
# Bob the Skeptic: Verify suspiciously perfect results
|
| 582 |
bob_status, bob_verification_log = await _bob_skeptic_filter(
|
| 583 |
+
narrative_analysis=narrative_analysis, results_data=results_data, query=query, api=api
|
|
|
|
|
|
|
|
|
|
| 584 |
)
|
| 585 |
|
| 586 |
return QueryResult(
|
|
|
|
| 609 |
|
| 610 |
@app.post("/bulk_query")
|
| 611 |
async def bulk_concurrent_queries(
|
| 612 |
+
request: BulkQueryRequest, service: RetrievalService = Depends(get_retrieval_service)
|
|
|
|
| 613 |
):
|
| 614 |
"""Execute multiple queries concurrently"""
|
| 615 |
+
service.init_api()
|
| 616 |
logger.info(
|
| 617 |
f"Executing {len(request.queries)} queries with concurrency level {request.concurrency_level}"
|
| 618 |
)
|
|
|
|
| 666 |
|
| 667 |
@app.post("/ingest")
|
| 668 |
async def ingest_documents(
|
| 669 |
+
request: Dict[str, Any], service: RetrievalService = Depends(get_retrieval_service)
|
|
|
|
| 670 |
):
|
| 671 |
"""Ingest documents into the RetrievalAPI"""
|
| 672 |
api = service.init_api()
|
warbler_cda/castle_graph.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
-
from typing import List, Dict, Any,
|
| 3 |
import time
|
| 4 |
import re
|
| 5 |
import math
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
+
from typing import List, Dict, Any, Optional, Set
|
| 3 |
import time
|
| 4 |
import re
|
| 5 |
import math
|
warbler_cda/conflict_detector.py
CHANGED
|
@@ -5,10 +5,10 @@ Detects conflicting or contradictory statements using semantic similarity and
|
|
| 5 |
logical opposition analysis for the Cognitive Geo-Thermal Lore Engine v0.3.
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
from typing import List, Dict, Any, Optional,
|
| 9 |
import time
|
| 10 |
import hashlib
|
| 11 |
-
from dataclasses import dataclass
|
| 12 |
from enum import Enum
|
| 13 |
|
| 14 |
|
|
@@ -580,7 +580,9 @@ class ConflictDetector:
|
|
| 580 |
|
| 581 |
def _generate_conflict_id(self, conflict: ConflictEvidence) -> str:
|
| 582 |
"""Generate unique ID for a conflict."""
|
| 583 |
-
content =
|
|
|
|
|
|
|
| 584 |
return hashlib.md5(content.encode()).hexdigest()[:12]
|
| 585 |
|
| 586 |
def _generate_conflict_recommendation(
|
|
|
|
| 5 |
logical opposition analysis for the Cognitive Geo-Thermal Lore Engine v0.3.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
from typing import List, Dict, Any, Optional, Set
|
| 9 |
import time
|
| 10 |
import hashlib
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
from enum import Enum
|
| 13 |
|
| 14 |
|
|
|
|
| 580 |
|
| 581 |
def _generate_conflict_id(self, conflict: ConflictEvidence) -> str:
|
| 582 |
"""Generate unique ID for a conflict."""
|
| 583 |
+
content = (
|
| 584 |
+
f"{conflict.statement_a_id}_{conflict.statement_b_id}_{conflict.conflict_type.value}"
|
| 585 |
+
)
|
| 586 |
return hashlib.md5(content.encode()).hexdigest()[:12]
|
| 587 |
|
| 588 |
def _generate_conflict_recommendation(
|
warbler_cda/embeddings/openai_provider.py
CHANGED
|
@@ -3,7 +3,6 @@ OpenAI Embedding Provider - Cloud-based Semantic Grounding
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
from typing import List, Dict, Any, Optional
|
| 6 |
-
import time
|
| 7 |
from warbler_cda.embeddings.base_provider import EmbeddingProvider
|
| 8 |
|
| 9 |
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
from typing import List, Dict, Any, Optional
|
|
|
|
| 6 |
from warbler_cda.embeddings.base_provider import EmbeddingProvider
|
| 7 |
|
| 8 |
|
warbler_cda/embeddings/sentence_transformer_provider.py
CHANGED
|
@@ -4,9 +4,7 @@ High-quality embeddings using pre-trained transformer models with CUDA support
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
from typing import List, Dict, Any, Optional, Tuple
|
| 7 |
-
import os
|
| 8 |
import json
|
| 9 |
-
import time
|
| 10 |
import hashlib
|
| 11 |
from pathlib import Path
|
| 12 |
from warbler_cda.embeddings.base_provider import EmbeddingProvider
|
|
@@ -118,7 +116,7 @@ class SentenceTransformerEmbeddingProvider(EmbeddingProvider):
|
|
| 118 |
|
| 119 |
import numpy as np
|
| 120 |
|
| 121 |
-
|
| 122 |
embed_vecs = np.array(embeddings)
|
| 123 |
|
| 124 |
similarities = []
|
|
@@ -205,8 +203,8 @@ class SentenceTransformerEmbeddingProvider(EmbeddingProvider):
|
|
| 205 |
seg2 = emb_array[2 * seg_size : 3 * seg_size]
|
| 206 |
seg3 = emb_array[3 * seg_size : 4 * seg_size]
|
| 207 |
seg4 = emb_array[4 * seg_size : 5 * seg_size]
|
| 208 |
-
|
| 209 |
-
|
| 210 |
|
| 211 |
lineage = float(np.mean(seg0**2))
|
| 212 |
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
from typing import List, Dict, Any, Optional, Tuple
|
|
|
|
| 7 |
import json
|
|
|
|
| 8 |
import hashlib
|
| 9 |
from pathlib import Path
|
| 10 |
from warbler_cda.embeddings.base_provider import EmbeddingProvider
|
|
|
|
| 116 |
|
| 117 |
import numpy as np
|
| 118 |
|
| 119 |
+
np.array(query_embedding)
|
| 120 |
embed_vecs = np.array(embeddings)
|
| 121 |
|
| 122 |
similarities = []
|
|
|
|
| 203 |
seg2 = emb_array[2 * seg_size : 3 * seg_size]
|
| 204 |
seg3 = emb_array[3 * seg_size : 4 * seg_size]
|
| 205 |
seg4 = emb_array[4 * seg_size : 5 * seg_size]
|
| 206 |
+
emb_array[5 * seg_size : 6 * seg_size]
|
| 207 |
+
emb_array[6 * seg_size :]
|
| 208 |
|
| 209 |
lineage = float(np.mean(seg0**2))
|
| 210 |
|
warbler_cda/evaporation.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
-
from typing import List, Dict, Any, Optional
|
| 3 |
import time
|
| 4 |
import random
|
| 5 |
import re
|
| 6 |
-
from collections import Counter
|
| 7 |
|
| 8 |
|
| 9 |
class EvaporationEngine:
|
|
@@ -299,7 +298,9 @@ class EvaporationEngine:
|
|
| 299 |
if len(concepts) == 1:
|
| 300 |
return f"[Balanced] Reflection on {concepts[0]} reveals deeper meaning."
|
| 301 |
else:
|
| 302 |
-
return
|
|
|
|
|
|
|
| 303 |
|
| 304 |
def _apply_affect_coloring(self, proto_thought: str, affect: Dict[str, Any]) -> str:
|
| 305 |
"""Apply affect-based coloring to proto-thought."""
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
+
from typing import List, Dict, Any, Optional
|
| 3 |
import time
|
| 4 |
import random
|
| 5 |
import re
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
class EvaporationEngine:
|
|
|
|
| 298 |
if len(concepts) == 1:
|
| 299 |
return f"[Balanced] Reflection on {concepts[0]} reveals deeper meaning."
|
| 300 |
else:
|
| 301 |
+
return (
|
| 302 |
+
f"[Balanced] The interplay between {concepts[0]} and {concepts[1]} creates harmony."
|
| 303 |
+
)
|
| 304 |
|
| 305 |
def _apply_affect_coloring(self, proto_thought: str, affect: Dict[str, Any]) -> str:
|
| 306 |
"""Apply affect-based coloring to proto-thought."""
|
warbler_cda/pack_loader.py
CHANGED
|
@@ -149,9 +149,7 @@ class PackLoader:
|
|
| 149 |
chunk_docs = self._load_jsonl_file(chunk_file, pack_name)
|
| 150 |
documents.extend(chunk_docs)
|
| 151 |
|
| 152 |
-
logger.info(
|
| 153 |
-
f"Loaded {len(documents)} total documents from {len(chunk_files)} chunks"
|
| 154 |
-
)
|
| 155 |
else:
|
| 156 |
# Load single-file pack (backward compatibility)
|
| 157 |
jsonl_file = pack_dir / f"{pack_name}.jsonl"
|
|
|
|
| 149 |
chunk_docs = self._load_jsonl_file(chunk_file, pack_name)
|
| 150 |
documents.extend(chunk_docs)
|
| 151 |
|
| 152 |
+
logger.info(f"Loaded {len(documents)} total documents from {len(chunk_files)} chunks")
|
|
|
|
|
|
|
| 153 |
else:
|
| 154 |
# Load single-file pack (backward compatibility)
|
| 155 |
jsonl_file = pack_dir / f"{pack_name}.jsonl"
|
warbler_cda/pack_sync.py
CHANGED
|
@@ -141,6 +141,6 @@ class PackSync:
|
|
| 141 |
"""Return reingest command if packs are missing"""
|
| 142 |
status = self.verify_packs()
|
| 143 |
if status["missing"]:
|
| 144 |
-
|
| 145 |
-
return
|
| 146 |
return None
|
|
|
|
| 141 |
"""Return reingest command if packs are missing"""
|
| 142 |
status = self.verify_packs()
|
| 143 |
if status["missing"]:
|
| 144 |
+
", ".join(status["missing"])
|
| 145 |
+
return "python -m warbler_cda.utils.hf_warbler_ingest ingest --datasets all"
|
| 146 |
return None
|
warbler_cda/retrieval_api.py
CHANGED
|
@@ -8,7 +8,7 @@ for the Cognitive Geo-Thermal Lore Engine v0.3.
|
|
| 8 |
from typing import List, Dict, Any, Optional, Tuple, Union
|
| 9 |
import time
|
| 10 |
import hashlib
|
| 11 |
-
from dataclasses import dataclass
|
| 12 |
from enum import Enum
|
| 13 |
|
| 14 |
|
|
@@ -377,8 +377,14 @@ class RetrievalAPI:
|
|
| 377 |
# DEBUG
|
| 378 |
import sys
|
| 379 |
|
| 380 |
-
print(
|
| 381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
print(f"DEBUG: context_store size={len(self._context_store)}", file=sys.stderr)
|
| 383 |
|
| 384 |
# If embedding provider available, use it
|
|
@@ -467,7 +473,7 @@ class RetrievalAPI:
|
|
| 467 |
try:
|
| 468 |
if self.embedding_provider and hasattr(self.embedding_provider, "semantic_search"):
|
| 469 |
return self._search_context_store_semantic(query)
|
| 470 |
-
except Exception
|
| 471 |
pass
|
| 472 |
|
| 473 |
return self._search_context_store_keyword(query)
|
|
@@ -527,7 +533,7 @@ class RetrievalAPI:
|
|
| 527 |
stat7_resonance=stat7_resonance,
|
| 528 |
)
|
| 529 |
results.append(result)
|
| 530 |
-
except Exception
|
| 531 |
return self._search_context_store_keyword(query)
|
| 532 |
|
| 533 |
return results
|
|
@@ -846,7 +852,6 @@ class RetrievalAPI:
|
|
| 846 |
filtered = [r for r in results if r.relevance_score >= query.confidence_threshold]
|
| 847 |
|
| 848 |
# Apply temporal decay
|
| 849 |
-
current_time = query.query_timestamp
|
| 850 |
for result in filtered:
|
| 851 |
age_hours = result.temporal_distance / 3600
|
| 852 |
decay_factor = max(0.1, 1.0 - (age_hours / self.temporal_decay_hours))
|
|
|
|
| 8 |
from typing import List, Dict, Any, Optional, Tuple, Union
|
| 9 |
import time
|
| 10 |
import hashlib
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
from enum import Enum
|
| 13 |
|
| 14 |
|
|
|
|
| 377 |
# DEBUG
|
| 378 |
import sys
|
| 379 |
|
| 380 |
+
print(
|
| 381 |
+
f"DEBUG: _retrieve_semantic_similarity called with query='{query.semantic_query}'",
|
| 382 |
+
file=sys.stderr,
|
| 383 |
+
)
|
| 384 |
+
print(
|
| 385 |
+
f"DEBUG: embedding_provider={self.embedding_provider}, semantic_anchors={self.semantic_anchors}",
|
| 386 |
+
file=sys.stderr,
|
| 387 |
+
)
|
| 388 |
print(f"DEBUG: context_store size={len(self._context_store)}", file=sys.stderr)
|
| 389 |
|
| 390 |
# If embedding provider available, use it
|
|
|
|
| 473 |
try:
|
| 474 |
if self.embedding_provider and hasattr(self.embedding_provider, "semantic_search"):
|
| 475 |
return self._search_context_store_semantic(query)
|
| 476 |
+
except Exception:
|
| 477 |
pass
|
| 478 |
|
| 479 |
return self._search_context_store_keyword(query)
|
|
|
|
| 533 |
stat7_resonance=stat7_resonance,
|
| 534 |
)
|
| 535 |
results.append(result)
|
| 536 |
+
except Exception:
|
| 537 |
return self._search_context_store_keyword(query)
|
| 538 |
|
| 539 |
return results
|
|
|
|
| 852 |
filtered = [r for r in results if r.relevance_score >= query.confidence_threshold]
|
| 853 |
|
| 854 |
# Apply temporal decay
|
|
|
|
| 855 |
for result in filtered:
|
| 856 |
age_hours = result.temporal_distance / 3600
|
| 857 |
decay_factor = max(0.1, 1.0 - (age_hours / self.temporal_decay_hours))
|
warbler_cda/semantic_anchors.py
CHANGED
|
@@ -2,11 +2,9 @@
|
|
| 2 |
Enhanced Anchor System with Semantic Grounding and Provenance
|
| 3 |
"""
|
| 4 |
|
| 5 |
-
from typing import List, Dict, Any, Optional
|
| 6 |
import time
|
| 7 |
import hashlib
|
| 8 |
-
import json
|
| 9 |
-
from dataclasses import dataclass, asdict
|
| 10 |
from warbler_cda.embeddings import EmbeddingProvider, EmbeddingProviderFactory
|
| 11 |
from warbler_cda.anchor_memory_pool import AnchorMemoryPool, get_global_anchor_pool
|
| 12 |
from warbler_cda.anchor_data_classes import SemanticAnchor, AnchorProvenance
|
|
@@ -242,7 +240,7 @@ class SemanticAnchorGraph:
|
|
| 242 |
"""Apply aging, consolidation, and eviction policies."""
|
| 243 |
actions = {"aged": 0, "consolidated": 0, "evicted": 0, "evicted_anchors": []}
|
| 244 |
|
| 245 |
-
|
| 246 |
anchors_to_evict = []
|
| 247 |
|
| 248 |
# Apply aging
|
|
|
|
| 2 |
Enhanced Anchor System with Semantic Grounding and Provenance
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
import time
|
| 7 |
import hashlib
|
|
|
|
|
|
|
| 8 |
from warbler_cda.embeddings import EmbeddingProvider, EmbeddingProviderFactory
|
| 9 |
from warbler_cda.anchor_memory_pool import AnchorMemoryPool, get_global_anchor_pool
|
| 10 |
from warbler_cda.anchor_data_classes import SemanticAnchor, AnchorProvenance
|
|
|
|
| 240 |
"""Apply aging, consolidation, and eviction policies."""
|
| 241 |
actions = {"aged": 0, "consolidated": 0, "evicted": 0, "evicted_anchors": []}
|
| 242 |
|
| 243 |
+
time.time()
|
| 244 |
anchors_to_evict = []
|
| 245 |
|
| 246 |
# Apply aging
|
warbler_cda/stat7_entity.py
CHANGED
|
@@ -11,7 +11,7 @@ Features:
|
|
| 11 |
- Entanglement detection and management
|
| 12 |
"""
|
| 13 |
|
| 14 |
-
from dataclasses import dataclass, field
|
| 15 |
from datetime import datetime
|
| 16 |
from enum import Enum
|
| 17 |
from typing import Dict, List, Optional, Any, Tuple
|
|
@@ -286,9 +286,7 @@ class STAT7Entity(ABC):
|
|
| 286 |
self.entangled_entities.append(other_entity_id)
|
| 287 |
self.entanglement_strength.append(strength)
|
| 288 |
self._record_event(
|
| 289 |
-
"entanglement_added",
|
| 290 |
-
f"Entangled with {other_entity_id}",
|
| 291 |
-
{"strength": strength}
|
| 292 |
)
|
| 293 |
|
| 294 |
def remove_entanglement(self, other_entity_id: str):
|
|
@@ -297,10 +295,7 @@ class STAT7Entity(ABC):
|
|
| 297 |
idx = self.entangled_entities.index(other_entity_id)
|
| 298 |
self.entangled_entities.pop(idx)
|
| 299 |
self.entanglement_strength.pop(idx)
|
| 300 |
-
self._record_event(
|
| 301 |
-
"entanglement_removed",
|
| 302 |
-
f"Untangled from {other_entity_id}"
|
| 303 |
-
)
|
| 304 |
|
| 305 |
def get_entanglements(self) -> List[Tuple[str, float]]:
|
| 306 |
"""Get all entangled entities with strength"""
|
|
@@ -315,8 +310,8 @@ class STAT7Entity(ABC):
|
|
| 315 |
self._record_event(
|
| 316 |
"entanglement_updated",
|
| 317 |
f"{other_entity_id} entanglement strength changed",
|
| 318 |
-
{"old_strength": old_strength, "new_strength": new_strength}
|
| 319 |
-
|
| 320 |
|
| 321 |
# ========================================================================
|
| 322 |
# LUCA Bootstrap
|
|
@@ -423,7 +418,7 @@ class STAT7Entity(ABC):
|
|
| 423 |
def load_from_file(cls, path: Path) -> "STAT7Entity":
|
| 424 |
"""Load entity from JSON file (must know concrete type)"""
|
| 425 |
with open(path, "r") as f:
|
| 426 |
-
|
| 427 |
# Note: In practice, would need factory pattern to instantiate correct
|
| 428 |
# subclass
|
| 429 |
raise NotImplementedError("Use subclass load methods")
|
|
|
|
| 11 |
- Entanglement detection and management
|
| 12 |
"""
|
| 13 |
|
| 14 |
+
from dataclasses import dataclass, field
|
| 15 |
from datetime import datetime
|
| 16 |
from enum import Enum
|
| 17 |
from typing import Dict, List, Optional, Any, Tuple
|
|
|
|
| 286 |
self.entangled_entities.append(other_entity_id)
|
| 287 |
self.entanglement_strength.append(strength)
|
| 288 |
self._record_event(
|
| 289 |
+
"entanglement_added", f"Entangled with {other_entity_id}", {"strength": strength}
|
|
|
|
|
|
|
| 290 |
)
|
| 291 |
|
| 292 |
def remove_entanglement(self, other_entity_id: str):
|
|
|
|
| 295 |
idx = self.entangled_entities.index(other_entity_id)
|
| 296 |
self.entangled_entities.pop(idx)
|
| 297 |
self.entanglement_strength.pop(idx)
|
| 298 |
+
self._record_event("entanglement_removed", f"Untangled from {other_entity_id}")
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
def get_entanglements(self) -> List[Tuple[str, float]]:
|
| 301 |
"""Get all entangled entities with strength"""
|
|
|
|
| 310 |
self._record_event(
|
| 311 |
"entanglement_updated",
|
| 312 |
f"{other_entity_id} entanglement strength changed",
|
| 313 |
+
{"old_strength": old_strength, "new_strength": new_strength},
|
| 314 |
+
)
|
| 315 |
|
| 316 |
# ========================================================================
|
| 317 |
# LUCA Bootstrap
|
|
|
|
| 418 |
def load_from_file(cls, path: Path) -> "STAT7Entity":
|
| 419 |
"""Load entity from JSON file (must know concrete type)"""
|
| 420 |
with open(path, "r") as f:
|
| 421 |
+
json.load(f)
|
| 422 |
# Note: In practice, would need factory pattern to instantiate correct
|
| 423 |
# subclass
|
| 424 |
raise NotImplementedError("Use subclass load methods")
|
warbler_cda/stat7_experiments.py
CHANGED
|
@@ -327,7 +327,7 @@ class EXP01_AddressUniqueness:
|
|
| 327 |
Tuple of (results list, overall success boolean)
|
| 328 |
"""
|
| 329 |
print(f"\n{'=' * 70}")
|
| 330 |
-
print(
|
| 331 |
print(f"{'=' * 70}")
|
| 332 |
print(f"Sample size: {self.sample_size} bit-chains")
|
| 333 |
print(f"Iterations: {self.iterations}")
|
|
@@ -383,9 +383,7 @@ class EXP01_AddressUniqueness:
|
|
| 383 |
print(f" ⚠️ Collision on {addr[:16]}... : {len(ids)} entries")
|
| 384 |
|
| 385 |
print()
|
| 386 |
-
print(
|
| 387 |
-
f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
|
| 388 |
-
)
|
| 389 |
print(f"Success rate: {sum(1 for r in self.results if r.success)}/{self.iterations}")
|
| 390 |
|
| 391 |
return self.results, all_success
|
|
@@ -452,7 +450,7 @@ class EXP02_RetrievalEfficiency:
|
|
| 452 |
Tuple of (results list, overall success boolean)
|
| 453 |
"""
|
| 454 |
print(f"\n{'=' * 70}")
|
| 455 |
-
print(
|
| 456 |
print(f"{'=' * 70}")
|
| 457 |
print(f"Query count per scale: {self.query_count}")
|
| 458 |
print(f"Scales: {self.scales}")
|
|
@@ -520,9 +518,7 @@ class EXP02_RetrievalEfficiency:
|
|
| 520 |
print(f" Target: < {threshold}ms")
|
| 521 |
print()
|
| 522 |
|
| 523 |
-
print(
|
| 524 |
-
f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
|
| 525 |
-
)
|
| 526 |
|
| 527 |
return self.results, all_success
|
| 528 |
|
|
@@ -589,7 +585,7 @@ class EXP03_DimensionNecessity:
|
|
| 589 |
Tuple of (results list, overall success boolean)
|
| 590 |
"""
|
| 591 |
print(f"\n{'=' * 70}")
|
| 592 |
-
print(
|
| 593 |
print(f"{'=' * 70}")
|
| 594 |
print(f"Sample size: {self.sample_size} bit-chains")
|
| 595 |
print()
|
|
@@ -618,9 +614,7 @@ class EXP03_DimensionNecessity:
|
|
| 618 |
self.results.append(result)
|
| 619 |
|
| 620 |
status = "✅ PASS" if result.acceptable else "❌ FAIL"
|
| 621 |
-
print(
|
| 622 |
-
f" {status} | Collisions: {collisions} | Rate: {baseline_collision_rate * 100:.4f}%"
|
| 623 |
-
)
|
| 624 |
print()
|
| 625 |
|
| 626 |
# Ablation: remove each dimension
|
|
@@ -662,13 +656,11 @@ class EXP03_DimensionNecessity:
|
|
| 662 |
# when removing dims
|
| 663 |
necessity = not acceptable # Should show collisions
|
| 664 |
status = "✅ NECESSARY" if necessity else "⚠️ OPTIONAL"
|
| 665 |
-
print(
|
| 666 |
-
f" {status} | Collisions: {collisions} | Rate: {collision_rate * 100:.4f}%"
|
| 667 |
-
)
|
| 668 |
|
| 669 |
print()
|
| 670 |
print(
|
| 671 |
-
|
| 672 |
)
|
| 673 |
|
| 674 |
return self.results, all_success
|
|
@@ -733,7 +725,7 @@ def run_all_experiments(
|
|
| 733 |
|
| 734 |
# Summary
|
| 735 |
print(f"\n{'=' * 70}")
|
| 736 |
-
print(
|
| 737 |
print(f"{'=' * 70}")
|
| 738 |
print(
|
| 739 |
f"EXP-01 (Address Uniqueness): {'✅ PASS' if results['EXP-01']['success'] else '❌ FAIL'}"
|
|
|
|
| 327 |
Tuple of (results list, overall success boolean)
|
| 328 |
"""
|
| 329 |
print(f"\n{'=' * 70}")
|
| 330 |
+
print("EXP-01: ADDRESS UNIQUENESS TEST")
|
| 331 |
print(f"{'=' * 70}")
|
| 332 |
print(f"Sample size: {self.sample_size} bit-chains")
|
| 333 |
print(f"Iterations: {self.iterations}")
|
|
|
|
| 383 |
print(f" ⚠️ Collision on {addr[:16]}... : {len(ids)} entries")
|
| 384 |
|
| 385 |
print()
|
| 386 |
+
print(f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}")
|
|
|
|
|
|
|
| 387 |
print(f"Success rate: {sum(1 for r in self.results if r.success)}/{self.iterations}")
|
| 388 |
|
| 389 |
return self.results, all_success
|
|
|
|
| 450 |
Tuple of (results list, overall success boolean)
|
| 451 |
"""
|
| 452 |
print(f"\n{'=' * 70}")
|
| 453 |
+
print("EXP-02: RETRIEVAL EFFICIENCY TEST")
|
| 454 |
print(f"{'=' * 70}")
|
| 455 |
print(f"Query count per scale: {self.query_count}")
|
| 456 |
print(f"Scales: {self.scales}")
|
|
|
|
| 518 |
print(f" Target: < {threshold}ms")
|
| 519 |
print()
|
| 520 |
|
| 521 |
+
print(f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}")
|
|
|
|
|
|
|
| 522 |
|
| 523 |
return self.results, all_success
|
| 524 |
|
|
|
|
| 585 |
Tuple of (results list, overall success boolean)
|
| 586 |
"""
|
| 587 |
print(f"\n{'=' * 70}")
|
| 588 |
+
print("EXP-03: DIMENSION NECESSITY TEST")
|
| 589 |
print(f"{'=' * 70}")
|
| 590 |
print(f"Sample size: {self.sample_size} bit-chains")
|
| 591 |
print()
|
|
|
|
| 614 |
self.results.append(result)
|
| 615 |
|
| 616 |
status = "✅ PASS" if result.acceptable else "❌ FAIL"
|
| 617 |
+
print(f" {status} | Collisions: {collisions} | Rate: {baseline_collision_rate * 100:.4f}%")
|
|
|
|
|
|
|
| 618 |
print()
|
| 619 |
|
| 620 |
# Ablation: remove each dimension
|
|
|
|
| 656 |
# when removing dims
|
| 657 |
necessity = not acceptable # Should show collisions
|
| 658 |
status = "✅ NECESSARY" if necessity else "⚠️ OPTIONAL"
|
| 659 |
+
print(f" {status} | Collisions: {collisions} | Rate: {collision_rate * 100:.4f}%")
|
|
|
|
|
|
|
| 660 |
|
| 661 |
print()
|
| 662 |
print(
|
| 663 |
+
"OVERALL RESULT: All 7 dimensions are necessary (all show > 0.1% collisions when removed)"
|
| 664 |
)
|
| 665 |
|
| 666 |
return self.results, all_success
|
|
|
|
| 725 |
|
| 726 |
# Summary
|
| 727 |
print(f"\n{'=' * 70}")
|
| 728 |
+
print("PHASE 1 VALIDATION SUMMARY")
|
| 729 |
print(f"{'=' * 70}")
|
| 730 |
print(
|
| 731 |
f"EXP-01 (Address Uniqueness): {'✅ PASS' if results['EXP-01']['success'] else '❌ FAIL'}"
|
warbler_cda/stat7_rag_bridge.py
CHANGED
|
@@ -55,18 +55,10 @@ class STAT7Address:
|
|
| 55 |
|
| 56 |
def __post_init__(self):
|
| 57 |
"""Validate STAT8 constraints."""
|
| 58 |
-
assert
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
assert
|
| 62 |
-
0.0 <= self.luminosity <= 1.0
|
| 63 |
-
), f"luminosity must be [0,1], got {self.luminosity}"
|
| 64 |
-
assert (
|
| 65 |
-
0.0 <= self.polarity <= 1.0
|
| 66 |
-
), f"polarity must be [0,1], got {self.polarity}"
|
| 67 |
-
assert (
|
| 68 |
-
0.0 <= self.entropy <= 1.0
|
| 69 |
-
), f"entropy must be [0,1], got {self.entropy}"
|
| 70 |
assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
|
| 71 |
assert (
|
| 72 |
1 <= self.dimensionality <= 8
|
|
@@ -164,7 +156,7 @@ def stat7_resonance(query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float
|
|
| 164 |
luminosity_diff = abs(query_stat7.luminosity - doc_stat7.luminosity)
|
| 165 |
polarity_diff = abs(query_stat7.polarity - doc_stat7.polarity)
|
| 166 |
entropy_diff = abs(query_stat7.entropy - doc_stat7.entropy)
|
| 167 |
-
signal_score = 1.0 - (1/3) * (luminosity_diff + polarity_diff + entropy_diff)
|
| 168 |
signal_score = max(0.0, signal_score)
|
| 169 |
|
| 170 |
# Adjacency/Dimensionality bonus: connectivity + complexity
|
|
|
|
| 55 |
|
| 56 |
def __post_init__(self):
|
| 57 |
"""Validate STAT8 constraints."""
|
| 58 |
+
assert 0.0 <= self.adjacency <= 1.0, f"adjacency must be [0,1], got {self.adjacency}"
|
| 59 |
+
assert 0.0 <= self.luminosity <= 1.0, f"luminosity must be [0,1], got {self.luminosity}"
|
| 60 |
+
assert 0.0 <= self.polarity <= 1.0, f"polarity must be [0,1], got {self.polarity}"
|
| 61 |
+
assert 0.0 <= self.entropy <= 1.0, f"entropy must be [0,1], got {self.entropy}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
|
| 63 |
assert (
|
| 64 |
1 <= self.dimensionality <= 8
|
|
|
|
| 156 |
luminosity_diff = abs(query_stat7.luminosity - doc_stat7.luminosity)
|
| 157 |
polarity_diff = abs(query_stat7.polarity - doc_stat7.polarity)
|
| 158 |
entropy_diff = abs(query_stat7.entropy - doc_stat7.entropy)
|
| 159 |
+
signal_score = 1.0 - (1 / 3) * (luminosity_diff + polarity_diff + entropy_diff)
|
| 160 |
signal_score = max(0.0, signal_score)
|
| 161 |
|
| 162 |
# Adjacency/Dimensionality bonus: connectivity + complexity
|
warbler_cda/stat7_visualization.py
CHANGED
|
@@ -21,11 +21,7 @@ from typing import Optional, Dict, Any
|
|
| 21 |
from pathlib import Path
|
| 22 |
|
| 23 |
from warbler_cda.stat7_experiments import (
|
| 24 |
-
BitChain,
|
| 25 |
-
generate_random_bitchain,
|
| 26 |
EXP01_AddressUniqueness,
|
| 27 |
-
EXP02_RetrievalEfficiency,
|
| 28 |
-
EXP03_DimensionNecessity,
|
| 29 |
)
|
| 30 |
|
| 31 |
# Import the visualization components
|
|
@@ -75,7 +71,7 @@ class STAT7VisualizationManager:
|
|
| 75 |
self.is_running = True
|
| 76 |
|
| 77 |
print(f"🚀 STAT7 Visualization Server started on ws://{self.host}:{self.port}")
|
| 78 |
-
print(
|
| 79 |
|
| 80 |
def _run_server(self):
|
| 81 |
"""Run the WebSocket server in asyncio event loop."""
|
|
@@ -201,7 +197,7 @@ def create_jupyter_widget(width: str = "100%", height: str = "600px") -> str:
|
|
| 201 |
return create_inline_jupyter_widget(width, height)
|
| 202 |
|
| 203 |
with open(html_path, "r", encoding="utf-8") as f:
|
| 204 |
-
|
| 205 |
|
| 206 |
# Wrap in iframe for Jupyter
|
| 207 |
widget_html = f"""
|
|
@@ -307,7 +303,7 @@ def display_in_jupyter(width: str = "100%", height: str = "600px"):
|
|
| 307 |
display(HTML(widget_html))
|
| 308 |
except ImportError:
|
| 309 |
print("IPython not available. Cannot display in Jupyter notebook.")
|
| 310 |
-
print(
|
| 311 |
|
| 312 |
|
| 313 |
# Convenience functions for quick start
|
|
|
|
| 21 |
from pathlib import Path
|
| 22 |
|
| 23 |
from warbler_cda.stat7_experiments import (
|
|
|
|
|
|
|
| 24 |
EXP01_AddressUniqueness,
|
|
|
|
|
|
|
| 25 |
)
|
| 26 |
|
| 27 |
# Import the visualization components
|
|
|
|
| 71 |
self.is_running = True
|
| 72 |
|
| 73 |
print(f"🚀 STAT7 Visualization Server started on ws://{self.host}:{self.port}")
|
| 74 |
+
print("🌐 Open stat7threejs.html in your browser to view visualization")
|
| 75 |
|
| 76 |
def _run_server(self):
|
| 77 |
"""Run the WebSocket server in asyncio event loop."""
|
|
|
|
| 197 |
return create_inline_jupyter_widget(width, height)
|
| 198 |
|
| 199 |
with open(html_path, "r", encoding="utf-8") as f:
|
| 200 |
+
f.read()
|
| 201 |
|
| 202 |
# Wrap in iframe for Jupyter
|
| 203 |
widget_html = f"""
|
|
|
|
| 303 |
display(HTML(widget_html))
|
| 304 |
except ImportError:
|
| 305 |
print("IPython not available. Cannot display in Jupyter notebook.")
|
| 306 |
+
print("Open stat7threejs.html in your browser instead.")
|
| 307 |
|
| 308 |
|
| 309 |
# Convenience functions for quick start
|
warbler_cda/summarization_ladder.py
CHANGED
|
@@ -5,10 +5,10 @@ Implements rolling N-window micro-summaries and pipeline macro distillation
|
|
| 5 |
for the Cognitive Geo-Thermal Lore Engine v0.3.
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
from typing import List, Dict, Any, Optional
|
| 9 |
import time
|
| 10 |
import hashlib
|
| 11 |
-
from dataclasses import dataclass
|
| 12 |
from collections import deque
|
| 13 |
|
| 14 |
|
|
|
|
| 5 |
for the Cognitive Geo-Thermal Lore Engine v0.3.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
from typing import List, Dict, Any, Optional
|
| 9 |
import time
|
| 10 |
import hashlib
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
from collections import deque
|
| 13 |
|
| 14 |
|
warbler_cda/utils/hf_warbler_ingest.py
CHANGED
|
@@ -8,7 +8,6 @@ for NPC intelligence training via the magma layer self-training system.
|
|
| 8 |
|
| 9 |
import logging
|
| 10 |
from pathlib import Path
|
| 11 |
-
from typing import Dict, Any, Optional
|
| 12 |
|
| 13 |
import click
|
| 14 |
|
|
@@ -97,12 +96,12 @@ def ingest(datasets, pack_prefix, arxiv_limit, max_docs_per_chunk, max_pdf_pages
|
|
| 97 |
if max_docs_per_chunk > 0:
|
| 98 |
click.echo(f"[PACK] Chunking enabled: {max_docs_per_chunk} documents per chunk")
|
| 99 |
else:
|
| 100 |
-
click.echo(
|
| 101 |
|
| 102 |
if max_pdf_pages is not None:
|
| 103 |
click.echo(f"[PDF] PDF extraction limit: {max_pdf_pages} pages")
|
| 104 |
else:
|
| 105 |
-
click.echo(
|
| 106 |
|
| 107 |
click.echo()
|
| 108 |
|
|
@@ -165,7 +164,7 @@ def ingest(datasets, pack_prefix, arxiv_limit, max_docs_per_chunk, max_pdf_pages
|
|
| 165 |
if results:
|
| 166 |
report_file = builder.save_report(results)
|
| 167 |
|
| 168 |
-
click.echo(
|
| 169 |
click.echo(f"[STATS] Total Documents: {sum(r['documents'] for r in results.values())}")
|
| 170 |
click.echo(f"[STATS] Packs Created: {len(results)}")
|
| 171 |
click.echo(f"[STATS] Report saved to: {report_file}")
|
|
|
|
| 8 |
|
| 9 |
import logging
|
| 10 |
from pathlib import Path
|
|
|
|
| 11 |
|
| 12 |
import click
|
| 13 |
|
|
|
|
| 96 |
if max_docs_per_chunk > 0:
|
| 97 |
click.echo(f"[PACK] Chunking enabled: {max_docs_per_chunk} documents per chunk")
|
| 98 |
else:
|
| 99 |
+
click.echo("[PACK] Chunking disabled: single file per pack")
|
| 100 |
|
| 101 |
if max_pdf_pages is not None:
|
| 102 |
click.echo(f"[PDF] PDF extraction limit: {max_pdf_pages} pages")
|
| 103 |
else:
|
| 104 |
+
click.echo("[PDF] PDF extraction: unlimited pages")
|
| 105 |
|
| 106 |
click.echo()
|
| 107 |
|
|
|
|
| 164 |
if results:
|
| 165 |
report_file = builder.save_report(results)
|
| 166 |
|
| 167 |
+
click.echo("\n[SUCCESS] Ingestion Complete!")
|
| 168 |
click.echo(f"[STATS] Total Documents: {sum(r['documents'] for r in results.values())}")
|
| 169 |
click.echo(f"[STATS] Packs Created: {len(results)}")
|
| 170 |
click.echo(f"[STATS] Report saved to: {report_file}")
|
warbler_cda/utils/load_warbler_packs.py
CHANGED
|
@@ -233,9 +233,7 @@ def discover(api_url):
|
|
| 233 |
for doc in documents:
|
| 234 |
click.echo(f" - {doc['content_id']}")
|
| 235 |
if "metadata" in doc:
|
| 236 |
-
click.echo(
|
| 237 |
-
f" Realm: {doc['metadata'].get('realm_type','unknown')}"
|
| 238 |
-
)
|
| 239 |
|
| 240 |
click.echo(f"\n[STATS] Total discovered: {total} documents\n")
|
| 241 |
|
|
|
|
| 233 |
for doc in documents:
|
| 234 |
click.echo(f" - {doc['content_id']}")
|
| 235 |
if "metadata" in doc:
|
| 236 |
+
click.echo(f" Realm: {doc['metadata'].get('realm_type','unknown')}")
|
|
|
|
|
|
|
| 237 |
|
| 238 |
click.echo(f"\n[STATS] Total discovered: {total} documents\n")
|
| 239 |
|
warbler_cda/utils/transformers/base.py
CHANGED
|
@@ -33,9 +33,8 @@ class BaseWarblerTransformer(ABC):
|
|
| 33 |
"""Base class for all dataset transformers"""
|
| 34 |
|
| 35 |
def __init__(
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
max_pdf_pages: Optional[int] = None):
|
| 39 |
self.max_pdf_pages = max_pdf_pages
|
| 40 |
|
| 41 |
@abstractmethod
|
|
@@ -47,10 +46,7 @@ class BaseWarblerTransformer(ABC):
|
|
| 47 |
"""Check if PDF extraction is available"""
|
| 48 |
return PDF_AVAILABLE
|
| 49 |
|
| 50 |
-
def extract_pdf_text(
|
| 51 |
-
self,
|
| 52 |
-
pdf_data: Any,
|
| 53 |
-
max_pages: Optional[int] = None) -> Optional[str]:
|
| 54 |
"""
|
| 55 |
Extract text from PDF data (bytes, file path, PDF object, or file-like object)
|
| 56 |
|
|
@@ -62,23 +58,19 @@ class BaseWarblerTransformer(ABC):
|
|
| 62 |
Extracted text or None if extraction fails
|
| 63 |
"""
|
| 64 |
if not PDF_AVAILABLE:
|
| 65 |
-
logger.debug(
|
| 66 |
-
"PDF extraction unavailable - pdfplumber not installed")
|
| 67 |
return None
|
| 68 |
|
| 69 |
try:
|
| 70 |
if hasattr(pdf_data, "pages") and hasattr(pdf_data, "metadata"):
|
| 71 |
-
logger.info(
|
| 72 |
-
"PDF data is already a pdfplumber.PDF object, extracting text...")
|
| 73 |
text_parts = []
|
| 74 |
total_pages = len(pdf_data.pages)
|
| 75 |
|
| 76 |
if max_pages is None:
|
| 77 |
-
logger.info(
|
| 78 |
-
f"PDF has {total_pages} pages, extracting all pages")
|
| 79 |
else:
|
| 80 |
-
logger.info(
|
| 81 |
-
f"PDF has {total_pages} pages, extracting up to {max_pages} pages")
|
| 82 |
|
| 83 |
try:
|
| 84 |
for page_num, page in enumerate(pdf_data.pages, 1):
|
|
@@ -88,35 +80,35 @@ class BaseWarblerTransformer(ABC):
|
|
| 88 |
text_parts.append(page_text)
|
| 89 |
logger.debug(
|
| 90 |
f"Extracted {
|
| 91 |
-
len(page_text)} chars from page {page_num}"
|
|
|
|
| 92 |
else:
|
| 93 |
-
logger.debug(
|
| 94 |
-
f"Page {page_num} has no extractable text")
|
| 95 |
except Exception as page_error:
|
| 96 |
-
logger.warning(
|
| 97 |
-
f"Error extracting page {page_num}: {page_error}")
|
| 98 |
continue
|
| 99 |
|
| 100 |
if max_pages is not None and page_num >= max_pages:
|
| 101 |
logger.info(
|
| 102 |
-
f"Truncated PDF extraction at {page_num} pages (max: {max_pages})"
|
|
|
|
| 103 |
break
|
| 104 |
|
| 105 |
-
extracted_text = "\n\n".join(
|
| 106 |
-
text_parts) if text_parts else None
|
| 107 |
if extracted_text:
|
| 108 |
logger.info(
|
| 109 |
f"Successfully extracted {
|
| 110 |
len(extracted_text)} total characters from {
|
| 111 |
-
len(text_parts)} pages"
|
|
|
|
| 112 |
else:
|
| 113 |
-
logger.warning(
|
| 114 |
-
"No text could be extracted from PDF object")
|
| 115 |
return extracted_text
|
| 116 |
except Exception as e:
|
| 117 |
logger.warning(
|
| 118 |
f"Error extracting from PDF object: {
|
| 119 |
-
type(e).__name__}: {e}"
|
|
|
|
| 120 |
return None
|
| 121 |
|
| 122 |
if isinstance(pdf_data, dict) and "bytes" in pdf_data:
|
|
@@ -129,7 +121,8 @@ class BaseWarblerTransformer(ABC):
|
|
| 129 |
if isinstance(pdf_data, bytes):
|
| 130 |
logger.info(
|
| 131 |
f"PDF data is bytes ({
|
| 132 |
-
len(pdf_data)} bytes), creating BytesIO"
|
|
|
|
| 133 |
pdf_file = io.BytesIO(pdf_data)
|
| 134 |
elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
|
| 135 |
logger.info(f"PDF data is file path: {pdf_data}")
|
|
@@ -138,8 +131,7 @@ class BaseWarblerTransformer(ABC):
|
|
| 138 |
logger.info(f"PDF data is file-like object: {type(pdf_data)}")
|
| 139 |
pdf_file = pdf_data
|
| 140 |
else:
|
| 141 |
-
logger.warning(
|
| 142 |
-
f"Unknown PDF data type: {type(pdf_data)}, cannot extract")
|
| 143 |
return None
|
| 144 |
|
| 145 |
text_parts = []
|
|
@@ -147,11 +139,11 @@ class BaseWarblerTransformer(ABC):
|
|
| 147 |
total_pages = len(pdf.pages)
|
| 148 |
|
| 149 |
if max_pages is None:
|
| 150 |
-
logger.info(
|
| 151 |
-
f"Opened PDF with {total_pages} pages, extracting all pages")
|
| 152 |
else:
|
| 153 |
logger.info(
|
| 154 |
-
f"Opened PDF with {total_pages} pages, extracting up to {max_pages} pages"
|
|
|
|
| 155 |
|
| 156 |
for page_num, page in enumerate(pdf.pages, 1):
|
| 157 |
try:
|
|
@@ -160,18 +152,18 @@ class BaseWarblerTransformer(ABC):
|
|
| 160 |
text_parts.append(page_text)
|
| 161 |
logger.debug(
|
| 162 |
f"Extracted {
|
| 163 |
-
len(page_text)} chars from page {page_num}"
|
|
|
|
| 164 |
else:
|
| 165 |
-
logger.debug(
|
| 166 |
-
f"Page {page_num} has no extractable text")
|
| 167 |
except Exception as page_error:
|
| 168 |
-
logger.warning(
|
| 169 |
-
f"Error extracting page {page_num}: {page_error}")
|
| 170 |
continue
|
| 171 |
|
| 172 |
if max_pages is not None and page_num >= max_pages:
|
| 173 |
logger.info(
|
| 174 |
-
f"Truncated PDF extraction at {page_num} pages (max: {max_pages})"
|
|
|
|
| 175 |
break
|
| 176 |
|
| 177 |
extracted_text = "\n\n".join(text_parts) if text_parts else None
|
|
@@ -179,7 +171,8 @@ class BaseWarblerTransformer(ABC):
|
|
| 179 |
logger.info(
|
| 180 |
f"Successfully extracted {
|
| 181 |
len(extracted_text)} total characters from {
|
| 182 |
-
len(text_parts)} pages"
|
|
|
|
| 183 |
else:
|
| 184 |
logger.warning("No text could be extracted from PDF")
|
| 185 |
return extracted_text
|
|
@@ -192,8 +185,7 @@ class BaseWarblerTransformer(ABC):
|
|
| 192 |
"""Split text into chunks"""
|
| 193 |
if not text:
|
| 194 |
return []
|
| 195 |
-
return [text[i: i + chunk_size]
|
| 196 |
-
for i in range(0, len(text), chunk_size)]
|
| 197 |
|
| 198 |
def extract_dataset_items(self, dataset: Any) -> List[Dict[str, Any]]:
|
| 199 |
"""
|
|
@@ -211,13 +203,7 @@ class BaseWarblerTransformer(ABC):
|
|
| 211 |
pass
|
| 212 |
|
| 213 |
try:
|
| 214 |
-
if hasattr(
|
| 215 |
-
dataset,
|
| 216 |
-
"keys") and callable(
|
| 217 |
-
getattr(
|
| 218 |
-
dataset,
|
| 219 |
-
"keys",
|
| 220 |
-
None)):
|
| 221 |
keys = list(dataset.keys())
|
| 222 |
if keys:
|
| 223 |
first_split = keys[0]
|
|
@@ -242,8 +228,7 @@ class WarblerPackBuilder:
|
|
| 242 |
|
| 243 |
def __init__(self, output_dir: Optional[Path] = None):
|
| 244 |
if output_dir is None:
|
| 245 |
-
output_dir = Path(__file__).resolve(
|
| 246 |
-
).parent.parent / "results" / "hf_ingest"
|
| 247 |
self.output_dir = Path(output_dir)
|
| 248 |
self.output_dir.mkdir(exist_ok=True, parents=True)
|
| 249 |
|
|
@@ -259,8 +244,7 @@ class WarblerPackBuilder:
|
|
| 259 |
|
| 260 |
total_docs = len(docs)
|
| 261 |
|
| 262 |
-
if max_docs_per_chunk == float(
|
| 263 |
-
"inf") or total_docs <= max_docs_per_chunk:
|
| 264 |
pack_file = pack_dir / f"{pack_name}.jsonl"
|
| 265 |
|
| 266 |
with open(pack_file, "w", encoding="utf-8") as f:
|
|
@@ -279,21 +263,21 @@ class WarblerPackBuilder:
|
|
| 279 |
}
|
| 280 |
|
| 281 |
logger.info(
|
| 282 |
-
f"✓ Created Warbler pack: {pack_name} with {total_docs} documents (single file)"
|
|
|
|
| 283 |
else:
|
| 284 |
-
chunk_count = (total_docs + max_docs_per_chunk -
|
| 285 |
-
1) // max_docs_per_chunk
|
| 286 |
|
| 287 |
logger.info(
|
| 288 |
-
f"Creating chunked pack: {pack_name} with {total_docs} documents across {chunk_count} chunks"
|
|
|
|
| 289 |
|
| 290 |
for chunk_idx in range(chunk_count):
|
| 291 |
start_idx = chunk_idx * max_docs_per_chunk
|
| 292 |
end_idx = min(start_idx + max_docs_per_chunk, total_docs)
|
| 293 |
chunk_docs = docs[start_idx:end_idx]
|
| 294 |
|
| 295 |
-
chunk_file = pack_dir /
|
| 296 |
-
f"{pack_name}-chunk-{chunk_idx + 1:03d}.jsonl"
|
| 297 |
|
| 298 |
with open(chunk_file, "w", encoding="utf-8") as f:
|
| 299 |
for doc in chunk_docs:
|
|
@@ -303,7 +287,8 @@ class WarblerPackBuilder:
|
|
| 303 |
f" ✓ Wrote chunk {
|
| 304 |
chunk_idx + 1}/{chunk_count}: {
|
| 305 |
len(chunk_docs)} documents to {
|
| 306 |
-
chunk_file.name}"
|
|
|
|
| 307 |
|
| 308 |
metadata = {
|
| 309 |
"name": pack_name,
|
|
@@ -320,7 +305,8 @@ class WarblerPackBuilder:
|
|
| 320 |
}
|
| 321 |
|
| 322 |
logger.info(
|
| 323 |
-
f"✓ Created chunked Warbler pack: {pack_name} with {total_docs} documents across {chunk_count} chunks"
|
|
|
|
| 324 |
|
| 325 |
metadata_file = pack_dir / "package.json"
|
| 326 |
with open(metadata_file, "w", encoding="utf-8") as f:
|
|
@@ -335,16 +321,14 @@ class WarblerPackBuilder:
|
|
| 335 |
"timestamp": datetime.now().isoformat(),
|
| 336 |
"results": results,
|
| 337 |
"total_documents": sum(
|
| 338 |
-
result.get("documents", 0) if isinstance(
|
| 339 |
-
result, dict) else len(result)
|
| 340 |
for result in results.values()
|
| 341 |
),
|
| 342 |
"packs_created": len(results),
|
| 343 |
}
|
| 344 |
|
| 345 |
report_file = (
|
| 346 |
-
self.output_dir /
|
| 347 |
-
f"ingestion_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
| 348 |
)
|
| 349 |
with open(report_file, "w", encoding="utf-8") as f:
|
| 350 |
json.dump(report, f, indent=2, ensure_ascii=False)
|
|
|
|
| 33 |
"""Base class for all dataset transformers"""
|
| 34 |
|
| 35 |
def __init__(
|
| 36 |
+
self, tokenizer_name: str = "microsoft/DialoGPT-medium", max_pdf_pages: Optional[int] = None
|
| 37 |
+
):
|
|
|
|
| 38 |
self.max_pdf_pages = max_pdf_pages
|
| 39 |
|
| 40 |
@abstractmethod
|
|
|
|
| 46 |
"""Check if PDF extraction is available"""
|
| 47 |
return PDF_AVAILABLE
|
| 48 |
|
| 49 |
+
def extract_pdf_text(self, pdf_data: Any, max_pages: Optional[int] = None) -> Optional[str]:
|
|
|
|
|
|
|
|
|
|
| 50 |
"""
|
| 51 |
Extract text from PDF data (bytes, file path, PDF object, or file-like object)
|
| 52 |
|
|
|
|
| 58 |
Extracted text or None if extraction fails
|
| 59 |
"""
|
| 60 |
if not PDF_AVAILABLE:
|
| 61 |
+
logger.debug("PDF extraction unavailable - pdfplumber not installed")
|
|
|
|
| 62 |
return None
|
| 63 |
|
| 64 |
try:
|
| 65 |
if hasattr(pdf_data, "pages") and hasattr(pdf_data, "metadata"):
|
| 66 |
+
logger.info("PDF data is already a pdfplumber.PDF object, extracting text...")
|
|
|
|
| 67 |
text_parts = []
|
| 68 |
total_pages = len(pdf_data.pages)
|
| 69 |
|
| 70 |
if max_pages is None:
|
| 71 |
+
logger.info(f"PDF has {total_pages} pages, extracting all pages")
|
|
|
|
| 72 |
else:
|
| 73 |
+
logger.info(f"PDF has {total_pages} pages, extracting up to {max_pages} pages")
|
|
|
|
| 74 |
|
| 75 |
try:
|
| 76 |
for page_num, page in enumerate(pdf_data.pages, 1):
|
|
|
|
| 80 |
text_parts.append(page_text)
|
| 81 |
logger.debug(
|
| 82 |
f"Extracted {
|
| 83 |
+
len(page_text)} chars from page {page_num}"
|
| 84 |
+
)
|
| 85 |
else:
|
| 86 |
+
logger.debug(f"Page {page_num} has no extractable text")
|
|
|
|
| 87 |
except Exception as page_error:
|
| 88 |
+
logger.warning(f"Error extracting page {page_num}: {page_error}")
|
|
|
|
| 89 |
continue
|
| 90 |
|
| 91 |
if max_pages is not None and page_num >= max_pages:
|
| 92 |
logger.info(
|
| 93 |
+
f"Truncated PDF extraction at {page_num} pages (max: {max_pages})"
|
| 94 |
+
)
|
| 95 |
break
|
| 96 |
|
| 97 |
+
extracted_text = "\n\n".join(text_parts) if text_parts else None
|
|
|
|
| 98 |
if extracted_text:
|
| 99 |
logger.info(
|
| 100 |
f"Successfully extracted {
|
| 101 |
len(extracted_text)} total characters from {
|
| 102 |
+
len(text_parts)} pages"
|
| 103 |
+
)
|
| 104 |
else:
|
| 105 |
+
logger.warning("No text could be extracted from PDF object")
|
|
|
|
| 106 |
return extracted_text
|
| 107 |
except Exception as e:
|
| 108 |
logger.warning(
|
| 109 |
f"Error extracting from PDF object: {
|
| 110 |
+
type(e).__name__}: {e}"
|
| 111 |
+
)
|
| 112 |
return None
|
| 113 |
|
| 114 |
if isinstance(pdf_data, dict) and "bytes" in pdf_data:
|
|
|
|
| 121 |
if isinstance(pdf_data, bytes):
|
| 122 |
logger.info(
|
| 123 |
f"PDF data is bytes ({
|
| 124 |
+
len(pdf_data)} bytes), creating BytesIO"
|
| 125 |
+
)
|
| 126 |
pdf_file = io.BytesIO(pdf_data)
|
| 127 |
elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
|
| 128 |
logger.info(f"PDF data is file path: {pdf_data}")
|
|
|
|
| 131 |
logger.info(f"PDF data is file-like object: {type(pdf_data)}")
|
| 132 |
pdf_file = pdf_data
|
| 133 |
else:
|
| 134 |
+
logger.warning(f"Unknown PDF data type: {type(pdf_data)}, cannot extract")
|
|
|
|
| 135 |
return None
|
| 136 |
|
| 137 |
text_parts = []
|
|
|
|
| 139 |
total_pages = len(pdf.pages)
|
| 140 |
|
| 141 |
if max_pages is None:
|
| 142 |
+
logger.info(f"Opened PDF with {total_pages} pages, extracting all pages")
|
|
|
|
| 143 |
else:
|
| 144 |
logger.info(
|
| 145 |
+
f"Opened PDF with {total_pages} pages, extracting up to {max_pages} pages"
|
| 146 |
+
)
|
| 147 |
|
| 148 |
for page_num, page in enumerate(pdf.pages, 1):
|
| 149 |
try:
|
|
|
|
| 152 |
text_parts.append(page_text)
|
| 153 |
logger.debug(
|
| 154 |
f"Extracted {
|
| 155 |
+
len(page_text)} chars from page {page_num}"
|
| 156 |
+
)
|
| 157 |
else:
|
| 158 |
+
logger.debug(f"Page {page_num} has no extractable text")
|
|
|
|
| 159 |
except Exception as page_error:
|
| 160 |
+
logger.warning(f"Error extracting page {page_num}: {page_error}")
|
|
|
|
| 161 |
continue
|
| 162 |
|
| 163 |
if max_pages is not None and page_num >= max_pages:
|
| 164 |
logger.info(
|
| 165 |
+
f"Truncated PDF extraction at {page_num} pages (max: {max_pages})"
|
| 166 |
+
)
|
| 167 |
break
|
| 168 |
|
| 169 |
extracted_text = "\n\n".join(text_parts) if text_parts else None
|
|
|
|
| 171 |
logger.info(
|
| 172 |
f"Successfully extracted {
|
| 173 |
len(extracted_text)} total characters from {
|
| 174 |
+
len(text_parts)} pages"
|
| 175 |
+
)
|
| 176 |
else:
|
| 177 |
logger.warning("No text could be extracted from PDF")
|
| 178 |
return extracted_text
|
|
|
|
| 185 |
"""Split text into chunks"""
|
| 186 |
if not text:
|
| 187 |
return []
|
| 188 |
+
return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
|
|
|
|
| 189 |
|
| 190 |
def extract_dataset_items(self, dataset: Any) -> List[Dict[str, Any]]:
|
| 191 |
"""
|
|
|
|
| 203 |
pass
|
| 204 |
|
| 205 |
try:
|
| 206 |
+
if hasattr(dataset, "keys") and callable(getattr(dataset, "keys", None)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
keys = list(dataset.keys())
|
| 208 |
if keys:
|
| 209 |
first_split = keys[0]
|
|
|
|
| 228 |
|
| 229 |
def __init__(self, output_dir: Optional[Path] = None):
|
| 230 |
if output_dir is None:
|
| 231 |
+
output_dir = Path(__file__).resolve().parent.parent / "results" / "hf_ingest"
|
|
|
|
| 232 |
self.output_dir = Path(output_dir)
|
| 233 |
self.output_dir.mkdir(exist_ok=True, parents=True)
|
| 234 |
|
|
|
|
| 244 |
|
| 245 |
total_docs = len(docs)
|
| 246 |
|
| 247 |
+
if max_docs_per_chunk == float("inf") or total_docs <= max_docs_per_chunk:
|
|
|
|
| 248 |
pack_file = pack_dir / f"{pack_name}.jsonl"
|
| 249 |
|
| 250 |
with open(pack_file, "w", encoding="utf-8") as f:
|
|
|
|
| 263 |
}
|
| 264 |
|
| 265 |
logger.info(
|
| 266 |
+
f"✓ Created Warbler pack: {pack_name} with {total_docs} documents (single file)"
|
| 267 |
+
)
|
| 268 |
else:
|
| 269 |
+
chunk_count = (total_docs + max_docs_per_chunk - 1) // max_docs_per_chunk
|
|
|
|
| 270 |
|
| 271 |
logger.info(
|
| 272 |
+
f"Creating chunked pack: {pack_name} with {total_docs} documents across {chunk_count} chunks"
|
| 273 |
+
)
|
| 274 |
|
| 275 |
for chunk_idx in range(chunk_count):
|
| 276 |
start_idx = chunk_idx * max_docs_per_chunk
|
| 277 |
end_idx = min(start_idx + max_docs_per_chunk, total_docs)
|
| 278 |
chunk_docs = docs[start_idx:end_idx]
|
| 279 |
|
| 280 |
+
chunk_file = pack_dir / f"{pack_name}-chunk-{chunk_idx + 1:03d}.jsonl"
|
|
|
|
| 281 |
|
| 282 |
with open(chunk_file, "w", encoding="utf-8") as f:
|
| 283 |
for doc in chunk_docs:
|
|
|
|
| 287 |
f" ✓ Wrote chunk {
|
| 288 |
chunk_idx + 1}/{chunk_count}: {
|
| 289 |
len(chunk_docs)} documents to {
|
| 290 |
+
chunk_file.name}"
|
| 291 |
+
)
|
| 292 |
|
| 293 |
metadata = {
|
| 294 |
"name": pack_name,
|
|
|
|
| 305 |
}
|
| 306 |
|
| 307 |
logger.info(
|
| 308 |
+
f"✓ Created chunked Warbler pack: {pack_name} with {total_docs} documents across {chunk_count} chunks"
|
| 309 |
+
)
|
| 310 |
|
| 311 |
metadata_file = pack_dir / "package.json"
|
| 312 |
with open(metadata_file, "w", encoding="utf-8") as f:
|
|
|
|
| 321 |
"timestamp": datetime.now().isoformat(),
|
| 322 |
"results": results,
|
| 323 |
"total_documents": sum(
|
| 324 |
+
result.get("documents", 0) if isinstance(result, dict) else len(result)
|
|
|
|
| 325 |
for result in results.values()
|
| 326 |
),
|
| 327 |
"packs_created": len(results),
|
| 328 |
}
|
| 329 |
|
| 330 |
report_file = (
|
| 331 |
+
self.output_dir / f"ingestion_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
|
|
| 332 |
)
|
| 333 |
with open(report_file, "w", encoding="utf-8") as f:
|
| 334 |
json.dump(report, f, indent=2, ensure_ascii=False)
|
warbler_cda/utils/transformers/edustories.py
CHANGED
|
@@ -16,8 +16,7 @@ logger = logging.getLogger(__name__)
|
|
| 16 |
class EdustoriesTransformer(BaseWarblerTransformer):
|
| 17 |
"""Transform MU-NLPC/Edustories-en dataset"""
|
| 18 |
|
| 19 |
-
def transform(
|
| 20 |
-
self, dataset_name: str = "MU-NLPC/Edustories-en") -> List[Dict[str, Any]]:
|
| 21 |
"""
|
| 22 |
Transform MU-NLPC/Edustories-en dataset
|
| 23 |
Format: Educational case studies with structured teaching situations
|
|
@@ -49,8 +48,7 @@ class EdustoriesTransformer(BaseWarblerTransformer):
|
|
| 49 |
|
| 50 |
for idx, item in enumerate(items):
|
| 51 |
if isinstance(item, str):
|
| 52 |
-
logger.warning(
|
| 53 |
-
f"Edustory {idx + 1}: Item is a string, skipping")
|
| 54 |
continue
|
| 55 |
|
| 56 |
if isinstance(item, dict) or hasattr(item, "__getitem__"):
|
|
@@ -75,7 +73,8 @@ class EdustoriesTransformer(BaseWarblerTransformer):
|
|
| 75 |
logger.warning(
|
| 76 |
f"Edustory {
|
| 77 |
idx +
|
| 78 |
-
1}: No case study content found, skipping"
|
|
|
|
| 79 |
continue
|
| 80 |
|
| 81 |
entry_id = safe_get("id", str(idx))
|
|
@@ -91,11 +90,9 @@ class EdustoriesTransformer(BaseWarblerTransformer):
|
|
| 91 |
problems_annotated = safe_get("problems_annotated", "")
|
| 92 |
problems_possible = safe_get("problems_possible_annotated", "")
|
| 93 |
solutions_annotated = safe_get("solutions_annotated", "")
|
| 94 |
-
solutions_possible = safe_get(
|
| 95 |
-
"solutions_possible_annotated", "")
|
| 96 |
implications_annotated = safe_get("implications_annotated", "")
|
| 97 |
-
implications_possible = safe_get(
|
| 98 |
-
"implications_possible_annotated", "")
|
| 99 |
|
| 100 |
annotator_id = safe_get("annotator_id", "")
|
| 101 |
|
|
@@ -131,7 +128,8 @@ class EdustoriesTransformer(BaseWarblerTransformer):
|
|
| 131 |
|
| 132 |
logger.info(
|
| 133 |
f"✓ Transformed {
|
| 134 |
-
len(warbler_docs)} educational case study entries"
|
|
|
|
| 135 |
return warbler_docs
|
| 136 |
|
| 137 |
@staticmethod
|
|
@@ -149,8 +147,7 @@ class EdustoriesTransformer(BaseWarblerTransformer):
|
|
| 149 |
return default
|
| 150 |
|
| 151 |
description = safe_get("description", "[No background provided]")
|
| 152 |
-
anamnesis = safe_get(
|
| 153 |
-
"anamnesis", "[No situation description provided]")
|
| 154 |
solution = safe_get("solution", "[No intervention described]")
|
| 155 |
outcome = safe_get("outcome", "[No outcome reported]")
|
| 156 |
|
|
@@ -181,16 +178,15 @@ class EdustoriesTransformer(BaseWarblerTransformer):
|
|
| 181 |
|
| 182 |
annotation_parts = []
|
| 183 |
if problems_annotated:
|
| 184 |
-
annotation_parts.append(
|
| 185 |
-
f"Problems Identified: {problems_annotated}")
|
| 186 |
if solutions_annotated:
|
| 187 |
-
annotation_parts.append(
|
| 188 |
-
f"Solutions Applied: {solutions_annotated}")
|
| 189 |
if implications_annotated:
|
| 190 |
annotation_parts.append(f"Implications: {implications_annotated}")
|
| 191 |
|
| 192 |
-
annotations = (
|
| 193 |
-
|
|
|
|
| 194 |
|
| 195 |
content = f"""TEACHING CASE STUDY
|
| 196 |
|
|
|
|
| 16 |
class EdustoriesTransformer(BaseWarblerTransformer):
|
| 17 |
"""Transform MU-NLPC/Edustories-en dataset"""
|
| 18 |
|
| 19 |
+
def transform(self, dataset_name: str = "MU-NLPC/Edustories-en") -> List[Dict[str, Any]]:
|
|
|
|
| 20 |
"""
|
| 21 |
Transform MU-NLPC/Edustories-en dataset
|
| 22 |
Format: Educational case studies with structured teaching situations
|
|
|
|
| 48 |
|
| 49 |
for idx, item in enumerate(items):
|
| 50 |
if isinstance(item, str):
|
| 51 |
+
logger.warning(f"Edustory {idx + 1}: Item is a string, skipping")
|
|
|
|
| 52 |
continue
|
| 53 |
|
| 54 |
if isinstance(item, dict) or hasattr(item, "__getitem__"):
|
|
|
|
| 73 |
logger.warning(
|
| 74 |
f"Edustory {
|
| 75 |
idx +
|
| 76 |
+
1}: No case study content found, skipping"
|
| 77 |
+
)
|
| 78 |
continue
|
| 79 |
|
| 80 |
entry_id = safe_get("id", str(idx))
|
|
|
|
| 90 |
problems_annotated = safe_get("problems_annotated", "")
|
| 91 |
problems_possible = safe_get("problems_possible_annotated", "")
|
| 92 |
solutions_annotated = safe_get("solutions_annotated", "")
|
| 93 |
+
solutions_possible = safe_get("solutions_possible_annotated", "")
|
|
|
|
| 94 |
implications_annotated = safe_get("implications_annotated", "")
|
| 95 |
+
implications_possible = safe_get("implications_possible_annotated", "")
|
|
|
|
| 96 |
|
| 97 |
annotator_id = safe_get("annotator_id", "")
|
| 98 |
|
|
|
|
| 128 |
|
| 129 |
logger.info(
|
| 130 |
f"✓ Transformed {
|
| 131 |
+
len(warbler_docs)} educational case study entries"
|
| 132 |
+
)
|
| 133 |
return warbler_docs
|
| 134 |
|
| 135 |
@staticmethod
|
|
|
|
| 147 |
return default
|
| 148 |
|
| 149 |
description = safe_get("description", "[No background provided]")
|
| 150 |
+
anamnesis = safe_get("anamnesis", "[No situation description provided]")
|
|
|
|
| 151 |
solution = safe_get("solution", "[No intervention described]")
|
| 152 |
outcome = safe_get("outcome", "[No outcome reported]")
|
| 153 |
|
|
|
|
| 178 |
|
| 179 |
annotation_parts = []
|
| 180 |
if problems_annotated:
|
| 181 |
+
annotation_parts.append(f"Problems Identified: {problems_annotated}")
|
|
|
|
| 182 |
if solutions_annotated:
|
| 183 |
+
annotation_parts.append(f"Solutions Applied: {solutions_annotated}")
|
|
|
|
| 184 |
if implications_annotated:
|
| 185 |
annotation_parts.append(f"Implications: {implications_annotated}")
|
| 186 |
|
| 187 |
+
annotations = (
|
| 188 |
+
"\n".join(annotation_parts) if annotation_parts else "[No annotations available]"
|
| 189 |
+
)
|
| 190 |
|
| 191 |
content = f"""TEACHING CASE STUDY
|
| 192 |
|
warbler_cda/utils/transformers/enterprise.py
CHANGED
|
@@ -16,8 +16,7 @@ logger = logging.getLogger(__name__)
|
|
| 16 |
class EnterpriseTransformer(BaseWarblerTransformer):
|
| 17 |
"""Transform SustcZhangYX/ChatEnv dataset"""
|
| 18 |
|
| 19 |
-
def transform(
|
| 20 |
-
self, dataset_name: str = "SustcZhangYX/ChatEnv") -> List[Dict[str, Any]]:
|
| 21 |
"""
|
| 22 |
Transform SustcZhangYX/ChatEnv dataset
|
| 23 |
Format: Software development chat conversations and collaborative coding scenarios
|
|
@@ -36,22 +35,20 @@ class EnterpriseTransformer(BaseWarblerTransformer):
|
|
| 36 |
items = list(dataset[split_name])
|
| 37 |
logger.info(
|
| 38 |
f"Loaded {
|
| 39 |
-
len(items)} items from '{split_name}' split"
|
|
|
|
| 40 |
break
|
| 41 |
except Exception as split_error:
|
| 42 |
-
logger.debug(
|
| 43 |
-
f"Could not load split '{split_name}': {split_error}")
|
| 44 |
continue
|
| 45 |
|
| 46 |
if not items:
|
| 47 |
items = self.extract_dataset_items(dataset)
|
| 48 |
if items:
|
| 49 |
-
logger.info(
|
| 50 |
-
f"Extracted {len(items)} items from dataset")
|
| 51 |
except Exception as e:
|
| 52 |
logger.warning(f"Failed to load {dataset_name}: {e}")
|
| 53 |
-
logger.info(
|
| 54 |
-
f"Skipping {dataset_name} - dataset has loading issues")
|
| 55 |
return []
|
| 56 |
|
| 57 |
if not items:
|
|
@@ -99,14 +96,12 @@ class EnterpriseTransformer(BaseWarblerTransformer):
|
|
| 99 |
)
|
| 100 |
|
| 101 |
task = (
|
| 102 |
-
item.get("task", item.get(
|
| 103 |
-
"scenario", "Software development chat"))
|
| 104 |
if isinstance(item, dict)
|
| 105 |
else "Software development chat"
|
| 106 |
)
|
| 107 |
scenario = (
|
| 108 |
-
item.get("scenario", item.get(
|
| 109 |
-
"task", f"ChatEnv Scenario #{idx + 1}"))
|
| 110 |
if isinstance(item, dict)
|
| 111 |
else f"ChatEnv Scenario #{idx + 1}"
|
| 112 |
)
|
|
@@ -138,7 +133,8 @@ class EnterpriseTransformer(BaseWarblerTransformer):
|
|
| 138 |
|
| 139 |
logger.info(
|
| 140 |
f"✓ Transformed {
|
| 141 |
-
len(warbler_docs)} ChatEnv software development chat entries"
|
|
|
|
| 142 |
return warbler_docs
|
| 143 |
|
| 144 |
@staticmethod
|
|
|
|
| 16 |
class EnterpriseTransformer(BaseWarblerTransformer):
|
| 17 |
"""Transform SustcZhangYX/ChatEnv dataset"""
|
| 18 |
|
| 19 |
+
def transform(self, dataset_name: str = "SustcZhangYX/ChatEnv") -> List[Dict[str, Any]]:
|
|
|
|
| 20 |
"""
|
| 21 |
Transform SustcZhangYX/ChatEnv dataset
|
| 22 |
Format: Software development chat conversations and collaborative coding scenarios
|
|
|
|
| 35 |
items = list(dataset[split_name])
|
| 36 |
logger.info(
|
| 37 |
f"Loaded {
|
| 38 |
+
len(items)} items from '{split_name}' split"
|
| 39 |
+
)
|
| 40 |
break
|
| 41 |
except Exception as split_error:
|
| 42 |
+
logger.debug(f"Could not load split '{split_name}': {split_error}")
|
|
|
|
| 43 |
continue
|
| 44 |
|
| 45 |
if not items:
|
| 46 |
items = self.extract_dataset_items(dataset)
|
| 47 |
if items:
|
| 48 |
+
logger.info(f"Extracted {len(items)} items from dataset")
|
|
|
|
| 49 |
except Exception as e:
|
| 50 |
logger.warning(f"Failed to load {dataset_name}: {e}")
|
| 51 |
+
logger.info(f"Skipping {dataset_name} - dataset has loading issues")
|
|
|
|
| 52 |
return []
|
| 53 |
|
| 54 |
if not items:
|
|
|
|
| 96 |
)
|
| 97 |
|
| 98 |
task = (
|
| 99 |
+
item.get("task", item.get("scenario", "Software development chat"))
|
|
|
|
| 100 |
if isinstance(item, dict)
|
| 101 |
else "Software development chat"
|
| 102 |
)
|
| 103 |
scenario = (
|
| 104 |
+
item.get("scenario", item.get("task", f"ChatEnv Scenario #{idx + 1}"))
|
|
|
|
| 105 |
if isinstance(item, dict)
|
| 106 |
else f"ChatEnv Scenario #{idx + 1}"
|
| 107 |
)
|
|
|
|
| 133 |
|
| 134 |
logger.info(
|
| 135 |
f"✓ Transformed {
|
| 136 |
+
len(warbler_docs)} ChatEnv software development chat entries"
|
| 137 |
+
)
|
| 138 |
return warbler_docs
|
| 139 |
|
| 140 |
@staticmethod
|
warbler_cda/utils/transformers/multi_character.py
CHANGED
|
@@ -35,15 +35,12 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 35 |
|
| 36 |
try:
|
| 37 |
if "train" not in dataset:
|
| 38 |
-
logger.warning(
|
| 39 |
-
f"Multi-char: No 'train' split found in dataset")
|
| 40 |
return []
|
| 41 |
|
| 42 |
train_data = dataset["train"]
|
| 43 |
-
total_items = len(train_data) if hasattr(
|
| 44 |
-
|
| 45 |
-
logger.info(
|
| 46 |
-
f"Processing {total_items} multi-character dialogue items...")
|
| 47 |
|
| 48 |
for idx, item in enumerate(train_data):
|
| 49 |
if idx > 0 and idx % 1000 == 0:
|
|
@@ -53,8 +50,7 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 53 |
|
| 54 |
try:
|
| 55 |
if item is None:
|
| 56 |
-
logger.warning(
|
| 57 |
-
f"Multi-char {idx + 1}: Item is None, skipping")
|
| 58 |
continue
|
| 59 |
|
| 60 |
if not isinstance(item, dict):
|
|
@@ -75,12 +71,12 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 75 |
conversation = [] if conversation is None else [conversation]
|
| 76 |
|
| 77 |
if not setting and not conversation:
|
| 78 |
-
logger.warning(
|
| 79 |
-
f"Multi-char {idx + 1}: Missing essential data, skipping")
|
| 80 |
continue
|
| 81 |
|
| 82 |
-
if conversation and not all(
|
| 83 |
-
|
|
|
|
| 84 |
logger.warning(
|
| 85 |
f"Multi-char {idx + 1}: Invalid conversation structure, skipping"
|
| 86 |
)
|
|
@@ -102,12 +98,10 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 102 |
"source_dataset": dataset_name,
|
| 103 |
"setting": setting[:150] + "..." if len(setting) > 150 else setting,
|
| 104 |
"character_count": (
|
| 105 |
-
len(characters) if isinstance(
|
| 106 |
-
characters, list) else 0
|
| 107 |
),
|
| 108 |
"conversation_length": (
|
| 109 |
-
len(conversation) if isinstance(
|
| 110 |
-
conversation, list) else 0
|
| 111 |
),
|
| 112 |
"realm_type": "narrative",
|
| 113 |
"realm_label": "multi_character_dialogue",
|
|
@@ -129,8 +123,7 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 129 |
)
|
| 130 |
continue
|
| 131 |
except (KeyboardInterrupt, SystemExit):
|
| 132 |
-
logger.warning(
|
| 133 |
-
f"Multi-char: Processing interrupted at item {idx + 1}")
|
| 134 |
raise
|
| 135 |
except Exception as e:
|
| 136 |
logger.warning(
|
|
@@ -141,10 +134,12 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 141 |
except (MemoryError, RecursionError) as critical_error:
|
| 142 |
logger.error(
|
| 143 |
f"Multi-char: Critical error during iteration: {
|
| 144 |
-
type(critical_error).__name__}: {critical_error}"
|
|
|
|
| 145 |
logger.info(
|
| 146 |
f"Returning {
|
| 147 |
-
len(warbler_docs)} documents processed before error"
|
|
|
|
| 148 |
except (KeyboardInterrupt, SystemExit):
|
| 149 |
logger.warning(
|
| 150 |
f"Multi-char: Processing interrupted, returning {len(warbler_docs)} documents"
|
|
@@ -153,13 +148,14 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 153 |
except Exception as outer_error:
|
| 154 |
logger.error(
|
| 155 |
f"Multi-char: Unexpected error during dataset iteration: {
|
| 156 |
-
type(outer_error).__name__}: {outer_error}"
|
|
|
|
| 157 |
logger.info(
|
| 158 |
f"Returning {
|
| 159 |
-
len(warbler_docs)} documents processed before error"
|
|
|
|
| 160 |
|
| 161 |
-
logger.info(
|
| 162 |
-
f"✓ Transformed {len(warbler_docs)} multi-character entries")
|
| 163 |
return warbler_docs
|
| 164 |
|
| 165 |
@staticmethod
|
|
@@ -185,18 +181,14 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 185 |
message_field = msg.get("message", "")
|
| 186 |
|
| 187 |
if not isinstance(from_field, str):
|
| 188 |
-
from_field = str(
|
| 189 |
-
from_field) if from_field is not None else "Unknown"
|
| 190 |
if not isinstance(message_field, str):
|
| 191 |
-
message_field = str(
|
| 192 |
-
message_field) if message_field is not None else ""
|
| 193 |
|
| 194 |
if len(message_field) > 5000:
|
| 195 |
-
message_field = message_field[:5000] +
|
| 196 |
-
"... [truncated]"
|
| 197 |
|
| 198 |
-
conversation_lines.append(
|
| 199 |
-
f"{from_field}: {message_field}")
|
| 200 |
|
| 201 |
elif isinstance(msg, str):
|
| 202 |
if len(msg) > 5000:
|
|
@@ -204,33 +196,31 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 204 |
conversation_lines.append(msg)
|
| 205 |
|
| 206 |
else:
|
| 207 |
-
conversation_lines.append(
|
| 208 |
-
f"[Message {msg_idx + 1}: {type(msg).__name__}]")
|
| 209 |
|
| 210 |
except (RecursionError, MemoryError) as critical_err:
|
| 211 |
logger.warning(
|
| 212 |
-
f"Critical error processing conversation message {msg_idx}: {critical_err}"
|
|
|
|
| 213 |
break
|
| 214 |
except Exception as msg_err:
|
| 215 |
-
logger.debug(
|
| 216 |
-
f"Error processing conversation message {msg_idx}: {msg_err}")
|
| 217 |
continue
|
| 218 |
|
| 219 |
if len(conversation) > max_conversation_items:
|
| 220 |
conversation_lines.append(
|
| 221 |
f"\n[... {
|
| 222 |
len(conversation) -
|
| 223 |
-
max_conversation_items} more messages truncated]"
|
|
|
|
| 224 |
|
| 225 |
conversation_text = (
|
| 226 |
-
"\n".join(
|
| 227 |
-
conversation_lines) if conversation_lines else "[No conversation available]"
|
| 228 |
)
|
| 229 |
|
| 230 |
setting = item.get("setting", "[No setting provided]")
|
| 231 |
if not isinstance(setting, str):
|
| 232 |
-
setting = str(
|
| 233 |
-
setting) if setting is not None else "[No setting provided]"
|
| 234 |
|
| 235 |
if len(setting) > 2000:
|
| 236 |
setting = setting[:2000] + "... [truncated]"
|
|
@@ -240,8 +230,8 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 240 |
characters = [] if characters is None else [characters]
|
| 241 |
|
| 242 |
setting_after = item.get(
|
| 243 |
-
"setting after interaction",
|
| 244 |
-
|
| 245 |
if not isinstance(setting_after, str):
|
| 246 |
setting_after = (
|
| 247 |
str(setting_after)
|
|
@@ -257,13 +247,11 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
|
|
| 257 |
if len(characters) > 100:
|
| 258 |
characters = characters[:100]
|
| 259 |
characters_str = (
|
| 260 |
-
json.dumps(characters, indent=2,
|
| 261 |
-
ensure_ascii=False) + "\n[... truncated]"
|
| 262 |
)
|
| 263 |
else:
|
| 264 |
characters_str = (
|
| 265 |
-
json.dumps(characters, indent=2,
|
| 266 |
-
ensure_ascii=False) if characters else "[]"
|
| 267 |
)
|
| 268 |
except (TypeError, ValueError, RecursionError) as json_err:
|
| 269 |
logger.debug(f"Error serializing characters to JSON: {json_err}")
|
|
@@ -283,8 +271,7 @@ After Interaction: {setting_after}
|
|
| 283 |
This represents a multi-character narrative scenario for NPC interaction training."""
|
| 284 |
|
| 285 |
if len(content) > 50000:
|
| 286 |
-
content = content[:50000] + \
|
| 287 |
-
"\n\n[Content truncated due to size]"
|
| 288 |
|
| 289 |
return content
|
| 290 |
except Exception as final_err:
|
|
|
|
| 35 |
|
| 36 |
try:
|
| 37 |
if "train" not in dataset:
|
| 38 |
+
logger.warning("Multi-char: No 'train' split found in dataset")
|
|
|
|
| 39 |
return []
|
| 40 |
|
| 41 |
train_data = dataset["train"]
|
| 42 |
+
total_items = len(train_data) if hasattr(train_data, "__len__") else 0
|
| 43 |
+
logger.info(f"Processing {total_items} multi-character dialogue items...")
|
|
|
|
|
|
|
| 44 |
|
| 45 |
for idx, item in enumerate(train_data):
|
| 46 |
if idx > 0 and idx % 1000 == 0:
|
|
|
|
| 50 |
|
| 51 |
try:
|
| 52 |
if item is None:
|
| 53 |
+
logger.warning(f"Multi-char {idx + 1}: Item is None, skipping")
|
|
|
|
| 54 |
continue
|
| 55 |
|
| 56 |
if not isinstance(item, dict):
|
|
|
|
| 71 |
conversation = [] if conversation is None else [conversation]
|
| 72 |
|
| 73 |
if not setting and not conversation:
|
| 74 |
+
logger.warning(f"Multi-char {idx + 1}: Missing essential data, skipping")
|
|
|
|
| 75 |
continue
|
| 76 |
|
| 77 |
+
if conversation and not all(
|
| 78 |
+
isinstance(msg, (dict, str)) for msg in conversation[:10]
|
| 79 |
+
):
|
| 80 |
logger.warning(
|
| 81 |
f"Multi-char {idx + 1}: Invalid conversation structure, skipping"
|
| 82 |
)
|
|
|
|
| 98 |
"source_dataset": dataset_name,
|
| 99 |
"setting": setting[:150] + "..." if len(setting) > 150 else setting,
|
| 100 |
"character_count": (
|
| 101 |
+
len(characters) if isinstance(characters, list) else 0
|
|
|
|
| 102 |
),
|
| 103 |
"conversation_length": (
|
| 104 |
+
len(conversation) if isinstance(conversation, list) else 0
|
|
|
|
| 105 |
),
|
| 106 |
"realm_type": "narrative",
|
| 107 |
"realm_label": "multi_character_dialogue",
|
|
|
|
| 123 |
)
|
| 124 |
continue
|
| 125 |
except (KeyboardInterrupt, SystemExit):
|
| 126 |
+
logger.warning(f"Multi-char: Processing interrupted at item {idx + 1}")
|
|
|
|
| 127 |
raise
|
| 128 |
except Exception as e:
|
| 129 |
logger.warning(
|
|
|
|
| 134 |
except (MemoryError, RecursionError) as critical_error:
|
| 135 |
logger.error(
|
| 136 |
f"Multi-char: Critical error during iteration: {
|
| 137 |
+
type(critical_error).__name__}: {critical_error}"
|
| 138 |
+
)
|
| 139 |
logger.info(
|
| 140 |
f"Returning {
|
| 141 |
+
len(warbler_docs)} documents processed before error"
|
| 142 |
+
)
|
| 143 |
except (KeyboardInterrupt, SystemExit):
|
| 144 |
logger.warning(
|
| 145 |
f"Multi-char: Processing interrupted, returning {len(warbler_docs)} documents"
|
|
|
|
| 148 |
except Exception as outer_error:
|
| 149 |
logger.error(
|
| 150 |
f"Multi-char: Unexpected error during dataset iteration: {
|
| 151 |
+
type(outer_error).__name__}: {outer_error}"
|
| 152 |
+
)
|
| 153 |
logger.info(
|
| 154 |
f"Returning {
|
| 155 |
+
len(warbler_docs)} documents processed before error"
|
| 156 |
+
)
|
| 157 |
|
| 158 |
+
logger.info(f"✓ Transformed {len(warbler_docs)} multi-character entries")
|
|
|
|
| 159 |
return warbler_docs
|
| 160 |
|
| 161 |
@staticmethod
|
|
|
|
| 181 |
message_field = msg.get("message", "")
|
| 182 |
|
| 183 |
if not isinstance(from_field, str):
|
| 184 |
+
from_field = str(from_field) if from_field is not None else "Unknown"
|
|
|
|
| 185 |
if not isinstance(message_field, str):
|
| 186 |
+
message_field = str(message_field) if message_field is not None else ""
|
|
|
|
| 187 |
|
| 188 |
if len(message_field) > 5000:
|
| 189 |
+
message_field = message_field[:5000] + "... [truncated]"
|
|
|
|
| 190 |
|
| 191 |
+
conversation_lines.append(f"{from_field}: {message_field}")
|
|
|
|
| 192 |
|
| 193 |
elif isinstance(msg, str):
|
| 194 |
if len(msg) > 5000:
|
|
|
|
| 196 |
conversation_lines.append(msg)
|
| 197 |
|
| 198 |
else:
|
| 199 |
+
conversation_lines.append(f"[Message {msg_idx + 1}: {type(msg).__name__}]")
|
|
|
|
| 200 |
|
| 201 |
except (RecursionError, MemoryError) as critical_err:
|
| 202 |
logger.warning(
|
| 203 |
+
f"Critical error processing conversation message {msg_idx}: {critical_err}"
|
| 204 |
+
)
|
| 205 |
break
|
| 206 |
except Exception as msg_err:
|
| 207 |
+
logger.debug(f"Error processing conversation message {msg_idx}: {msg_err}")
|
|
|
|
| 208 |
continue
|
| 209 |
|
| 210 |
if len(conversation) > max_conversation_items:
|
| 211 |
conversation_lines.append(
|
| 212 |
f"\n[... {
|
| 213 |
len(conversation) -
|
| 214 |
+
max_conversation_items} more messages truncated]"
|
| 215 |
+
)
|
| 216 |
|
| 217 |
conversation_text = (
|
| 218 |
+
"\n".join(conversation_lines) if conversation_lines else "[No conversation available]"
|
|
|
|
| 219 |
)
|
| 220 |
|
| 221 |
setting = item.get("setting", "[No setting provided]")
|
| 222 |
if not isinstance(setting, str):
|
| 223 |
+
setting = str(setting) if setting is not None else "[No setting provided]"
|
|
|
|
| 224 |
|
| 225 |
if len(setting) > 2000:
|
| 226 |
setting = setting[:2000] + "... [truncated]"
|
|
|
|
| 230 |
characters = [] if characters is None else [characters]
|
| 231 |
|
| 232 |
setting_after = item.get(
|
| 233 |
+
"setting after interaction", "[No setting after interaction provided]"
|
| 234 |
+
)
|
| 235 |
if not isinstance(setting_after, str):
|
| 236 |
setting_after = (
|
| 237 |
str(setting_after)
|
|
|
|
| 247 |
if len(characters) > 100:
|
| 248 |
characters = characters[:100]
|
| 249 |
characters_str = (
|
| 250 |
+
json.dumps(characters, indent=2, ensure_ascii=False) + "\n[... truncated]"
|
|
|
|
| 251 |
)
|
| 252 |
else:
|
| 253 |
characters_str = (
|
| 254 |
+
json.dumps(characters, indent=2, ensure_ascii=False) if characters else "[]"
|
|
|
|
| 255 |
)
|
| 256 |
except (TypeError, ValueError, RecursionError) as json_err:
|
| 257 |
logger.debug(f"Error serializing characters to JSON: {json_err}")
|
|
|
|
| 271 |
This represents a multi-character narrative scenario for NPC interaction training."""
|
| 272 |
|
| 273 |
if len(content) > 50000:
|
| 274 |
+
content = content[:50000] + "\n\n[Content truncated due to size]"
|
|
|
|
| 275 |
|
| 276 |
return content
|
| 277 |
except Exception as final_err:
|
warbler_cda/utils/transformers/novels.py
CHANGED
|
@@ -16,8 +16,7 @@ logger = logging.getLogger(__name__)
|
|
| 16 |
class NovelsTransformer(BaseWarblerTransformer):
|
| 17 |
"""Transform GOAT-AI/generated-novels dataset"""
|
| 18 |
|
| 19 |
-
def transform(
|
| 20 |
-
self, dataset_name: str = "GOAT-AI/generated-novels") -> List[Dict[str, Any]]:
|
| 21 |
"""
|
| 22 |
Transform GOAT-AI/generated-novels dataset
|
| 23 |
Format: Full-length generated novels (PDF-based, treated as narrative metadata)
|
|
@@ -61,13 +60,7 @@ class NovelsTransformer(BaseWarblerTransformer):
|
|
| 61 |
except Exception:
|
| 62 |
item_keys = []
|
| 63 |
|
| 64 |
-
for field in [
|
| 65 |
-
"text",
|
| 66 |
-
"story",
|
| 67 |
-
"content",
|
| 68 |
-
"novel",
|
| 69 |
-
"body",
|
| 70 |
-
"full_text"]:
|
| 71 |
try:
|
| 72 |
if isinstance(item, dict):
|
| 73 |
if field in item and item[field]:
|
|
@@ -84,9 +77,9 @@ class NovelsTransformer(BaseWarblerTransformer):
|
|
| 84 |
logger.info(
|
| 85 |
f"Novel {
|
| 86 |
idx +
|
| 87 |
-
1}: No text field found, attempting PDF extraction..."
|
| 88 |
-
|
| 89 |
-
|
| 90 |
try:
|
| 91 |
pdf_data = None
|
| 92 |
if isinstance(item, dict):
|
|
@@ -101,33 +94,37 @@ class NovelsTransformer(BaseWarblerTransformer):
|
|
| 101 |
f"Novel {
|
| 102 |
idx +
|
| 103 |
1}: Found PDF data in field '{pdf_field}' (type: {
|
| 104 |
-
type(pdf_data).__name__})"
|
| 105 |
-
|
| 106 |
-
|
| 107 |
if text:
|
| 108 |
logger.info(
|
| 109 |
f"Novel {
|
| 110 |
idx +
|
| 111 |
1}: Successfully extracted {
|
| 112 |
-
len(text)} chars from PDF field '{pdf_field}'"
|
|
|
|
| 113 |
break
|
| 114 |
else:
|
| 115 |
logger.warning(
|
| 116 |
f"Novel {
|
| 117 |
idx +
|
| 118 |
-
1}: PDF field '{pdf_field}' extraction returned no text"
|
|
|
|
| 119 |
except Exception as e:
|
| 120 |
logger.warning(
|
| 121 |
f"Novel {
|
| 122 |
idx +
|
| 123 |
1}: PDF extraction from field '{pdf_field}' failed: {
|
| 124 |
-
type(e).__name__}: {e}"
|
|
|
|
| 125 |
|
| 126 |
if not text:
|
| 127 |
logger.warning(
|
| 128 |
f"Novel {
|
| 129 |
idx +
|
| 130 |
-
1}: No text content found. Available fields: {item_keys}"
|
|
|
|
| 131 |
pdf_status = (
|
| 132 |
"Enabled"
|
| 133 |
if self.has_pdf_support()
|
|
@@ -149,11 +146,9 @@ This entry serves as a placeholder for retrieval system testing."""
|
|
| 149 |
title = f"Generated Novel #{idx + 1}"
|
| 150 |
try:
|
| 151 |
if isinstance(item, dict):
|
| 152 |
-
title = item.get("title", item.get(
|
| 153 |
-
"name", f"Generated Novel #{idx + 1}"))
|
| 154 |
elif hasattr(item, "get"):
|
| 155 |
-
title = item.get("title", item.get(
|
| 156 |
-
"name", f"Generated Novel #{idx + 1}"))
|
| 157 |
elif hasattr(item, "__getitem__"):
|
| 158 |
title = (
|
| 159 |
item.get("title", f"Generated Novel #{idx + 1}")
|
|
@@ -193,15 +188,12 @@ This entry serves as a placeholder for retrieval system testing."""
|
|
| 193 |
logger.info(
|
| 194 |
f"✓ Transformed {
|
| 195 |
len(warbler_docs)} novel chunks from {
|
| 196 |
-
len(items)} novels"
|
|
|
|
| 197 |
return warbler_docs
|
| 198 |
|
| 199 |
@staticmethod
|
| 200 |
-
def _create_content(
|
| 201 |
-
title: str,
|
| 202 |
-
text_chunk: str,
|
| 203 |
-
chunk_idx: int,
|
| 204 |
-
total_chunks: int) -> str:
|
| 205 |
"""Create content string for novel chunk"""
|
| 206 |
return f"""Novel: {title}
|
| 207 |
Part: {chunk_idx + 1} of {total_chunks}
|
|
|
|
| 16 |
class NovelsTransformer(BaseWarblerTransformer):
|
| 17 |
"""Transform GOAT-AI/generated-novels dataset"""
|
| 18 |
|
| 19 |
+
def transform(self, dataset_name: str = "GOAT-AI/generated-novels") -> List[Dict[str, Any]]:
|
|
|
|
| 20 |
"""
|
| 21 |
Transform GOAT-AI/generated-novels dataset
|
| 22 |
Format: Full-length generated novels (PDF-based, treated as narrative metadata)
|
|
|
|
| 60 |
except Exception:
|
| 61 |
item_keys = []
|
| 62 |
|
| 63 |
+
for field in ["text", "story", "content", "novel", "body", "full_text"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
try:
|
| 65 |
if isinstance(item, dict):
|
| 66 |
if field in item and item[field]:
|
|
|
|
| 77 |
logger.info(
|
| 78 |
f"Novel {
|
| 79 |
idx +
|
| 80 |
+
1}: No text field found, attempting PDF extraction..."
|
| 81 |
+
)
|
| 82 |
+
for pdf_field in ["pdf", "file", "document", "content", "data"]:
|
| 83 |
try:
|
| 84 |
pdf_data = None
|
| 85 |
if isinstance(item, dict):
|
|
|
|
| 94 |
f"Novel {
|
| 95 |
idx +
|
| 96 |
1}: Found PDF data in field '{pdf_field}' (type: {
|
| 97 |
+
type(pdf_data).__name__})"
|
| 98 |
+
)
|
| 99 |
+
text = self.extract_pdf_text(pdf_data, max_pages=self.max_pdf_pages)
|
| 100 |
if text:
|
| 101 |
logger.info(
|
| 102 |
f"Novel {
|
| 103 |
idx +
|
| 104 |
1}: Successfully extracted {
|
| 105 |
+
len(text)} chars from PDF field '{pdf_field}'"
|
| 106 |
+
)
|
| 107 |
break
|
| 108 |
else:
|
| 109 |
logger.warning(
|
| 110 |
f"Novel {
|
| 111 |
idx +
|
| 112 |
+
1}: PDF field '{pdf_field}' extraction returned no text"
|
| 113 |
+
)
|
| 114 |
except Exception as e:
|
| 115 |
logger.warning(
|
| 116 |
f"Novel {
|
| 117 |
idx +
|
| 118 |
1}: PDF extraction from field '{pdf_field}' failed: {
|
| 119 |
+
type(e).__name__}: {e}"
|
| 120 |
+
)
|
| 121 |
|
| 122 |
if not text:
|
| 123 |
logger.warning(
|
| 124 |
f"Novel {
|
| 125 |
idx +
|
| 126 |
+
1}: No text content found. Available fields: {item_keys}"
|
| 127 |
+
)
|
| 128 |
pdf_status = (
|
| 129 |
"Enabled"
|
| 130 |
if self.has_pdf_support()
|
|
|
|
| 146 |
title = f"Generated Novel #{idx + 1}"
|
| 147 |
try:
|
| 148 |
if isinstance(item, dict):
|
| 149 |
+
title = item.get("title", item.get("name", f"Generated Novel #{idx + 1}"))
|
|
|
|
| 150 |
elif hasattr(item, "get"):
|
| 151 |
+
title = item.get("title", item.get("name", f"Generated Novel #{idx + 1}"))
|
|
|
|
| 152 |
elif hasattr(item, "__getitem__"):
|
| 153 |
title = (
|
| 154 |
item.get("title", f"Generated Novel #{idx + 1}")
|
|
|
|
| 188 |
logger.info(
|
| 189 |
f"✓ Transformed {
|
| 190 |
len(warbler_docs)} novel chunks from {
|
| 191 |
+
len(items)} novels"
|
| 192 |
+
)
|
| 193 |
return warbler_docs
|
| 194 |
|
| 195 |
@staticmethod
|
| 196 |
+
def _create_content(title: str, text_chunk: str, chunk_idx: int, total_chunks: int) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
"""Create content string for novel chunk"""
|
| 198 |
return f"""Novel: {title}
|
| 199 |
Part: {chunk_idx + 1} of {total_chunks}
|
warbler_cda/utils/transformers/npc_dialogue.py
CHANGED
|
@@ -25,13 +25,17 @@ class NPCDialogueTransformer(BaseWarblerTransformer):
|
|
| 25 |
{
|
| 26 |
"Name": "Elandra the Merchant",
|
| 27 |
"Biography": "A seasoned trader who has traveled across kingdoms, known for her sharp wit.",
|
| 28 |
-
"Queries": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"Responses": [
|
| 30 |
"I have wares from distant lands, take a look.",
|
| 31 |
"Prices are firm, but quality is unmatched.",
|
| 32 |
-
"Indeed, a relic from the old empire just arrived."
|
| 33 |
],
|
| 34 |
-
"Emotions": ["neutral", "greedy", "excited"]
|
| 35 |
},
|
| 36 |
{
|
| 37 |
"Name": "Tharos the Guard",
|
|
@@ -40,20 +44,24 @@ class NPCDialogueTransformer(BaseWarblerTransformer):
|
|
| 40 |
"Responses": [
|
| 41 |
"Only citizens may pass without a writ.",
|
| 42 |
"Bandits lurk beyond the hills, stay vigilant.",
|
| 43 |
-
"I serve the crown, keeping watch at dawn."
|
| 44 |
],
|
| 45 |
-
"Emotions": ["serious", "cautious", "stern"]
|
| 46 |
},
|
| 47 |
{
|
| 48 |
"Name": "Lyra the Healer",
|
| 49 |
"Biography": "A gentle soul who tends to the wounded, guided by compassion and faith.",
|
| 50 |
-
"Queries": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
"Responses": [
|
| 52 |
"Rest easy, I will mend your wounds.",
|
| 53 |
"Chamomile and sage, nature’s gift to us.",
|
| 54 |
-
"Because every life is sacred, no matter the path."
|
| 55 |
],
|
| 56 |
-
"Emotions": ["kind", "calm", "hopeful"]
|
| 57 |
},
|
| 58 |
]
|
| 59 |
|
|
|
|
| 25 |
{
|
| 26 |
"Name": "Elandra the Merchant",
|
| 27 |
"Biography": "A seasoned trader who has traveled across kingdoms, known for her sharp wit.",
|
| 28 |
+
"Queries": [
|
| 29 |
+
"What do you sell?",
|
| 30 |
+
"Can you lower the price?",
|
| 31 |
+
"Any rare items today?",
|
| 32 |
+
],
|
| 33 |
"Responses": [
|
| 34 |
"I have wares from distant lands, take a look.",
|
| 35 |
"Prices are firm, but quality is unmatched.",
|
| 36 |
+
"Indeed, a relic from the old empire just arrived.",
|
| 37 |
],
|
| 38 |
+
"Emotions": ["neutral", "greedy", "excited"],
|
| 39 |
},
|
| 40 |
{
|
| 41 |
"Name": "Tharos the Guard",
|
|
|
|
| 44 |
"Responses": [
|
| 45 |
"Only citizens may pass without a writ.",
|
| 46 |
"Bandits lurk beyond the hills, stay vigilant.",
|
| 47 |
+
"I serve the crown, keeping watch at dawn.",
|
| 48 |
],
|
| 49 |
+
"Emotions": ["serious", "cautious", "stern"],
|
| 50 |
},
|
| 51 |
{
|
| 52 |
"Name": "Lyra the Healer",
|
| 53 |
"Biography": "A gentle soul who tends to the wounded, guided by compassion and faith.",
|
| 54 |
+
"Queries": [
|
| 55 |
+
"Can you heal me?",
|
| 56 |
+
"What herbs do you use?",
|
| 57 |
+
"Why do you help strangers?",
|
| 58 |
+
],
|
| 59 |
"Responses": [
|
| 60 |
"Rest easy, I will mend your wounds.",
|
| 61 |
"Chamomile and sage, nature’s gift to us.",
|
| 62 |
+
"Because every life is sacred, no matter the path.",
|
| 63 |
],
|
| 64 |
+
"Emotions": ["kind", "calm", "hopeful"],
|
| 65 |
},
|
| 66 |
]
|
| 67 |
|
warbler_cda/utils/transformers/portuguese_education.py
CHANGED
|
@@ -35,8 +35,7 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
|
|
| 35 |
items = []
|
| 36 |
if hasattr(dataset, "__getitem__") and "train" in dataset:
|
| 37 |
items = list(dataset["train"])
|
| 38 |
-
logger.info(
|
| 39 |
-
f"Loaded {len(items)} items from dataset['train']")
|
| 40 |
else:
|
| 41 |
items = self.extract_dataset_items(dataset)
|
| 42 |
logger.info(f"Extracted {len(items)} items from dataset")
|
|
@@ -48,8 +47,7 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
|
|
| 48 |
|
| 49 |
for idx, item in enumerate(items):
|
| 50 |
if isinstance(item, str):
|
| 51 |
-
logger.warning(
|
| 52 |
-
f"Portuguese doc {idx + 1}: Item is a string, skipping")
|
| 53 |
continue
|
| 54 |
|
| 55 |
if isinstance(item, dict) or hasattr(item, "__getitem__"):
|
|
@@ -63,12 +61,7 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
|
|
| 63 |
item_keys = []
|
| 64 |
|
| 65 |
content = None
|
| 66 |
-
for field in [
|
| 67 |
-
"content",
|
| 68 |
-
"text",
|
| 69 |
-
"body",
|
| 70 |
-
"document",
|
| 71 |
-
"passage"]:
|
| 72 |
try:
|
| 73 |
if isinstance(item, dict):
|
| 74 |
if field in item and item[field]:
|
|
@@ -93,14 +86,14 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
|
|
| 93 |
pdf_data = item[pdf_field]
|
| 94 |
|
| 95 |
if pdf_data:
|
| 96 |
-
if isinstance(
|
| 97 |
-
pdf_data, dict) and "bytes" in pdf_data:
|
| 98 |
pdf_bytes = pdf_data["bytes"]
|
| 99 |
logger.info(
|
| 100 |
f"Portuguese doc {
|
| 101 |
idx +
|
| 102 |
1}: Found PDF bytes ({
|
| 103 |
-
len(pdf_bytes)} bytes), extracting..."
|
|
|
|
| 104 |
content = self.extract_pdf_text(
|
| 105 |
pdf_bytes, max_pages=self.max_pdf_pages
|
| 106 |
)
|
|
@@ -109,7 +102,8 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
|
|
| 109 |
f"Portuguese doc {
|
| 110 |
idx +
|
| 111 |
1}: Found PDF bytes ({
|
| 112 |
-
len(pdf_data)} bytes), extracting..."
|
|
|
|
| 113 |
content = self.extract_pdf_text(
|
| 114 |
pdf_data, max_pages=self.max_pdf_pages
|
| 115 |
)
|
|
@@ -118,7 +112,8 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
|
|
| 118 |
f"Portuguese doc {
|
| 119 |
idx +
|
| 120 |
1}: Found PDF data (type: {
|
| 121 |
-
type(pdf_data)}), attempting extraction..."
|
|
|
|
| 122 |
content = self.extract_pdf_text(
|
| 123 |
pdf_data, max_pages=self.max_pdf_pages
|
| 124 |
)
|
|
@@ -128,24 +123,28 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
|
|
| 128 |
f"Portuguese doc {
|
| 129 |
idx +
|
| 130 |
1}: Successfully extracted {
|
| 131 |
-
len(content)} chars from PDF"
|
|
|
|
| 132 |
break
|
| 133 |
else:
|
| 134 |
logger.warning(
|
| 135 |
f"Portuguese doc {
|
| 136 |
-
idx + 1}: PDF extraction returned no text"
|
|
|
|
| 137 |
except Exception as e:
|
| 138 |
logger.warning(
|
| 139 |
f"Portuguese doc {
|
| 140 |
idx +
|
| 141 |
1}: PDF extraction error: {
|
| 142 |
-
type(e).__name__}: {e}"
|
|
|
|
| 143 |
|
| 144 |
if not content:
|
| 145 |
logger.warning(
|
| 146 |
f"Portuguese doc {
|
| 147 |
idx +
|
| 148 |
-
1}: No content found. Available fields: {item_keys}"
|
|
|
|
| 149 |
content = f"""[Conteúdo Indisponível]
|
| 150 |
|
| 151 |
Este documento (#{idx + 1}) faz parte do dataset Solshine/Portuguese_Language_Education_Texts.
|
|
@@ -186,7 +185,8 @@ Esta entrada serve como placeholder para testes do sistema de recuperação."""
|
|
| 186 |
except Exception as e:
|
| 187 |
logger.warning(
|
| 188 |
f"Portuguese doc {
|
| 189 |
-
idx + 1}: Could not convert item to dict: {e}"
|
|
|
|
| 190 |
item_with_content = {}
|
| 191 |
|
| 192 |
item_with_content["content"] = content
|
|
@@ -222,8 +222,7 @@ Esta entrada serve como placeholder para testes do sistema de recuperação."""
|
|
| 222 |
}
|
| 223 |
warbler_docs.append(doc)
|
| 224 |
|
| 225 |
-
logger.info(
|
| 226 |
-
f"✓ Transformed {len(warbler_docs)} Portuguese education entries")
|
| 227 |
return warbler_docs
|
| 228 |
|
| 229 |
@staticmethod
|
|
|
|
| 35 |
items = []
|
| 36 |
if hasattr(dataset, "__getitem__") and "train" in dataset:
|
| 37 |
items = list(dataset["train"])
|
| 38 |
+
logger.info(f"Loaded {len(items)} items from dataset['train']")
|
|
|
|
| 39 |
else:
|
| 40 |
items = self.extract_dataset_items(dataset)
|
| 41 |
logger.info(f"Extracted {len(items)} items from dataset")
|
|
|
|
| 47 |
|
| 48 |
for idx, item in enumerate(items):
|
| 49 |
if isinstance(item, str):
|
| 50 |
+
logger.warning(f"Portuguese doc {idx + 1}: Item is a string, skipping")
|
|
|
|
| 51 |
continue
|
| 52 |
|
| 53 |
if isinstance(item, dict) or hasattr(item, "__getitem__"):
|
|
|
|
| 61 |
item_keys = []
|
| 62 |
|
| 63 |
content = None
|
| 64 |
+
for field in ["content", "text", "body", "document", "passage"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
try:
|
| 66 |
if isinstance(item, dict):
|
| 67 |
if field in item and item[field]:
|
|
|
|
| 86 |
pdf_data = item[pdf_field]
|
| 87 |
|
| 88 |
if pdf_data:
|
| 89 |
+
if isinstance(pdf_data, dict) and "bytes" in pdf_data:
|
|
|
|
| 90 |
pdf_bytes = pdf_data["bytes"]
|
| 91 |
logger.info(
|
| 92 |
f"Portuguese doc {
|
| 93 |
idx +
|
| 94 |
1}: Found PDF bytes ({
|
| 95 |
+
len(pdf_bytes)} bytes), extracting..."
|
| 96 |
+
)
|
| 97 |
content = self.extract_pdf_text(
|
| 98 |
pdf_bytes, max_pages=self.max_pdf_pages
|
| 99 |
)
|
|
|
|
| 102 |
f"Portuguese doc {
|
| 103 |
idx +
|
| 104 |
1}: Found PDF bytes ({
|
| 105 |
+
len(pdf_data)} bytes), extracting..."
|
| 106 |
+
)
|
| 107 |
content = self.extract_pdf_text(
|
| 108 |
pdf_data, max_pages=self.max_pdf_pages
|
| 109 |
)
|
|
|
|
| 112 |
f"Portuguese doc {
|
| 113 |
idx +
|
| 114 |
1}: Found PDF data (type: {
|
| 115 |
+
type(pdf_data)}), attempting extraction..."
|
| 116 |
+
)
|
| 117 |
content = self.extract_pdf_text(
|
| 118 |
pdf_data, max_pages=self.max_pdf_pages
|
| 119 |
)
|
|
|
|
| 123 |
f"Portuguese doc {
|
| 124 |
idx +
|
| 125 |
1}: Successfully extracted {
|
| 126 |
+
len(content)} chars from PDF"
|
| 127 |
+
)
|
| 128 |
break
|
| 129 |
else:
|
| 130 |
logger.warning(
|
| 131 |
f"Portuguese doc {
|
| 132 |
+
idx + 1}: PDF extraction returned no text"
|
| 133 |
+
)
|
| 134 |
except Exception as e:
|
| 135 |
logger.warning(
|
| 136 |
f"Portuguese doc {
|
| 137 |
idx +
|
| 138 |
1}: PDF extraction error: {
|
| 139 |
+
type(e).__name__}: {e}"
|
| 140 |
+
)
|
| 141 |
|
| 142 |
if not content:
|
| 143 |
logger.warning(
|
| 144 |
f"Portuguese doc {
|
| 145 |
idx +
|
| 146 |
+
1}: No content found. Available fields: {item_keys}"
|
| 147 |
+
)
|
| 148 |
content = f"""[Conteúdo Indisponível]
|
| 149 |
|
| 150 |
Este documento (#{idx + 1}) faz parte do dataset Solshine/Portuguese_Language_Education_Texts.
|
|
|
|
| 185 |
except Exception as e:
|
| 186 |
logger.warning(
|
| 187 |
f"Portuguese doc {
|
| 188 |
+
idx + 1}: Could not convert item to dict: {e}"
|
| 189 |
+
)
|
| 190 |
item_with_content = {}
|
| 191 |
|
| 192 |
item_with_content["content"] = content
|
|
|
|
| 222 |
}
|
| 223 |
warbler_docs.append(doc)
|
| 224 |
|
| 225 |
+
logger.info(f"✓ Transformed {len(warbler_docs)} Portuguese education entries")
|
|
|
|
| 226 |
return warbler_docs
|
| 227 |
|
| 228 |
@staticmethod
|