Bellok commited on
Commit
ec38897
·
1 Parent(s): a2c1773

refactor(app): improve code formatting and add background ingestion status display

Browse files

- Remove unused 'hashlib' import
- Update typing import by removing unused 'List'
- Standardize string quotes to double quotes for consistency
- Reformat long print statements into multi-line for readability
- Simplify thread creation arguments on single line
- Adjust long string concatenations and metadata formatting
- Add memory garbage collection after large ingestion batches
- Include background pack ingestion details in system stats output for better monitoring

This commit enhances code maintainability through consistent formatting and adds informative status reporting for ingestion processes.

Files changed (48) hide show
  1. .gitignore +1 -0
  2. app.py +45 -14
  3. compress_packs.py +17 -19
  4. final_fix.py +2 -4
  5. fix_theme.py +3 -3
  6. package-lock.json +15 -1
  7. package.json +2 -1
  8. packs/warbler-pack-npc-dialog/src/index.ts +27 -3
  9. packs/warbler-pack-npc-dialog/warbler-pack-core.jsonl +0 -2
  10. packs/warbler-pack-wisdom-scrolls/README.md +22 -4
  11. packs/warbler-pack-wisdom-scrolls/README_HF_DATASET.md +7 -1
  12. test_app.py +3 -1
  13. test_compressed_pack.py +3 -1
  14. test_embedding_integration.py +2 -2
  15. test_fixes.py +2 -2
  16. test_pack_loading.py +4 -1
  17. tests/test_new_mit_datasets.py +2 -4
  18. tests/test_pdf_ingestion.py +1 -4
  19. tests/test_rag_e2e.py +12 -22
  20. tests/test_retrieval_api.py +1 -1
  21. tsconfig.base.json +14 -0
  22. validate_new_transformers.py +2 -2
  23. verify_pack_ingestion.py +0 -1
  24. warbler_cda/api/cli.py +4 -6
  25. warbler_cda/api/service.py +9 -13
  26. warbler_cda/castle_graph.py +1 -1
  27. warbler_cda/conflict_detector.py +5 -3
  28. warbler_cda/embeddings/openai_provider.py +0 -1
  29. warbler_cda/embeddings/sentence_transformer_provider.py +3 -5
  30. warbler_cda/evaporation.py +4 -3
  31. warbler_cda/pack_loader.py +1 -3
  32. warbler_cda/pack_sync.py +2 -2
  33. warbler_cda/retrieval_api.py +11 -6
  34. warbler_cda/semantic_anchors.py +2 -4
  35. warbler_cda/stat7_entity.py +6 -11
  36. warbler_cda/stat7_experiments.py +9 -17
  37. warbler_cda/stat7_rag_bridge.py +5 -13
  38. warbler_cda/stat7_visualization.py +3 -7
  39. warbler_cda/summarization_ladder.py +2 -2
  40. warbler_cda/utils/hf_warbler_ingest.py +3 -4
  41. warbler_cda/utils/load_warbler_packs.py +1 -3
  42. warbler_cda/utils/transformers/base.py +49 -65
  43. warbler_cda/utils/transformers/edustories.py +14 -18
  44. warbler_cda/utils/transformers/enterprise.py +10 -14
  45. warbler_cda/utils/transformers/multi_character.py +37 -50
  46. warbler_cda/utils/transformers/novels.py +21 -29
  47. warbler_cda/utils/transformers/npc_dialogue.py +16 -8
  48. warbler_cda/utils/transformers/portuguese_education.py +21 -22
.gitignore CHANGED
@@ -661,3 +661,4 @@ node_modules/wrappy/LICENSE
661
  node_modules/wrappy/package.json
662
  node_modules/wrappy/README.md
663
  node_modules/wrappy/wrappy.js
 
 
661
  node_modules/wrappy/package.json
662
  node_modules/wrappy/README.md
663
  node_modules/wrappy/wrappy.js
664
+ TODO.md
app.py CHANGED
@@ -8,13 +8,12 @@ import time
8
  import os
9
  import threading
10
  import gradio as gr
11
- import hashlib
12
  import spaces
13
  from pathlib import Path
14
- from typing import List, Tuple, Optional, Dict
15
 
16
  # Set TOKENIZERS_PARALLELISM to avoid warnings with SentenceTransformers
17
- os.environ['TOKENIZERS_PARALLELISM'] = 'false'
18
 
19
 
20
  # Global variables for background ingestion tracking
@@ -76,20 +75,25 @@ def background_ingest_packs(api, pack_docs, pack_manager):
76
  ingestion_status["rate"] = rate
77
  ingestion_status["eta"] = eta
78
 
79
- print(f"[PROGRESS] {processed}/{total_docs} documents ingested "
80
- f"({processed/total_docs*100:.1f}%) - "
81
- f"{rate:.1f} docs/sec - ETA: {eta/60:.1f} min")
 
 
82
 
83
  # Force garbage collection after large batches to free memory
84
  if processed % 10000 == 0:
85
  import gc
 
86
  gc.collect()
87
 
88
  packs_loaded = processed
89
  pack_manager.mark_packs_ingested(1, packs_loaded)
90
  total_time = time.time() - start_time
91
- print(f"[OK] Loaded {packs_loaded} documents from Warbler packs "
92
- f"({failed} failed) in {total_time:.1f} seconds")
 
 
93
 
94
  # Mark ingestion complete
95
  ingestion_status["running"] = False
@@ -259,9 +263,7 @@ if WARBLER_AVAILABLE:
259
  if pack_docs and pack_manager.should_ingest_packs(api, len(pack_docs)):
260
  # Start background ingestion
261
  ingestion_thread = threading.Thread(
262
- target=background_ingest_packs,
263
- args=(api, pack_docs, pack_manager),
264
- daemon=True
265
  )
266
  ingestion_thread.start()
267
  packs_loaded = 0 # Will be updated asynchronously
@@ -338,7 +340,7 @@ def query_warbler(
338
  elapsed_ms = (time.time() - start_time) * 1000
339
 
340
  # Format results
341
- results_text = f"# Query Results\n\n"
342
  results_text += f"**Query:** {query_text}\n\n"
343
  results_text += (
344
  f"**Mode:** {'Hybrid (Semantic + STAT7)' if use_hybrid else 'Semantic Only'}\n\n"
@@ -361,7 +363,7 @@ def query_warbler(
361
  results_text += f"**Type:** {result.content_type}\n\n"
362
 
363
  if result.metadata:
364
- results_text += f"**Metadata:**\n"
365
  for key, value in result.metadata.items():
366
  if key != "stat7": # Skip complex STAT7 object
367
  results_text += f"- {key}: {value}\n"
@@ -428,7 +430,7 @@ def get_system_stats() -> str:
428
  try:
429
  metrics = api.get_retrieval_metrics()
430
 
431
- stats = f"# System Statistics\n\n"
432
  stats += f"**Total Documents:** {metrics['context_store_size']}\n\n"
433
  stats += f"**Total Queries:** {metrics['retrieval_metrics']['total_queries']}\n\n"
434
  stats += f"**Cache Hit Rate:** {metrics['cache_performance']['hit_rate']:.1%}\n\n"
@@ -440,6 +442,35 @@ def get_system_stats() -> str:
440
  for quality, count in metrics["retrieval_metrics"]["quality_distribution"].items():
441
  stats += f"- {quality.capitalize()}: {count}\n"
442
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  return stats
444
 
445
  except Exception as e:
 
8
  import os
9
  import threading
10
  import gradio as gr
 
11
  import spaces
12
  from pathlib import Path
13
+ from typing import Tuple, Optional, Dict
14
 
15
  # Set TOKENIZERS_PARALLELISM to avoid warnings with SentenceTransformers
16
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
17
 
18
 
19
  # Global variables for background ingestion tracking
 
75
  ingestion_status["rate"] = rate
76
  ingestion_status["eta"] = eta
77
 
78
+ print(
79
+ f"[PROGRESS] {processed}/{total_docs} documents ingested "
80
+ f"({processed/total_docs*100:.1f}%) - "
81
+ f"{rate:.1f} docs/sec - ETA: {eta/60:.1f} min"
82
+ )
83
 
84
  # Force garbage collection after large batches to free memory
85
  if processed % 10000 == 0:
86
  import gc
87
+
88
  gc.collect()
89
 
90
  packs_loaded = processed
91
  pack_manager.mark_packs_ingested(1, packs_loaded)
92
  total_time = time.time() - start_time
93
+ print(
94
+ f"[OK] Loaded {packs_loaded} documents from Warbler packs "
95
+ f"({failed} failed) in {total_time:.1f} seconds"
96
+ )
97
 
98
  # Mark ingestion complete
99
  ingestion_status["running"] = False
 
263
  if pack_docs and pack_manager.should_ingest_packs(api, len(pack_docs)):
264
  # Start background ingestion
265
  ingestion_thread = threading.Thread(
266
+ target=background_ingest_packs, args=(api, pack_docs, pack_manager), daemon=True
 
 
267
  )
268
  ingestion_thread.start()
269
  packs_loaded = 0 # Will be updated asynchronously
 
340
  elapsed_ms = (time.time() - start_time) * 1000
341
 
342
  # Format results
343
+ results_text = "# Query Results\n\n"
344
  results_text += f"**Query:** {query_text}\n\n"
345
  results_text += (
346
  f"**Mode:** {'Hybrid (Semantic + STAT7)' if use_hybrid else 'Semantic Only'}\n\n"
 
363
  results_text += f"**Type:** {result.content_type}\n\n"
364
 
365
  if result.metadata:
366
+ results_text += "**Metadata:**\n"
367
  for key, value in result.metadata.items():
368
  if key != "stat7": # Skip complex STAT7 object
369
  results_text += f"- {key}: {value}\n"
 
430
  try:
431
  metrics = api.get_retrieval_metrics()
432
 
433
+ stats = "# System Statistics\n\n"
434
  stats += f"**Total Documents:** {metrics['context_store_size']}\n\n"
435
  stats += f"**Total Queries:** {metrics['retrieval_metrics']['total_queries']}\n\n"
436
  stats += f"**Cache Hit Rate:** {metrics['cache_performance']['hit_rate']:.1%}\n\n"
 
442
  for quality, count in metrics["retrieval_metrics"]["quality_distribution"].items():
443
  stats += f"- {quality.capitalize()}: {count}\n"
444
 
445
+ # Add ingestion status information
446
+ global ingestion_status
447
+ stats += "\n## Background Pack Ingestion\n\n"
448
+
449
+ if ingestion_status["running"]:
450
+ # Currently ingesting
451
+ progress_percent = (ingestion_status["processed"] / ingestion_status["total_docs"] * 100) if ingestion_status["total_docs"] > 0 else 0
452
+ eta_minutes = ingestion_status["eta"] / 60 if ingestion_status["eta"] > 0 else 0
453
+
454
+ stats += "**Status:** 🟢 **ACTIVE** - Ingesting documents...\n\n"
455
+ stats += "```\n"
456
+ stats += f"Progress: {ingestion_status['processed']}/{ingestion_status['total_docs']} documents\n"
457
+ stats += f"Complete: {progress_percent:.1f}%\n"
458
+ stats += f"Rate: {ingestion_status['rate']:.1f} docs/sec\n"
459
+ stats += f"ETA: {eta_minutes:.1f} minutes\n"
460
+ if ingestion_status['failed'] > 0:
461
+ stats += f"Failed: {ingestion_status['failed']} documents\n"
462
+ stats += "```\n\n"
463
+ elif ingestion_status["total_docs"] > 0:
464
+ # Completed ingestion (has totals but not running)
465
+ stats += "**Status:** ✅ **COMPLETE**\n\n"
466
+ stats += f"**Last Ingestion:** Processed {ingestion_status['processed']} documents"
467
+ if ingestion_status['failed'] > 0:
468
+ stats += f" ({ingestion_status['failed']} failed)"
469
+ stats += "\n\n"
470
+ else:
471
+ # No background ingestion detected
472
+ stats += "**Status:** ⚪ **IDLE** - No background ingestion active\n\n"
473
+
474
  return stats
475
 
476
  except Exception as e:
compress_packs.py CHANGED
@@ -7,7 +7,6 @@ compressed proto-thoughts generated by the evaporation engine.
7
  """
8
 
9
  import json
10
- import os
11
  import sys
12
  from pathlib import Path
13
  from typing import Dict, Any, List
@@ -22,7 +21,7 @@ from warbler_cda.evaporation import EvaporationEngine, CloudStore
22
  def load_jsonl_file(filepath: str) -> List[Dict[str, Any]]:
23
  """Load a JSONL file and return list of documents."""
24
  documents = []
25
- with open(filepath, 'r', encoding='utf-8') as f:
26
  for line in f:
27
  line = line.strip()
28
  if line:
@@ -32,9 +31,9 @@ def load_jsonl_file(filepath: str) -> List[Dict[str, Any]]:
32
 
33
  def save_jsonl_file(filepath: str, documents: List[Dict[str, Any]]) -> None:
34
  """Save list of documents to a JSONL file."""
35
- with open(filepath, 'w', encoding='utf-8') as f:
36
  for doc in documents:
37
- f.write(json.dumps(doc, ensure_ascii=False) + '\n')
38
 
39
 
40
  def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
@@ -70,39 +69,38 @@ def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
70
  compressed_documents = []
71
 
72
  for doc in documents:
73
- if 'content' not in doc:
74
- print(f"Warning: Document missing 'content' field, skipping")
75
  continue
76
 
77
- content = doc['content']
78
  if not content or not isinstance(content, str):
79
- print(f"Warning: Empty or invalid content, skipping")
80
  continue
81
 
82
  try:
83
  # Create a fragment from the document content
84
- fragment = {
85
- 'id': doc.get('content_id', f'doc_{compressed_docs}'),
86
- 'text': content
87
- }
88
 
89
  # Create glyph from the single fragment
90
- glyph = melt_layer.retire_cluster({'fragments': [fragment]})
91
 
92
  # Evaporate to get proto-thought
93
  mist_lines = evaporation_engine.evaporate(limit=1)
94
 
95
  if mist_lines:
96
- proto_thought = mist_lines[0]['proto_thought']
97
  # Replace content with compressed proto-thought
98
  compressed_doc = doc.copy()
99
- compressed_doc['content'] = proto_thought
100
- compressed_doc['original_content_length'] = len(content)
101
- compressed_doc['compressed_content_length'] = len(proto_thought)
102
  compressed_documents.append(compressed_doc)
103
  compressed_docs += 1
104
  else:
105
- print(f"Warning: Failed to evaporate glyph for document {doc.get('content_id', 'unknown')}")
 
 
106
  # Keep original document if evaporation fails
107
  compressed_documents.append(doc)
108
 
@@ -116,7 +114,7 @@ def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
116
  save_jsonl_file(str(output_file), compressed_documents)
117
  print(f"Saved compressed file: {output_file}")
118
 
119
- print(f"Compression complete:")
120
  print(f" Total documents processed: {total_docs}")
121
  print(f" Documents compressed: {compressed_docs}")
122
  if total_docs > 0:
 
7
  """
8
 
9
  import json
 
10
  import sys
11
  from pathlib import Path
12
  from typing import Dict, Any, List
 
21
  def load_jsonl_file(filepath: str) -> List[Dict[str, Any]]:
22
  """Load a JSONL file and return list of documents."""
23
  documents = []
24
+ with open(filepath, "r", encoding="utf-8") as f:
25
  for line in f:
26
  line = line.strip()
27
  if line:
 
31
 
32
  def save_jsonl_file(filepath: str, documents: List[Dict[str, Any]]) -> None:
33
  """Save list of documents to a JSONL file."""
34
+ with open(filepath, "w", encoding="utf-8") as f:
35
  for doc in documents:
36
+ f.write(json.dumps(doc, ensure_ascii=False) + "\n")
37
 
38
 
39
  def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
 
69
  compressed_documents = []
70
 
71
  for doc in documents:
72
+ if "content" not in doc:
73
+ print("Warning: Document missing 'content' field, skipping")
74
  continue
75
 
76
+ content = doc["content"]
77
  if not content or not isinstance(content, str):
78
+ print("Warning: Empty or invalid content, skipping")
79
  continue
80
 
81
  try:
82
  # Create a fragment from the document content
83
+ fragment = {"id": doc.get("content_id", f"doc_{compressed_docs}"), "text": content}
 
 
 
84
 
85
  # Create glyph from the single fragment
86
+ melt_layer.retire_cluster({"fragments": [fragment]})
87
 
88
  # Evaporate to get proto-thought
89
  mist_lines = evaporation_engine.evaporate(limit=1)
90
 
91
  if mist_lines:
92
+ proto_thought = mist_lines[0]["proto_thought"]
93
  # Replace content with compressed proto-thought
94
  compressed_doc = doc.copy()
95
+ compressed_doc["content"] = proto_thought
96
+ compressed_doc["original_content_length"] = len(content)
97
+ compressed_doc["compressed_content_length"] = len(proto_thought)
98
  compressed_documents.append(compressed_doc)
99
  compressed_docs += 1
100
  else:
101
+ print(
102
+ f"Warning: Failed to evaporate glyph for document {doc.get('content_id', 'unknown')}"
103
+ )
104
  # Keep original document if evaporation fails
105
  compressed_documents.append(doc)
106
 
 
114
  save_jsonl_file(str(output_file), compressed_documents)
115
  print(f"Saved compressed file: {output_file}")
116
 
117
+ print("Compression complete:")
118
  print(f" Total documents processed: {total_docs}")
119
  print(f" Documents compressed: {compressed_docs}")
120
  if total_docs > 0:
final_fix.py CHANGED
@@ -2,27 +2,25 @@
2
  """Final fixes for stat7_entity.py and verify the fixes work"""
3
 
4
  # Fix the stat7_entity.py bug
5
- with open('warbler_cda/stat7_entity.py', 'r', encoding='utf-8') as f:
6
  content = f.read()
7
 
8
  # Fix the description reference bug
9
  content = content.replace('"description": description,', '"description": self.description,')
10
 
11
  # Write back the fixed content
12
- with open('warbler_cda/stat7_entity.py', 'w', encoding='utf-8') as f:
13
  f.write(content)
14
 
15
  print("Fixed stat7_entity.py description bug")
16
 
17
  # Test import to make sure everything works
18
  try:
19
- import warbler_cda.stat7_entity
20
  print("✅ stat7_entity imports successfully")
21
  except Exception as e:
22
  print(f"❌ stat7_entity import failed: {e}")
23
 
24
  try:
25
- import warbler_cda.stat7_rag_bridge
26
  print("✅ stat7_rag_bridge imports successfully")
27
  except Exception as e:
28
  print(f"❌ stat7_rag_bridge import failed: {e}")
 
2
  """Final fixes for stat7_entity.py and verify the fixes work"""
3
 
4
  # Fix the stat7_entity.py bug
5
+ with open("warbler_cda/stat7_entity.py", "r", encoding="utf-8") as f:
6
  content = f.read()
7
 
8
  # Fix the description reference bug
9
  content = content.replace('"description": description,', '"description": self.description,')
10
 
11
  # Write back the fixed content
12
+ with open("warbler_cda/stat7_entity.py", "w", encoding="utf-8") as f:
13
  f.write(content)
14
 
15
  print("Fixed stat7_entity.py description bug")
16
 
17
  # Test import to make sure everything works
18
  try:
 
19
  print("✅ stat7_entity imports successfully")
20
  except Exception as e:
21
  print(f"❌ stat7_entity import failed: {e}")
22
 
23
  try:
 
24
  print("✅ stat7_rag_bridge imports successfully")
25
  except Exception as e:
26
  print(f"❌ stat7_rag_bridge import failed: {e}")
fix_theme.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  """Fix the theme issue in app.py"""
3
 
4
- with open('app.py', 'r', encoding='utf-8') as f:
5
  content = f.read()
6
 
7
  old_line = 'with gr.Blocks(title="Warbler CDA - RAG System Demo", theme=gr.themes.Soft()) as demo:'
@@ -9,7 +9,7 @@ new_line = 'with gr.Blocks(title="Warbler CDA - RAG System Demo") as demo:'
9
 
10
  content = content.replace(old_line, new_line)
11
 
12
- with open('app.py', 'w', encoding='utf-8') as f:
13
  f.write(content)
14
 
15
- print('Fixed theme issue')
 
1
  #!/usr/bin/env python3
2
  """Fix the theme issue in app.py"""
3
 
4
+ with open("app.py", "r", encoding="utf-8") as f:
5
  content = f.read()
6
 
7
  old_line = 'with gr.Blocks(title="Warbler CDA - RAG System Demo", theme=gr.themes.Soft()) as demo:'
 
9
 
10
  content = content.replace(old_line, new_line)
11
 
12
+ with open("app.py", "w", encoding="utf-8") as f:
13
  f.write(content)
14
 
15
+ print("Fixed theme issue")
package-lock.json CHANGED
@@ -9,7 +9,8 @@
9
  "version": "1.0.0",
10
  "license": "ISC",
11
  "dependencies": {
12
- "express": "^5.1.0"
 
13
  }
14
  },
15
  "node_modules/accepts": {
@@ -819,6 +820,19 @@
819
  "node": ">= 0.6"
820
  }
821
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
822
  "node_modules/unpipe": {
823
  "version": "1.0.0",
824
  "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
 
9
  "version": "1.0.0",
10
  "license": "ISC",
11
  "dependencies": {
12
+ "express": "^5.1.0",
13
+ "typescript": "^5.9.3"
14
  }
15
  },
16
  "node_modules/accepts": {
 
820
  "node": ">= 0.6"
821
  }
822
  },
823
+ "node_modules/typescript": {
824
+ "version": "5.9.3",
825
+ "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
826
+ "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
827
+ "license": "Apache-2.0",
828
+ "bin": {
829
+ "tsc": "bin/tsc",
830
+ "tsserver": "bin/tsserver"
831
+ },
832
+ "engines": {
833
+ "node": ">=14.17"
834
+ }
835
+ },
836
  "node_modules/unpipe": {
837
  "version": "1.0.0",
838
  "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
package.json CHANGED
@@ -13,6 +13,7 @@
13
  "author": "",
14
  "license": "ISC",
15
  "dependencies": {
16
- "express": "^5.1.0"
 
17
  }
18
  }
 
13
  "author": "",
14
  "license": "ISC",
15
  "dependencies": {
16
+ "express": "^5.1.0",
17
+ "typescript": "^5.9.3"
18
  }
19
  }
packs/warbler-pack-npc-dialog/src/index.ts CHANGED
@@ -1,12 +1,36 @@
1
  /**
2
  * Warbler NPC Dialog Pack - Essential conversation templates
3
- *
4
  * Re-exports templates for dynamic loading in the Warbler conversation system
5
  */
6
 
7
- import { WarblerTemplate, WarblerPackMetadata } from 'warbler-npc';
8
  import templatesData from '../pack/templates.json';
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  // Transform JSON data to proper WarblerTemplate objects
11
  export const templates: WarblerTemplate[] = templatesData.templates.map(template => ({
12
  ...template,
@@ -48,4 +72,4 @@ export default {
48
  tradeInquiryWelcome,
49
  generalConversation,
50
  unknownResponse
51
- };
 
1
  /**
2
  * Warbler NPC Dialog Pack - Essential conversation templates
3
+ *
4
  * Re-exports templates for dynamic loading in the Warbler conversation system
5
  */
6
 
 
7
  import templatesData from '../pack/templates.json';
8
 
9
+ // Type definitions for Warbler pack types
10
+ export interface WarblerTemplate {
11
+ id: string;
12
+ version: string;
13
+ title: string;
14
+ description: string;
15
+ content: string;
16
+ requiredSlots: Array<{
17
+ name: string;
18
+ type: 'string' | 'number' | 'boolean' | 'object';
19
+ required: boolean;
20
+ description?: string;
21
+ }>;
22
+ tags: string[];
23
+ maxLength?: number;
24
+ }
25
+
26
+ export interface WarblerPackMetadata {
27
+ name: string;
28
+ version: string;
29
+ description: string;
30
+ author: string;
31
+ templates: WarblerTemplate[];
32
+ }
33
+
34
  // Transform JSON data to proper WarblerTemplate objects
35
  export const templates: WarblerTemplate[] = templatesData.templates.map(template => ({
36
  ...template,
 
72
  tradeInquiryWelcome,
73
  generalConversation,
74
  unknownResponse
75
+ };
packs/warbler-pack-npc-dialog/warbler-pack-core.jsonl DELETED
@@ -1,2 +0,0 @@
1
- "packInfo"
2
- "templates"
 
 
 
packs/warbler-pack-wisdom-scrolls/README.md CHANGED
@@ -1,6 +1,6 @@
1
  # 🎭 Warbler Pack: Wisdom Scrolls
2
 
3
- **Dynamic wisdom generation templates for the Secret Art of the Living Dev**
4
 
5
  This Warbler content pack provides mystical wisdom generation templates that create fresh quotes in the authentic style of the Sacred Scrolls, breathing new life into the ancient wisdom while maintaining the sacred atmosphere of the Cheekdom.
6
 
@@ -23,32 +23,44 @@ scripts/lda-quote --warbler
23
  ## Template Categories
24
 
25
  ### 🧙‍♂️ Development Wisdom (`wisdom_development_insight`)
 
26
  Generates profound insights about development practices using philosophical structure:
 
27
  - **Pattern**: `{action} is not {misconception}; it's {deeper_truth}. Like {metaphor}, but for {domain}.`
28
  - **Example**: *"Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."*
29
 
30
- ### 📜 Sacred Attribution (`scroll_attribution_template`)
 
31
  Creates mystical attribution in the style of ancient texts:
 
32
  - **Pattern**: `— {author_title}, {source_title}, {volume_designation}`
33
  - **Example**: *"— The Great Validator, Secret Art of the Living Dev, Vol. III"*
34
 
35
  ### 🐛 Debugging Proverbs (`debugging_proverb_template`)
 
36
  Humorous debugging wisdom using classical proverb structure:
 
37
  - **Pattern**: `The {problem_type} you can't {action_verb} is like the {creature} under the {location}—{reality_statement}.`
38
  - **Example**: *"The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."*
39
 
40
  ### 📖 Documentation Philosophy (`documentation_philosophy`)
 
41
  Profound insights about documentation practices:
 
42
  - **Pattern**: `Documentation is not {what_its_not}; it's {what_it_really_is}.`
43
  - **Example**: *"Documentation is not what you write for others; it's what you write for the you of six months from now."*
44
 
45
  ### 🏰 Cheekdom Lore (`cheekdom_lore_template`)
 
46
  Epic lore about the Cheekdom and its sacred mission:
 
47
  - **Pattern**: `In the {realm} of {domain}, the {guardian_class} stands between {civilization} and {threat_type}.`
48
  - **Example**: *"In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."*
49
 
50
  ### 🍑 Buttsafe Wisdom (`buttsafe_wisdom`)
 
51
  Sacred wisdom about ergonomic development practices:
 
52
  - **Pattern**: `Every developer's {body_part} is {sacred_designation}. {protection_action} with {protection_means}.`
53
  - **Example**: *"Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."*
54
 
@@ -138,6 +150,7 @@ scripts/weekly-wisdom-oracle.sh stats
138
  All generated quotes maintain the Sacred Code Standards:
139
 
140
  ### ✅ **Buttsafe Certified Requirements**
 
141
  - Professional workplace appropriateness
142
  - Dry, witty humor style (never offensive)
143
  - Development-focused insights
@@ -145,12 +158,14 @@ All generated quotes maintain the Sacred Code Standards:
145
  - Maximum length: 200 characters per template
146
 
147
  ### 🎭 **Authenticity Standards**
 
148
  - Maintains mystical atmosphere of original quotes
149
  - Uses consistent Sacred Art terminology
150
  - Preserves philosophical depth and wisdom
151
  - Integrates seamlessly with static quote database
152
 
153
  ### 📊 **Quality Assurance**
 
154
  - All templates validated for structure and content
155
  - Slot combinations tested for coherent output
156
  - Generated quotes pass content filtering
@@ -160,7 +175,7 @@ All generated quotes maintain the Sacred Code Standards:
160
 
161
  The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through multiple layers:
162
 
163
- ```
164
  ┌─────────────────────────────────────────────────┐
165
  │ Weekly Oracle Workflow │
166
  │ (GitHub Actions Automation) │
@@ -185,6 +200,7 @@ The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through m
185
  ## Versioning and Evolution
186
 
187
  ### Current Version: 1.0.0
 
188
  - ✅ Six core template categories
189
  - ✅ Complete slot value libraries
190
  - ✅ Integration with Warbler Quote Engine
@@ -192,12 +208,14 @@ The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through m
192
  - ✅ CLI integration
193
 
194
  ### Planned Enhancements (v1.1.0)
 
195
  - 🔄 Additional template categories (CI/CD wisdom, workflow philosophy)
196
  - 🔄 Context-aware slot selection
197
  - 🔄 Machine learning-enhanced quote quality
198
  - 🔄 Cross-reference generation with existing quotes
199
 
200
  ### Future Vision (v2.0.0)
 
201
  - 🌟 Dynamic template creation based on repository context
202
  - 🌟 Personalized wisdom generation
203
  - 🌟 Integration with Git commit analysis
@@ -228,7 +246,7 @@ scripts/lda-quote --warbler --stats
228
 
229
  ## Sacred Mission
230
 
231
- *"The Wisdom Scrolls pack transforms static sacred texts into living oracles, ensuring that fresh insights flow continuously through the channels of development wisdom while preserving the mystical essence of the original teachings."*
232
 
233
  — **Pack Philosophy**, Living Oracle Manifesto, Sacred Design Document
234
 
 
1
  # 🎭 Warbler Pack: Wisdom Scrolls
2
 
3
+ ## **Dynamic wisdom generation templates for the Secret Art of the Living Dev**
4
 
5
  This Warbler content pack provides mystical wisdom generation templates that create fresh quotes in the authentic style of the Sacred Scrolls, breathing new life into the ancient wisdom while maintaining the sacred atmosphere of the Cheekdom.
6
 
 
23
  ## Template Categories
24
 
25
  ### 🧙‍♂️ Development Wisdom (`wisdom_development_insight`)
26
+
27
  Generates profound insights about development practices using philosophical structure:
28
+
29
  - **Pattern**: `{action} is not {misconception}; it's {deeper_truth}. Like {metaphor}, but for {domain}.`
30
  - **Example**: *"Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."*
31
 
32
+ ### 📜 Sacred Attribution (`scroll_attribution_template`)
33
+
34
  Creates mystical attribution in the style of ancient texts:
35
+
36
  - **Pattern**: `— {author_title}, {source_title}, {volume_designation}`
37
  - **Example**: *"— The Great Validator, Secret Art of the Living Dev, Vol. III"*
38
 
39
  ### 🐛 Debugging Proverbs (`debugging_proverb_template`)
40
+
41
  Humorous debugging wisdom using classical proverb structure:
42
+
43
  - **Pattern**: `The {problem_type} you can't {action_verb} is like the {creature} under the {location}—{reality_statement}.`
44
  - **Example**: *"The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."*
45
 
46
  ### 📖 Documentation Philosophy (`documentation_philosophy`)
47
+
48
  Profound insights about documentation practices:
49
+
50
  - **Pattern**: `Documentation is not {what_its_not}; it's {what_it_really_is}.`
51
  - **Example**: *"Documentation is not what you write for others; it's what you write for the you of six months from now."*
52
 
53
  ### 🏰 Cheekdom Lore (`cheekdom_lore_template`)
54
+
55
  Epic lore about the Cheekdom and its sacred mission:
56
+
57
  - **Pattern**: `In the {realm} of {domain}, the {guardian_class} stands between {civilization} and {threat_type}.`
58
  - **Example**: *"In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."*
59
 
60
  ### 🍑 Buttsafe Wisdom (`buttsafe_wisdom`)
61
+
62
  Sacred wisdom about ergonomic development practices:
63
+
64
  - **Pattern**: `Every developer's {body_part} is {sacred_designation}. {protection_action} with {protection_means}.`
65
  - **Example**: *"Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."*
66
 
 
150
  All generated quotes maintain the Sacred Code Standards:
151
 
152
  ### ✅ **Buttsafe Certified Requirements**
153
+
154
  - Professional workplace appropriateness
155
  - Dry, witty humor style (never offensive)
156
  - Development-focused insights
 
158
  - Maximum length: 200 characters per template
159
 
160
  ### 🎭 **Authenticity Standards**
161
+
162
  - Maintains mystical atmosphere of original quotes
163
  - Uses consistent Sacred Art terminology
164
  - Preserves philosophical depth and wisdom
165
  - Integrates seamlessly with static quote database
166
 
167
  ### 📊 **Quality Assurance**
168
+
169
  - All templates validated for structure and content
170
  - Slot combinations tested for coherent output
171
  - Generated quotes pass content filtering
 
175
 
176
  The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through multiple layers:
177
 
178
+ ```none
179
  ┌─────────────────────────────────────────────────┐
180
  │ Weekly Oracle Workflow │
181
  │ (GitHub Actions Automation) │
 
200
  ## Versioning and Evolution
201
 
202
  ### Current Version: 1.0.0
203
+
204
  - ✅ Six core template categories
205
  - ✅ Complete slot value libraries
206
  - ✅ Integration with Warbler Quote Engine
 
208
  - ✅ CLI integration
209
 
210
  ### Planned Enhancements (v1.1.0)
211
+
212
  - 🔄 Additional template categories (CI/CD wisdom, workflow philosophy)
213
  - 🔄 Context-aware slot selection
214
  - 🔄 Machine learning-enhanced quote quality
215
  - 🔄 Cross-reference generation with existing quotes
216
 
217
  ### Future Vision (v2.0.0)
218
+
219
  - 🌟 Dynamic template creation based on repository context
220
  - 🌟 Personalized wisdom generation
221
  - 🌟 Integration with Git commit analysis
 
246
 
247
  ## Sacred Mission
248
 
249
+ -*"The Wisdom Scrolls pack transforms static sacred texts into living oracles, ensuring that fresh insights flow continuously through the channels of development wisdom while preserving the mystical essence of the original teachings."*
250
 
251
  — **Pack Philosophy**, Living Oracle Manifesto, Sacred Design Document
252
 
packs/warbler-pack-wisdom-scrolls/README_HF_DATASET.md CHANGED
@@ -34,7 +34,7 @@ This dataset contains mystical wisdom generation templates that create fresh quo
34
 
35
  ## Dataset Structure
36
 
37
- ```
38
  {
39
  "template_id": str,
40
  "category": str,
@@ -49,26 +49,32 @@ This dataset contains mystical wisdom generation templates that create fresh quo
49
  ## Template Categories
50
 
51
  ### 🧙‍♂️ Development Wisdom
 
52
  Generates profound insights about development practices using philosophical structure.
53
  *Example*: "Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."
54
 
55
  ### 📜 Sacred Attribution
 
56
  Creates mystical attribution in the style of ancient texts.
57
  *Example*: "— The Great Validator, Secret Art of the Living Dev, Vol. III"
58
 
59
  ### 🐛 Debugging Proverbs
 
60
  Humorous debugging wisdom using classical proverb structure.
61
  *Example*: "The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."
62
 
63
  ### 📖 Documentation Philosophy
 
64
  Profound insights about documentation practices.
65
  *Example*: "Documentation is not what you write for others; it's what you write for the you of six months from now."
66
 
67
  ### 🏰 Cheekdom Lore
 
68
  Epic lore about the Cheekdom and its sacred mission.
69
  *Example*: "In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."
70
 
71
  ### 🍑 Buttsafe Wisdom
 
72
  Sacred wisdom about ergonomic development practices.
73
  *Example*: "Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."
74
 
 
34
 
35
  ## Dataset Structure
36
 
37
+ ```py
38
  {
39
  "template_id": str,
40
  "category": str,
 
49
  ## Template Categories
50
 
51
  ### 🧙‍♂️ Development Wisdom
52
+
53
  Generates profound insights about development practices using philosophical structure.
54
  *Example*: "Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."
55
 
56
  ### 📜 Sacred Attribution
57
+
58
  Creates mystical attribution in the style of ancient texts.
59
  *Example*: "— The Great Validator, Secret Art of the Living Dev, Vol. III"
60
 
61
  ### 🐛 Debugging Proverbs
62
+
63
  Humorous debugging wisdom using classical proverb structure.
64
  *Example*: "The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."
65
 
66
  ### 📖 Documentation Philosophy
67
+
68
  Profound insights about documentation practices.
69
  *Example*: "Documentation is not what you write for others; it's what you write for the you of six months from now."
70
 
71
  ### 🏰 Cheekdom Lore
72
+
73
  Epic lore about the Cheekdom and its sacred mission.
74
  *Example*: "In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."
75
 
76
  ### 🍑 Buttsafe Wisdom
77
+
78
  Sacred wisdom about ergonomic development practices.
79
  *Example*: "Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."
80
 
test_app.py CHANGED
@@ -4,7 +4,8 @@ Test script to debug app.py initialization issues
4
  """
5
 
6
  import os
7
- os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 
8
 
9
  try:
10
  from warbler_cda import (
@@ -54,6 +55,7 @@ if WARBLER_AVAILABLE:
54
  except Exception as e:
55
  print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
56
  import traceback
 
57
  traceback.print_exc()
58
  api = None
59
 
 
4
  """
5
 
6
  import os
7
+
8
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
9
 
10
  try:
11
  from warbler_cda import (
 
55
  except Exception as e:
56
  print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
57
  import traceback
58
+
59
  traceback.print_exc()
60
  api = None
61
 
test_compressed_pack.py CHANGED
@@ -11,6 +11,7 @@ sys.path.insert(0, str(Path(__file__).parent))
11
 
12
  from warbler_cda.pack_loader import PackLoader
13
 
 
14
  def test_compressed_pack_loading():
15
  """Test loading the compressed novels pack"""
16
  packs_dir = Path("packs")
@@ -47,7 +48,7 @@ def test_compressed_pack_loading():
47
  print()
48
 
49
  # Check that content is compressed (should be short proto-thoughts)
50
- avg_content_length = sum(len(doc['content']) for doc in documents) / len(documents)
51
  print(f"Average content length: {avg_content_length:.1f} characters")
52
 
53
  if avg_content_length > 200: # Original was ~1100, compressed should be much shorter
@@ -57,6 +58,7 @@ def test_compressed_pack_loading():
57
  print("✓ Compressed pack loading test passed!")
58
  return True
59
 
 
60
  if __name__ == "__main__":
61
  success = test_compressed_pack_loading()
62
  sys.exit(0 if success else 1)
 
11
 
12
  from warbler_cda.pack_loader import PackLoader
13
 
14
+
15
  def test_compressed_pack_loading():
16
  """Test loading the compressed novels pack"""
17
  packs_dir = Path("packs")
 
48
  print()
49
 
50
  # Check that content is compressed (should be short proto-thoughts)
51
+ avg_content_length = sum(len(doc["content"]) for doc in documents) / len(documents)
52
  print(f"Average content length: {avg_content_length:.1f} characters")
53
 
54
  if avg_content_length > 200: # Original was ~1100, compressed should be much shorter
 
58
  print("✓ Compressed pack loading test passed!")
59
  return True
60
 
61
+
62
  if __name__ == "__main__":
63
  success = test_compressed_pack_loading()
64
  sys.exit(0 if success else 1)
test_embedding_integration.py CHANGED
@@ -116,10 +116,10 @@ def test_embedding_cache():
116
 
117
  text = "Cache test document"
118
 
119
- emb1 = provider.embed_text(text)
120
  hits_before = provider.cache_stats["hits"]
121
 
122
- emb2 = provider.embed_text(text)
123
  hits_after = provider.cache_stats["hits"]
124
 
125
  if hits_after > hits_before:
 
116
 
117
  text = "Cache test document"
118
 
119
+ provider.embed_text(text)
120
  hits_before = provider.cache_stats["hits"]
121
 
122
+ provider.embed_text(text)
123
  hits_after = provider.cache_stats["hits"]
124
 
125
  if hits_after > hits_before:
test_fixes.py CHANGED
@@ -17,7 +17,7 @@ def test_load_warbler_packs():
17
 
18
  print("Testing WarblerPackLoader...")
19
  try:
20
- loader = WarblerPackLoader()
21
  print("✓ WarblerPackLoader instantiated successfully")
22
 
23
  print("✓ JSONL parsing fix applied")
@@ -36,7 +36,7 @@ def test_sentence_transformer():
36
  print("\nTesting SentenceTransformerEmbeddingProvider...")
37
  try:
38
  config = {"model_name": "all-MiniLM-L6-v2", "batch_size": 32}
39
- provider = SentenceTransformerEmbeddingProvider(config)
40
  print("✓ Provider initialized with proper type annotations")
41
  return True
42
  except Exception as e:
 
17
 
18
  print("Testing WarblerPackLoader...")
19
  try:
20
+ WarblerPackLoader()
21
  print("✓ WarblerPackLoader instantiated successfully")
22
 
23
  print("✓ JSONL parsing fix applied")
 
36
  print("\nTesting SentenceTransformerEmbeddingProvider...")
37
  try:
38
  config = {"model_name": "all-MiniLM-L6-v2", "batch_size": 32}
39
+ SentenceTransformerEmbeddingProvider(config)
40
  print("✓ Provider initialized with proper type annotations")
41
  return True
42
  except Exception as e:
test_pack_loading.py CHANGED
@@ -4,7 +4,8 @@ Test pack loading to debug app.py issues
4
  """
5
 
6
  import os
7
- os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 
8
 
9
  try:
10
  from warbler_cda import (
@@ -78,11 +79,13 @@ if WARBLER_AVAILABLE:
78
  except Exception as e:
79
  print(f"[ERROR] Pack loading failed: {e}")
80
  import traceback
 
81
  traceback.print_exc()
82
 
83
  except Exception as e:
84
  print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
85
  import traceback
 
86
  traceback.print_exc()
87
  api = None
88
 
 
4
  """
5
 
6
  import os
7
+
8
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
9
 
10
  try:
11
  from warbler_cda import (
 
79
  except Exception as e:
80
  print(f"[ERROR] Pack loading failed: {e}")
81
  import traceback
82
+
83
  traceback.print_exc()
84
 
85
  except Exception as e:
86
  print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
87
  import traceback
88
+
89
  traceback.print_exc()
90
  api = None
91
 
tests/test_new_mit_datasets.py CHANGED
@@ -22,11 +22,9 @@ from warbler_cda.utils.transformers import (
22
  WarblerPackBuilder,
23
  )
24
  import pytest
25
- import json
26
  import sys
27
  from pathlib import Path
28
- from unittest.mock import Mock, patch, MagicMock
29
- from typing import Dict, List, Any
30
 
31
  sys.path.insert(0, str(Path(__file__).parent.parent))
32
 
@@ -517,7 +515,7 @@ class TestNewDatasetsIntegrationWithRetrieval:
517
  """Test that packs can be created from new datasets"""
518
  builder = WarblerPackBuilder()
519
 
520
- test_docs = [
521
  {
522
  "content_id": f"test_{i}",
523
  "content": f"Test content {i}",
 
22
  WarblerPackBuilder,
23
  )
24
  import pytest
 
25
  import sys
26
  from pathlib import Path
27
+ from unittest.mock import patch, MagicMock
 
28
 
29
  sys.path.insert(0, str(Path(__file__).parent.parent))
30
 
 
515
  """Test that packs can be created from new datasets"""
516
  builder = WarblerPackBuilder()
517
 
518
+ [
519
  {
520
  "content_id": f"test_{i}",
521
  "content": f"Test content {i}",
tests/test_pdf_ingestion.py CHANGED
@@ -13,12 +13,9 @@ from warbler_cda.utils.transformers import (
13
  PromptReportTransformer,
14
  ManualsTransformer,
15
  )
16
- import pytest
17
- import json
18
  import sys
19
  from pathlib import Path
20
- from unittest.mock import Mock, patch, MagicMock
21
- from typing import Dict, List, Any
22
 
23
  sys.path.insert(0, str(Path(__file__).parent.parent))
24
 
 
13
  PromptReportTransformer,
14
  ManualsTransformer,
15
  )
 
 
16
  import sys
17
  from pathlib import Path
18
+ from unittest.mock import patch, MagicMock
 
19
 
20
  sys.path.insert(0, str(Path(__file__).parent.parent))
21
 
tests/test_rag_e2e.py CHANGED
@@ -38,11 +38,9 @@ class TestEndToEndRAG:
38
  print("RAG SYSTEM METRICS")
39
  print("=" * 60)
40
  print(f"Embedding Provider: {self.embedding_provider.provider_id}")
41
- print(
42
- f"Embedding Dimension: {self.embedding_provider.get_dimension()}")
43
  print(f"Documents in Store: {metrics['context_store_size']}")
44
- print(
45
- f"Total Queries: {metrics['retrieval_metrics']['total_queries']}")
46
  print("=" * 60)
47
 
48
  def test_01_embedding_generation(self):
@@ -128,8 +126,7 @@ class TestEndToEndRAG:
128
 
129
  print(f"[PASS] Retrieved {len(assembly.results)} relevant documents")
130
  for i, result in enumerate(assembly.results, 1):
131
- print(
132
- f" {i}. [{result.relevance_score:.4f}] {result.content[:50]}")
133
 
134
  def test_05_max_results_respected(self):
135
  """Test 05: Verify max_results parameter is respected."""
@@ -149,10 +146,7 @@ class TestEndToEndRAG:
149
  assembly = self.api.retrieve_context(query)
150
 
151
  assert len(assembly.results) <= 3
152
- print(
153
- "[PASS] Query returned:"
154
- f"{len(assembly.results)} results",
155
- "(max 3 requested)")
156
 
157
  def test_06_confidence_threshold(self):
158
  """Test 06: Verify confidence threshold filtering."""
@@ -186,12 +180,8 @@ class TestEndToEndRAG:
186
  strict_results = self.api.retrieve_context(query_strict)
187
  loose_results = self.api.retrieve_context(query_loose)
188
 
189
- print(
190
- "[PASS] Strict threshold (0.8):",
191
- f"{len(strict_results.results)} results")
192
- print(
193
- "[PASS] Loose threshold (0.2):",
194
- f"{len(loose_results.results)} results")
195
 
196
  assert len(strict_results.results) <= len(loose_results.results)
197
 
@@ -207,8 +197,7 @@ class TestEndToEndRAG:
207
  provider = SentenceTransformerEmbeddingProvider()
208
 
209
  hybrid_api = RetrievalAPI(
210
- embedding_provider=provider, config={
211
- "enable_stat7_hybrid": True}
212
  )
213
  except ImportError:
214
  pytest.skip("SentenceTransformer not installed for STAT7 testing")
@@ -242,7 +231,8 @@ class TestEndToEndRAG:
242
  print(
243
  "[PASS] Result:",
244
  f"semantic={result.semantic_similarity:.4f}",
245
- f"STAT7={result.stat7_resonance:.4f}")
 
246
 
247
  def test_08_temporal_retrieval(self):
248
  """Test 08: Verify temporal retrieval works."""
@@ -268,8 +258,7 @@ class TestEndToEndRAG:
268
  assembly = self.api.retrieve_context(query)
269
 
270
  assert assembly is not None
271
- print(
272
- f"[PASS] Temporal query retrieved {len(assembly.results)} results")
273
 
274
  def test_09_retrieval_metrics(self):
275
  """Test 09: Verify retrieval metrics are tracked."""
@@ -294,7 +283,8 @@ class TestEndToEndRAG:
294
 
295
  print(
296
  f"[PASS] Metrics tracked: {
297
- metrics['retrieval_metrics']['total_queries']} queries")
 
298
 
299
  def test_10_full_rag_pipeline(self):
300
  """Test 10: Complete RAG pipeline end-to-end."""
 
38
  print("RAG SYSTEM METRICS")
39
  print("=" * 60)
40
  print(f"Embedding Provider: {self.embedding_provider.provider_id}")
41
+ print(f"Embedding Dimension: {self.embedding_provider.get_dimension()}")
 
42
  print(f"Documents in Store: {metrics['context_store_size']}")
43
+ print(f"Total Queries: {metrics['retrieval_metrics']['total_queries']}")
 
44
  print("=" * 60)
45
 
46
  def test_01_embedding_generation(self):
 
126
 
127
  print(f"[PASS] Retrieved {len(assembly.results)} relevant documents")
128
  for i, result in enumerate(assembly.results, 1):
129
+ print(f" {i}. [{result.relevance_score:.4f}] {result.content[:50]}")
 
130
 
131
  def test_05_max_results_respected(self):
132
  """Test 05: Verify max_results parameter is respected."""
 
146
  assembly = self.api.retrieve_context(query)
147
 
148
  assert len(assembly.results) <= 3
149
+ print("[PASS] Query returned:" f"{len(assembly.results)} results", "(max 3 requested)")
 
 
 
150
 
151
  def test_06_confidence_threshold(self):
152
  """Test 06: Verify confidence threshold filtering."""
 
180
  strict_results = self.api.retrieve_context(query_strict)
181
  loose_results = self.api.retrieve_context(query_loose)
182
 
183
+ print("[PASS] Strict threshold (0.8):", f"{len(strict_results.results)} results")
184
+ print("[PASS] Loose threshold (0.2):", f"{len(loose_results.results)} results")
 
 
 
 
185
 
186
  assert len(strict_results.results) <= len(loose_results.results)
187
 
 
197
  provider = SentenceTransformerEmbeddingProvider()
198
 
199
  hybrid_api = RetrievalAPI(
200
+ embedding_provider=provider, config={"enable_stat7_hybrid": True}
 
201
  )
202
  except ImportError:
203
  pytest.skip("SentenceTransformer not installed for STAT7 testing")
 
231
  print(
232
  "[PASS] Result:",
233
  f"semantic={result.semantic_similarity:.4f}",
234
+ f"STAT7={result.stat7_resonance:.4f}",
235
+ )
236
 
237
  def test_08_temporal_retrieval(self):
238
  """Test 08: Verify temporal retrieval works."""
 
258
  assembly = self.api.retrieve_context(query)
259
 
260
  assert assembly is not None
261
+ print(f"[PASS] Temporal query retrieved {len(assembly.results)} results")
 
262
 
263
  def test_09_retrieval_metrics(self):
264
  """Test 09: Verify retrieval metrics are tracked."""
 
283
 
284
  print(
285
  f"[PASS] Metrics tracked: {
286
+ metrics['retrieval_metrics']['total_queries']} queries"
287
+ )
288
 
289
  def test_10_full_rag_pipeline(self):
290
  """Test 10: Complete RAG pipeline end-to-end."""
tests/test_retrieval_api.py CHANGED
@@ -331,7 +331,7 @@ class TestRetrievalMetrics:
331
  max_results=5,
332
  )
333
 
334
- initial_metrics = self.api.get_retrieval_metrics()
335
 
336
  self.api.retrieve_context(query)
337
  self.api.retrieve_context(query)
 
331
  max_results=5,
332
  )
333
 
334
+ self.api.get_retrieval_metrics()
335
 
336
  self.api.retrieve_context(query)
337
  self.api.retrieve_context(query)
tsconfig.base.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2020",
4
+ "module": "commonjs",
5
+ "esModuleInterop": true,
6
+ "allowSyntheticDefaultImports": true,
7
+ "strict": true,
8
+ "skipLibCheck": true,
9
+ "forceConsistentCasingInFileNames": true,
10
+ "declaration": true,
11
+ "moduleResolution": "node",
12
+ "resolveJsonModule": true
13
+ }
14
+ }
validate_new_transformers.py CHANGED
@@ -110,9 +110,9 @@ def main():
110
  chunks = ingestor._chunk_text(test_text, chunk_size=100)
111
  print(f" ✓ Successfully chunked text into {len(chunks)} chunks")
112
  if all(isinstance(chunk, str) for chunk in chunks):
113
- print(f" ✓ All chunks are strings")
114
  else:
115
- print(f" ✗ Some chunks are not strings")
116
  all_good = False
117
  except Exception as e:
118
  print(f" ✗ _chunk_text failed: {e}")
 
110
  chunks = ingestor._chunk_text(test_text, chunk_size=100)
111
  print(f" ✓ Successfully chunked text into {len(chunks)} chunks")
112
  if all(isinstance(chunk, str) for chunk in chunks):
113
+ print(" ✓ All chunks are strings")
114
  else:
115
+ print(" ✗ Some chunks are not strings")
116
  all_good = False
117
  except Exception as e:
118
  print(f" ✗ _chunk_text failed: {e}")
verify_pack_ingestion.py CHANGED
@@ -7,7 +7,6 @@ Run this locally before deploying to HuggingFace.
7
  """
8
 
9
  import sys
10
- from pathlib import Path
11
  import logging
12
 
13
  # Setup logging
 
7
  """
8
 
9
  import sys
 
10
  import logging
11
 
12
  # Setup logging
warbler_cda/api/cli.py CHANGED
@@ -11,10 +11,8 @@ import json
11
  import requests
12
  import time
13
  from typing import List, Dict, Any
14
- from concurrent.futures import ThreadPoolExecutor, as_completed
15
  from datetime import datetime
16
  import logging
17
- from pathlib import Path
18
 
19
  logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
@@ -178,7 +176,7 @@ def query(
178
  # Show narrative analysis
179
  if result.get("narrative_analysis"):
180
  narr = result["narrative_analysis"]
181
- click.echo(f"\nNarrative Analysis:")
182
  click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
183
  click.echo(f" Narrative Threads: {narr.get('narrative_threads', 0)}")
184
  click.echo(f" Analysis: {narr.get('analysis')}")
@@ -249,7 +247,7 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
249
 
250
  result = client.bulk_query(query_data, concurrency=concurrency, include_narrative=True)
251
 
252
- elapsed = time.time() - start_time
253
 
254
  if json_output:
255
  click.echo(json.dumps(result, indent=2))
@@ -270,7 +268,7 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
270
  # Narrative analysis for entire batch
271
  if result.get("batch_narrative_analysis"):
272
  narr = result["batch_narrative_analysis"]
273
- click.echo(f"\nBatch Narrative Analysis:")
274
  click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
275
  click.echo(
276
  f" Total Narrative Threads: {
@@ -282,7 +280,7 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
282
  click.echo(f" Analysis: {narr.get('analysis')}")
283
 
284
  # Per-query summary
285
- click.echo(f"\nPer-Query Summary (first 3):")
286
  for res in result.get("results", [])[:3]:
287
  click.echo(
288
  f" {
 
11
  import requests
12
  import time
13
  from typing import List, Dict, Any
 
14
  from datetime import datetime
15
  import logging
 
16
 
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
 
176
  # Show narrative analysis
177
  if result.get("narrative_analysis"):
178
  narr = result["narrative_analysis"]
179
+ click.echo("\nNarrative Analysis:")
180
  click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
181
  click.echo(f" Narrative Threads: {narr.get('narrative_threads', 0)}")
182
  click.echo(f" Analysis: {narr.get('analysis')}")
 
247
 
248
  result = client.bulk_query(query_data, concurrency=concurrency, include_narrative=True)
249
 
250
+ time.time() - start_time
251
 
252
  if json_output:
253
  click.echo(json.dumps(result, indent=2))
 
268
  # Narrative analysis for entire batch
269
  if result.get("batch_narrative_analysis"):
270
  narr = result["batch_narrative_analysis"]
271
+ click.echo("\nBatch Narrative Analysis:")
272
  click.echo(f" Coherence Score: {narr.get('coherence_score', 0):.3f}")
273
  click.echo(
274
  f" Total Narrative Threads: {
 
280
  click.echo(f" Analysis: {narr.get('analysis')}")
281
 
282
  # Per-query summary
283
+ click.echo("\nPer-Query Summary (first 3):")
284
  for res in result.get("results", [])[:3]:
285
  click.echo(
286
  f" {
warbler_cda/api/service.py CHANGED
@@ -495,8 +495,10 @@ async def startup_event():
495
  @app.get("/health", response_model=HealthResponse)
496
  async def health_check(service: RetrievalService = Depends(get_retrieval_service)):
497
  """Health check endpoint"""
498
- api = service.init_api()
499
- uptime = (datetime.now() - datetime.fromisoformat(service.metrics["start_time"])).total_seconds()
 
 
500
 
501
  return HealthResponse(
502
  status="healthy",
@@ -511,8 +513,7 @@ async def health_check(service: RetrievalService = Depends(get_retrieval_service
511
 
512
  @app.post("/query", response_model=QueryResult)
513
  async def single_query(
514
- request: QueryRequest,
515
- service: RetrievalService = Depends(get_retrieval_service)
516
  ):
517
  """Execute a single retrieval query"""
518
  api = service.init_api()
@@ -579,10 +580,7 @@ async def single_query(
579
 
580
  # Bob the Skeptic: Verify suspiciously perfect results
581
  bob_status, bob_verification_log = await _bob_skeptic_filter(
582
- narrative_analysis=narrative_analysis,
583
- results_data=results_data,
584
- query=query,
585
- api=api
586
  )
587
 
588
  return QueryResult(
@@ -611,11 +609,10 @@ async def single_query(
611
 
612
  @app.post("/bulk_query")
613
  async def bulk_concurrent_queries(
614
- request: BulkQueryRequest,
615
- service: RetrievalService = Depends(get_retrieval_service)
616
  ):
617
  """Execute multiple queries concurrently"""
618
- api = service.init_api()
619
  logger.info(
620
  f"Executing {len(request.queries)} queries with concurrency level {request.concurrency_level}"
621
  )
@@ -669,8 +666,7 @@ async def bulk_concurrent_queries(
669
 
670
  @app.post("/ingest")
671
  async def ingest_documents(
672
- request: Dict[str, Any],
673
- service: RetrievalService = Depends(get_retrieval_service)
674
  ):
675
  """Ingest documents into the RetrievalAPI"""
676
  api = service.init_api()
 
495
  @app.get("/health", response_model=HealthResponse)
496
  async def health_check(service: RetrievalService = Depends(get_retrieval_service)):
497
  """Health check endpoint"""
498
+ service.init_api()
499
+ uptime = (
500
+ datetime.now() - datetime.fromisoformat(service.metrics["start_time"])
501
+ ).total_seconds()
502
 
503
  return HealthResponse(
504
  status="healthy",
 
513
 
514
  @app.post("/query", response_model=QueryResult)
515
  async def single_query(
516
+ request: QueryRequest, service: RetrievalService = Depends(get_retrieval_service)
 
517
  ):
518
  """Execute a single retrieval query"""
519
  api = service.init_api()
 
580
 
581
  # Bob the Skeptic: Verify suspiciously perfect results
582
  bob_status, bob_verification_log = await _bob_skeptic_filter(
583
+ narrative_analysis=narrative_analysis, results_data=results_data, query=query, api=api
 
 
 
584
  )
585
 
586
  return QueryResult(
 
609
 
610
  @app.post("/bulk_query")
611
  async def bulk_concurrent_queries(
612
+ request: BulkQueryRequest, service: RetrievalService = Depends(get_retrieval_service)
 
613
  ):
614
  """Execute multiple queries concurrently"""
615
+ service.init_api()
616
  logger.info(
617
  f"Executing {len(request.queries)} queries with concurrency level {request.concurrency_level}"
618
  )
 
666
 
667
  @app.post("/ingest")
668
  async def ingest_documents(
669
+ request: Dict[str, Any], service: RetrievalService = Depends(get_retrieval_service)
 
670
  ):
671
  """Ingest documents into the RetrievalAPI"""
672
  api = service.init_api()
warbler_cda/castle_graph.py CHANGED
@@ -1,5 +1,5 @@
1
  from __future__ import annotations
2
- from typing import List, Dict, Any, Tuple, Optional, Set
3
  import time
4
  import re
5
  import math
 
1
  from __future__ import annotations
2
+ from typing import List, Dict, Any, Optional, Set
3
  import time
4
  import re
5
  import math
warbler_cda/conflict_detector.py CHANGED
@@ -5,10 +5,10 @@ Detects conflicting or contradictory statements using semantic similarity and
5
  logical opposition analysis for the Cognitive Geo-Thermal Lore Engine v0.3.
6
  """
7
 
8
- from typing import List, Dict, Any, Optional, Tuple, Set
9
  import time
10
  import hashlib
11
- from dataclasses import dataclass, asdict
12
  from enum import Enum
13
 
14
 
@@ -580,7 +580,9 @@ class ConflictDetector:
580
 
581
  def _generate_conflict_id(self, conflict: ConflictEvidence) -> str:
582
  """Generate unique ID for a conflict."""
583
- content = f"{conflict.statement_a_id}_{conflict.statement_b_id}_{conflict.conflict_type.value}"
 
 
584
  return hashlib.md5(content.encode()).hexdigest()[:12]
585
 
586
  def _generate_conflict_recommendation(
 
5
  logical opposition analysis for the Cognitive Geo-Thermal Lore Engine v0.3.
6
  """
7
 
8
+ from typing import List, Dict, Any, Optional, Set
9
  import time
10
  import hashlib
11
+ from dataclasses import dataclass
12
  from enum import Enum
13
 
14
 
 
580
 
581
  def _generate_conflict_id(self, conflict: ConflictEvidence) -> str:
582
  """Generate unique ID for a conflict."""
583
+ content = (
584
+ f"{conflict.statement_a_id}_{conflict.statement_b_id}_{conflict.conflict_type.value}"
585
+ )
586
  return hashlib.md5(content.encode()).hexdigest()[:12]
587
 
588
  def _generate_conflict_recommendation(
warbler_cda/embeddings/openai_provider.py CHANGED
@@ -3,7 +3,6 @@ OpenAI Embedding Provider - Cloud-based Semantic Grounding
3
  """
4
 
5
  from typing import List, Dict, Any, Optional
6
- import time
7
  from warbler_cda.embeddings.base_provider import EmbeddingProvider
8
 
9
 
 
3
  """
4
 
5
  from typing import List, Dict, Any, Optional
 
6
  from warbler_cda.embeddings.base_provider import EmbeddingProvider
7
 
8
 
warbler_cda/embeddings/sentence_transformer_provider.py CHANGED
@@ -4,9 +4,7 @@ High-quality embeddings using pre-trained transformer models with CUDA support
4
  """
5
 
6
  from typing import List, Dict, Any, Optional, Tuple
7
- import os
8
  import json
9
- import time
10
  import hashlib
11
  from pathlib import Path
12
  from warbler_cda.embeddings.base_provider import EmbeddingProvider
@@ -118,7 +116,7 @@ class SentenceTransformerEmbeddingProvider(EmbeddingProvider):
118
 
119
  import numpy as np
120
 
121
- query_vec = np.array(query_embedding)
122
  embed_vecs = np.array(embeddings)
123
 
124
  similarities = []
@@ -205,8 +203,8 @@ class SentenceTransformerEmbeddingProvider(EmbeddingProvider):
205
  seg2 = emb_array[2 * seg_size : 3 * seg_size]
206
  seg3 = emb_array[3 * seg_size : 4 * seg_size]
207
  seg4 = emb_array[4 * seg_size : 5 * seg_size]
208
- seg5 = emb_array[5 * seg_size : 6 * seg_size]
209
- seg6 = emb_array[6 * seg_size :]
210
 
211
  lineage = float(np.mean(seg0**2))
212
 
 
4
  """
5
 
6
  from typing import List, Dict, Any, Optional, Tuple
 
7
  import json
 
8
  import hashlib
9
  from pathlib import Path
10
  from warbler_cda.embeddings.base_provider import EmbeddingProvider
 
116
 
117
  import numpy as np
118
 
119
+ np.array(query_embedding)
120
  embed_vecs = np.array(embeddings)
121
 
122
  similarities = []
 
203
  seg2 = emb_array[2 * seg_size : 3 * seg_size]
204
  seg3 = emb_array[3 * seg_size : 4 * seg_size]
205
  seg4 = emb_array[4 * seg_size : 5 * seg_size]
206
+ emb_array[5 * seg_size : 6 * seg_size]
207
+ emb_array[6 * seg_size :]
208
 
209
  lineage = float(np.mean(seg0**2))
210
 
warbler_cda/evaporation.py CHANGED
@@ -1,9 +1,8 @@
1
  from __future__ import annotations
2
- from typing import List, Dict, Any, Optional, Tuple
3
  import time
4
  import random
5
  import re
6
- from collections import Counter
7
 
8
 
9
  class EvaporationEngine:
@@ -299,7 +298,9 @@ class EvaporationEngine:
299
  if len(concepts) == 1:
300
  return f"[Balanced] Reflection on {concepts[0]} reveals deeper meaning."
301
  else:
302
- return f"[Balanced] The interplay between {concepts[0]} and {concepts[1]} creates harmony."
 
 
303
 
304
  def _apply_affect_coloring(self, proto_thought: str, affect: Dict[str, Any]) -> str:
305
  """Apply affect-based coloring to proto-thought."""
 
1
  from __future__ import annotations
2
+ from typing import List, Dict, Any, Optional
3
  import time
4
  import random
5
  import re
 
6
 
7
 
8
  class EvaporationEngine:
 
298
  if len(concepts) == 1:
299
  return f"[Balanced] Reflection on {concepts[0]} reveals deeper meaning."
300
  else:
301
+ return (
302
+ f"[Balanced] The interplay between {concepts[0]} and {concepts[1]} creates harmony."
303
+ )
304
 
305
  def _apply_affect_coloring(self, proto_thought: str, affect: Dict[str, Any]) -> str:
306
  """Apply affect-based coloring to proto-thought."""
warbler_cda/pack_loader.py CHANGED
@@ -149,9 +149,7 @@ class PackLoader:
149
  chunk_docs = self._load_jsonl_file(chunk_file, pack_name)
150
  documents.extend(chunk_docs)
151
 
152
- logger.info(
153
- f"Loaded {len(documents)} total documents from {len(chunk_files)} chunks"
154
- )
155
  else:
156
  # Load single-file pack (backward compatibility)
157
  jsonl_file = pack_dir / f"{pack_name}.jsonl"
 
149
  chunk_docs = self._load_jsonl_file(chunk_file, pack_name)
150
  documents.extend(chunk_docs)
151
 
152
+ logger.info(f"Loaded {len(documents)} total documents from {len(chunk_files)} chunks")
 
 
153
  else:
154
  # Load single-file pack (backward compatibility)
155
  jsonl_file = pack_dir / f"{pack_name}.jsonl"
warbler_cda/pack_sync.py CHANGED
@@ -141,6 +141,6 @@ class PackSync:
141
  """Return reingest command if packs are missing"""
142
  status = self.verify_packs()
143
  if status["missing"]:
144
- missing = ", ".join(status["missing"])
145
- return f"python -m warbler_cda.utils.hf_warbler_ingest ingest --datasets all"
146
  return None
 
141
  """Return reingest command if packs are missing"""
142
  status = self.verify_packs()
143
  if status["missing"]:
144
+ ", ".join(status["missing"])
145
+ return "python -m warbler_cda.utils.hf_warbler_ingest ingest --datasets all"
146
  return None
warbler_cda/retrieval_api.py CHANGED
@@ -8,7 +8,7 @@ for the Cognitive Geo-Thermal Lore Engine v0.3.
8
  from typing import List, Dict, Any, Optional, Tuple, Union
9
  import time
10
  import hashlib
11
- from dataclasses import dataclass, asdict
12
  from enum import Enum
13
 
14
 
@@ -377,8 +377,14 @@ class RetrievalAPI:
377
  # DEBUG
378
  import sys
379
 
380
- print(f"DEBUG: _retrieve_semantic_similarity called with query='{query.semantic_query}'", file=sys.stderr)
381
- print(f"DEBUG: embedding_provider={self.embedding_provider}, semantic_anchors={self.semantic_anchors}", file=sys.stderr)
 
 
 
 
 
 
382
  print(f"DEBUG: context_store size={len(self._context_store)}", file=sys.stderr)
383
 
384
  # If embedding provider available, use it
@@ -467,7 +473,7 @@ class RetrievalAPI:
467
  try:
468
  if self.embedding_provider and hasattr(self.embedding_provider, "semantic_search"):
469
  return self._search_context_store_semantic(query)
470
- except Exception as e:
471
  pass
472
 
473
  return self._search_context_store_keyword(query)
@@ -527,7 +533,7 @@ class RetrievalAPI:
527
  stat7_resonance=stat7_resonance,
528
  )
529
  results.append(result)
530
- except Exception as e:
531
  return self._search_context_store_keyword(query)
532
 
533
  return results
@@ -846,7 +852,6 @@ class RetrievalAPI:
846
  filtered = [r for r in results if r.relevance_score >= query.confidence_threshold]
847
 
848
  # Apply temporal decay
849
- current_time = query.query_timestamp
850
  for result in filtered:
851
  age_hours = result.temporal_distance / 3600
852
  decay_factor = max(0.1, 1.0 - (age_hours / self.temporal_decay_hours))
 
8
  from typing import List, Dict, Any, Optional, Tuple, Union
9
  import time
10
  import hashlib
11
+ from dataclasses import dataclass
12
  from enum import Enum
13
 
14
 
 
377
  # DEBUG
378
  import sys
379
 
380
+ print(
381
+ f"DEBUG: _retrieve_semantic_similarity called with query='{query.semantic_query}'",
382
+ file=sys.stderr,
383
+ )
384
+ print(
385
+ f"DEBUG: embedding_provider={self.embedding_provider}, semantic_anchors={self.semantic_anchors}",
386
+ file=sys.stderr,
387
+ )
388
  print(f"DEBUG: context_store size={len(self._context_store)}", file=sys.stderr)
389
 
390
  # If embedding provider available, use it
 
473
  try:
474
  if self.embedding_provider and hasattr(self.embedding_provider, "semantic_search"):
475
  return self._search_context_store_semantic(query)
476
+ except Exception:
477
  pass
478
 
479
  return self._search_context_store_keyword(query)
 
533
  stat7_resonance=stat7_resonance,
534
  )
535
  results.append(result)
536
+ except Exception:
537
  return self._search_context_store_keyword(query)
538
 
539
  return results
 
852
  filtered = [r for r in results if r.relevance_score >= query.confidence_threshold]
853
 
854
  # Apply temporal decay
 
855
  for result in filtered:
856
  age_hours = result.temporal_distance / 3600
857
  decay_factor = max(0.1, 1.0 - (age_hours / self.temporal_decay_hours))
warbler_cda/semantic_anchors.py CHANGED
@@ -2,11 +2,9 @@
2
  Enhanced Anchor System with Semantic Grounding and Provenance
3
  """
4
 
5
- from typing import List, Dict, Any, Optional, Tuple
6
  import time
7
  import hashlib
8
- import json
9
- from dataclasses import dataclass, asdict
10
  from warbler_cda.embeddings import EmbeddingProvider, EmbeddingProviderFactory
11
  from warbler_cda.anchor_memory_pool import AnchorMemoryPool, get_global_anchor_pool
12
  from warbler_cda.anchor_data_classes import SemanticAnchor, AnchorProvenance
@@ -242,7 +240,7 @@ class SemanticAnchorGraph:
242
  """Apply aging, consolidation, and eviction policies."""
243
  actions = {"aged": 0, "consolidated": 0, "evicted": 0, "evicted_anchors": []}
244
 
245
- current_time = time.time()
246
  anchors_to_evict = []
247
 
248
  # Apply aging
 
2
  Enhanced Anchor System with Semantic Grounding and Provenance
3
  """
4
 
5
+ from typing import List, Dict, Any, Optional
6
  import time
7
  import hashlib
 
 
8
  from warbler_cda.embeddings import EmbeddingProvider, EmbeddingProviderFactory
9
  from warbler_cda.anchor_memory_pool import AnchorMemoryPool, get_global_anchor_pool
10
  from warbler_cda.anchor_data_classes import SemanticAnchor, AnchorProvenance
 
240
  """Apply aging, consolidation, and eviction policies."""
241
  actions = {"aged": 0, "consolidated": 0, "evicted": 0, "evicted_anchors": []}
242
 
243
+ time.time()
244
  anchors_to_evict = []
245
 
246
  # Apply aging
warbler_cda/stat7_entity.py CHANGED
@@ -11,7 +11,7 @@ Features:
11
  - Entanglement detection and management
12
  """
13
 
14
- from dataclasses import dataclass, field, asdict
15
  from datetime import datetime
16
  from enum import Enum
17
  from typing import Dict, List, Optional, Any, Tuple
@@ -286,9 +286,7 @@ class STAT7Entity(ABC):
286
  self.entangled_entities.append(other_entity_id)
287
  self.entanglement_strength.append(strength)
288
  self._record_event(
289
- "entanglement_added",
290
- f"Entangled with {other_entity_id}",
291
- {"strength": strength}
292
  )
293
 
294
  def remove_entanglement(self, other_entity_id: str):
@@ -297,10 +295,7 @@ class STAT7Entity(ABC):
297
  idx = self.entangled_entities.index(other_entity_id)
298
  self.entangled_entities.pop(idx)
299
  self.entanglement_strength.pop(idx)
300
- self._record_event(
301
- "entanglement_removed",
302
- f"Untangled from {other_entity_id}"
303
- )
304
 
305
  def get_entanglements(self) -> List[Tuple[str, float]]:
306
  """Get all entangled entities with strength"""
@@ -315,8 +310,8 @@ class STAT7Entity(ABC):
315
  self._record_event(
316
  "entanglement_updated",
317
  f"{other_entity_id} entanglement strength changed",
318
- {"old_strength": old_strength, "new_strength": new_strength}
319
- )
320
 
321
  # ========================================================================
322
  # LUCA Bootstrap
@@ -423,7 +418,7 @@ class STAT7Entity(ABC):
423
  def load_from_file(cls, path: Path) -> "STAT7Entity":
424
  """Load entity from JSON file (must know concrete type)"""
425
  with open(path, "r") as f:
426
- data = json.load(f)
427
  # Note: In practice, would need factory pattern to instantiate correct
428
  # subclass
429
  raise NotImplementedError("Use subclass load methods")
 
11
  - Entanglement detection and management
12
  """
13
 
14
+ from dataclasses import dataclass, field
15
  from datetime import datetime
16
  from enum import Enum
17
  from typing import Dict, List, Optional, Any, Tuple
 
286
  self.entangled_entities.append(other_entity_id)
287
  self.entanglement_strength.append(strength)
288
  self._record_event(
289
+ "entanglement_added", f"Entangled with {other_entity_id}", {"strength": strength}
 
 
290
  )
291
 
292
  def remove_entanglement(self, other_entity_id: str):
 
295
  idx = self.entangled_entities.index(other_entity_id)
296
  self.entangled_entities.pop(idx)
297
  self.entanglement_strength.pop(idx)
298
+ self._record_event("entanglement_removed", f"Untangled from {other_entity_id}")
 
 
 
299
 
300
  def get_entanglements(self) -> List[Tuple[str, float]]:
301
  """Get all entangled entities with strength"""
 
310
  self._record_event(
311
  "entanglement_updated",
312
  f"{other_entity_id} entanglement strength changed",
313
+ {"old_strength": old_strength, "new_strength": new_strength},
314
+ )
315
 
316
  # ========================================================================
317
  # LUCA Bootstrap
 
418
  def load_from_file(cls, path: Path) -> "STAT7Entity":
419
  """Load entity from JSON file (must know concrete type)"""
420
  with open(path, "r") as f:
421
+ json.load(f)
422
  # Note: In practice, would need factory pattern to instantiate correct
423
  # subclass
424
  raise NotImplementedError("Use subclass load methods")
warbler_cda/stat7_experiments.py CHANGED
@@ -327,7 +327,7 @@ class EXP01_AddressUniqueness:
327
  Tuple of (results list, overall success boolean)
328
  """
329
  print(f"\n{'=' * 70}")
330
- print(f"EXP-01: ADDRESS UNIQUENESS TEST")
331
  print(f"{'=' * 70}")
332
  print(f"Sample size: {self.sample_size} bit-chains")
333
  print(f"Iterations: {self.iterations}")
@@ -383,9 +383,7 @@ class EXP01_AddressUniqueness:
383
  print(f" ⚠️ Collision on {addr[:16]}... : {len(ids)} entries")
384
 
385
  print()
386
- print(
387
- f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
388
- )
389
  print(f"Success rate: {sum(1 for r in self.results if r.success)}/{self.iterations}")
390
 
391
  return self.results, all_success
@@ -452,7 +450,7 @@ class EXP02_RetrievalEfficiency:
452
  Tuple of (results list, overall success boolean)
453
  """
454
  print(f"\n{'=' * 70}")
455
- print(f"EXP-02: RETRIEVAL EFFICIENCY TEST")
456
  print(f"{'=' * 70}")
457
  print(f"Query count per scale: {self.query_count}")
458
  print(f"Scales: {self.scales}")
@@ -520,9 +518,7 @@ class EXP02_RetrievalEfficiency:
520
  print(f" Target: < {threshold}ms")
521
  print()
522
 
523
- print(
524
- f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
525
- )
526
 
527
  return self.results, all_success
528
 
@@ -589,7 +585,7 @@ class EXP03_DimensionNecessity:
589
  Tuple of (results list, overall success boolean)
590
  """
591
  print(f"\n{'=' * 70}")
592
- print(f"EXP-03: DIMENSION NECESSITY TEST")
593
  print(f"{'=' * 70}")
594
  print(f"Sample size: {self.sample_size} bit-chains")
595
  print()
@@ -618,9 +614,7 @@ class EXP03_DimensionNecessity:
618
  self.results.append(result)
619
 
620
  status = "✅ PASS" if result.acceptable else "❌ FAIL"
621
- print(
622
- f" {status} | Collisions: {collisions} | Rate: {baseline_collision_rate * 100:.4f}%"
623
- )
624
  print()
625
 
626
  # Ablation: remove each dimension
@@ -662,13 +656,11 @@ class EXP03_DimensionNecessity:
662
  # when removing dims
663
  necessity = not acceptable # Should show collisions
664
  status = "✅ NECESSARY" if necessity else "⚠️ OPTIONAL"
665
- print(
666
- f" {status} | Collisions: {collisions} | Rate: {collision_rate * 100:.4f}%"
667
- )
668
 
669
  print()
670
  print(
671
- f"OVERALL RESULT: All 7 dimensions are necessary (all show > 0.1% collisions when removed)"
672
  )
673
 
674
  return self.results, all_success
@@ -733,7 +725,7 @@ def run_all_experiments(
733
 
734
  # Summary
735
  print(f"\n{'=' * 70}")
736
- print(f"PHASE 1 VALIDATION SUMMARY")
737
  print(f"{'=' * 70}")
738
  print(
739
  f"EXP-01 (Address Uniqueness): {'✅ PASS' if results['EXP-01']['success'] else '❌ FAIL'}"
 
327
  Tuple of (results list, overall success boolean)
328
  """
329
  print(f"\n{'=' * 70}")
330
+ print("EXP-01: ADDRESS UNIQUENESS TEST")
331
  print(f"{'=' * 70}")
332
  print(f"Sample size: {self.sample_size} bit-chains")
333
  print(f"Iterations: {self.iterations}")
 
383
  print(f" ⚠️ Collision on {addr[:16]}... : {len(ids)} entries")
384
 
385
  print()
386
+ print(f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}")
 
 
387
  print(f"Success rate: {sum(1 for r in self.results if r.success)}/{self.iterations}")
388
 
389
  return self.results, all_success
 
450
  Tuple of (results list, overall success boolean)
451
  """
452
  print(f"\n{'=' * 70}")
453
+ print("EXP-02: RETRIEVAL EFFICIENCY TEST")
454
  print(f"{'=' * 70}")
455
  print(f"Query count per scale: {self.query_count}")
456
  print(f"Scales: {self.scales}")
 
518
  print(f" Target: < {threshold}ms")
519
  print()
520
 
521
+ print(f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}")
 
 
522
 
523
  return self.results, all_success
524
 
 
585
  Tuple of (results list, overall success boolean)
586
  """
587
  print(f"\n{'=' * 70}")
588
+ print("EXP-03: DIMENSION NECESSITY TEST")
589
  print(f"{'=' * 70}")
590
  print(f"Sample size: {self.sample_size} bit-chains")
591
  print()
 
614
  self.results.append(result)
615
 
616
  status = "✅ PASS" if result.acceptable else "❌ FAIL"
617
+ print(f" {status} | Collisions: {collisions} | Rate: {baseline_collision_rate * 100:.4f}%")
 
 
618
  print()
619
 
620
  # Ablation: remove each dimension
 
656
  # when removing dims
657
  necessity = not acceptable # Should show collisions
658
  status = "✅ NECESSARY" if necessity else "⚠️ OPTIONAL"
659
+ print(f" {status} | Collisions: {collisions} | Rate: {collision_rate * 100:.4f}%")
 
 
660
 
661
  print()
662
  print(
663
+ "OVERALL RESULT: All 7 dimensions are necessary (all show > 0.1% collisions when removed)"
664
  )
665
 
666
  return self.results, all_success
 
725
 
726
  # Summary
727
  print(f"\n{'=' * 70}")
728
+ print("PHASE 1 VALIDATION SUMMARY")
729
  print(f"{'=' * 70}")
730
  print(
731
  f"EXP-01 (Address Uniqueness): {'✅ PASS' if results['EXP-01']['success'] else '❌ FAIL'}"
warbler_cda/stat7_rag_bridge.py CHANGED
@@ -55,18 +55,10 @@ class STAT7Address:
55
 
56
  def __post_init__(self):
57
  """Validate STAT8 constraints."""
58
- assert (
59
- 0.0 <= self.adjacency <= 1.0
60
- ), f"adjacency must be [0,1], got {self.adjacency}"
61
- assert (
62
- 0.0 <= self.luminosity <= 1.0
63
- ), f"luminosity must be [0,1], got {self.luminosity}"
64
- assert (
65
- 0.0 <= self.polarity <= 1.0
66
- ), f"polarity must be [0,1], got {self.polarity}"
67
- assert (
68
- 0.0 <= self.entropy <= 1.0
69
- ), f"entropy must be [0,1], got {self.entropy}"
70
  assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
71
  assert (
72
  1 <= self.dimensionality <= 8
@@ -164,7 +156,7 @@ def stat7_resonance(query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float
164
  luminosity_diff = abs(query_stat7.luminosity - doc_stat7.luminosity)
165
  polarity_diff = abs(query_stat7.polarity - doc_stat7.polarity)
166
  entropy_diff = abs(query_stat7.entropy - doc_stat7.entropy)
167
- signal_score = 1.0 - (1/3) * (luminosity_diff + polarity_diff + entropy_diff)
168
  signal_score = max(0.0, signal_score)
169
 
170
  # Adjacency/Dimensionality bonus: connectivity + complexity
 
55
 
56
  def __post_init__(self):
57
  """Validate STAT8 constraints."""
58
+ assert 0.0 <= self.adjacency <= 1.0, f"adjacency must be [0,1], got {self.adjacency}"
59
+ assert 0.0 <= self.luminosity <= 1.0, f"luminosity must be [0,1], got {self.luminosity}"
60
+ assert 0.0 <= self.polarity <= 1.0, f"polarity must be [0,1], got {self.polarity}"
61
+ assert 0.0 <= self.entropy <= 1.0, f"entropy must be [0,1], got {self.entropy}"
 
 
 
 
 
 
 
 
62
  assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
63
  assert (
64
  1 <= self.dimensionality <= 8
 
156
  luminosity_diff = abs(query_stat7.luminosity - doc_stat7.luminosity)
157
  polarity_diff = abs(query_stat7.polarity - doc_stat7.polarity)
158
  entropy_diff = abs(query_stat7.entropy - doc_stat7.entropy)
159
+ signal_score = 1.0 - (1 / 3) * (luminosity_diff + polarity_diff + entropy_diff)
160
  signal_score = max(0.0, signal_score)
161
 
162
  # Adjacency/Dimensionality bonus: connectivity + complexity
warbler_cda/stat7_visualization.py CHANGED
@@ -21,11 +21,7 @@ from typing import Optional, Dict, Any
21
  from pathlib import Path
22
 
23
  from warbler_cda.stat7_experiments import (
24
- BitChain,
25
- generate_random_bitchain,
26
  EXP01_AddressUniqueness,
27
- EXP02_RetrievalEfficiency,
28
- EXP03_DimensionNecessity,
29
  )
30
 
31
  # Import the visualization components
@@ -75,7 +71,7 @@ class STAT7VisualizationManager:
75
  self.is_running = True
76
 
77
  print(f"🚀 STAT7 Visualization Server started on ws://{self.host}:{self.port}")
78
- print(f"🌐 Open stat7threejs.html in your browser to view visualization")
79
 
80
  def _run_server(self):
81
  """Run the WebSocket server in asyncio event loop."""
@@ -201,7 +197,7 @@ def create_jupyter_widget(width: str = "100%", height: str = "600px") -> str:
201
  return create_inline_jupyter_widget(width, height)
202
 
203
  with open(html_path, "r", encoding="utf-8") as f:
204
- html_content = f.read()
205
 
206
  # Wrap in iframe for Jupyter
207
  widget_html = f"""
@@ -307,7 +303,7 @@ def display_in_jupyter(width: str = "100%", height: str = "600px"):
307
  display(HTML(widget_html))
308
  except ImportError:
309
  print("IPython not available. Cannot display in Jupyter notebook.")
310
- print(f"Open stat7threejs.html in your browser instead.")
311
 
312
 
313
  # Convenience functions for quick start
 
21
  from pathlib import Path
22
 
23
  from warbler_cda.stat7_experiments import (
 
 
24
  EXP01_AddressUniqueness,
 
 
25
  )
26
 
27
  # Import the visualization components
 
71
  self.is_running = True
72
 
73
  print(f"🚀 STAT7 Visualization Server started on ws://{self.host}:{self.port}")
74
+ print("🌐 Open stat7threejs.html in your browser to view visualization")
75
 
76
  def _run_server(self):
77
  """Run the WebSocket server in asyncio event loop."""
 
197
  return create_inline_jupyter_widget(width, height)
198
 
199
  with open(html_path, "r", encoding="utf-8") as f:
200
+ f.read()
201
 
202
  # Wrap in iframe for Jupyter
203
  widget_html = f"""
 
303
  display(HTML(widget_html))
304
  except ImportError:
305
  print("IPython not available. Cannot display in Jupyter notebook.")
306
+ print("Open stat7threejs.html in your browser instead.")
307
 
308
 
309
  # Convenience functions for quick start
warbler_cda/summarization_ladder.py CHANGED
@@ -5,10 +5,10 @@ Implements rolling N-window micro-summaries and pipeline macro distillation
5
  for the Cognitive Geo-Thermal Lore Engine v0.3.
6
  """
7
 
8
- from typing import List, Dict, Any, Optional, Tuple
9
  import time
10
  import hashlib
11
- from dataclasses import dataclass, asdict
12
  from collections import deque
13
 
14
 
 
5
  for the Cognitive Geo-Thermal Lore Engine v0.3.
6
  """
7
 
8
+ from typing import List, Dict, Any, Optional
9
  import time
10
  import hashlib
11
+ from dataclasses import dataclass
12
  from collections import deque
13
 
14
 
warbler_cda/utils/hf_warbler_ingest.py CHANGED
@@ -8,7 +8,6 @@ for NPC intelligence training via the magma layer self-training system.
8
 
9
  import logging
10
  from pathlib import Path
11
- from typing import Dict, Any, Optional
12
 
13
  import click
14
 
@@ -97,12 +96,12 @@ def ingest(datasets, pack_prefix, arxiv_limit, max_docs_per_chunk, max_pdf_pages
97
  if max_docs_per_chunk > 0:
98
  click.echo(f"[PACK] Chunking enabled: {max_docs_per_chunk} documents per chunk")
99
  else:
100
- click.echo(f"[PACK] Chunking disabled: single file per pack")
101
 
102
  if max_pdf_pages is not None:
103
  click.echo(f"[PDF] PDF extraction limit: {max_pdf_pages} pages")
104
  else:
105
- click.echo(f"[PDF] PDF extraction: unlimited pages")
106
 
107
  click.echo()
108
 
@@ -165,7 +164,7 @@ def ingest(datasets, pack_prefix, arxiv_limit, max_docs_per_chunk, max_pdf_pages
165
  if results:
166
  report_file = builder.save_report(results)
167
 
168
- click.echo(f"\n[SUCCESS] Ingestion Complete!")
169
  click.echo(f"[STATS] Total Documents: {sum(r['documents'] for r in results.values())}")
170
  click.echo(f"[STATS] Packs Created: {len(results)}")
171
  click.echo(f"[STATS] Report saved to: {report_file}")
 
8
 
9
  import logging
10
  from pathlib import Path
 
11
 
12
  import click
13
 
 
96
  if max_docs_per_chunk > 0:
97
  click.echo(f"[PACK] Chunking enabled: {max_docs_per_chunk} documents per chunk")
98
  else:
99
+ click.echo("[PACK] Chunking disabled: single file per pack")
100
 
101
  if max_pdf_pages is not None:
102
  click.echo(f"[PDF] PDF extraction limit: {max_pdf_pages} pages")
103
  else:
104
+ click.echo("[PDF] PDF extraction: unlimited pages")
105
 
106
  click.echo()
107
 
 
164
  if results:
165
  report_file = builder.save_report(results)
166
 
167
+ click.echo("\n[SUCCESS] Ingestion Complete!")
168
  click.echo(f"[STATS] Total Documents: {sum(r['documents'] for r in results.values())}")
169
  click.echo(f"[STATS] Packs Created: {len(results)}")
170
  click.echo(f"[STATS] Report saved to: {report_file}")
warbler_cda/utils/load_warbler_packs.py CHANGED
@@ -233,9 +233,7 @@ def discover(api_url):
233
  for doc in documents:
234
  click.echo(f" - {doc['content_id']}")
235
  if "metadata" in doc:
236
- click.echo(
237
- f" Realm: {doc['metadata'].get('realm_type','unknown')}"
238
- )
239
 
240
  click.echo(f"\n[STATS] Total discovered: {total} documents\n")
241
 
 
233
  for doc in documents:
234
  click.echo(f" - {doc['content_id']}")
235
  if "metadata" in doc:
236
+ click.echo(f" Realm: {doc['metadata'].get('realm_type','unknown')}")
 
 
237
 
238
  click.echo(f"\n[STATS] Total discovered: {total} documents\n")
239
 
warbler_cda/utils/transformers/base.py CHANGED
@@ -33,9 +33,8 @@ class BaseWarblerTransformer(ABC):
33
  """Base class for all dataset transformers"""
34
 
35
  def __init__(
36
- self,
37
- tokenizer_name: str = "microsoft/DialoGPT-medium",
38
- max_pdf_pages: Optional[int] = None):
39
  self.max_pdf_pages = max_pdf_pages
40
 
41
  @abstractmethod
@@ -47,10 +46,7 @@ class BaseWarblerTransformer(ABC):
47
  """Check if PDF extraction is available"""
48
  return PDF_AVAILABLE
49
 
50
- def extract_pdf_text(
51
- self,
52
- pdf_data: Any,
53
- max_pages: Optional[int] = None) -> Optional[str]:
54
  """
55
  Extract text from PDF data (bytes, file path, PDF object, or file-like object)
56
 
@@ -62,23 +58,19 @@ class BaseWarblerTransformer(ABC):
62
  Extracted text or None if extraction fails
63
  """
64
  if not PDF_AVAILABLE:
65
- logger.debug(
66
- "PDF extraction unavailable - pdfplumber not installed")
67
  return None
68
 
69
  try:
70
  if hasattr(pdf_data, "pages") and hasattr(pdf_data, "metadata"):
71
- logger.info(
72
- "PDF data is already a pdfplumber.PDF object, extracting text...")
73
  text_parts = []
74
  total_pages = len(pdf_data.pages)
75
 
76
  if max_pages is None:
77
- logger.info(
78
- f"PDF has {total_pages} pages, extracting all pages")
79
  else:
80
- logger.info(
81
- f"PDF has {total_pages} pages, extracting up to {max_pages} pages")
82
 
83
  try:
84
  for page_num, page in enumerate(pdf_data.pages, 1):
@@ -88,35 +80,35 @@ class BaseWarblerTransformer(ABC):
88
  text_parts.append(page_text)
89
  logger.debug(
90
  f"Extracted {
91
- len(page_text)} chars from page {page_num}")
 
92
  else:
93
- logger.debug(
94
- f"Page {page_num} has no extractable text")
95
  except Exception as page_error:
96
- logger.warning(
97
- f"Error extracting page {page_num}: {page_error}")
98
  continue
99
 
100
  if max_pages is not None and page_num >= max_pages:
101
  logger.info(
102
- f"Truncated PDF extraction at {page_num} pages (max: {max_pages})")
 
103
  break
104
 
105
- extracted_text = "\n\n".join(
106
- text_parts) if text_parts else None
107
  if extracted_text:
108
  logger.info(
109
  f"Successfully extracted {
110
  len(extracted_text)} total characters from {
111
- len(text_parts)} pages")
 
112
  else:
113
- logger.warning(
114
- "No text could be extracted from PDF object")
115
  return extracted_text
116
  except Exception as e:
117
  logger.warning(
118
  f"Error extracting from PDF object: {
119
- type(e).__name__}: {e}")
 
120
  return None
121
 
122
  if isinstance(pdf_data, dict) and "bytes" in pdf_data:
@@ -129,7 +121,8 @@ class BaseWarblerTransformer(ABC):
129
  if isinstance(pdf_data, bytes):
130
  logger.info(
131
  f"PDF data is bytes ({
132
- len(pdf_data)} bytes), creating BytesIO")
 
133
  pdf_file = io.BytesIO(pdf_data)
134
  elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
135
  logger.info(f"PDF data is file path: {pdf_data}")
@@ -138,8 +131,7 @@ class BaseWarblerTransformer(ABC):
138
  logger.info(f"PDF data is file-like object: {type(pdf_data)}")
139
  pdf_file = pdf_data
140
  else:
141
- logger.warning(
142
- f"Unknown PDF data type: {type(pdf_data)}, cannot extract")
143
  return None
144
 
145
  text_parts = []
@@ -147,11 +139,11 @@ class BaseWarblerTransformer(ABC):
147
  total_pages = len(pdf.pages)
148
 
149
  if max_pages is None:
150
- logger.info(
151
- f"Opened PDF with {total_pages} pages, extracting all pages")
152
  else:
153
  logger.info(
154
- f"Opened PDF with {total_pages} pages, extracting up to {max_pages} pages")
 
155
 
156
  for page_num, page in enumerate(pdf.pages, 1):
157
  try:
@@ -160,18 +152,18 @@ class BaseWarblerTransformer(ABC):
160
  text_parts.append(page_text)
161
  logger.debug(
162
  f"Extracted {
163
- len(page_text)} chars from page {page_num}")
 
164
  else:
165
- logger.debug(
166
- f"Page {page_num} has no extractable text")
167
  except Exception as page_error:
168
- logger.warning(
169
- f"Error extracting page {page_num}: {page_error}")
170
  continue
171
 
172
  if max_pages is not None and page_num >= max_pages:
173
  logger.info(
174
- f"Truncated PDF extraction at {page_num} pages (max: {max_pages})")
 
175
  break
176
 
177
  extracted_text = "\n\n".join(text_parts) if text_parts else None
@@ -179,7 +171,8 @@ class BaseWarblerTransformer(ABC):
179
  logger.info(
180
  f"Successfully extracted {
181
  len(extracted_text)} total characters from {
182
- len(text_parts)} pages")
 
183
  else:
184
  logger.warning("No text could be extracted from PDF")
185
  return extracted_text
@@ -192,8 +185,7 @@ class BaseWarblerTransformer(ABC):
192
  """Split text into chunks"""
193
  if not text:
194
  return []
195
- return [text[i: i + chunk_size]
196
- for i in range(0, len(text), chunk_size)]
197
 
198
  def extract_dataset_items(self, dataset: Any) -> List[Dict[str, Any]]:
199
  """
@@ -211,13 +203,7 @@ class BaseWarblerTransformer(ABC):
211
  pass
212
 
213
  try:
214
- if hasattr(
215
- dataset,
216
- "keys") and callable(
217
- getattr(
218
- dataset,
219
- "keys",
220
- None)):
221
  keys = list(dataset.keys())
222
  if keys:
223
  first_split = keys[0]
@@ -242,8 +228,7 @@ class WarblerPackBuilder:
242
 
243
  def __init__(self, output_dir: Optional[Path] = None):
244
  if output_dir is None:
245
- output_dir = Path(__file__).resolve(
246
- ).parent.parent / "results" / "hf_ingest"
247
  self.output_dir = Path(output_dir)
248
  self.output_dir.mkdir(exist_ok=True, parents=True)
249
 
@@ -259,8 +244,7 @@ class WarblerPackBuilder:
259
 
260
  total_docs = len(docs)
261
 
262
- if max_docs_per_chunk == float(
263
- "inf") or total_docs <= max_docs_per_chunk:
264
  pack_file = pack_dir / f"{pack_name}.jsonl"
265
 
266
  with open(pack_file, "w", encoding="utf-8") as f:
@@ -279,21 +263,21 @@ class WarblerPackBuilder:
279
  }
280
 
281
  logger.info(
282
- f"✓ Created Warbler pack: {pack_name} with {total_docs} documents (single file)")
 
283
  else:
284
- chunk_count = (total_docs + max_docs_per_chunk -
285
- 1) // max_docs_per_chunk
286
 
287
  logger.info(
288
- f"Creating chunked pack: {pack_name} with {total_docs} documents across {chunk_count} chunks")
 
289
 
290
  for chunk_idx in range(chunk_count):
291
  start_idx = chunk_idx * max_docs_per_chunk
292
  end_idx = min(start_idx + max_docs_per_chunk, total_docs)
293
  chunk_docs = docs[start_idx:end_idx]
294
 
295
- chunk_file = pack_dir / \
296
- f"{pack_name}-chunk-{chunk_idx + 1:03d}.jsonl"
297
 
298
  with open(chunk_file, "w", encoding="utf-8") as f:
299
  for doc in chunk_docs:
@@ -303,7 +287,8 @@ class WarblerPackBuilder:
303
  f" ✓ Wrote chunk {
304
  chunk_idx + 1}/{chunk_count}: {
305
  len(chunk_docs)} documents to {
306
- chunk_file.name}")
 
307
 
308
  metadata = {
309
  "name": pack_name,
@@ -320,7 +305,8 @@ class WarblerPackBuilder:
320
  }
321
 
322
  logger.info(
323
- f"✓ Created chunked Warbler pack: {pack_name} with {total_docs} documents across {chunk_count} chunks")
 
324
 
325
  metadata_file = pack_dir / "package.json"
326
  with open(metadata_file, "w", encoding="utf-8") as f:
@@ -335,16 +321,14 @@ class WarblerPackBuilder:
335
  "timestamp": datetime.now().isoformat(),
336
  "results": results,
337
  "total_documents": sum(
338
- result.get("documents", 0) if isinstance(
339
- result, dict) else len(result)
340
  for result in results.values()
341
  ),
342
  "packs_created": len(results),
343
  }
344
 
345
  report_file = (
346
- self.output_dir /
347
- f"ingestion_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
348
  )
349
  with open(report_file, "w", encoding="utf-8") as f:
350
  json.dump(report, f, indent=2, ensure_ascii=False)
 
33
  """Base class for all dataset transformers"""
34
 
35
  def __init__(
36
+ self, tokenizer_name: str = "microsoft/DialoGPT-medium", max_pdf_pages: Optional[int] = None
37
+ ):
 
38
  self.max_pdf_pages = max_pdf_pages
39
 
40
  @abstractmethod
 
46
  """Check if PDF extraction is available"""
47
  return PDF_AVAILABLE
48
 
49
+ def extract_pdf_text(self, pdf_data: Any, max_pages: Optional[int] = None) -> Optional[str]:
 
 
 
50
  """
51
  Extract text from PDF data (bytes, file path, PDF object, or file-like object)
52
 
 
58
  Extracted text or None if extraction fails
59
  """
60
  if not PDF_AVAILABLE:
61
+ logger.debug("PDF extraction unavailable - pdfplumber not installed")
 
62
  return None
63
 
64
  try:
65
  if hasattr(pdf_data, "pages") and hasattr(pdf_data, "metadata"):
66
+ logger.info("PDF data is already a pdfplumber.PDF object, extracting text...")
 
67
  text_parts = []
68
  total_pages = len(pdf_data.pages)
69
 
70
  if max_pages is None:
71
+ logger.info(f"PDF has {total_pages} pages, extracting all pages")
 
72
  else:
73
+ logger.info(f"PDF has {total_pages} pages, extracting up to {max_pages} pages")
 
74
 
75
  try:
76
  for page_num, page in enumerate(pdf_data.pages, 1):
 
80
  text_parts.append(page_text)
81
  logger.debug(
82
  f"Extracted {
83
+ len(page_text)} chars from page {page_num}"
84
+ )
85
  else:
86
+ logger.debug(f"Page {page_num} has no extractable text")
 
87
  except Exception as page_error:
88
+ logger.warning(f"Error extracting page {page_num}: {page_error}")
 
89
  continue
90
 
91
  if max_pages is not None and page_num >= max_pages:
92
  logger.info(
93
+ f"Truncated PDF extraction at {page_num} pages (max: {max_pages})"
94
+ )
95
  break
96
 
97
+ extracted_text = "\n\n".join(text_parts) if text_parts else None
 
98
  if extracted_text:
99
  logger.info(
100
  f"Successfully extracted {
101
  len(extracted_text)} total characters from {
102
+ len(text_parts)} pages"
103
+ )
104
  else:
105
+ logger.warning("No text could be extracted from PDF object")
 
106
  return extracted_text
107
  except Exception as e:
108
  logger.warning(
109
  f"Error extracting from PDF object: {
110
+ type(e).__name__}: {e}"
111
+ )
112
  return None
113
 
114
  if isinstance(pdf_data, dict) and "bytes" in pdf_data:
 
121
  if isinstance(pdf_data, bytes):
122
  logger.info(
123
  f"PDF data is bytes ({
124
+ len(pdf_data)} bytes), creating BytesIO"
125
+ )
126
  pdf_file = io.BytesIO(pdf_data)
127
  elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
128
  logger.info(f"PDF data is file path: {pdf_data}")
 
131
  logger.info(f"PDF data is file-like object: {type(pdf_data)}")
132
  pdf_file = pdf_data
133
  else:
134
+ logger.warning(f"Unknown PDF data type: {type(pdf_data)}, cannot extract")
 
135
  return None
136
 
137
  text_parts = []
 
139
  total_pages = len(pdf.pages)
140
 
141
  if max_pages is None:
142
+ logger.info(f"Opened PDF with {total_pages} pages, extracting all pages")
 
143
  else:
144
  logger.info(
145
+ f"Opened PDF with {total_pages} pages, extracting up to {max_pages} pages"
146
+ )
147
 
148
  for page_num, page in enumerate(pdf.pages, 1):
149
  try:
 
152
  text_parts.append(page_text)
153
  logger.debug(
154
  f"Extracted {
155
+ len(page_text)} chars from page {page_num}"
156
+ )
157
  else:
158
+ logger.debug(f"Page {page_num} has no extractable text")
 
159
  except Exception as page_error:
160
+ logger.warning(f"Error extracting page {page_num}: {page_error}")
 
161
  continue
162
 
163
  if max_pages is not None and page_num >= max_pages:
164
  logger.info(
165
+ f"Truncated PDF extraction at {page_num} pages (max: {max_pages})"
166
+ )
167
  break
168
 
169
  extracted_text = "\n\n".join(text_parts) if text_parts else None
 
171
  logger.info(
172
  f"Successfully extracted {
173
  len(extracted_text)} total characters from {
174
+ len(text_parts)} pages"
175
+ )
176
  else:
177
  logger.warning("No text could be extracted from PDF")
178
  return extracted_text
 
185
  """Split text into chunks"""
186
  if not text:
187
  return []
188
+ return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
 
189
 
190
  def extract_dataset_items(self, dataset: Any) -> List[Dict[str, Any]]:
191
  """
 
203
  pass
204
 
205
  try:
206
+ if hasattr(dataset, "keys") and callable(getattr(dataset, "keys", None)):
 
 
 
 
 
 
207
  keys = list(dataset.keys())
208
  if keys:
209
  first_split = keys[0]
 
228
 
229
  def __init__(self, output_dir: Optional[Path] = None):
230
  if output_dir is None:
231
+ output_dir = Path(__file__).resolve().parent.parent / "results" / "hf_ingest"
 
232
  self.output_dir = Path(output_dir)
233
  self.output_dir.mkdir(exist_ok=True, parents=True)
234
 
 
244
 
245
  total_docs = len(docs)
246
 
247
+ if max_docs_per_chunk == float("inf") or total_docs <= max_docs_per_chunk:
 
248
  pack_file = pack_dir / f"{pack_name}.jsonl"
249
 
250
  with open(pack_file, "w", encoding="utf-8") as f:
 
263
  }
264
 
265
  logger.info(
266
+ f"✓ Created Warbler pack: {pack_name} with {total_docs} documents (single file)"
267
+ )
268
  else:
269
+ chunk_count = (total_docs + max_docs_per_chunk - 1) // max_docs_per_chunk
 
270
 
271
  logger.info(
272
+ f"Creating chunked pack: {pack_name} with {total_docs} documents across {chunk_count} chunks"
273
+ )
274
 
275
  for chunk_idx in range(chunk_count):
276
  start_idx = chunk_idx * max_docs_per_chunk
277
  end_idx = min(start_idx + max_docs_per_chunk, total_docs)
278
  chunk_docs = docs[start_idx:end_idx]
279
 
280
+ chunk_file = pack_dir / f"{pack_name}-chunk-{chunk_idx + 1:03d}.jsonl"
 
281
 
282
  with open(chunk_file, "w", encoding="utf-8") as f:
283
  for doc in chunk_docs:
 
287
  f" ✓ Wrote chunk {
288
  chunk_idx + 1}/{chunk_count}: {
289
  len(chunk_docs)} documents to {
290
+ chunk_file.name}"
291
+ )
292
 
293
  metadata = {
294
  "name": pack_name,
 
305
  }
306
 
307
  logger.info(
308
+ f"✓ Created chunked Warbler pack: {pack_name} with {total_docs} documents across {chunk_count} chunks"
309
+ )
310
 
311
  metadata_file = pack_dir / "package.json"
312
  with open(metadata_file, "w", encoding="utf-8") as f:
 
321
  "timestamp": datetime.now().isoformat(),
322
  "results": results,
323
  "total_documents": sum(
324
+ result.get("documents", 0) if isinstance(result, dict) else len(result)
 
325
  for result in results.values()
326
  ),
327
  "packs_created": len(results),
328
  }
329
 
330
  report_file = (
331
+ self.output_dir / f"ingestion_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
 
332
  )
333
  with open(report_file, "w", encoding="utf-8") as f:
334
  json.dump(report, f, indent=2, ensure_ascii=False)
warbler_cda/utils/transformers/edustories.py CHANGED
@@ -16,8 +16,7 @@ logger = logging.getLogger(__name__)
16
  class EdustoriesTransformer(BaseWarblerTransformer):
17
  """Transform MU-NLPC/Edustories-en dataset"""
18
 
19
- def transform(
20
- self, dataset_name: str = "MU-NLPC/Edustories-en") -> List[Dict[str, Any]]:
21
  """
22
  Transform MU-NLPC/Edustories-en dataset
23
  Format: Educational case studies with structured teaching situations
@@ -49,8 +48,7 @@ class EdustoriesTransformer(BaseWarblerTransformer):
49
 
50
  for idx, item in enumerate(items):
51
  if isinstance(item, str):
52
- logger.warning(
53
- f"Edustory {idx + 1}: Item is a string, skipping")
54
  continue
55
 
56
  if isinstance(item, dict) or hasattr(item, "__getitem__"):
@@ -75,7 +73,8 @@ class EdustoriesTransformer(BaseWarblerTransformer):
75
  logger.warning(
76
  f"Edustory {
77
  idx +
78
- 1}: No case study content found, skipping")
 
79
  continue
80
 
81
  entry_id = safe_get("id", str(idx))
@@ -91,11 +90,9 @@ class EdustoriesTransformer(BaseWarblerTransformer):
91
  problems_annotated = safe_get("problems_annotated", "")
92
  problems_possible = safe_get("problems_possible_annotated", "")
93
  solutions_annotated = safe_get("solutions_annotated", "")
94
- solutions_possible = safe_get(
95
- "solutions_possible_annotated", "")
96
  implications_annotated = safe_get("implications_annotated", "")
97
- implications_possible = safe_get(
98
- "implications_possible_annotated", "")
99
 
100
  annotator_id = safe_get("annotator_id", "")
101
 
@@ -131,7 +128,8 @@ class EdustoriesTransformer(BaseWarblerTransformer):
131
 
132
  logger.info(
133
  f"✓ Transformed {
134
- len(warbler_docs)} educational case study entries")
 
135
  return warbler_docs
136
 
137
  @staticmethod
@@ -149,8 +147,7 @@ class EdustoriesTransformer(BaseWarblerTransformer):
149
  return default
150
 
151
  description = safe_get("description", "[No background provided]")
152
- anamnesis = safe_get(
153
- "anamnesis", "[No situation description provided]")
154
  solution = safe_get("solution", "[No intervention described]")
155
  outcome = safe_get("outcome", "[No outcome reported]")
156
 
@@ -181,16 +178,15 @@ class EdustoriesTransformer(BaseWarblerTransformer):
181
 
182
  annotation_parts = []
183
  if problems_annotated:
184
- annotation_parts.append(
185
- f"Problems Identified: {problems_annotated}")
186
  if solutions_annotated:
187
- annotation_parts.append(
188
- f"Solutions Applied: {solutions_annotated}")
189
  if implications_annotated:
190
  annotation_parts.append(f"Implications: {implications_annotated}")
191
 
192
- annotations = ("\n".join(annotation_parts)
193
- if annotation_parts else "[No annotations available]")
 
194
 
195
  content = f"""TEACHING CASE STUDY
196
 
 
16
  class EdustoriesTransformer(BaseWarblerTransformer):
17
  """Transform MU-NLPC/Edustories-en dataset"""
18
 
19
+ def transform(self, dataset_name: str = "MU-NLPC/Edustories-en") -> List[Dict[str, Any]]:
 
20
  """
21
  Transform MU-NLPC/Edustories-en dataset
22
  Format: Educational case studies with structured teaching situations
 
48
 
49
  for idx, item in enumerate(items):
50
  if isinstance(item, str):
51
+ logger.warning(f"Edustory {idx + 1}: Item is a string, skipping")
 
52
  continue
53
 
54
  if isinstance(item, dict) or hasattr(item, "__getitem__"):
 
73
  logger.warning(
74
  f"Edustory {
75
  idx +
76
+ 1}: No case study content found, skipping"
77
+ )
78
  continue
79
 
80
  entry_id = safe_get("id", str(idx))
 
90
  problems_annotated = safe_get("problems_annotated", "")
91
  problems_possible = safe_get("problems_possible_annotated", "")
92
  solutions_annotated = safe_get("solutions_annotated", "")
93
+ solutions_possible = safe_get("solutions_possible_annotated", "")
 
94
  implications_annotated = safe_get("implications_annotated", "")
95
+ implications_possible = safe_get("implications_possible_annotated", "")
 
96
 
97
  annotator_id = safe_get("annotator_id", "")
98
 
 
128
 
129
  logger.info(
130
  f"✓ Transformed {
131
+ len(warbler_docs)} educational case study entries"
132
+ )
133
  return warbler_docs
134
 
135
  @staticmethod
 
147
  return default
148
 
149
  description = safe_get("description", "[No background provided]")
150
+ anamnesis = safe_get("anamnesis", "[No situation description provided]")
 
151
  solution = safe_get("solution", "[No intervention described]")
152
  outcome = safe_get("outcome", "[No outcome reported]")
153
 
 
178
 
179
  annotation_parts = []
180
  if problems_annotated:
181
+ annotation_parts.append(f"Problems Identified: {problems_annotated}")
 
182
  if solutions_annotated:
183
+ annotation_parts.append(f"Solutions Applied: {solutions_annotated}")
 
184
  if implications_annotated:
185
  annotation_parts.append(f"Implications: {implications_annotated}")
186
 
187
+ annotations = (
188
+ "\n".join(annotation_parts) if annotation_parts else "[No annotations available]"
189
+ )
190
 
191
  content = f"""TEACHING CASE STUDY
192
 
warbler_cda/utils/transformers/enterprise.py CHANGED
@@ -16,8 +16,7 @@ logger = logging.getLogger(__name__)
16
  class EnterpriseTransformer(BaseWarblerTransformer):
17
  """Transform SustcZhangYX/ChatEnv dataset"""
18
 
19
- def transform(
20
- self, dataset_name: str = "SustcZhangYX/ChatEnv") -> List[Dict[str, Any]]:
21
  """
22
  Transform SustcZhangYX/ChatEnv dataset
23
  Format: Software development chat conversations and collaborative coding scenarios
@@ -36,22 +35,20 @@ class EnterpriseTransformer(BaseWarblerTransformer):
36
  items = list(dataset[split_name])
37
  logger.info(
38
  f"Loaded {
39
- len(items)} items from '{split_name}' split")
 
40
  break
41
  except Exception as split_error:
42
- logger.debug(
43
- f"Could not load split '{split_name}': {split_error}")
44
  continue
45
 
46
  if not items:
47
  items = self.extract_dataset_items(dataset)
48
  if items:
49
- logger.info(
50
- f"Extracted {len(items)} items from dataset")
51
  except Exception as e:
52
  logger.warning(f"Failed to load {dataset_name}: {e}")
53
- logger.info(
54
- f"Skipping {dataset_name} - dataset has loading issues")
55
  return []
56
 
57
  if not items:
@@ -99,14 +96,12 @@ class EnterpriseTransformer(BaseWarblerTransformer):
99
  )
100
 
101
  task = (
102
- item.get("task", item.get(
103
- "scenario", "Software development chat"))
104
  if isinstance(item, dict)
105
  else "Software development chat"
106
  )
107
  scenario = (
108
- item.get("scenario", item.get(
109
- "task", f"ChatEnv Scenario #{idx + 1}"))
110
  if isinstance(item, dict)
111
  else f"ChatEnv Scenario #{idx + 1}"
112
  )
@@ -138,7 +133,8 @@ class EnterpriseTransformer(BaseWarblerTransformer):
138
 
139
  logger.info(
140
  f"✓ Transformed {
141
- len(warbler_docs)} ChatEnv software development chat entries")
 
142
  return warbler_docs
143
 
144
  @staticmethod
 
16
  class EnterpriseTransformer(BaseWarblerTransformer):
17
  """Transform SustcZhangYX/ChatEnv dataset"""
18
 
19
+ def transform(self, dataset_name: str = "SustcZhangYX/ChatEnv") -> List[Dict[str, Any]]:
 
20
  """
21
  Transform SustcZhangYX/ChatEnv dataset
22
  Format: Software development chat conversations and collaborative coding scenarios
 
35
  items = list(dataset[split_name])
36
  logger.info(
37
  f"Loaded {
38
+ len(items)} items from '{split_name}' split"
39
+ )
40
  break
41
  except Exception as split_error:
42
+ logger.debug(f"Could not load split '{split_name}': {split_error}")
 
43
  continue
44
 
45
  if not items:
46
  items = self.extract_dataset_items(dataset)
47
  if items:
48
+ logger.info(f"Extracted {len(items)} items from dataset")
 
49
  except Exception as e:
50
  logger.warning(f"Failed to load {dataset_name}: {e}")
51
+ logger.info(f"Skipping {dataset_name} - dataset has loading issues")
 
52
  return []
53
 
54
  if not items:
 
96
  )
97
 
98
  task = (
99
+ item.get("task", item.get("scenario", "Software development chat"))
 
100
  if isinstance(item, dict)
101
  else "Software development chat"
102
  )
103
  scenario = (
104
+ item.get("scenario", item.get("task", f"ChatEnv Scenario #{idx + 1}"))
 
105
  if isinstance(item, dict)
106
  else f"ChatEnv Scenario #{idx + 1}"
107
  )
 
133
 
134
  logger.info(
135
  f"✓ Transformed {
136
+ len(warbler_docs)} ChatEnv software development chat entries"
137
+ )
138
  return warbler_docs
139
 
140
  @staticmethod
warbler_cda/utils/transformers/multi_character.py CHANGED
@@ -35,15 +35,12 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
35
 
36
  try:
37
  if "train" not in dataset:
38
- logger.warning(
39
- f"Multi-char: No 'train' split found in dataset")
40
  return []
41
 
42
  train_data = dataset["train"]
43
- total_items = len(train_data) if hasattr(
44
- train_data, "__len__") else 0
45
- logger.info(
46
- f"Processing {total_items} multi-character dialogue items...")
47
 
48
  for idx, item in enumerate(train_data):
49
  if idx > 0 and idx % 1000 == 0:
@@ -53,8 +50,7 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
53
 
54
  try:
55
  if item is None:
56
- logger.warning(
57
- f"Multi-char {idx + 1}: Item is None, skipping")
58
  continue
59
 
60
  if not isinstance(item, dict):
@@ -75,12 +71,12 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
75
  conversation = [] if conversation is None else [conversation]
76
 
77
  if not setting and not conversation:
78
- logger.warning(
79
- f"Multi-char {idx + 1}: Missing essential data, skipping")
80
  continue
81
 
82
- if conversation and not all(isinstance(
83
- msg, (dict, str)) for msg in conversation[:10]):
 
84
  logger.warning(
85
  f"Multi-char {idx + 1}: Invalid conversation structure, skipping"
86
  )
@@ -102,12 +98,10 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
102
  "source_dataset": dataset_name,
103
  "setting": setting[:150] + "..." if len(setting) > 150 else setting,
104
  "character_count": (
105
- len(characters) if isinstance(
106
- characters, list) else 0
107
  ),
108
  "conversation_length": (
109
- len(conversation) if isinstance(
110
- conversation, list) else 0
111
  ),
112
  "realm_type": "narrative",
113
  "realm_label": "multi_character_dialogue",
@@ -129,8 +123,7 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
129
  )
130
  continue
131
  except (KeyboardInterrupt, SystemExit):
132
- logger.warning(
133
- f"Multi-char: Processing interrupted at item {idx + 1}")
134
  raise
135
  except Exception as e:
136
  logger.warning(
@@ -141,10 +134,12 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
141
  except (MemoryError, RecursionError) as critical_error:
142
  logger.error(
143
  f"Multi-char: Critical error during iteration: {
144
- type(critical_error).__name__}: {critical_error}")
 
145
  logger.info(
146
  f"Returning {
147
- len(warbler_docs)} documents processed before error")
 
148
  except (KeyboardInterrupt, SystemExit):
149
  logger.warning(
150
  f"Multi-char: Processing interrupted, returning {len(warbler_docs)} documents"
@@ -153,13 +148,14 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
153
  except Exception as outer_error:
154
  logger.error(
155
  f"Multi-char: Unexpected error during dataset iteration: {
156
- type(outer_error).__name__}: {outer_error}")
 
157
  logger.info(
158
  f"Returning {
159
- len(warbler_docs)} documents processed before error")
 
160
 
161
- logger.info(
162
- f"✓ Transformed {len(warbler_docs)} multi-character entries")
163
  return warbler_docs
164
 
165
  @staticmethod
@@ -185,18 +181,14 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
185
  message_field = msg.get("message", "")
186
 
187
  if not isinstance(from_field, str):
188
- from_field = str(
189
- from_field) if from_field is not None else "Unknown"
190
  if not isinstance(message_field, str):
191
- message_field = str(
192
- message_field) if message_field is not None else ""
193
 
194
  if len(message_field) > 5000:
195
- message_field = message_field[:5000] + \
196
- "... [truncated]"
197
 
198
- conversation_lines.append(
199
- f"{from_field}: {message_field}")
200
 
201
  elif isinstance(msg, str):
202
  if len(msg) > 5000:
@@ -204,33 +196,31 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
204
  conversation_lines.append(msg)
205
 
206
  else:
207
- conversation_lines.append(
208
- f"[Message {msg_idx + 1}: {type(msg).__name__}]")
209
 
210
  except (RecursionError, MemoryError) as critical_err:
211
  logger.warning(
212
- f"Critical error processing conversation message {msg_idx}: {critical_err}")
 
213
  break
214
  except Exception as msg_err:
215
- logger.debug(
216
- f"Error processing conversation message {msg_idx}: {msg_err}")
217
  continue
218
 
219
  if len(conversation) > max_conversation_items:
220
  conversation_lines.append(
221
  f"\n[... {
222
  len(conversation) -
223
- max_conversation_items} more messages truncated]")
 
224
 
225
  conversation_text = (
226
- "\n".join(
227
- conversation_lines) if conversation_lines else "[No conversation available]"
228
  )
229
 
230
  setting = item.get("setting", "[No setting provided]")
231
  if not isinstance(setting, str):
232
- setting = str(
233
- setting) if setting is not None else "[No setting provided]"
234
 
235
  if len(setting) > 2000:
236
  setting = setting[:2000] + "... [truncated]"
@@ -240,8 +230,8 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
240
  characters = [] if characters is None else [characters]
241
 
242
  setting_after = item.get(
243
- "setting after interaction",
244
- "[No setting after interaction provided]")
245
  if not isinstance(setting_after, str):
246
  setting_after = (
247
  str(setting_after)
@@ -257,13 +247,11 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
257
  if len(characters) > 100:
258
  characters = characters[:100]
259
  characters_str = (
260
- json.dumps(characters, indent=2,
261
- ensure_ascii=False) + "\n[... truncated]"
262
  )
263
  else:
264
  characters_str = (
265
- json.dumps(characters, indent=2,
266
- ensure_ascii=False) if characters else "[]"
267
  )
268
  except (TypeError, ValueError, RecursionError) as json_err:
269
  logger.debug(f"Error serializing characters to JSON: {json_err}")
@@ -283,8 +271,7 @@ After Interaction: {setting_after}
283
  This represents a multi-character narrative scenario for NPC interaction training."""
284
 
285
  if len(content) > 50000:
286
- content = content[:50000] + \
287
- "\n\n[Content truncated due to size]"
288
 
289
  return content
290
  except Exception as final_err:
 
35
 
36
  try:
37
  if "train" not in dataset:
38
+ logger.warning("Multi-char: No 'train' split found in dataset")
 
39
  return []
40
 
41
  train_data = dataset["train"]
42
+ total_items = len(train_data) if hasattr(train_data, "__len__") else 0
43
+ logger.info(f"Processing {total_items} multi-character dialogue items...")
 
 
44
 
45
  for idx, item in enumerate(train_data):
46
  if idx > 0 and idx % 1000 == 0:
 
50
 
51
  try:
52
  if item is None:
53
+ logger.warning(f"Multi-char {idx + 1}: Item is None, skipping")
 
54
  continue
55
 
56
  if not isinstance(item, dict):
 
71
  conversation = [] if conversation is None else [conversation]
72
 
73
  if not setting and not conversation:
74
+ logger.warning(f"Multi-char {idx + 1}: Missing essential data, skipping")
 
75
  continue
76
 
77
+ if conversation and not all(
78
+ isinstance(msg, (dict, str)) for msg in conversation[:10]
79
+ ):
80
  logger.warning(
81
  f"Multi-char {idx + 1}: Invalid conversation structure, skipping"
82
  )
 
98
  "source_dataset": dataset_name,
99
  "setting": setting[:150] + "..." if len(setting) > 150 else setting,
100
  "character_count": (
101
+ len(characters) if isinstance(characters, list) else 0
 
102
  ),
103
  "conversation_length": (
104
+ len(conversation) if isinstance(conversation, list) else 0
 
105
  ),
106
  "realm_type": "narrative",
107
  "realm_label": "multi_character_dialogue",
 
123
  )
124
  continue
125
  except (KeyboardInterrupt, SystemExit):
126
+ logger.warning(f"Multi-char: Processing interrupted at item {idx + 1}")
 
127
  raise
128
  except Exception as e:
129
  logger.warning(
 
134
  except (MemoryError, RecursionError) as critical_error:
135
  logger.error(
136
  f"Multi-char: Critical error during iteration: {
137
+ type(critical_error).__name__}: {critical_error}"
138
+ )
139
  logger.info(
140
  f"Returning {
141
+ len(warbler_docs)} documents processed before error"
142
+ )
143
  except (KeyboardInterrupt, SystemExit):
144
  logger.warning(
145
  f"Multi-char: Processing interrupted, returning {len(warbler_docs)} documents"
 
148
  except Exception as outer_error:
149
  logger.error(
150
  f"Multi-char: Unexpected error during dataset iteration: {
151
+ type(outer_error).__name__}: {outer_error}"
152
+ )
153
  logger.info(
154
  f"Returning {
155
+ len(warbler_docs)} documents processed before error"
156
+ )
157
 
158
+ logger.info(f"✓ Transformed {len(warbler_docs)} multi-character entries")
 
159
  return warbler_docs
160
 
161
  @staticmethod
 
181
  message_field = msg.get("message", "")
182
 
183
  if not isinstance(from_field, str):
184
+ from_field = str(from_field) if from_field is not None else "Unknown"
 
185
  if not isinstance(message_field, str):
186
+ message_field = str(message_field) if message_field is not None else ""
 
187
 
188
  if len(message_field) > 5000:
189
+ message_field = message_field[:5000] + "... [truncated]"
 
190
 
191
+ conversation_lines.append(f"{from_field}: {message_field}")
 
192
 
193
  elif isinstance(msg, str):
194
  if len(msg) > 5000:
 
196
  conversation_lines.append(msg)
197
 
198
  else:
199
+ conversation_lines.append(f"[Message {msg_idx + 1}: {type(msg).__name__}]")
 
200
 
201
  except (RecursionError, MemoryError) as critical_err:
202
  logger.warning(
203
+ f"Critical error processing conversation message {msg_idx}: {critical_err}"
204
+ )
205
  break
206
  except Exception as msg_err:
207
+ logger.debug(f"Error processing conversation message {msg_idx}: {msg_err}")
 
208
  continue
209
 
210
  if len(conversation) > max_conversation_items:
211
  conversation_lines.append(
212
  f"\n[... {
213
  len(conversation) -
214
+ max_conversation_items} more messages truncated]"
215
+ )
216
 
217
  conversation_text = (
218
+ "\n".join(conversation_lines) if conversation_lines else "[No conversation available]"
 
219
  )
220
 
221
  setting = item.get("setting", "[No setting provided]")
222
  if not isinstance(setting, str):
223
+ setting = str(setting) if setting is not None else "[No setting provided]"
 
224
 
225
  if len(setting) > 2000:
226
  setting = setting[:2000] + "... [truncated]"
 
230
  characters = [] if characters is None else [characters]
231
 
232
  setting_after = item.get(
233
+ "setting after interaction", "[No setting after interaction provided]"
234
+ )
235
  if not isinstance(setting_after, str):
236
  setting_after = (
237
  str(setting_after)
 
247
  if len(characters) > 100:
248
  characters = characters[:100]
249
  characters_str = (
250
+ json.dumps(characters, indent=2, ensure_ascii=False) + "\n[... truncated]"
 
251
  )
252
  else:
253
  characters_str = (
254
+ json.dumps(characters, indent=2, ensure_ascii=False) if characters else "[]"
 
255
  )
256
  except (TypeError, ValueError, RecursionError) as json_err:
257
  logger.debug(f"Error serializing characters to JSON: {json_err}")
 
271
  This represents a multi-character narrative scenario for NPC interaction training."""
272
 
273
  if len(content) > 50000:
274
+ content = content[:50000] + "\n\n[Content truncated due to size]"
 
275
 
276
  return content
277
  except Exception as final_err:
warbler_cda/utils/transformers/novels.py CHANGED
@@ -16,8 +16,7 @@ logger = logging.getLogger(__name__)
16
  class NovelsTransformer(BaseWarblerTransformer):
17
  """Transform GOAT-AI/generated-novels dataset"""
18
 
19
- def transform(
20
- self, dataset_name: str = "GOAT-AI/generated-novels") -> List[Dict[str, Any]]:
21
  """
22
  Transform GOAT-AI/generated-novels dataset
23
  Format: Full-length generated novels (PDF-based, treated as narrative metadata)
@@ -61,13 +60,7 @@ class NovelsTransformer(BaseWarblerTransformer):
61
  except Exception:
62
  item_keys = []
63
 
64
- for field in [
65
- "text",
66
- "story",
67
- "content",
68
- "novel",
69
- "body",
70
- "full_text"]:
71
  try:
72
  if isinstance(item, dict):
73
  if field in item and item[field]:
@@ -84,9 +77,9 @@ class NovelsTransformer(BaseWarblerTransformer):
84
  logger.info(
85
  f"Novel {
86
  idx +
87
- 1}: No text field found, attempting PDF extraction...")
88
- for pdf_field in [
89
- "pdf", "file", "document", "content", "data"]:
90
  try:
91
  pdf_data = None
92
  if isinstance(item, dict):
@@ -101,33 +94,37 @@ class NovelsTransformer(BaseWarblerTransformer):
101
  f"Novel {
102
  idx +
103
  1}: Found PDF data in field '{pdf_field}' (type: {
104
- type(pdf_data).__name__})")
105
- text = self.extract_pdf_text(
106
- pdf_data, max_pages=self.max_pdf_pages)
107
  if text:
108
  logger.info(
109
  f"Novel {
110
  idx +
111
  1}: Successfully extracted {
112
- len(text)} chars from PDF field '{pdf_field}'")
 
113
  break
114
  else:
115
  logger.warning(
116
  f"Novel {
117
  idx +
118
- 1}: PDF field '{pdf_field}' extraction returned no text")
 
119
  except Exception as e:
120
  logger.warning(
121
  f"Novel {
122
  idx +
123
  1}: PDF extraction from field '{pdf_field}' failed: {
124
- type(e).__name__}: {e}")
 
125
 
126
  if not text:
127
  logger.warning(
128
  f"Novel {
129
  idx +
130
- 1}: No text content found. Available fields: {item_keys}")
 
131
  pdf_status = (
132
  "Enabled"
133
  if self.has_pdf_support()
@@ -149,11 +146,9 @@ This entry serves as a placeholder for retrieval system testing."""
149
  title = f"Generated Novel #{idx + 1}"
150
  try:
151
  if isinstance(item, dict):
152
- title = item.get("title", item.get(
153
- "name", f"Generated Novel #{idx + 1}"))
154
  elif hasattr(item, "get"):
155
- title = item.get("title", item.get(
156
- "name", f"Generated Novel #{idx + 1}"))
157
  elif hasattr(item, "__getitem__"):
158
  title = (
159
  item.get("title", f"Generated Novel #{idx + 1}")
@@ -193,15 +188,12 @@ This entry serves as a placeholder for retrieval system testing."""
193
  logger.info(
194
  f"✓ Transformed {
195
  len(warbler_docs)} novel chunks from {
196
- len(items)} novels")
 
197
  return warbler_docs
198
 
199
  @staticmethod
200
- def _create_content(
201
- title: str,
202
- text_chunk: str,
203
- chunk_idx: int,
204
- total_chunks: int) -> str:
205
  """Create content string for novel chunk"""
206
  return f"""Novel: {title}
207
  Part: {chunk_idx + 1} of {total_chunks}
 
16
  class NovelsTransformer(BaseWarblerTransformer):
17
  """Transform GOAT-AI/generated-novels dataset"""
18
 
19
+ def transform(self, dataset_name: str = "GOAT-AI/generated-novels") -> List[Dict[str, Any]]:
 
20
  """
21
  Transform GOAT-AI/generated-novels dataset
22
  Format: Full-length generated novels (PDF-based, treated as narrative metadata)
 
60
  except Exception:
61
  item_keys = []
62
 
63
+ for field in ["text", "story", "content", "novel", "body", "full_text"]:
 
 
 
 
 
 
64
  try:
65
  if isinstance(item, dict):
66
  if field in item and item[field]:
 
77
  logger.info(
78
  f"Novel {
79
  idx +
80
+ 1}: No text field found, attempting PDF extraction..."
81
+ )
82
+ for pdf_field in ["pdf", "file", "document", "content", "data"]:
83
  try:
84
  pdf_data = None
85
  if isinstance(item, dict):
 
94
  f"Novel {
95
  idx +
96
  1}: Found PDF data in field '{pdf_field}' (type: {
97
+ type(pdf_data).__name__})"
98
+ )
99
+ text = self.extract_pdf_text(pdf_data, max_pages=self.max_pdf_pages)
100
  if text:
101
  logger.info(
102
  f"Novel {
103
  idx +
104
  1}: Successfully extracted {
105
+ len(text)} chars from PDF field '{pdf_field}'"
106
+ )
107
  break
108
  else:
109
  logger.warning(
110
  f"Novel {
111
  idx +
112
+ 1}: PDF field '{pdf_field}' extraction returned no text"
113
+ )
114
  except Exception as e:
115
  logger.warning(
116
  f"Novel {
117
  idx +
118
  1}: PDF extraction from field '{pdf_field}' failed: {
119
+ type(e).__name__}: {e}"
120
+ )
121
 
122
  if not text:
123
  logger.warning(
124
  f"Novel {
125
  idx +
126
+ 1}: No text content found. Available fields: {item_keys}"
127
+ )
128
  pdf_status = (
129
  "Enabled"
130
  if self.has_pdf_support()
 
146
  title = f"Generated Novel #{idx + 1}"
147
  try:
148
  if isinstance(item, dict):
149
+ title = item.get("title", item.get("name", f"Generated Novel #{idx + 1}"))
 
150
  elif hasattr(item, "get"):
151
+ title = item.get("title", item.get("name", f"Generated Novel #{idx + 1}"))
 
152
  elif hasattr(item, "__getitem__"):
153
  title = (
154
  item.get("title", f"Generated Novel #{idx + 1}")
 
188
  logger.info(
189
  f"✓ Transformed {
190
  len(warbler_docs)} novel chunks from {
191
+ len(items)} novels"
192
+ )
193
  return warbler_docs
194
 
195
  @staticmethod
196
+ def _create_content(title: str, text_chunk: str, chunk_idx: int, total_chunks: int) -> str:
 
 
 
 
197
  """Create content string for novel chunk"""
198
  return f"""Novel: {title}
199
  Part: {chunk_idx + 1} of {total_chunks}
warbler_cda/utils/transformers/npc_dialogue.py CHANGED
@@ -25,13 +25,17 @@ class NPCDialogueTransformer(BaseWarblerTransformer):
25
  {
26
  "Name": "Elandra the Merchant",
27
  "Biography": "A seasoned trader who has traveled across kingdoms, known for her sharp wit.",
28
- "Queries": ["What do you sell?", "Can you lower the price?", "Any rare items today?"],
 
 
 
 
29
  "Responses": [
30
  "I have wares from distant lands, take a look.",
31
  "Prices are firm, but quality is unmatched.",
32
- "Indeed, a relic from the old empire just arrived."
33
  ],
34
- "Emotions": ["neutral", "greedy", "excited"]
35
  },
36
  {
37
  "Name": "Tharos the Guard",
@@ -40,20 +44,24 @@ class NPCDialogueTransformer(BaseWarblerTransformer):
40
  "Responses": [
41
  "Only citizens may pass without a writ.",
42
  "Bandits lurk beyond the hills, stay vigilant.",
43
- "I serve the crown, keeping watch at dawn."
44
  ],
45
- "Emotions": ["serious", "cautious", "stern"]
46
  },
47
  {
48
  "Name": "Lyra the Healer",
49
  "Biography": "A gentle soul who tends to the wounded, guided by compassion and faith.",
50
- "Queries": ["Can you heal me?", "What herbs do you use?", "Why do you help strangers?"],
 
 
 
 
51
  "Responses": [
52
  "Rest easy, I will mend your wounds.",
53
  "Chamomile and sage, nature’s gift to us.",
54
- "Because every life is sacred, no matter the path."
55
  ],
56
- "Emotions": ["kind", "calm", "hopeful"]
57
  },
58
  ]
59
 
 
25
  {
26
  "Name": "Elandra the Merchant",
27
  "Biography": "A seasoned trader who has traveled across kingdoms, known for her sharp wit.",
28
+ "Queries": [
29
+ "What do you sell?",
30
+ "Can you lower the price?",
31
+ "Any rare items today?",
32
+ ],
33
  "Responses": [
34
  "I have wares from distant lands, take a look.",
35
  "Prices are firm, but quality is unmatched.",
36
+ "Indeed, a relic from the old empire just arrived.",
37
  ],
38
+ "Emotions": ["neutral", "greedy", "excited"],
39
  },
40
  {
41
  "Name": "Tharos the Guard",
 
44
  "Responses": [
45
  "Only citizens may pass without a writ.",
46
  "Bandits lurk beyond the hills, stay vigilant.",
47
+ "I serve the crown, keeping watch at dawn.",
48
  ],
49
+ "Emotions": ["serious", "cautious", "stern"],
50
  },
51
  {
52
  "Name": "Lyra the Healer",
53
  "Biography": "A gentle soul who tends to the wounded, guided by compassion and faith.",
54
+ "Queries": [
55
+ "Can you heal me?",
56
+ "What herbs do you use?",
57
+ "Why do you help strangers?",
58
+ ],
59
  "Responses": [
60
  "Rest easy, I will mend your wounds.",
61
  "Chamomile and sage, nature’s gift to us.",
62
+ "Because every life is sacred, no matter the path.",
63
  ],
64
+ "Emotions": ["kind", "calm", "hopeful"],
65
  },
66
  ]
67
 
warbler_cda/utils/transformers/portuguese_education.py CHANGED
@@ -35,8 +35,7 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
35
  items = []
36
  if hasattr(dataset, "__getitem__") and "train" in dataset:
37
  items = list(dataset["train"])
38
- logger.info(
39
- f"Loaded {len(items)} items from dataset['train']")
40
  else:
41
  items = self.extract_dataset_items(dataset)
42
  logger.info(f"Extracted {len(items)} items from dataset")
@@ -48,8 +47,7 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
48
 
49
  for idx, item in enumerate(items):
50
  if isinstance(item, str):
51
- logger.warning(
52
- f"Portuguese doc {idx + 1}: Item is a string, skipping")
53
  continue
54
 
55
  if isinstance(item, dict) or hasattr(item, "__getitem__"):
@@ -63,12 +61,7 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
63
  item_keys = []
64
 
65
  content = None
66
- for field in [
67
- "content",
68
- "text",
69
- "body",
70
- "document",
71
- "passage"]:
72
  try:
73
  if isinstance(item, dict):
74
  if field in item and item[field]:
@@ -93,14 +86,14 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
93
  pdf_data = item[pdf_field]
94
 
95
  if pdf_data:
96
- if isinstance(
97
- pdf_data, dict) and "bytes" in pdf_data:
98
  pdf_bytes = pdf_data["bytes"]
99
  logger.info(
100
  f"Portuguese doc {
101
  idx +
102
  1}: Found PDF bytes ({
103
- len(pdf_bytes)} bytes), extracting...")
 
104
  content = self.extract_pdf_text(
105
  pdf_bytes, max_pages=self.max_pdf_pages
106
  )
@@ -109,7 +102,8 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
109
  f"Portuguese doc {
110
  idx +
111
  1}: Found PDF bytes ({
112
- len(pdf_data)} bytes), extracting...")
 
113
  content = self.extract_pdf_text(
114
  pdf_data, max_pages=self.max_pdf_pages
115
  )
@@ -118,7 +112,8 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
118
  f"Portuguese doc {
119
  idx +
120
  1}: Found PDF data (type: {
121
- type(pdf_data)}), attempting extraction...")
 
122
  content = self.extract_pdf_text(
123
  pdf_data, max_pages=self.max_pdf_pages
124
  )
@@ -128,24 +123,28 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
128
  f"Portuguese doc {
129
  idx +
130
  1}: Successfully extracted {
131
- len(content)} chars from PDF")
 
132
  break
133
  else:
134
  logger.warning(
135
  f"Portuguese doc {
136
- idx + 1}: PDF extraction returned no text")
 
137
  except Exception as e:
138
  logger.warning(
139
  f"Portuguese doc {
140
  idx +
141
  1}: PDF extraction error: {
142
- type(e).__name__}: {e}")
 
143
 
144
  if not content:
145
  logger.warning(
146
  f"Portuguese doc {
147
  idx +
148
- 1}: No content found. Available fields: {item_keys}")
 
149
  content = f"""[Conteúdo Indisponível]
150
 
151
  Este documento (#{idx + 1}) faz parte do dataset Solshine/Portuguese_Language_Education_Texts.
@@ -186,7 +185,8 @@ Esta entrada serve como placeholder para testes do sistema de recuperação."""
186
  except Exception as e:
187
  logger.warning(
188
  f"Portuguese doc {
189
- idx + 1}: Could not convert item to dict: {e}")
 
190
  item_with_content = {}
191
 
192
  item_with_content["content"] = content
@@ -222,8 +222,7 @@ Esta entrada serve como placeholder para testes do sistema de recuperação."""
222
  }
223
  warbler_docs.append(doc)
224
 
225
- logger.info(
226
- f"✓ Transformed {len(warbler_docs)} Portuguese education entries")
227
  return warbler_docs
228
 
229
  @staticmethod
 
35
  items = []
36
  if hasattr(dataset, "__getitem__") and "train" in dataset:
37
  items = list(dataset["train"])
38
+ logger.info(f"Loaded {len(items)} items from dataset['train']")
 
39
  else:
40
  items = self.extract_dataset_items(dataset)
41
  logger.info(f"Extracted {len(items)} items from dataset")
 
47
 
48
  for idx, item in enumerate(items):
49
  if isinstance(item, str):
50
+ logger.warning(f"Portuguese doc {idx + 1}: Item is a string, skipping")
 
51
  continue
52
 
53
  if isinstance(item, dict) or hasattr(item, "__getitem__"):
 
61
  item_keys = []
62
 
63
  content = None
64
+ for field in ["content", "text", "body", "document", "passage"]:
 
 
 
 
 
65
  try:
66
  if isinstance(item, dict):
67
  if field in item and item[field]:
 
86
  pdf_data = item[pdf_field]
87
 
88
  if pdf_data:
89
+ if isinstance(pdf_data, dict) and "bytes" in pdf_data:
 
90
  pdf_bytes = pdf_data["bytes"]
91
  logger.info(
92
  f"Portuguese doc {
93
  idx +
94
  1}: Found PDF bytes ({
95
+ len(pdf_bytes)} bytes), extracting..."
96
+ )
97
  content = self.extract_pdf_text(
98
  pdf_bytes, max_pages=self.max_pdf_pages
99
  )
 
102
  f"Portuguese doc {
103
  idx +
104
  1}: Found PDF bytes ({
105
+ len(pdf_data)} bytes), extracting..."
106
+ )
107
  content = self.extract_pdf_text(
108
  pdf_data, max_pages=self.max_pdf_pages
109
  )
 
112
  f"Portuguese doc {
113
  idx +
114
  1}: Found PDF data (type: {
115
+ type(pdf_data)}), attempting extraction..."
116
+ )
117
  content = self.extract_pdf_text(
118
  pdf_data, max_pages=self.max_pdf_pages
119
  )
 
123
  f"Portuguese doc {
124
  idx +
125
  1}: Successfully extracted {
126
+ len(content)} chars from PDF"
127
+ )
128
  break
129
  else:
130
  logger.warning(
131
  f"Portuguese doc {
132
+ idx + 1}: PDF extraction returned no text"
133
+ )
134
  except Exception as e:
135
  logger.warning(
136
  f"Portuguese doc {
137
  idx +
138
  1}: PDF extraction error: {
139
+ type(e).__name__}: {e}"
140
+ )
141
 
142
  if not content:
143
  logger.warning(
144
  f"Portuguese doc {
145
  idx +
146
+ 1}: No content found. Available fields: {item_keys}"
147
+ )
148
  content = f"""[Conteúdo Indisponível]
149
 
150
  Este documento (#{idx + 1}) faz parte do dataset Solshine/Portuguese_Language_Education_Texts.
 
185
  except Exception as e:
186
  logger.warning(
187
  f"Portuguese doc {
188
+ idx + 1}: Could not convert item to dict: {e}"
189
+ )
190
  item_with_content = {}
191
 
192
  item_with_content["content"] = content
 
222
  }
223
  warbler_docs.append(doc)
224
 
225
+ logger.info(f"✓ Transformed {len(warbler_docs)} Portuguese education entries")
 
226
  return warbler_docs
227
 
228
  @staticmethod