Spaces:

Bellok
/

warbler-cda

Running on Zero

Bellok commited on 17 days ago

Commit

ec38897

1 Parent(s): a2c1773

refactor(app): improve code formatting and add background ingestion status display

- Remove unused 'hashlib' import
- Update typing import by removing unused 'List'
- Standardize string quotes to double quotes for consistency
- Reformat long print statements into multi-line for readability
- Simplify thread creation arguments on single line
- Adjust long string concatenations and metadata formatting
- Add memory garbage collection after large ingestion batches
- Include background pack ingestion details in system stats output for better monitoring

This commit enhances code maintainability through consistent formatting and adds informative status reporting for ingestion processes.

Files changed (48) hide show

.gitignore +1 -0
app.py +45 -14
compress_packs.py +17 -19
final_fix.py +2 -4
fix_theme.py +3 -3
package-lock.json +15 -1
package.json +2 -1
packs/warbler-pack-npc-dialog/src/index.ts +27 -3
packs/warbler-pack-npc-dialog/warbler-pack-core.jsonl +0 -2
packs/warbler-pack-wisdom-scrolls/README.md +22 -4
packs/warbler-pack-wisdom-scrolls/README_HF_DATASET.md +7 -1
test_app.py +3 -1
test_compressed_pack.py +3 -1
test_embedding_integration.py +2 -2
test_fixes.py +2 -2
test_pack_loading.py +4 -1
tests/test_new_mit_datasets.py +2 -4
tests/test_pdf_ingestion.py +1 -4
tests/test_rag_e2e.py +12 -22
tests/test_retrieval_api.py +1 -1
tsconfig.base.json +14 -0
validate_new_transformers.py +2 -2
verify_pack_ingestion.py +0 -1
warbler_cda/api/cli.py +4 -6
warbler_cda/api/service.py +9 -13
warbler_cda/castle_graph.py +1 -1
warbler_cda/conflict_detector.py +5 -3
warbler_cda/embeddings/openai_provider.py +0 -1
warbler_cda/embeddings/sentence_transformer_provider.py +3 -5
warbler_cda/evaporation.py +4 -3
warbler_cda/pack_loader.py +1 -3
warbler_cda/pack_sync.py +2 -2
warbler_cda/retrieval_api.py +11 -6
warbler_cda/semantic_anchors.py +2 -4
warbler_cda/stat7_entity.py +6 -11
warbler_cda/stat7_experiments.py +9 -17
warbler_cda/stat7_rag_bridge.py +5 -13
warbler_cda/stat7_visualization.py +3 -7
warbler_cda/summarization_ladder.py +2 -2
warbler_cda/utils/hf_warbler_ingest.py +3 -4
warbler_cda/utils/load_warbler_packs.py +1 -3
warbler_cda/utils/transformers/base.py +49 -65
warbler_cda/utils/transformers/edustories.py +14 -18
warbler_cda/utils/transformers/enterprise.py +10 -14
warbler_cda/utils/transformers/multi_character.py +37 -50
warbler_cda/utils/transformers/novels.py +21 -29
warbler_cda/utils/transformers/npc_dialogue.py +16 -8
warbler_cda/utils/transformers/portuguese_education.py +21 -22

.gitignore CHANGED Viewed

@@ -661,3 +661,4 @@ node_modules/wrappy/LICENSE
 node_modules/wrappy/package.json
 node_modules/wrappy/README.md
 node_modules/wrappy/wrappy.js

 node_modules/wrappy/package.json
 node_modules/wrappy/README.md
 node_modules/wrappy/wrappy.js
+TODO.md

app.py CHANGED Viewed

@@ -8,13 +8,12 @@ import time
 import os
 import threading
 import gradio as gr
-import hashlib
 import spaces
 from pathlib import Path
-from typing import List, Tuple, Optional, Dict
 # Set TOKENIZERS_PARALLELISM to avoid warnings with SentenceTransformers
-os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 # Global variables for background ingestion tracking
@@ -76,20 +75,25 @@ def background_ingest_packs(api, pack_docs, pack_manager):
         ingestion_status["rate"] = rate
         ingestion_status["eta"] = eta
-        print(f"[PROGRESS] {processed}/{total_docs} documents ingested "
-              f"({processed/total_docs*100:.1f}%) - "
-              f"{rate:.1f} docs/sec - ETA: {eta/60:.1f} min")
         # Force garbage collection after large batches to free memory
         if processed % 10000 == 0:
             import gc
             gc.collect()
     packs_loaded = processed
     pack_manager.mark_packs_ingested(1, packs_loaded)
     total_time = time.time() - start_time
-    print(f"[OK] Loaded {packs_loaded} documents from Warbler packs "
-          f"({failed} failed) in {total_time:.1f} seconds")
     # Mark ingestion complete
     ingestion_status["running"] = False
@@ -259,9 +263,7 @@ if WARBLER_AVAILABLE:
             if pack_docs and pack_manager.should_ingest_packs(api, len(pack_docs)):
                 # Start background ingestion
                 ingestion_thread = threading.Thread(
-                    target=background_ingest_packs,
-                    args=(api, pack_docs, pack_manager),
-                    daemon=True
                 )
                 ingestion_thread.start()
                 packs_loaded = 0  # Will be updated asynchronously
@@ -338,7 +340,7 @@ def query_warbler(
         elapsed_ms = (time.time() - start_time) * 1000
         # Format results
-        results_text = f"# Query Results\n\n"
         results_text += f"**Query:** {query_text}\n\n"
         results_text += (
             f"**Mode:** {'Hybrid (Semantic + STAT7)' if use_hybrid else 'Semantic Only'}\n\n"
@@ -361,7 +363,7 @@ def query_warbler(
                 results_text += f"**Type:** {result.content_type}\n\n"
                 if result.metadata:
-                    results_text += f"**Metadata:**\n"
                     for key, value in result.metadata.items():
                         if key != "stat7":  # Skip complex STAT7 object
                             results_text += f"- {key}: {value}\n"
@@ -428,7 +430,7 @@ def get_system_stats() -> str:
     try:
         metrics = api.get_retrieval_metrics()
-        stats = f"# System Statistics\n\n"
         stats += f"**Total Documents:** {metrics['context_store_size']}\n\n"
         stats += f"**Total Queries:** {metrics['retrieval_metrics']['total_queries']}\n\n"
         stats += f"**Cache Hit Rate:** {metrics['cache_performance']['hit_rate']:.1%}\n\n"
@@ -440,6 +442,35 @@ def get_system_stats() -> str:
         for quality, count in metrics["retrieval_metrics"]["quality_distribution"].items():
             stats += f"- {quality.capitalize()}: {count}\n"
         return stats
     except Exception as e:

 import os
 import threading
 import gradio as gr
 import spaces
 from pathlib import Path
+from typing import Tuple, Optional, Dict
 # Set TOKENIZERS_PARALLELISM to avoid warnings with SentenceTransformers
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # Global variables for background ingestion tracking
         ingestion_status["rate"] = rate
         ingestion_status["eta"] = eta
+        print(
+            f"[PROGRESS] {processed}/{total_docs} documents ingested "
+            f"({processed/total_docs*100:.1f}%) - "
+            f"{rate:.1f} docs/sec - ETA: {eta/60:.1f} min"
+        )
         # Force garbage collection after large batches to free memory
         if processed % 10000 == 0:
             import gc
             gc.collect()
     packs_loaded = processed
     pack_manager.mark_packs_ingested(1, packs_loaded)
     total_time = time.time() - start_time
+    print(
+        f"[OK] Loaded {packs_loaded} documents from Warbler packs "
+        f"({failed} failed) in {total_time:.1f} seconds"
+    )
     # Mark ingestion complete
     ingestion_status["running"] = False
             if pack_docs and pack_manager.should_ingest_packs(api, len(pack_docs)):
                 # Start background ingestion
                 ingestion_thread = threading.Thread(
+                    target=background_ingest_packs, args=(api, pack_docs, pack_manager), daemon=True
                 )
                 ingestion_thread.start()
                 packs_loaded = 0  # Will be updated asynchronously
         elapsed_ms = (time.time() - start_time) * 1000
         # Format results
+        results_text = "# Query Results\n\n"
         results_text += f"**Query:** {query_text}\n\n"
         results_text += (
             f"**Mode:** {'Hybrid (Semantic + STAT7)' if use_hybrid else 'Semantic Only'}\n\n"
                 results_text += f"**Type:** {result.content_type}\n\n"
                 if result.metadata:
+                    results_text += "**Metadata:**\n"
                     for key, value in result.metadata.items():
                         if key != "stat7":  # Skip complex STAT7 object
                             results_text += f"- {key}: {value}\n"
     try:
         metrics = api.get_retrieval_metrics()
+        stats = "# System Statistics\n\n"
         stats += f"**Total Documents:** {metrics['context_store_size']}\n\n"
         stats += f"**Total Queries:** {metrics['retrieval_metrics']['total_queries']}\n\n"
         stats += f"**Cache Hit Rate:** {metrics['cache_performance']['hit_rate']:.1%}\n\n"
         for quality, count in metrics["retrieval_metrics"]["quality_distribution"].items():
             stats += f"- {quality.capitalize()}: {count}\n"
+        # Add ingestion status information
+        global ingestion_status
+        stats += "\n## Background Pack Ingestion\n\n"
+        if ingestion_status["running"]:
+            # Currently ingesting
+            progress_percent = (ingestion_status["processed"] / ingestion_status["total_docs"] * 100) if ingestion_status["total_docs"] > 0 else 0
+            eta_minutes = ingestion_status["eta"] / 60 if ingestion_status["eta"] > 0 else 0
+            stats += "**Status:** 🟢 **ACTIVE** - Ingesting documents...\n\n"
+            stats += "```\n"
+            stats += f"Progress: {ingestion_status['processed']}/{ingestion_status['total_docs']} documents\n"
+            stats += f"Complete: {progress_percent:.1f}%\n"
+            stats += f"Rate: {ingestion_status['rate']:.1f} docs/sec\n"
+            stats += f"ETA: {eta_minutes:.1f} minutes\n"
+            if ingestion_status['failed'] > 0:
+                stats += f"Failed: {ingestion_status['failed']} documents\n"
+            stats += "```\n\n"
+        elif ingestion_status["total_docs"] > 0:
+            # Completed ingestion (has totals but not running)
+            stats += "**Status:** ✅ **COMPLETE**\n\n"
+            stats += f"**Last Ingestion:** Processed {ingestion_status['processed']} documents"
+            if ingestion_status['failed'] > 0:
+                stats += f" ({ingestion_status['failed']} failed)"
+            stats += "\n\n"
+        else:
+            # No background ingestion detected
+            stats += "**Status:** ⚪ **IDLE** - No background ingestion active\n\n"
         return stats
     except Exception as e:

compress_packs.py CHANGED Viewed

@@ -7,7 +7,6 @@ compressed proto-thoughts generated by the evaporation engine.
 """
 import json
-import os
 import sys
 from pathlib import Path
 from typing import Dict, Any, List
@@ -22,7 +21,7 @@ from warbler_cda.evaporation import EvaporationEngine, CloudStore
 def load_jsonl_file(filepath: str) -> List[Dict[str, Any]]:
     """Load a JSONL file and return list of documents."""
     documents = []
-    with open(filepath, 'r', encoding='utf-8') as f:
         for line in f:
             line = line.strip()
             if line:
@@ -32,9 +31,9 @@ def load_jsonl_file(filepath: str) -> List[Dict[str, Any]]:
 def save_jsonl_file(filepath: str, documents: List[Dict[str, Any]]) -> None:
     """Save list of documents to a JSONL file."""
-    with open(filepath, 'w', encoding='utf-8') as f:
         for doc in documents:
-            f.write(json.dumps(doc, ensure_ascii=False) + '\n')
 def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
@@ -70,39 +69,38 @@ def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
         compressed_documents = []
         for doc in documents:
-            if 'content' not in doc:
-                print(f"Warning: Document missing 'content' field, skipping")
                 continue
-            content = doc['content']
             if not content or not isinstance(content, str):
-                print(f"Warning: Empty or invalid content, skipping")
                 continue
             try:
                 # Create a fragment from the document content
-                fragment = {
-                    'id': doc.get('content_id', f'doc_{compressed_docs}'),
-                    'text': content
-                }
                 # Create glyph from the single fragment
-                glyph = melt_layer.retire_cluster({'fragments': [fragment]})
                 # Evaporate to get proto-thought
                 mist_lines = evaporation_engine.evaporate(limit=1)
                 if mist_lines:
-                    proto_thought = mist_lines[0]['proto_thought']
                     # Replace content with compressed proto-thought
                     compressed_doc = doc.copy()
-                    compressed_doc['content'] = proto_thought
-                    compressed_doc['original_content_length'] = len(content)
-                    compressed_doc['compressed_content_length'] = len(proto_thought)
                     compressed_documents.append(compressed_doc)
                     compressed_docs += 1
                 else:
-                    print(f"Warning: Failed to evaporate glyph for document {doc.get('content_id', 'unknown')}")
                     # Keep original document if evaporation fails
                     compressed_documents.append(doc)
@@ -116,7 +114,7 @@ def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
         save_jsonl_file(str(output_file), compressed_documents)
         print(f"Saved compressed file: {output_file}")
-    print(f"Compression complete:")
     print(f"  Total documents processed: {total_docs}")
     print(f"  Documents compressed: {compressed_docs}")
     if total_docs > 0:

 """
 import json
 import sys
 from pathlib import Path
 from typing import Dict, Any, List
 def load_jsonl_file(filepath: str) -> List[Dict[str, Any]]:
     """Load a JSONL file and return list of documents."""
     documents = []
+    with open(filepath, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
             if line:
 def save_jsonl_file(filepath: str, documents: List[Dict[str, Any]]) -> None:
     """Save list of documents to a JSONL file."""
+    with open(filepath, "w", encoding="utf-8") as f:
         for doc in documents:
+            f.write(json.dumps(doc, ensure_ascii=False) + "\n")
 def compress_pack(pack_path: str, output_suffix: str = "_compressed") -> None:
         compressed_documents = []
         for doc in documents:
+            if "content" not in doc:
+                print("Warning: Document missing 'content' field, skipping")
                 continue
+            content = doc["content"]
             if not content or not isinstance(content, str):
+                print("Warning: Empty or invalid content, skipping")
                 continue
             try:
                 # Create a fragment from the document content
+                fragment = {"id": doc.get("content_id", f"doc_{compressed_docs}"), "text": content}
                 # Create glyph from the single fragment
+                melt_layer.retire_cluster({"fragments": [fragment]})
                 # Evaporate to get proto-thought
                 mist_lines = evaporation_engine.evaporate(limit=1)
                 if mist_lines:
+                    proto_thought = mist_lines[0]["proto_thought"]
                     # Replace content with compressed proto-thought
                     compressed_doc = doc.copy()
+                    compressed_doc["content"] = proto_thought
+                    compressed_doc["original_content_length"] = len(content)
+                    compressed_doc["compressed_content_length"] = len(proto_thought)
                     compressed_documents.append(compressed_doc)
                     compressed_docs += 1
                 else:
+                    print(
+                        f"Warning: Failed to evaporate glyph for document {doc.get('content_id', 'unknown')}"
+                    )
                     # Keep original document if evaporation fails
                     compressed_documents.append(doc)
         save_jsonl_file(str(output_file), compressed_documents)
         print(f"Saved compressed file: {output_file}")
+    print("Compression complete:")
     print(f"  Total documents processed: {total_docs}")
     print(f"  Documents compressed: {compressed_docs}")
     if total_docs > 0:

final_fix.py CHANGED Viewed

@@ -2,27 +2,25 @@
 """Final fixes for stat7_entity.py and verify the fixes work"""
 # Fix the stat7_entity.py bug
-with open('warbler_cda/stat7_entity.py', 'r', encoding='utf-8') as f:
     content = f.read()
 # Fix the description reference bug
 content = content.replace('"description": description,', '"description": self.description,')
 # Write back the fixed content
-with open('warbler_cda/stat7_entity.py', 'w', encoding='utf-8') as f:
     f.write(content)
 print("Fixed stat7_entity.py description bug")
 # Test import to make sure everything works
 try:
-    import warbler_cda.stat7_entity
     print("✅ stat7_entity imports successfully")
 except Exception as e:
     print(f"❌ stat7_entity import failed: {e}")
 try:
-    import warbler_cda.stat7_rag_bridge
     print("✅ stat7_rag_bridge imports successfully")
 except Exception as e:
     print(f"❌ stat7_rag_bridge import failed: {e}")

 """Final fixes for stat7_entity.py and verify the fixes work"""
 # Fix the stat7_entity.py bug
+with open("warbler_cda/stat7_entity.py", "r", encoding="utf-8") as f:
     content = f.read()
 # Fix the description reference bug
 content = content.replace('"description": description,', '"description": self.description,')
 # Write back the fixed content
+with open("warbler_cda/stat7_entity.py", "w", encoding="utf-8") as f:
     f.write(content)
 print("Fixed stat7_entity.py description bug")
 # Test import to make sure everything works
 try:
     print("✅ stat7_entity imports successfully")
 except Exception as e:
     print(f"❌ stat7_entity import failed: {e}")
 try:
     print("✅ stat7_rag_bridge imports successfully")
 except Exception as e:
     print(f"❌ stat7_rag_bridge import failed: {e}")

fix_theme.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """Fix the theme issue in app.py"""
-with open('app.py', 'r', encoding='utf-8') as f:
     content = f.read()
 old_line = 'with gr.Blocks(title="Warbler CDA - RAG System Demo", theme=gr.themes.Soft()) as demo:'
@@ -9,7 +9,7 @@ new_line = 'with gr.Blocks(title="Warbler CDA - RAG System Demo") as demo:'
 content = content.replace(old_line, new_line)
-with open('app.py', 'w', encoding='utf-8') as f:
     f.write(content)
-print('Fixed theme issue')

 #!/usr/bin/env python3
 """Fix the theme issue in app.py"""
+with open("app.py", "r", encoding="utf-8") as f:
     content = f.read()
 old_line = 'with gr.Blocks(title="Warbler CDA - RAG System Demo", theme=gr.themes.Soft()) as demo:'
 content = content.replace(old_line, new_line)
+with open("app.py", "w", encoding="utf-8") as f:
     f.write(content)
+print("Fixed theme issue")

package-lock.json CHANGED Viewed

@@ -9,7 +9,8 @@
       "version": "1.0.0",
       "license": "ISC",
       "dependencies": {
-        "express": "^5.1.0"
       }
     },
     "node_modules/accepts": {
@@ -819,6 +820,19 @@
         "node": ">= 0.6"
       }
     },
     "node_modules/unpipe": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",

       "version": "1.0.0",
       "license": "ISC",
       "dependencies": {
+        "express": "^5.1.0",
+        "typescript": "^5.9.3"
       }
     },
     "node_modules/accepts": {
         "node": ">= 0.6"
       }
     },
+    "node_modules/typescript": {
+      "version": "5.9.3",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
+      "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
+      "license": "Apache-2.0",
+      "bin": {
+        "tsc": "bin/tsc",
+        "tsserver": "bin/tsserver"
+      },
+      "engines": {
+        "node": ">=14.17"
+      }
+    },
     "node_modules/unpipe": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",

package.json CHANGED Viewed

@@ -13,6 +13,7 @@
   "author": "",
   "license": "ISC",
   "dependencies": {
-    "express": "^5.1.0"
   }
 }

   "author": "",
   "license": "ISC",
   "dependencies": {
+    "express": "^5.1.0",
+    "typescript": "^5.9.3"
   }
 }

packs/warbler-pack-npc-dialog/src/index.ts CHANGED Viewed

@@ -1,12 +1,36 @@
 /**
  * Warbler NPC Dialog Pack - Essential conversation templates
- *
  * Re-exports templates for dynamic loading in the Warbler conversation system
  */
-import { WarblerTemplate, WarblerPackMetadata } from 'warbler-npc';
 import templatesData from '../pack/templates.json';
 // Transform JSON data to proper WarblerTemplate objects
 export const templates: WarblerTemplate[] = templatesData.templates.map(template => ({
   ...template,
@@ -48,4 +72,4 @@ export default {
   tradeInquiryWelcome,
   generalConversation,
   unknownResponse
-};

 /**
  * Warbler NPC Dialog Pack - Essential conversation templates
+ *
  * Re-exports templates for dynamic loading in the Warbler conversation system
  */
 import templatesData from '../pack/templates.json';
+// Type definitions for Warbler pack types
+export interface WarblerTemplate {
+  id: string;
+  version: string;
+  title: string;
+  description: string;
+  content: string;
+  requiredSlots: Array<{
+    name: string;
+    type: 'string' | 'number' | 'boolean' | 'object';
+    required: boolean;
+    description?: string;
+  }>;
+  tags: string[];
+  maxLength?: number;
+}
+export interface WarblerPackMetadata {
+  name: string;
+  version: string;
+  description: string;
+  author: string;
+  templates: WarblerTemplate[];
+}
 // Transform JSON data to proper WarblerTemplate objects
 export const templates: WarblerTemplate[] = templatesData.templates.map(template => ({
   ...template,
   tradeInquiryWelcome,
   generalConversation,
   unknownResponse
+};

packs/warbler-pack-npc-dialog/warbler-pack-core.jsonl DELETED Viewed

	@@ -1,2 +0,0 @@
1	- "packInfo"
2	- "templates"

packs/warbler-pack-wisdom-scrolls/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # 🎭 Warbler Pack: Wisdom Scrolls
-**Dynamic wisdom generation templates for the Secret Art of the Living Dev**
 This Warbler content pack provides mystical wisdom generation templates that create fresh quotes in the authentic style of the Sacred Scrolls, breathing new life into the ancient wisdom while maintaining the sacred atmosphere of the Cheekdom.
@@ -23,32 +23,44 @@ scripts/lda-quote --warbler
 ## Template Categories
 ### 🧙‍♂️ Development Wisdom (`wisdom_development_insight`)
 Generates profound insights about development practices using philosophical structure:
 - **Pattern**: `{action} is not {misconception}; it's {deeper_truth}. Like {metaphor}, but for {domain}.`
 - **Example**: *"Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."*
-### 📜 Sacred Attribution (`scroll_attribution_template`)
 Creates mystical attribution in the style of ancient texts:
 - **Pattern**: `— {author_title}, {source_title}, {volume_designation}`
 - **Example**: *"— The Great Validator, Secret Art of the Living Dev, Vol. III"*
 ### 🐛 Debugging Proverbs (`debugging_proverb_template`)
 Humorous debugging wisdom using classical proverb structure:
 - **Pattern**: `The {problem_type} you can't {action_verb} is like the {creature} under the {location}—{reality_statement}.`
 - **Example**: *"The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."*
 ### 📖 Documentation Philosophy (`documentation_philosophy`)
 Profound insights about documentation practices:
 - **Pattern**: `Documentation is not {what_its_not}; it's {what_it_really_is}.`
 - **Example**: *"Documentation is not what you write for others; it's what you write for the you of six months from now."*
 ### 🏰 Cheekdom Lore (`cheekdom_lore_template`)
 Epic lore about the Cheekdom and its sacred mission:
 - **Pattern**: `In the {realm} of {domain}, the {guardian_class} stands between {civilization} and {threat_type}.`
 - **Example**: *"In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."*
 ### 🍑 Buttsafe Wisdom (`buttsafe_wisdom`)
 Sacred wisdom about ergonomic development practices:
 - **Pattern**: `Every developer's {body_part} is {sacred_designation}. {protection_action} with {protection_means}.`
 - **Example**: *"Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."*
@@ -138,6 +150,7 @@ scripts/weekly-wisdom-oracle.sh stats
 All generated quotes maintain the Sacred Code Standards:
 ### ✅ **Buttsafe Certified Requirements**
 - Professional workplace appropriateness
 - Dry, witty humor style (never offensive)
 - Development-focused insights
@@ -145,12 +158,14 @@ All generated quotes maintain the Sacred Code Standards:
 - Maximum length: 200 characters per template
 ### 🎭 **Authenticity Standards**
 - Maintains mystical atmosphere of original quotes
 - Uses consistent Sacred Art terminology
 - Preserves philosophical depth and wisdom
 - Integrates seamlessly with static quote database
 ### 📊 **Quality Assurance**
 - All templates validated for structure and content
 - Slot combinations tested for coherent output
 - Generated quotes pass content filtering
@@ -160,7 +175,7 @@ All generated quotes maintain the Sacred Code Standards:
 The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through multiple layers:
-```
 ┌─────────────────────────────────────────────────┐
 │               Weekly Oracle Workflow            │
 │           (GitHub Actions Automation)          │
@@ -185,6 +200,7 @@ The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through m
 ## Versioning and Evolution
 ### Current Version: 1.0.0
 - ✅ Six core template categories
 - ✅ Complete slot value libraries
 - ✅ Integration with Warbler Quote Engine
@@ -192,12 +208,14 @@ The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through m
 - ✅ CLI integration
 ### Planned Enhancements (v1.1.0)
 - 🔄 Additional template categories (CI/CD wisdom, workflow philosophy)
 - 🔄 Context-aware slot selection
 - 🔄 Machine learning-enhanced quote quality
 - 🔄 Cross-reference generation with existing quotes
 ### Future Vision (v2.0.0)
 - 🌟 Dynamic template creation based on repository context
 - 🌟 Personalized wisdom generation
 - 🌟 Integration with Git commit analysis
@@ -228,7 +246,7 @@ scripts/lda-quote --warbler --stats
 ## Sacred Mission
-*"The Wisdom Scrolls pack transforms static sacred texts into living oracles, ensuring that fresh insights flow continuously through the channels of development wisdom while preserving the mystical essence of the original teachings."*
 — **Pack Philosophy**, Living Oracle Manifesto, Sacred Design Document

 # 🎭 Warbler Pack: Wisdom Scrolls
+## **Dynamic wisdom generation templates for the Secret Art of the Living Dev**
 This Warbler content pack provides mystical wisdom generation templates that create fresh quotes in the authentic style of the Sacred Scrolls, breathing new life into the ancient wisdom while maintaining the sacred atmosphere of the Cheekdom.
 ## Template Categories
 ### 🧙‍♂️ Development Wisdom (`wisdom_development_insight`)
 Generates profound insights about development practices using philosophical structure:
 - **Pattern**: `{action} is not {misconception}; it's {deeper_truth}. Like {metaphor}, but for {domain}.`
 - **Example**: *"Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."*
+### 📜 Sacred Attribution (`scroll_attribution_template`)
 Creates mystical attribution in the style of ancient texts:
 - **Pattern**: `— {author_title}, {source_title}, {volume_designation}`
 - **Example**: *"— The Great Validator, Secret Art of the Living Dev, Vol. III"*
 ### 🐛 Debugging Proverbs (`debugging_proverb_template`)
 Humorous debugging wisdom using classical proverb structure:
 - **Pattern**: `The {problem_type} you can't {action_verb} is like the {creature} under the {location}—{reality_statement}.`
 - **Example**: *"The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."*
 ### 📖 Documentation Philosophy (`documentation_philosophy`)
 Profound insights about documentation practices:
 - **Pattern**: `Documentation is not {what_its_not}; it's {what_it_really_is}.`
 - **Example**: *"Documentation is not what you write for others; it's what you write for the you of six months from now."*
 ### 🏰 Cheekdom Lore (`cheekdom_lore_template`)
 Epic lore about the Cheekdom and its sacred mission:
 - **Pattern**: `In the {realm} of {domain}, the {guardian_class} stands between {civilization} and {threat_type}.`
 - **Example**: *"In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."*
 ### 🍑 Buttsafe Wisdom (`buttsafe_wisdom`)
 Sacred wisdom about ergonomic development practices:
 - **Pattern**: `Every developer's {body_part} is {sacred_designation}. {protection_action} with {protection_means}.`
 - **Example**: *"Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."*
 All generated quotes maintain the Sacred Code Standards:
 ### ✅ **Buttsafe Certified Requirements**
 - Professional workplace appropriateness
 - Dry, witty humor style (never offensive)
 - Development-focused insights
 - Maximum length: 200 characters per template
 ### 🎭 **Authenticity Standards**
 - Maintains mystical atmosphere of original quotes
 - Uses consistent Sacred Art terminology
 - Preserves philosophical depth and wisdom
 - Integrates seamlessly with static quote database
 ### 📊 **Quality Assurance**
 - All templates validated for structure and content
 - Slot combinations tested for coherent output
 - Generated quotes pass content filtering
 The Wisdom Scrolls pack integrates with the Living Dev Agent ecosystem through multiple layers:
+```none
 ┌─────────────────────────────────────────────────┐
 │               Weekly Oracle Workflow            │
 │           (GitHub Actions Automation)          │
 ## Versioning and Evolution
 ### Current Version: 1.0.0
 - ✅ Six core template categories
 - ✅ Complete slot value libraries
 - ✅ Integration with Warbler Quote Engine
 - ✅ CLI integration
 ### Planned Enhancements (v1.1.0)
 - 🔄 Additional template categories (CI/CD wisdom, workflow philosophy)
 - 🔄 Context-aware slot selection
 - 🔄 Machine learning-enhanced quote quality
 - 🔄 Cross-reference generation with existing quotes
 ### Future Vision (v2.0.0)
 - 🌟 Dynamic template creation based on repository context
 - 🌟 Personalized wisdom generation
 - 🌟 Integration with Git commit analysis
 ## Sacred Mission
+-*"The Wisdom Scrolls pack transforms static sacred texts into living oracles, ensuring that fresh insights flow continuously through the channels of development wisdom while preserving the mystical essence of the original teachings."*
 — **Pack Philosophy**, Living Oracle Manifesto, Sacred Design Document

packs/warbler-pack-wisdom-scrolls/README_HF_DATASET.md CHANGED Viewed

@@ -34,7 +34,7 @@ This dataset contains mystical wisdom generation templates that create fresh quo
 ## Dataset Structure
-```
 {
   "template_id": str,
   "category": str,
@@ -49,26 +49,32 @@ This dataset contains mystical wisdom generation templates that create fresh quo
 ## Template Categories
 ### 🧙‍♂️ Development Wisdom
 Generates profound insights about development practices using philosophical structure.
 *Example*: "Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."
 ### 📜 Sacred Attribution
 Creates mystical attribution in the style of ancient texts.
 *Example*: "— The Great Validator, Secret Art of the Living Dev, Vol. III"
 ### 🐛 Debugging Proverbs
 Humorous debugging wisdom using classical proverb structure.
 *Example*: "The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."
 ### 📖 Documentation Philosophy
 Profound insights about documentation practices.
 *Example*: "Documentation is not what you write for others; it's what you write for the you of six months from now."
 ### 🏰 Cheekdom Lore
 Epic lore about the Cheekdom and its sacred mission.
 *Example*: "In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."
 ### 🍑 Buttsafe Wisdom
 Sacred wisdom about ergonomic development practices.
 *Example*: "Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."

 ## Dataset Structure
+```py
 {
   "template_id": str,
   "category": str,
 ## Template Categories
 ### 🧙‍♂️ Development Wisdom
 Generates profound insights about development practices using philosophical structure.
 *Example*: "Refactoring is not admitting failure; it's evolution of understanding. Like pruning a garden, but for algorithms."
 ### 📜 Sacred Attribution
 Creates mystical attribution in the style of ancient texts.
 *Example*: "— The Great Validator, Secret Art of the Living Dev, Vol. III"
 ### 🐛 Debugging Proverbs
 Humorous debugging wisdom using classical proverb structure.
 *Example*: "The bug you can't reproduce is like the monster under the bed—real, but only when no one's looking."
 ### 📖 Documentation Philosophy
 Profound insights about documentation practices.
 *Example*: "Documentation is not what you write for others; it's what you write for the you of six months from now."
 ### 🏰 Cheekdom Lore
 Epic lore about the Cheekdom and its sacred mission.
 *Example*: "In the kingdom of Software Development, the Buttwarden stands between comfortable development and runtime catastrophe."
 ### 🍑 Buttsafe Wisdom
 Sacred wisdom about ergonomic development practices.
 *Example*: "Every developer's posterior is sacred. Protect it with ergonomic wisdom and comfortable seating."

test_app.py CHANGED Viewed

@@ -4,7 +4,8 @@ Test script to debug app.py initialization issues
 """
 import os
-os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 try:
     from warbler_cda import (
@@ -54,6 +55,7 @@ if WARBLER_AVAILABLE:
     except Exception as e:
         print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
         import traceback
         traceback.print_exc()
         api = None

 """
 import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 try:
     from warbler_cda import (
     except Exception as e:
         print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
         import traceback
         traceback.print_exc()
         api = None

test_compressed_pack.py CHANGED Viewed

@@ -11,6 +11,7 @@ sys.path.insert(0, str(Path(__file__).parent))
 from warbler_cda.pack_loader import PackLoader
 def test_compressed_pack_loading():
     """Test loading the compressed novels pack"""
     packs_dir = Path("packs")
@@ -47,7 +48,7 @@ def test_compressed_pack_loading():
         print()
     # Check that content is compressed (should be short proto-thoughts)
-    avg_content_length = sum(len(doc['content']) for doc in documents) / len(documents)
     print(f"Average content length: {avg_content_length:.1f} characters")
     if avg_content_length > 200:  # Original was ~1100, compressed should be much shorter
@@ -57,6 +58,7 @@ def test_compressed_pack_loading():
     print("✓ Compressed pack loading test passed!")
     return True
 if __name__ == "__main__":
     success = test_compressed_pack_loading()
     sys.exit(0 if success else 1)

 from warbler_cda.pack_loader import PackLoader
 def test_compressed_pack_loading():
     """Test loading the compressed novels pack"""
     packs_dir = Path("packs")
         print()
     # Check that content is compressed (should be short proto-thoughts)
+    avg_content_length = sum(len(doc["content"]) for doc in documents) / len(documents)
     print(f"Average content length: {avg_content_length:.1f} characters")
     if avg_content_length > 200:  # Original was ~1100, compressed should be much shorter
     print("✓ Compressed pack loading test passed!")
     return True
 if __name__ == "__main__":
     success = test_compressed_pack_loading()
     sys.exit(0 if success else 1)

test_embedding_integration.py CHANGED Viewed

@@ -116,10 +116,10 @@ def test_embedding_cache():
         text = "Cache test document"
-        emb1 = provider.embed_text(text)
         hits_before = provider.cache_stats["hits"]
-        emb2 = provider.embed_text(text)
         hits_after = provider.cache_stats["hits"]
         if hits_after > hits_before:

         text = "Cache test document"
+        provider.embed_text(text)
         hits_before = provider.cache_stats["hits"]
+        provider.embed_text(text)
         hits_after = provider.cache_stats["hits"]
         if hits_after > hits_before:

test_fixes.py CHANGED Viewed

@@ -17,7 +17,7 @@ def test_load_warbler_packs():
     print("Testing WarblerPackLoader...")
     try:
-        loader = WarblerPackLoader()
         print("✓ WarblerPackLoader instantiated successfully")
         print("✓ JSONL parsing fix applied")
@@ -36,7 +36,7 @@ def test_sentence_transformer():
     print("\nTesting SentenceTransformerEmbeddingProvider...")
     try:
         config = {"model_name": "all-MiniLM-L6-v2", "batch_size": 32}
-        provider = SentenceTransformerEmbeddingProvider(config)
         print("✓ Provider initialized with proper type annotations")
         return True
     except Exception as e:

     print("Testing WarblerPackLoader...")
     try:
+        WarblerPackLoader()
         print("✓ WarblerPackLoader instantiated successfully")
         print("✓ JSONL parsing fix applied")
     print("\nTesting SentenceTransformerEmbeddingProvider...")
     try:
         config = {"model_name": "all-MiniLM-L6-v2", "batch_size": 32}
+        SentenceTransformerEmbeddingProvider(config)
         print("✓ Provider initialized with proper type annotations")
         return True
     except Exception as e:

test_pack_loading.py CHANGED Viewed

@@ -4,7 +4,8 @@ Test pack loading to debug app.py issues
 """
 import os
-os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 try:
     from warbler_cda import (
@@ -78,11 +79,13 @@ if WARBLER_AVAILABLE:
         except Exception as e:
             print(f"[ERROR] Pack loading failed: {e}")
             import traceback
             traceback.print_exc()
     except Exception as e:
         print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
         import traceback
         traceback.print_exc()
         api = None

 """
 import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 try:
     from warbler_cda import (
         except Exception as e:
             print(f"[ERROR] Pack loading failed: {e}")
             import traceback
             traceback.print_exc()
     except Exception as e:
         print(f"[ERROR] Failed to initialize Warbler CDA: {e}")
         import traceback
         traceback.print_exc()
         api = None

tests/test_new_mit_datasets.py CHANGED Viewed

@@ -22,11 +22,9 @@ from warbler_cda.utils.transformers import (
     WarblerPackBuilder,
 )
 import pytest
-import json
 import sys
 from pathlib import Path
-from unittest.mock import Mock, patch, MagicMock
-from typing import Dict, List, Any
 sys.path.insert(0, str(Path(__file__).parent.parent))
@@ -517,7 +515,7 @@ class TestNewDatasetsIntegrationWithRetrieval:
         """Test that packs can be created from new datasets"""
         builder = WarblerPackBuilder()
-        test_docs = [
             {
                 "content_id": f"test_{i}",
                 "content": f"Test content {i}",

     WarblerPackBuilder,
 )
 import pytest
 import sys
 from pathlib import Path
+from unittest.mock import patch, MagicMock
 sys.path.insert(0, str(Path(__file__).parent.parent))
         """Test that packs can be created from new datasets"""
         builder = WarblerPackBuilder()
+        [
             {
                 "content_id": f"test_{i}",
                 "content": f"Test content {i}",

tests/test_pdf_ingestion.py CHANGED Viewed

@@ -13,12 +13,9 @@ from warbler_cda.utils.transformers import (
     PromptReportTransformer,
     ManualsTransformer,
 )
-import pytest
-import json
 import sys
 from pathlib import Path
-from unittest.mock import Mock, patch, MagicMock
-from typing import Dict, List, Any
 sys.path.insert(0, str(Path(__file__).parent.parent))

     PromptReportTransformer,
     ManualsTransformer,
 )
 import sys
 from pathlib import Path
+from unittest.mock import patch, MagicMock
 sys.path.insert(0, str(Path(__file__).parent.parent))

tests/test_rag_e2e.py CHANGED Viewed

@@ -38,11 +38,9 @@ class TestEndToEndRAG:
         print("RAG SYSTEM METRICS")
         print("=" * 60)
         print(f"Embedding Provider: {self.embedding_provider.provider_id}")
-        print(
-            f"Embedding Dimension: {self.embedding_provider.get_dimension()}")
         print(f"Documents in Store: {metrics['context_store_size']}")
-        print(
-            f"Total Queries: {metrics['retrieval_metrics']['total_queries']}")
         print("=" * 60)
     def test_01_embedding_generation(self):
@@ -128,8 +126,7 @@ class TestEndToEndRAG:
         print(f"[PASS] Retrieved {len(assembly.results)} relevant documents")
         for i, result in enumerate(assembly.results, 1):
-            print(
-                f" {i}. [{result.relevance_score:.4f}] {result.content[:50]}")
     def test_05_max_results_respected(self):
         """Test 05: Verify max_results parameter is respected."""
@@ -149,10 +146,7 @@ class TestEndToEndRAG:
         assembly = self.api.retrieve_context(query)
         assert len(assembly.results) <= 3
-        print(
-            "[PASS] Query returned:"
-            f"{len(assembly.results)} results",
-            "(max 3 requested)")
     def test_06_confidence_threshold(self):
         """Test 06: Verify confidence threshold filtering."""
@@ -186,12 +180,8 @@ class TestEndToEndRAG:
         strict_results = self.api.retrieve_context(query_strict)
         loose_results = self.api.retrieve_context(query_loose)
-        print(
-            "[PASS] Strict threshold (0.8):",
-            f"{len(strict_results.results)} results")
-        print(
-            "[PASS] Loose threshold (0.2):",
-            f"{len(loose_results.results)} results")
         assert len(strict_results.results) <= len(loose_results.results)
@@ -207,8 +197,7 @@ class TestEndToEndRAG:
             provider = SentenceTransformerEmbeddingProvider()
             hybrid_api = RetrievalAPI(
-                embedding_provider=provider, config={
-                    "enable_stat7_hybrid": True}
             )
         except ImportError:
             pytest.skip("SentenceTransformer not installed for STAT7 testing")
@@ -242,7 +231,8 @@ class TestEndToEndRAG:
                 print(
                     "[PASS] Result:",
                     f"semantic={result.semantic_similarity:.4f}",
-                    f"STAT7={result.stat7_resonance:.4f}")
     def test_08_temporal_retrieval(self):
         """Test 08: Verify temporal retrieval works."""
@@ -268,8 +258,7 @@ class TestEndToEndRAG:
         assembly = self.api.retrieve_context(query)
         assert assembly is not None
-        print(
-            f"[PASS] Temporal query retrieved {len(assembly.results)} results")
     def test_09_retrieval_metrics(self):
         """Test 09: Verify retrieval metrics are tracked."""
@@ -294,7 +283,8 @@ class TestEndToEndRAG:
         print(
             f"[PASS] Metrics tracked: {
-                metrics['retrieval_metrics']['total_queries']} queries")
     def test_10_full_rag_pipeline(self):
         """Test 10: Complete RAG pipeline end-to-end."""

         print("RAG SYSTEM METRICS")
         print("=" * 60)
         print(f"Embedding Provider: {self.embedding_provider.provider_id}")
+        print(f"Embedding Dimension: {self.embedding_provider.get_dimension()}")
         print(f"Documents in Store: {metrics['context_store_size']}")
+        print(f"Total Queries: {metrics['retrieval_metrics']['total_queries']}")
         print("=" * 60)
     def test_01_embedding_generation(self):
         print(f"[PASS] Retrieved {len(assembly.results)} relevant documents")
         for i, result in enumerate(assembly.results, 1):
+            print(f" {i}. [{result.relevance_score:.4f}] {result.content[:50]}")
     def test_05_max_results_respected(self):
         """Test 05: Verify max_results parameter is respected."""
         assembly = self.api.retrieve_context(query)
         assert len(assembly.results) <= 3
+        print("[PASS] Query returned:" f"{len(assembly.results)} results", "(max 3 requested)")
     def test_06_confidence_threshold(self):
         """Test 06: Verify confidence threshold filtering."""
         strict_results = self.api.retrieve_context(query_strict)
         loose_results = self.api.retrieve_context(query_loose)
+        print("[PASS] Strict threshold (0.8):", f"{len(strict_results.results)} results")
+        print("[PASS] Loose threshold (0.2):", f"{len(loose_results.results)} results")
         assert len(strict_results.results) <= len(loose_results.results)
             provider = SentenceTransformerEmbeddingProvider()
             hybrid_api = RetrievalAPI(
+                embedding_provider=provider, config={"enable_stat7_hybrid": True}
             )
         except ImportError:
             pytest.skip("SentenceTransformer not installed for STAT7 testing")
                 print(
                     "[PASS] Result:",
                     f"semantic={result.semantic_similarity:.4f}",
+                    f"STAT7={result.stat7_resonance:.4f}",
+                )
     def test_08_temporal_retrieval(self):
         """Test 08: Verify temporal retrieval works."""
         assembly = self.api.retrieve_context(query)
         assert assembly is not None
+        print(f"[PASS] Temporal query retrieved {len(assembly.results)} results")
     def test_09_retrieval_metrics(self):
         """Test 09: Verify retrieval metrics are tracked."""
         print(
             f"[PASS] Metrics tracked: {
+                metrics['retrieval_metrics']['total_queries']} queries"
+        )
     def test_10_full_rag_pipeline(self):
         """Test 10: Complete RAG pipeline end-to-end."""

tests/test_retrieval_api.py CHANGED Viewed

@@ -331,7 +331,7 @@ class TestRetrievalMetrics:
             max_results=5,
         )
-        initial_metrics = self.api.get_retrieval_metrics()
         self.api.retrieve_context(query)
         self.api.retrieve_context(query)

             max_results=5,
         )
+        self.api.get_retrieval_metrics()
         self.api.retrieve_context(query)
         self.api.retrieve_context(query)

tsconfig.base.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "module": "commonjs",
+    "esModuleInterop": true,
+    "allowSyntheticDefaultImports": true,
+    "strict": true,
+    "skipLibCheck": true,
+    "forceConsistentCasingInFileNames": true,
+    "declaration": true,
+    "moduleResolution": "node",
+    "resolveJsonModule": true
+  }
+}

validate_new_transformers.py CHANGED Viewed

@@ -110,9 +110,9 @@ def main():
         chunks = ingestor._chunk_text(test_text, chunk_size=100)
         print(f"  ✓ Successfully chunked text into {len(chunks)} chunks")
         if all(isinstance(chunk, str) for chunk in chunks):
-            print(f"  ✓ All chunks are strings")
         else:
-            print(f"  ✗ Some chunks are not strings")
             all_good = False
     except Exception as e:
         print(f"  ✗ _chunk_text failed: {e}")

         chunks = ingestor._chunk_text(test_text, chunk_size=100)
         print(f"  ✓ Successfully chunked text into {len(chunks)} chunks")
         if all(isinstance(chunk, str) for chunk in chunks):
+            print("  ✓ All chunks are strings")
         else:
+            print("  ✗ Some chunks are not strings")
             all_good = False
     except Exception as e:
         print(f"  ✗ _chunk_text failed: {e}")

verify_pack_ingestion.py CHANGED Viewed

@@ -7,7 +7,6 @@ Run this locally before deploying to HuggingFace.
 """
 import sys
-from pathlib import Path
 import logging
 # Setup logging

 """
 import sys
 import logging
 # Setup logging

warbler_cda/api/cli.py CHANGED Viewed

@@ -11,10 +11,8 @@ import json
 import requests
 import time
 from typing import List, Dict, Any
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 import logging
-from pathlib import Path
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -178,7 +176,7 @@ def query(
             # Show narrative analysis
             if result.get("narrative_analysis"):
                 narr = result["narrative_analysis"]
-                click.echo(f"\nNarrative Analysis:")
                 click.echo(f"  Coherence Score: {narr.get('coherence_score', 0):.3f}")
                 click.echo(f"  Narrative Threads: {narr.get('narrative_threads', 0)}")
                 click.echo(f"  Analysis: {narr.get('analysis')}")
@@ -249,7 +247,7 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
         result = client.bulk_query(query_data, concurrency=concurrency, include_narrative=True)
-        elapsed = time.time() - start_time
         if json_output:
             click.echo(json.dumps(result, indent=2))
@@ -270,7 +268,7 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
             # Narrative analysis for entire batch
             if result.get("batch_narrative_analysis"):
                 narr = result["batch_narrative_analysis"]
-                click.echo(f"\nBatch Narrative Analysis:")
                 click.echo(f"  Coherence Score: {narr.get('coherence_score', 0):.3f}")
                 click.echo(
                     f"  Total Narrative Threads: {
@@ -282,7 +280,7 @@ def bulk(ctx, num_queries, concurrency, semantic, hybrid, json_output):
                 click.echo(f"  Analysis: {narr.get('analysis')}")
             # Per-query summary
-            click.echo(f"\nPer-Query Summary (first 3):")
             for res in result.get("results", [])[:3]:
                 click.echo(
                     f"  {

 import requests
 import time
 from typing import List, Dict, Any
 from datetime import datetime
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
             # Show narrative analysis
             if result.get("narrative_analysis"):
                 narr = result["narrative_analysis"]
+                click.echo("\nNarrative Analysis:")
                 click.echo(f"  Coherence Score: {narr.get('coherence_score', 0):.3f}")
                 click.echo(f"  Narrative Threads: {narr.get('narrative_threads', 0)}")
                 click.echo(f"  Analysis: {narr.get('analysis')}")
         result = client.bulk_query(query_data, concurrency=concurrency, include_narrative=True)
+        time.time() - start_time
         if json_output:
             click.echo(json.dumps(result, indent=2))
             # Narrative analysis for entire batch
             if result.get("batch_narrative_analysis"):
                 narr = result["batch_narrative_analysis"]
+                click.echo("\nBatch Narrative Analysis:")
                 click.echo(f"  Coherence Score: {narr.get('coherence_score', 0):.3f}")
                 click.echo(
                     f"  Total Narrative Threads: {
                 click.echo(f"  Analysis: {narr.get('analysis')}")
             # Per-query summary
+            click.echo("\nPer-Query Summary (first 3):")
             for res in result.get("results", [])[:3]:
                 click.echo(
                     f"  {

warbler_cda/api/service.py CHANGED Viewed

@@ -495,8 +495,10 @@ async def startup_event():
 @app.get("/health", response_model=HealthResponse)
 async def health_check(service: RetrievalService = Depends(get_retrieval_service)):
     """Health check endpoint"""
-    api = service.init_api()
-    uptime = (datetime.now() - datetime.fromisoformat(service.metrics["start_time"])).total_seconds()
     return HealthResponse(
         status="healthy",
@@ -511,8 +513,7 @@ async def health_check(service: RetrievalService = Depends(get_retrieval_service
 @app.post("/query", response_model=QueryResult)
 async def single_query(
-    request: QueryRequest,
-    service: RetrievalService = Depends(get_retrieval_service)
 ):
     """Execute a single retrieval query"""
     api = service.init_api()
@@ -579,10 +580,7 @@ async def single_query(
         # Bob the Skeptic: Verify suspiciously perfect results
         bob_status, bob_verification_log = await _bob_skeptic_filter(
-            narrative_analysis=narrative_analysis,
-            results_data=results_data,
-            query=query,
-            api=api
         )
         return QueryResult(
@@ -611,11 +609,10 @@ async def single_query(
 @app.post("/bulk_query")
 async def bulk_concurrent_queries(
-    request: BulkQueryRequest,
-    service: RetrievalService = Depends(get_retrieval_service)
 ):
     """Execute multiple queries concurrently"""
-    api = service.init_api()
     logger.info(
         f"Executing {len(request.queries)} queries with concurrency level {request.concurrency_level}"
     )
@@ -669,8 +666,7 @@ async def bulk_concurrent_queries(
 @app.post("/ingest")
 async def ingest_documents(
-    request: Dict[str, Any],
-    service: RetrievalService = Depends(get_retrieval_service)
 ):
     """Ingest documents into the RetrievalAPI"""
     api = service.init_api()

 @app.get("/health", response_model=HealthResponse)
 async def health_check(service: RetrievalService = Depends(get_retrieval_service)):
     """Health check endpoint"""
+    service.init_api()
+    uptime = (
+        datetime.now() - datetime.fromisoformat(service.metrics["start_time"])
+    ).total_seconds()
     return HealthResponse(
         status="healthy",
 @app.post("/query", response_model=QueryResult)
 async def single_query(
+    request: QueryRequest, service: RetrievalService = Depends(get_retrieval_service)
 ):
     """Execute a single retrieval query"""
     api = service.init_api()
         # Bob the Skeptic: Verify suspiciously perfect results
         bob_status, bob_verification_log = await _bob_skeptic_filter(
+            narrative_analysis=narrative_analysis, results_data=results_data, query=query, api=api
         )
         return QueryResult(
 @app.post("/bulk_query")
 async def bulk_concurrent_queries(
+    request: BulkQueryRequest, service: RetrievalService = Depends(get_retrieval_service)
 ):
     """Execute multiple queries concurrently"""
+    service.init_api()
     logger.info(
         f"Executing {len(request.queries)} queries with concurrency level {request.concurrency_level}"
     )
 @app.post("/ingest")
 async def ingest_documents(
+    request: Dict[str, Any], service: RetrievalService = Depends(get_retrieval_service)
 ):
     """Ingest documents into the RetrievalAPI"""
     api = service.init_api()

warbler_cda/castle_graph.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import List, Dict, Any, Tuple, Optional, Set
 import time
 import re
 import math

 from __future__ import annotations
+from typing import List, Dict, Any, Optional, Set
 import time
 import re
 import math

warbler_cda/conflict_detector.py CHANGED Viewed

@@ -5,10 +5,10 @@ Detects conflicting or contradictory statements using semantic similarity and
 logical opposition analysis for the Cognitive Geo-Thermal Lore Engine v0.3.
 """
-from typing import List, Dict, Any, Optional, Tuple, Set
 import time
 import hashlib
-from dataclasses import dataclass, asdict
 from enum import Enum
@@ -580,7 +580,9 @@ class ConflictDetector:
     def _generate_conflict_id(self, conflict: ConflictEvidence) -> str:
         """Generate unique ID for a conflict."""
-        content = f"{conflict.statement_a_id}_{conflict.statement_b_id}_{conflict.conflict_type.value}"
         return hashlib.md5(content.encode()).hexdigest()[:12]
     def _generate_conflict_recommendation(

 logical opposition analysis for the Cognitive Geo-Thermal Lore Engine v0.3.
 """
+from typing import List, Dict, Any, Optional, Set
 import time
 import hashlib
+from dataclasses import dataclass
 from enum import Enum
     def _generate_conflict_id(self, conflict: ConflictEvidence) -> str:
         """Generate unique ID for a conflict."""
+        content = (
+            f"{conflict.statement_a_id}_{conflict.statement_b_id}_{conflict.conflict_type.value}"
+        )
         return hashlib.md5(content.encode()).hexdigest()[:12]
     def _generate_conflict_recommendation(

warbler_cda/embeddings/openai_provider.py CHANGED Viewed

@@ -3,7 +3,6 @@ OpenAI Embedding Provider - Cloud-based Semantic Grounding
 """
 from typing import List, Dict, Any, Optional
-import time
 from warbler_cda.embeddings.base_provider import EmbeddingProvider

 """
 from typing import List, Dict, Any, Optional
 from warbler_cda.embeddings.base_provider import EmbeddingProvider

warbler_cda/embeddings/sentence_transformer_provider.py CHANGED Viewed

@@ -4,9 +4,7 @@ High-quality embeddings using pre-trained transformer models with CUDA support
 """
 from typing import List, Dict, Any, Optional, Tuple
-import os
 import json
-import time
 import hashlib
 from pathlib import Path
 from warbler_cda.embeddings.base_provider import EmbeddingProvider
@@ -118,7 +116,7 @@ class SentenceTransformerEmbeddingProvider(EmbeddingProvider):
         import numpy as np
-        query_vec = np.array(query_embedding)
         embed_vecs = np.array(embeddings)
         similarities = []
@@ -205,8 +203,8 @@ class SentenceTransformerEmbeddingProvider(EmbeddingProvider):
         seg2 = emb_array[2 * seg_size : 3 * seg_size]
         seg3 = emb_array[3 * seg_size : 4 * seg_size]
         seg4 = emb_array[4 * seg_size : 5 * seg_size]
-        seg5 = emb_array[5 * seg_size : 6 * seg_size]
-        seg6 = emb_array[6 * seg_size :]
         lineage = float(np.mean(seg0**2))

 """
 from typing import List, Dict, Any, Optional, Tuple
 import json
 import hashlib
 from pathlib import Path
 from warbler_cda.embeddings.base_provider import EmbeddingProvider
         import numpy as np
+        np.array(query_embedding)
         embed_vecs = np.array(embeddings)
         similarities = []
         seg2 = emb_array[2 * seg_size : 3 * seg_size]
         seg3 = emb_array[3 * seg_size : 4 * seg_size]
         seg4 = emb_array[4 * seg_size : 5 * seg_size]
+        emb_array[5 * seg_size : 6 * seg_size]
+        emb_array[6 * seg_size :]
         lineage = float(np.mean(seg0**2))

warbler_cda/evaporation.py CHANGED Viewed

@@ -1,9 +1,8 @@
 from __future__ import annotations
-from typing import List, Dict, Any, Optional, Tuple
 import time
 import random
 import re
-from collections import Counter
 class EvaporationEngine:
@@ -299,7 +298,9 @@ class EvaporationEngine:
         if len(concepts) == 1:
             return f"[Balanced] Reflection on {concepts[0]} reveals deeper meaning."
         else:
-            return f"[Balanced] The interplay between {concepts[0]} and {concepts[1]} creates harmony."
     def _apply_affect_coloring(self, proto_thought: str, affect: Dict[str, Any]) -> str:
         """Apply affect-based coloring to proto-thought."""

 from __future__ import annotations
+from typing import List, Dict, Any, Optional
 import time
 import random
 import re
 class EvaporationEngine:
         if len(concepts) == 1:
             return f"[Balanced] Reflection on {concepts[0]} reveals deeper meaning."
         else:
+            return (
+                f"[Balanced] The interplay between {concepts[0]} and {concepts[1]} creates harmony."
+            )
     def _apply_affect_coloring(self, proto_thought: str, affect: Dict[str, Any]) -> str:
         """Apply affect-based coloring to proto-thought."""

warbler_cda/pack_loader.py CHANGED Viewed

@@ -149,9 +149,7 @@ class PackLoader:
                 chunk_docs = self._load_jsonl_file(chunk_file, pack_name)
                 documents.extend(chunk_docs)
-            logger.info(
-                f"Loaded {len(documents)} total documents from {len(chunk_files)} chunks"
-            )
         else:
             # Load single-file pack (backward compatibility)
             jsonl_file = pack_dir / f"{pack_name}.jsonl"

                 chunk_docs = self._load_jsonl_file(chunk_file, pack_name)
                 documents.extend(chunk_docs)
+            logger.info(f"Loaded {len(documents)} total documents from {len(chunk_files)} chunks")
         else:
             # Load single-file pack (backward compatibility)
             jsonl_file = pack_dir / f"{pack_name}.jsonl"

warbler_cda/pack_sync.py CHANGED Viewed

@@ -141,6 +141,6 @@ class PackSync:
         """Return reingest command if packs are missing"""
         status = self.verify_packs()
         if status["missing"]:
-            missing = ", ".join(status["missing"])
-            return f"python -m warbler_cda.utils.hf_warbler_ingest ingest --datasets all"
         return None

         """Return reingest command if packs are missing"""
         status = self.verify_packs()
         if status["missing"]:
+            ", ".join(status["missing"])
+            return "python -m warbler_cda.utils.hf_warbler_ingest ingest --datasets all"
         return None

warbler_cda/retrieval_api.py CHANGED Viewed

@@ -8,7 +8,7 @@ for the Cognitive Geo-Thermal Lore Engine v0.3.
 from typing import List, Dict, Any, Optional, Tuple, Union
 import time
 import hashlib
-from dataclasses import dataclass, asdict
 from enum import Enum
@@ -377,8 +377,14 @@ class RetrievalAPI:
         # DEBUG
         import sys
-        print(f"DEBUG: _retrieve_semantic_similarity called with query='{query.semantic_query}'", file=sys.stderr)
-        print(f"DEBUG: embedding_provider={self.embedding_provider}, semantic_anchors={self.semantic_anchors}", file=sys.stderr)
         print(f"DEBUG: context_store size={len(self._context_store)}", file=sys.stderr)
         # If embedding provider available, use it
@@ -467,7 +473,7 @@ class RetrievalAPI:
         try:
             if self.embedding_provider and hasattr(self.embedding_provider, "semantic_search"):
                 return self._search_context_store_semantic(query)
-        except Exception as e:
             pass
         return self._search_context_store_keyword(query)
@@ -527,7 +533,7 @@ class RetrievalAPI:
                         stat7_resonance=stat7_resonance,
                     )
                     results.append(result)
-        except Exception as e:
             return self._search_context_store_keyword(query)
         return results
@@ -846,7 +852,6 @@ class RetrievalAPI:
         filtered = [r for r in results if r.relevance_score >= query.confidence_threshold]
         # Apply temporal decay
-        current_time = query.query_timestamp
         for result in filtered:
             age_hours = result.temporal_distance / 3600
             decay_factor = max(0.1, 1.0 - (age_hours / self.temporal_decay_hours))

 from typing import List, Dict, Any, Optional, Tuple, Union
 import time
 import hashlib
+from dataclasses import dataclass
 from enum import Enum
         # DEBUG
         import sys
+        print(
+            f"DEBUG: _retrieve_semantic_similarity called with query='{query.semantic_query}'",
+            file=sys.stderr,
+        )
+        print(
+            f"DEBUG: embedding_provider={self.embedding_provider}, semantic_anchors={self.semantic_anchors}",
+            file=sys.stderr,
+        )
         print(f"DEBUG: context_store size={len(self._context_store)}", file=sys.stderr)
         # If embedding provider available, use it
         try:
             if self.embedding_provider and hasattr(self.embedding_provider, "semantic_search"):
                 return self._search_context_store_semantic(query)
+        except Exception:
             pass
         return self._search_context_store_keyword(query)
                         stat7_resonance=stat7_resonance,
                     )
                     results.append(result)
+        except Exception:
             return self._search_context_store_keyword(query)
         return results
         filtered = [r for r in results if r.relevance_score >= query.confidence_threshold]
         # Apply temporal decay
         for result in filtered:
             age_hours = result.temporal_distance / 3600
             decay_factor = max(0.1, 1.0 - (age_hours / self.temporal_decay_hours))

warbler_cda/semantic_anchors.py CHANGED Viewed

@@ -2,11 +2,9 @@
 Enhanced Anchor System with Semantic Grounding and Provenance
 """
-from typing import List, Dict, Any, Optional, Tuple
 import time
 import hashlib
-import json
-from dataclasses import dataclass, asdict
 from warbler_cda.embeddings import EmbeddingProvider, EmbeddingProviderFactory
 from warbler_cda.anchor_memory_pool import AnchorMemoryPool, get_global_anchor_pool
 from warbler_cda.anchor_data_classes import SemanticAnchor, AnchorProvenance
@@ -242,7 +240,7 @@ class SemanticAnchorGraph:
         """Apply aging, consolidation, and eviction policies."""
         actions = {"aged": 0, "consolidated": 0, "evicted": 0, "evicted_anchors": []}
-        current_time = time.time()
         anchors_to_evict = []
         # Apply aging

 Enhanced Anchor System with Semantic Grounding and Provenance
 """
+from typing import List, Dict, Any, Optional
 import time
 import hashlib
 from warbler_cda.embeddings import EmbeddingProvider, EmbeddingProviderFactory
 from warbler_cda.anchor_memory_pool import AnchorMemoryPool, get_global_anchor_pool
 from warbler_cda.anchor_data_classes import SemanticAnchor, AnchorProvenance
         """Apply aging, consolidation, and eviction policies."""
         actions = {"aged": 0, "consolidated": 0, "evicted": 0, "evicted_anchors": []}
+        time.time()
         anchors_to_evict = []
         # Apply aging

warbler_cda/stat7_entity.py CHANGED Viewed

@@ -11,7 +11,7 @@ Features:
 - Entanglement detection and management
 """
-from dataclasses import dataclass, field, asdict
 from datetime import datetime
 from enum import Enum
 from typing import Dict, List, Optional, Any, Tuple
@@ -286,9 +286,7 @@ class STAT7Entity(ABC):
             self.entangled_entities.append(other_entity_id)
             self.entanglement_strength.append(strength)
             self._record_event(
-                "entanglement_added",
-                f"Entangled with {other_entity_id}",
-                {"strength": strength}
             )
     def remove_entanglement(self, other_entity_id: str):
@@ -297,10 +295,7 @@ class STAT7Entity(ABC):
             idx = self.entangled_entities.index(other_entity_id)
             self.entangled_entities.pop(idx)
             self.entanglement_strength.pop(idx)
-            self._record_event(
-                "entanglement_removed",
-                f"Untangled from {other_entity_id}"
-                )
     def get_entanglements(self) -> List[Tuple[str, float]]:
         """Get all entangled entities with strength"""
@@ -315,8 +310,8 @@ class STAT7Entity(ABC):
             self._record_event(
                 "entanglement_updated",
                 f"{other_entity_id} entanglement strength changed",
-                {"old_strength": old_strength, "new_strength": new_strength}
-                )
     # ========================================================================
     # LUCA Bootstrap
@@ -423,7 +418,7 @@ class STAT7Entity(ABC):
     def load_from_file(cls, path: Path) -> "STAT7Entity":
         """Load entity from JSON file (must know concrete type)"""
         with open(path, "r") as f:
-            data = json.load(f)
         # Note: In practice, would need factory pattern to instantiate correct
         # subclass
         raise NotImplementedError("Use subclass load methods")

 - Entanglement detection and management
 """
+from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
 from typing import Dict, List, Optional, Any, Tuple
             self.entangled_entities.append(other_entity_id)
             self.entanglement_strength.append(strength)
             self._record_event(
+                "entanglement_added", f"Entangled with {other_entity_id}", {"strength": strength}
             )
     def remove_entanglement(self, other_entity_id: str):
             idx = self.entangled_entities.index(other_entity_id)
             self.entangled_entities.pop(idx)
             self.entanglement_strength.pop(idx)
+            self._record_event("entanglement_removed", f"Untangled from {other_entity_id}")
     def get_entanglements(self) -> List[Tuple[str, float]]:
         """Get all entangled entities with strength"""
             self._record_event(
                 "entanglement_updated",
                 f"{other_entity_id} entanglement strength changed",
+                {"old_strength": old_strength, "new_strength": new_strength},
+            )
     # ========================================================================
     # LUCA Bootstrap
     def load_from_file(cls, path: Path) -> "STAT7Entity":
         """Load entity from JSON file (must know concrete type)"""
         with open(path, "r") as f:
+            json.load(f)
         # Note: In practice, would need factory pattern to instantiate correct
         # subclass
         raise NotImplementedError("Use subclass load methods")

warbler_cda/stat7_experiments.py CHANGED Viewed

@@ -327,7 +327,7 @@ class EXP01_AddressUniqueness:
             Tuple of (results list, overall success boolean)
         """
         print(f"\n{'=' * 70}")
-        print(f"EXP-01: ADDRESS UNIQUENESS TEST")
         print(f"{'=' * 70}")
         print(f"Sample size: {self.sample_size} bit-chains")
         print(f"Iterations: {self.iterations}")
@@ -383,9 +383,7 @@ class EXP01_AddressUniqueness:
                     print(f"  ⚠️  Collision on {addr[:16]}... : {len(ids)} entries")
         print()
-        print(
-            f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
-        )
         print(f"Success rate: {sum(1 for r in self.results if r.success)}/{self.iterations}")
         return self.results, all_success
@@ -452,7 +450,7 @@ class EXP02_RetrievalEfficiency:
             Tuple of (results list, overall success boolean)
         """
         print(f"\n{'=' * 70}")
-        print(f"EXP-02: RETRIEVAL EFFICIENCY TEST")
         print(f"{'=' * 70}")
         print(f"Query count per scale: {self.query_count}")
         print(f"Scales: {self.scales}")
@@ -520,9 +518,7 @@ class EXP02_RetrievalEfficiency:
             print(f"       Target: < {threshold}ms")
             print()
-        print(
-            f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}"
-        )
         return self.results, all_success
@@ -589,7 +585,7 @@ class EXP03_DimensionNecessity:
             Tuple of (results list, overall success boolean)
         """
         print(f"\n{'=' * 70}")
-        print(f"EXP-03: DIMENSION NECESSITY TEST")
         print(f"{'=' * 70}")
         print(f"Sample size: {self.sample_size} bit-chains")
         print()
@@ -618,9 +614,7 @@ class EXP03_DimensionNecessity:
         self.results.append(result)
         status = "✅ PASS" if result.acceptable else "❌ FAIL"
-        print(
-            f"  {status} | Collisions: {collisions} | Rate: {baseline_collision_rate * 100:.4f}%"
-        )
         print()
         # Ablation: remove each dimension
@@ -662,13 +656,11 @@ class EXP03_DimensionNecessity:
             # when removing dims
             necessity = not acceptable  # Should show collisions
             status = "✅ NECESSARY" if necessity else "⚠️  OPTIONAL"
-            print(
-                f"  {status} | Collisions: {collisions} | Rate: {collision_rate * 100:.4f}%"
-            )
         print()
         print(
-            f"OVERALL RESULT: All 7 dimensions are necessary (all show > 0.1% collisions when removed)"
         )
         return self.results, all_success
@@ -733,7 +725,7 @@ def run_all_experiments(
     # Summary
     print(f"\n{'=' * 70}")
-    print(f"PHASE 1 VALIDATION SUMMARY")
     print(f"{'=' * 70}")
     print(
         f"EXP-01 (Address Uniqueness): {'✅ PASS' if results['EXP-01']['success'] else '❌ FAIL'}"

             Tuple of (results list, overall success boolean)
         """
         print(f"\n{'=' * 70}")
+        print("EXP-01: ADDRESS UNIQUENESS TEST")
         print(f"{'=' * 70}")
         print(f"Sample size: {self.sample_size} bit-chains")
         print(f"Iterations: {self.iterations}")
                     print(f"  ⚠️  Collision on {addr[:16]}... : {len(ids)} entries")
         print()
+        print(f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}")
         print(f"Success rate: {sum(1 for r in self.results if r.success)}/{self.iterations}")
         return self.results, all_success
             Tuple of (results list, overall success boolean)
         """
         print(f"\n{'=' * 70}")
+        print("EXP-02: RETRIEVAL EFFICIENCY TEST")
         print(f"{'=' * 70}")
         print(f"Query count per scale: {self.query_count}")
         print(f"Scales: {self.scales}")
             print(f"       Target: < {threshold}ms")
             print()
+        print(f"OVERALL RESULT: {'✅ ALL PASS' if all_success else '❌ SOME FAILED'}")
         return self.results, all_success
             Tuple of (results list, overall success boolean)
         """
         print(f"\n{'=' * 70}")
+        print("EXP-03: DIMENSION NECESSITY TEST")
         print(f"{'=' * 70}")
         print(f"Sample size: {self.sample_size} bit-chains")
         print()
         self.results.append(result)
         status = "✅ PASS" if result.acceptable else "❌ FAIL"
+        print(f"  {status} | Collisions: {collisions} | Rate: {baseline_collision_rate * 100:.4f}%")
         print()
         # Ablation: remove each dimension
             # when removing dims
             necessity = not acceptable  # Should show collisions
             status = "✅ NECESSARY" if necessity else "⚠️  OPTIONAL"
+            print(f"  {status} | Collisions: {collisions} | Rate: {collision_rate * 100:.4f}%")
         print()
         print(
+            "OVERALL RESULT: All 7 dimensions are necessary (all show > 0.1% collisions when removed)"
         )
         return self.results, all_success
     # Summary
     print(f"\n{'=' * 70}")
+    print("PHASE 1 VALIDATION SUMMARY")
     print(f"{'=' * 70}")
     print(
         f"EXP-01 (Address Uniqueness): {'✅ PASS' if results['EXP-01']['success'] else '❌ FAIL'}"

warbler_cda/stat7_rag_bridge.py CHANGED Viewed

@@ -55,18 +55,10 @@ class STAT7Address:
     def __post_init__(self):
         """Validate STAT8 constraints."""
-        assert (
-            0.0 <= self.adjacency <= 1.0
-        ), f"adjacency must be [0,1], got {self.adjacency}"
-        assert (
-            0.0 <= self.luminosity <= 1.0
-        ), f"luminosity must be [0,1], got {self.luminosity}"
-        assert (
-            0.0 <= self.polarity <= 1.0
-        ), f"polarity must be [0,1], got {self.polarity}"
-        assert (
-            0.0 <= self.entropy <= 1.0
-        ), f"entropy must be [0,1], got {self.entropy}"
         assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
         assert (
             1 <= self.dimensionality <= 8
@@ -164,7 +156,7 @@ def stat7_resonance(query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float
     luminosity_diff = abs(query_stat7.luminosity - doc_stat7.luminosity)
     polarity_diff = abs(query_stat7.polarity - doc_stat7.polarity)
     entropy_diff = abs(query_stat7.entropy - doc_stat7.entropy)
-    signal_score = 1.0 - (1/3) * (luminosity_diff + polarity_diff + entropy_diff)
     signal_score = max(0.0, signal_score)
     # Adjacency/Dimensionality bonus: connectivity + complexity

     def __post_init__(self):
         """Validate STAT8 constraints."""
+        assert 0.0 <= self.adjacency <= 1.0, f"adjacency must be [0,1], got {self.adjacency}"
+        assert 0.0 <= self.luminosity <= 1.0, f"luminosity must be [0,1], got {self.luminosity}"
+        assert 0.0 <= self.polarity <= 1.0, f"polarity must be [0,1], got {self.polarity}"
+        assert 0.0 <= self.entropy <= 1.0, f"entropy must be [0,1], got {self.entropy}"
         assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
         assert (
             1 <= self.dimensionality <= 8
     luminosity_diff = abs(query_stat7.luminosity - doc_stat7.luminosity)
     polarity_diff = abs(query_stat7.polarity - doc_stat7.polarity)
     entropy_diff = abs(query_stat7.entropy - doc_stat7.entropy)
+    signal_score = 1.0 - (1 / 3) * (luminosity_diff + polarity_diff + entropy_diff)
     signal_score = max(0.0, signal_score)
     # Adjacency/Dimensionality bonus: connectivity + complexity

warbler_cda/stat7_visualization.py CHANGED Viewed

@@ -21,11 +21,7 @@ from typing import Optional, Dict, Any
 from pathlib import Path
 from warbler_cda.stat7_experiments import (
-    BitChain,
-    generate_random_bitchain,
     EXP01_AddressUniqueness,
-    EXP02_RetrievalEfficiency,
-    EXP03_DimensionNecessity,
 )
 # Import the visualization components
@@ -75,7 +71,7 @@ class STAT7VisualizationManager:
         self.is_running = True
         print(f"🚀 STAT7 Visualization Server started on ws://{self.host}:{self.port}")
-        print(f"🌐 Open stat7threejs.html in your browser to view visualization")
     def _run_server(self):
         """Run the WebSocket server in asyncio event loop."""
@@ -201,7 +197,7 @@ def create_jupyter_widget(width: str = "100%", height: str = "600px") -> str:
         return create_inline_jupyter_widget(width, height)
     with open(html_path, "r", encoding="utf-8") as f:
-        html_content = f.read()
     # Wrap in iframe for Jupyter
     widget_html = f"""
@@ -307,7 +303,7 @@ def display_in_jupyter(width: str = "100%", height: str = "600px"):
         display(HTML(widget_html))
     except ImportError:
         print("IPython not available. Cannot display in Jupyter notebook.")
-        print(f"Open stat7threejs.html in your browser instead.")
 # Convenience functions for quick start

 from pathlib import Path
 from warbler_cda.stat7_experiments import (
     EXP01_AddressUniqueness,
 )
 # Import the visualization components
         self.is_running = True
         print(f"🚀 STAT7 Visualization Server started on ws://{self.host}:{self.port}")
+        print("🌐 Open stat7threejs.html in your browser to view visualization")
     def _run_server(self):
         """Run the WebSocket server in asyncio event loop."""
         return create_inline_jupyter_widget(width, height)
     with open(html_path, "r", encoding="utf-8") as f:
+        f.read()
     # Wrap in iframe for Jupyter
     widget_html = f"""
         display(HTML(widget_html))
     except ImportError:
         print("IPython not available. Cannot display in Jupyter notebook.")
+        print("Open stat7threejs.html in your browser instead.")
 # Convenience functions for quick start

warbler_cda/summarization_ladder.py CHANGED Viewed

@@ -5,10 +5,10 @@ Implements rolling N-window micro-summaries and pipeline macro distillation
 for the Cognitive Geo-Thermal Lore Engine v0.3.
 """
-from typing import List, Dict, Any, Optional, Tuple
 import time
 import hashlib
-from dataclasses import dataclass, asdict
 from collections import deque

 for the Cognitive Geo-Thermal Lore Engine v0.3.
 """
+from typing import List, Dict, Any, Optional
 import time
 import hashlib
+from dataclasses import dataclass
 from collections import deque

warbler_cda/utils/hf_warbler_ingest.py CHANGED Viewed

@@ -8,7 +8,6 @@ for NPC intelligence training via the magma layer self-training system.
 import logging
 from pathlib import Path
-from typing import Dict, Any, Optional
 import click
@@ -97,12 +96,12 @@ def ingest(datasets, pack_prefix, arxiv_limit, max_docs_per_chunk, max_pdf_pages
     if max_docs_per_chunk > 0:
         click.echo(f"[PACK] Chunking enabled: {max_docs_per_chunk} documents per chunk")
     else:
-        click.echo(f"[PACK] Chunking disabled: single file per pack")
     if max_pdf_pages is not None:
         click.echo(f"[PDF] PDF extraction limit: {max_pdf_pages} pages")
     else:
-        click.echo(f"[PDF] PDF extraction: unlimited pages")
     click.echo()
@@ -165,7 +164,7 @@ def ingest(datasets, pack_prefix, arxiv_limit, max_docs_per_chunk, max_pdf_pages
     if results:
         report_file = builder.save_report(results)
-        click.echo(f"\n[SUCCESS] Ingestion Complete!")
         click.echo(f"[STATS] Total Documents: {sum(r['documents'] for r in results.values())}")
         click.echo(f"[STATS] Packs Created: {len(results)}")
         click.echo(f"[STATS] Report saved to: {report_file}")

 import logging
 from pathlib import Path
 import click
     if max_docs_per_chunk > 0:
         click.echo(f"[PACK] Chunking enabled: {max_docs_per_chunk} documents per chunk")
     else:
+        click.echo("[PACK] Chunking disabled: single file per pack")
     if max_pdf_pages is not None:
         click.echo(f"[PDF] PDF extraction limit: {max_pdf_pages} pages")
     else:
+        click.echo("[PDF] PDF extraction: unlimited pages")
     click.echo()
     if results:
         report_file = builder.save_report(results)
+        click.echo("\n[SUCCESS] Ingestion Complete!")
         click.echo(f"[STATS] Total Documents: {sum(r['documents'] for r in results.values())}")
         click.echo(f"[STATS] Packs Created: {len(results)}")
         click.echo(f"[STATS] Report saved to: {report_file}")

warbler_cda/utils/load_warbler_packs.py CHANGED Viewed

@@ -233,9 +233,7 @@ def discover(api_url):
         for doc in documents:
             click.echo(f"  - {doc['content_id']}")
             if "metadata" in doc:
-                click.echo(
-                    f"    Realm: {doc['metadata'].get('realm_type','unknown')}"
-                )
     click.echo(f"\n[STATS] Total discovered: {total} documents\n")

         for doc in documents:
             click.echo(f"  - {doc['content_id']}")
             if "metadata" in doc:
+                click.echo(f"    Realm: {doc['metadata'].get('realm_type','unknown')}")
     click.echo(f"\n[STATS] Total discovered: {total} documents\n")

warbler_cda/utils/transformers/base.py CHANGED Viewed

@@ -33,9 +33,8 @@ class BaseWarblerTransformer(ABC):
     """Base class for all dataset transformers"""
     def __init__(
-            self,
-            tokenizer_name: str = "microsoft/DialoGPT-medium",
-            max_pdf_pages: Optional[int] = None):
         self.max_pdf_pages = max_pdf_pages
     @abstractmethod
@@ -47,10 +46,7 @@ class BaseWarblerTransformer(ABC):
         """Check if PDF extraction is available"""
         return PDF_AVAILABLE
-    def extract_pdf_text(
-            self,
-            pdf_data: Any,
-            max_pages: Optional[int] = None) -> Optional[str]:
         """
         Extract text from PDF data (bytes, file path, PDF object, or file-like object)
@@ -62,23 +58,19 @@ class BaseWarblerTransformer(ABC):
             Extracted text or None if extraction fails
         """
         if not PDF_AVAILABLE:
-            logger.debug(
-                "PDF extraction unavailable - pdfplumber not installed")
             return None
         try:
             if hasattr(pdf_data, "pages") and hasattr(pdf_data, "metadata"):
-                logger.info(
-                    "PDF data is already a pdfplumber.PDF object, extracting text...")
                 text_parts = []
                 total_pages = len(pdf_data.pages)
                 if max_pages is None:
-                    logger.info(
-                        f"PDF has {total_pages} pages, extracting all pages")
                 else:
-                    logger.info(
-                        f"PDF has {total_pages} pages, extracting up to {max_pages} pages")
                 try:
                     for page_num, page in enumerate(pdf_data.pages, 1):
@@ -88,35 +80,35 @@ class BaseWarblerTransformer(ABC):
                                 text_parts.append(page_text)
                                 logger.debug(
                                     f"Extracted {
-                                        len(page_text)} chars from page {page_num}")
                             else:
-                                logger.debug(
-                                    f"Page {page_num} has no extractable text")
                         except Exception as page_error:
-                            logger.warning(
-                                f"Error extracting page {page_num}: {page_error}")
                             continue
                         if max_pages is not None and page_num >= max_pages:
                             logger.info(
-                                f"Truncated PDF extraction at {page_num} pages (max: {max_pages})")
                             break
-                    extracted_text = "\n\n".join(
-                        text_parts) if text_parts else None
                     if extracted_text:
                         logger.info(
                             f"Successfully extracted {
                                 len(extracted_text)} total characters from {
-                                len(text_parts)} pages")
                     else:
-                        logger.warning(
-                            "No text could be extracted from PDF object")
                     return extracted_text
                 except Exception as e:
                     logger.warning(
                         f"Error extracting from PDF object: {
-                            type(e).__name__}: {e}")
                     return None
             if isinstance(pdf_data, dict) and "bytes" in pdf_data:
@@ -129,7 +121,8 @@ class BaseWarblerTransformer(ABC):
             if isinstance(pdf_data, bytes):
                 logger.info(
                     f"PDF data is bytes ({
-                        len(pdf_data)} bytes), creating BytesIO")
                 pdf_file = io.BytesIO(pdf_data)
             elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
                 logger.info(f"PDF data is file path: {pdf_data}")
@@ -138,8 +131,7 @@ class BaseWarblerTransformer(ABC):
                 logger.info(f"PDF data is file-like object: {type(pdf_data)}")
                 pdf_file = pdf_data
             else:
-                logger.warning(
-                    f"Unknown PDF data type: {type(pdf_data)}, cannot extract")
                 return None
             text_parts = []
@@ -147,11 +139,11 @@ class BaseWarblerTransformer(ABC):
                 total_pages = len(pdf.pages)
                 if max_pages is None:
-                    logger.info(
-                        f"Opened PDF with {total_pages} pages, extracting all pages")
                 else:
                     logger.info(
-                        f"Opened PDF with {total_pages} pages, extracting up to {max_pages} pages")
                 for page_num, page in enumerate(pdf.pages, 1):
                     try:
@@ -160,18 +152,18 @@ class BaseWarblerTransformer(ABC):
                             text_parts.append(page_text)
                             logger.debug(
                                 f"Extracted {
-                                    len(page_text)} chars from page {page_num}")
                         else:
-                            logger.debug(
-                                f"Page {page_num} has no extractable text")
                     except Exception as page_error:
-                        logger.warning(
-                            f"Error extracting page {page_num}: {page_error}")
                         continue
                     if max_pages is not None and page_num >= max_pages:
                         logger.info(
-                            f"Truncated PDF extraction at {page_num} pages (max: {max_pages})")
                         break
             extracted_text = "\n\n".join(text_parts) if text_parts else None
@@ -179,7 +171,8 @@ class BaseWarblerTransformer(ABC):
                 logger.info(
                     f"Successfully extracted {
                         len(extracted_text)} total characters from {
-                        len(text_parts)} pages")
             else:
                 logger.warning("No text could be extracted from PDF")
             return extracted_text
@@ -192,8 +185,7 @@ class BaseWarblerTransformer(ABC):
         """Split text into chunks"""
         if not text:
             return []
-        return [text[i: i + chunk_size]
-                for i in range(0, len(text), chunk_size)]
     def extract_dataset_items(self, dataset: Any) -> List[Dict[str, Any]]:
         """
@@ -211,13 +203,7 @@ class BaseWarblerTransformer(ABC):
             pass
         try:
-            if hasattr(
-                dataset,
-                "keys") and callable(
-                getattr(
-                    dataset,
-                    "keys",
-                    None)):
                 keys = list(dataset.keys())
                 if keys:
                     first_split = keys[0]
@@ -242,8 +228,7 @@ class WarblerPackBuilder:
     def __init__(self, output_dir: Optional[Path] = None):
         if output_dir is None:
-            output_dir = Path(__file__).resolve(
-            ).parent.parent / "results" / "hf_ingest"
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(exist_ok=True, parents=True)
@@ -259,8 +244,7 @@ class WarblerPackBuilder:
         total_docs = len(docs)
-        if max_docs_per_chunk == float(
-                "inf") or total_docs <= max_docs_per_chunk:
             pack_file = pack_dir / f"{pack_name}.jsonl"
             with open(pack_file, "w", encoding="utf-8") as f:
@@ -279,21 +263,21 @@ class WarblerPackBuilder:
             }
             logger.info(
-                f"✓ Created Warbler pack: {pack_name} with {total_docs} documents (single file)")
         else:
-            chunk_count = (total_docs + max_docs_per_chunk -
-                           1) // max_docs_per_chunk
             logger.info(
-                f"Creating chunked pack: {pack_name} with {total_docs} documents across {chunk_count} chunks")
             for chunk_idx in range(chunk_count):
                 start_idx = chunk_idx * max_docs_per_chunk
                 end_idx = min(start_idx + max_docs_per_chunk, total_docs)
                 chunk_docs = docs[start_idx:end_idx]
-                chunk_file = pack_dir / \
-                    f"{pack_name}-chunk-{chunk_idx + 1:03d}.jsonl"
                 with open(chunk_file, "w", encoding="utf-8") as f:
                     for doc in chunk_docs:
@@ -303,7 +287,8 @@ class WarblerPackBuilder:
                     f"  ✓ Wrote chunk {
                         chunk_idx + 1}/{chunk_count}: {
                         len(chunk_docs)} documents to {
-                        chunk_file.name}")
             metadata = {
                 "name": pack_name,
@@ -320,7 +305,8 @@ class WarblerPackBuilder:
             }
             logger.info(
-                f"✓ Created chunked Warbler pack: {pack_name} with {total_docs} documents across {chunk_count} chunks")
         metadata_file = pack_dir / "package.json"
         with open(metadata_file, "w", encoding="utf-8") as f:
@@ -335,16 +321,14 @@ class WarblerPackBuilder:
             "timestamp": datetime.now().isoformat(),
             "results": results,
             "total_documents": sum(
-                result.get("documents", 0) if isinstance(
-                    result, dict) else len(result)
                 for result in results.values()
             ),
             "packs_created": len(results),
         }
         report_file = (
-            self.output_dir /
-            f"ingestion_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
         )
         with open(report_file, "w", encoding="utf-8") as f:
             json.dump(report, f, indent=2, ensure_ascii=False)

     """Base class for all dataset transformers"""
     def __init__(
+        self, tokenizer_name: str = "microsoft/DialoGPT-medium", max_pdf_pages: Optional[int] = None
+    ):
         self.max_pdf_pages = max_pdf_pages
     @abstractmethod
         """Check if PDF extraction is available"""
         return PDF_AVAILABLE
+    def extract_pdf_text(self, pdf_data: Any, max_pages: Optional[int] = None) -> Optional[str]:
         """
         Extract text from PDF data (bytes, file path, PDF object, or file-like object)
             Extracted text or None if extraction fails
         """
         if not PDF_AVAILABLE:
+            logger.debug("PDF extraction unavailable - pdfplumber not installed")
             return None
         try:
             if hasattr(pdf_data, "pages") and hasattr(pdf_data, "metadata"):
+                logger.info("PDF data is already a pdfplumber.PDF object, extracting text...")
                 text_parts = []
                 total_pages = len(pdf_data.pages)
                 if max_pages is None:
+                    logger.info(f"PDF has {total_pages} pages, extracting all pages")
                 else:
+                    logger.info(f"PDF has {total_pages} pages, extracting up to {max_pages} pages")
                 try:
                     for page_num, page in enumerate(pdf_data.pages, 1):
                                 text_parts.append(page_text)
                                 logger.debug(
                                     f"Extracted {
+                                        len(page_text)} chars from page {page_num}"
+                                )
                             else:
+                                logger.debug(f"Page {page_num} has no extractable text")
                         except Exception as page_error:
+                            logger.warning(f"Error extracting page {page_num}: {page_error}")
                             continue
                         if max_pages is not None and page_num >= max_pages:
                             logger.info(
+                                f"Truncated PDF extraction at {page_num} pages (max: {max_pages})"
+                            )
                             break
+                    extracted_text = "\n\n".join(text_parts) if text_parts else None
                     if extracted_text:
                         logger.info(
                             f"Successfully extracted {
                                 len(extracted_text)} total characters from {
+                                len(text_parts)} pages"
+                        )
                     else:
+                        logger.warning("No text could be extracted from PDF object")
                     return extracted_text
                 except Exception as e:
                     logger.warning(
                         f"Error extracting from PDF object: {
+                            type(e).__name__}: {e}"
+                    )
                     return None
             if isinstance(pdf_data, dict) and "bytes" in pdf_data:
             if isinstance(pdf_data, bytes):
                 logger.info(
                     f"PDF data is bytes ({
+                        len(pdf_data)} bytes), creating BytesIO"
+                )
                 pdf_file = io.BytesIO(pdf_data)
             elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
                 logger.info(f"PDF data is file path: {pdf_data}")
                 logger.info(f"PDF data is file-like object: {type(pdf_data)}")
                 pdf_file = pdf_data
             else:
+                logger.warning(f"Unknown PDF data type: {type(pdf_data)}, cannot extract")
                 return None
             text_parts = []
                 total_pages = len(pdf.pages)
                 if max_pages is None:
+                    logger.info(f"Opened PDF with {total_pages} pages, extracting all pages")
                 else:
                     logger.info(
+                        f"Opened PDF with {total_pages} pages, extracting up to {max_pages} pages"
+                    )
                 for page_num, page in enumerate(pdf.pages, 1):
                     try:
                             text_parts.append(page_text)
                             logger.debug(
                                 f"Extracted {
+                                    len(page_text)} chars from page {page_num}"
+                            )
                         else:
+                            logger.debug(f"Page {page_num} has no extractable text")
                     except Exception as page_error:
+                        logger.warning(f"Error extracting page {page_num}: {page_error}")
                         continue
                     if max_pages is not None and page_num >= max_pages:
                         logger.info(
+                            f"Truncated PDF extraction at {page_num} pages (max: {max_pages})"
+                        )
                         break
             extracted_text = "\n\n".join(text_parts) if text_parts else None
                 logger.info(
                     f"Successfully extracted {
                         len(extracted_text)} total characters from {
+                        len(text_parts)} pages"
+                )
             else:
                 logger.warning("No text could be extracted from PDF")
             return extracted_text
         """Split text into chunks"""
         if not text:
             return []
+        return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
     def extract_dataset_items(self, dataset: Any) -> List[Dict[str, Any]]:
         """
             pass
         try:
+            if hasattr(dataset, "keys") and callable(getattr(dataset, "keys", None)):
                 keys = list(dataset.keys())
                 if keys:
                     first_split = keys[0]
     def __init__(self, output_dir: Optional[Path] = None):
         if output_dir is None:
+            output_dir = Path(__file__).resolve().parent.parent / "results" / "hf_ingest"
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(exist_ok=True, parents=True)
         total_docs = len(docs)
+        if max_docs_per_chunk == float("inf") or total_docs <= max_docs_per_chunk:
             pack_file = pack_dir / f"{pack_name}.jsonl"
             with open(pack_file, "w", encoding="utf-8") as f:
             }
             logger.info(
+                f"✓ Created Warbler pack: {pack_name} with {total_docs} documents (single file)"
+            )
         else:
+            chunk_count = (total_docs + max_docs_per_chunk - 1) // max_docs_per_chunk
             logger.info(
+                f"Creating chunked pack: {pack_name} with {total_docs} documents across {chunk_count} chunks"
+            )
             for chunk_idx in range(chunk_count):
                 start_idx = chunk_idx * max_docs_per_chunk
                 end_idx = min(start_idx + max_docs_per_chunk, total_docs)
                 chunk_docs = docs[start_idx:end_idx]
+                chunk_file = pack_dir / f"{pack_name}-chunk-{chunk_idx + 1:03d}.jsonl"
                 with open(chunk_file, "w", encoding="utf-8") as f:
                     for doc in chunk_docs:
                     f"  ✓ Wrote chunk {
                         chunk_idx + 1}/{chunk_count}: {
                         len(chunk_docs)} documents to {
+                        chunk_file.name}"
+                )
             metadata = {
                 "name": pack_name,
             }
             logger.info(
+                f"✓ Created chunked Warbler pack: {pack_name} with {total_docs} documents across {chunk_count} chunks"
+            )
         metadata_file = pack_dir / "package.json"
         with open(metadata_file, "w", encoding="utf-8") as f:
             "timestamp": datetime.now().isoformat(),
             "results": results,
             "total_documents": sum(
+                result.get("documents", 0) if isinstance(result, dict) else len(result)
                 for result in results.values()
             ),
             "packs_created": len(results),
         }
         report_file = (
+            self.output_dir / f"ingestion_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
         )
         with open(report_file, "w", encoding="utf-8") as f:
             json.dump(report, f, indent=2, ensure_ascii=False)

warbler_cda/utils/transformers/edustories.py CHANGED Viewed

@@ -16,8 +16,7 @@ logger = logging.getLogger(__name__)
 class EdustoriesTransformer(BaseWarblerTransformer):
     """Transform MU-NLPC/Edustories-en dataset"""
-    def transform(
-            self, dataset_name: str = "MU-NLPC/Edustories-en") -> List[Dict[str, Any]]:
         """
         Transform MU-NLPC/Edustories-en dataset
         Format: Educational case studies with structured teaching situations
@@ -49,8 +48,7 @@ class EdustoriesTransformer(BaseWarblerTransformer):
         for idx, item in enumerate(items):
             if isinstance(item, str):
-                logger.warning(
-                    f"Edustory {idx + 1}: Item is a string, skipping")
                 continue
             if isinstance(item, dict) or hasattr(item, "__getitem__"):
@@ -75,7 +73,8 @@ class EdustoriesTransformer(BaseWarblerTransformer):
                     logger.warning(
                         f"Edustory {
                             idx +
-                            1}: No case study content found, skipping")
                     continue
                 entry_id = safe_get("id", str(idx))
@@ -91,11 +90,9 @@ class EdustoriesTransformer(BaseWarblerTransformer):
                 problems_annotated = safe_get("problems_annotated", "")
                 problems_possible = safe_get("problems_possible_annotated", "")
                 solutions_annotated = safe_get("solutions_annotated", "")
-                solutions_possible = safe_get(
-                    "solutions_possible_annotated", "")
                 implications_annotated = safe_get("implications_annotated", "")
-                implications_possible = safe_get(
-                    "implications_possible_annotated", "")
                 annotator_id = safe_get("annotator_id", "")
@@ -131,7 +128,8 @@ class EdustoriesTransformer(BaseWarblerTransformer):
         logger.info(
             f"✓ Transformed {
-                len(warbler_docs)} educational case study entries")
         return warbler_docs
     @staticmethod
@@ -149,8 +147,7 @@ class EdustoriesTransformer(BaseWarblerTransformer):
             return default
         description = safe_get("description", "[No background provided]")
-        anamnesis = safe_get(
-            "anamnesis", "[No situation description provided]")
         solution = safe_get("solution", "[No intervention described]")
         outcome = safe_get("outcome", "[No outcome reported]")
@@ -181,16 +178,15 @@ class EdustoriesTransformer(BaseWarblerTransformer):
         annotation_parts = []
         if problems_annotated:
-            annotation_parts.append(
-                f"Problems Identified: {problems_annotated}")
         if solutions_annotated:
-            annotation_parts.append(
-                f"Solutions Applied: {solutions_annotated}")
         if implications_annotated:
             annotation_parts.append(f"Implications: {implications_annotated}")
-        annotations = ("\n".join(annotation_parts)
-                       if annotation_parts else "[No annotations available]")
         content = f"""TEACHING CASE STUDY

 class EdustoriesTransformer(BaseWarblerTransformer):
     """Transform MU-NLPC/Edustories-en dataset"""
+    def transform(self, dataset_name: str = "MU-NLPC/Edustories-en") -> List[Dict[str, Any]]:
         """
         Transform MU-NLPC/Edustories-en dataset
         Format: Educational case studies with structured teaching situations
         for idx, item in enumerate(items):
             if isinstance(item, str):
+                logger.warning(f"Edustory {idx + 1}: Item is a string, skipping")
                 continue
             if isinstance(item, dict) or hasattr(item, "__getitem__"):
                     logger.warning(
                         f"Edustory {
                             idx +
+                            1}: No case study content found, skipping"
+                    )
                     continue
                 entry_id = safe_get("id", str(idx))
                 problems_annotated = safe_get("problems_annotated", "")
                 problems_possible = safe_get("problems_possible_annotated", "")
                 solutions_annotated = safe_get("solutions_annotated", "")
+                solutions_possible = safe_get("solutions_possible_annotated", "")
                 implications_annotated = safe_get("implications_annotated", "")
+                implications_possible = safe_get("implications_possible_annotated", "")
                 annotator_id = safe_get("annotator_id", "")
         logger.info(
             f"✓ Transformed {
+                len(warbler_docs)} educational case study entries"
+        )
         return warbler_docs
     @staticmethod
             return default
         description = safe_get("description", "[No background provided]")
+        anamnesis = safe_get("anamnesis", "[No situation description provided]")
         solution = safe_get("solution", "[No intervention described]")
         outcome = safe_get("outcome", "[No outcome reported]")
         annotation_parts = []
         if problems_annotated:
+            annotation_parts.append(f"Problems Identified: {problems_annotated}")
         if solutions_annotated:
+            annotation_parts.append(f"Solutions Applied: {solutions_annotated}")
         if implications_annotated:
             annotation_parts.append(f"Implications: {implications_annotated}")
+        annotations = (
+            "\n".join(annotation_parts) if annotation_parts else "[No annotations available]"
+        )
         content = f"""TEACHING CASE STUDY

warbler_cda/utils/transformers/enterprise.py CHANGED Viewed

@@ -16,8 +16,7 @@ logger = logging.getLogger(__name__)
 class EnterpriseTransformer(BaseWarblerTransformer):
     """Transform SustcZhangYX/ChatEnv dataset"""
-    def transform(
-            self, dataset_name: str = "SustcZhangYX/ChatEnv") -> List[Dict[str, Any]]:
         """
         Transform SustcZhangYX/ChatEnv dataset
         Format: Software development chat conversations and collaborative coding scenarios
@@ -36,22 +35,20 @@ class EnterpriseTransformer(BaseWarblerTransformer):
                             items = list(dataset[split_name])
                             logger.info(
                                 f"Loaded {
-                                    len(items)} items from '{split_name}' split")
                             break
                     except Exception as split_error:
-                        logger.debug(
-                            f"Could not load split '{split_name}': {split_error}")
                         continue
                 if not items:
                     items = self.extract_dataset_items(dataset)
                     if items:
-                        logger.info(
-                            f"Extracted {len(items)} items from dataset")
         except Exception as e:
             logger.warning(f"Failed to load {dataset_name}: {e}")
-            logger.info(
-                f"Skipping {dataset_name} - dataset has loading issues")
             return []
         if not items:
@@ -99,14 +96,12 @@ class EnterpriseTransformer(BaseWarblerTransformer):
                 )
                 task = (
-                    item.get("task", item.get(
-                        "scenario", "Software development chat"))
                     if isinstance(item, dict)
                     else "Software development chat"
                 )
                 scenario = (
-                    item.get("scenario", item.get(
-                        "task", f"ChatEnv Scenario #{idx + 1}"))
                     if isinstance(item, dict)
                     else f"ChatEnv Scenario #{idx + 1}"
                 )
@@ -138,7 +133,8 @@ class EnterpriseTransformer(BaseWarblerTransformer):
         logger.info(
             f"✓ Transformed {
-                len(warbler_docs)} ChatEnv software development chat entries")
         return warbler_docs
     @staticmethod

 class EnterpriseTransformer(BaseWarblerTransformer):
     """Transform SustcZhangYX/ChatEnv dataset"""
+    def transform(self, dataset_name: str = "SustcZhangYX/ChatEnv") -> List[Dict[str, Any]]:
         """
         Transform SustcZhangYX/ChatEnv dataset
         Format: Software development chat conversations and collaborative coding scenarios
                             items = list(dataset[split_name])
                             logger.info(
                                 f"Loaded {
+                                    len(items)} items from '{split_name}' split"
+                            )
                             break
                     except Exception as split_error:
+                        logger.debug(f"Could not load split '{split_name}': {split_error}")
                         continue
                 if not items:
                     items = self.extract_dataset_items(dataset)
                     if items:
+                        logger.info(f"Extracted {len(items)} items from dataset")
         except Exception as e:
             logger.warning(f"Failed to load {dataset_name}: {e}")
+            logger.info(f"Skipping {dataset_name} - dataset has loading issues")
             return []
         if not items:
                 )
                 task = (
+                    item.get("task", item.get("scenario", "Software development chat"))
                     if isinstance(item, dict)
                     else "Software development chat"
                 )
                 scenario = (
+                    item.get("scenario", item.get("task", f"ChatEnv Scenario #{idx + 1}"))
                     if isinstance(item, dict)
                     else f"ChatEnv Scenario #{idx + 1}"
                 )
         logger.info(
             f"✓ Transformed {
+                len(warbler_docs)} ChatEnv software development chat entries"
+        )
         return warbler_docs
     @staticmethod

warbler_cda/utils/transformers/multi_character.py CHANGED Viewed

@@ -35,15 +35,12 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
         try:
             if "train" not in dataset:
-                logger.warning(
-                    f"Multi-char: No 'train' split found in dataset")
                 return []
             train_data = dataset["train"]
-            total_items = len(train_data) if hasattr(
-                train_data, "__len__") else 0
-            logger.info(
-                f"Processing {total_items} multi-character dialogue items...")
             for idx, item in enumerate(train_data):
                 if idx > 0 and idx % 1000 == 0:
@@ -53,8 +50,7 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
                 try:
                     if item is None:
-                        logger.warning(
-                            f"Multi-char {idx + 1}: Item is None, skipping")
                         continue
                     if not isinstance(item, dict):
@@ -75,12 +71,12 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
                         conversation = [] if conversation is None else [conversation]
                     if not setting and not conversation:
-                        logger.warning(
-                            f"Multi-char {idx + 1}: Missing essential data, skipping")
                         continue
-                    if conversation and not all(isinstance(
-                            msg, (dict, str)) for msg in conversation[:10]):
                         logger.warning(
                             f"Multi-char {idx + 1}: Invalid conversation structure, skipping"
                         )
@@ -102,12 +98,10 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
                             "source_dataset": dataset_name,
                             "setting": setting[:150] + "..." if len(setting) > 150 else setting,
                             "character_count": (
-                                len(characters) if isinstance(
-                                    characters, list) else 0
                             ),
                             "conversation_length": (
-                                len(conversation) if isinstance(
-                                    conversation, list) else 0
                             ),
                             "realm_type": "narrative",
                             "realm_label": "multi_character_dialogue",
@@ -129,8 +123,7 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
                     )
                     continue
                 except (KeyboardInterrupt, SystemExit):
-                    logger.warning(
-                        f"Multi-char: Processing interrupted at item {idx + 1}")
                     raise
                 except Exception as e:
                     logger.warning(
@@ -141,10 +134,12 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
         except (MemoryError, RecursionError) as critical_error:
             logger.error(
                 f"Multi-char: Critical error during iteration: {
-                    type(critical_error).__name__}: {critical_error}")
             logger.info(
                 f"Returning {
-                    len(warbler_docs)} documents processed before error")
         except (KeyboardInterrupt, SystemExit):
             logger.warning(
                 f"Multi-char: Processing interrupted, returning {len(warbler_docs)} documents"
@@ -153,13 +148,14 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
         except Exception as outer_error:
             logger.error(
                 f"Multi-char: Unexpected error during dataset iteration: {
-                    type(outer_error).__name__}: {outer_error}")
             logger.info(
                 f"Returning {
-                    len(warbler_docs)} documents processed before error")
-        logger.info(
-            f"✓ Transformed {len(warbler_docs)} multi-character entries")
         return warbler_docs
     @staticmethod
@@ -185,18 +181,14 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
                         message_field = msg.get("message", "")
                         if not isinstance(from_field, str):
-                            from_field = str(
-                                from_field) if from_field is not None else "Unknown"
                         if not isinstance(message_field, str):
-                            message_field = str(
-                                message_field) if message_field is not None else ""
                         if len(message_field) > 5000:
-                            message_field = message_field[:5000] + \
-                                "... [truncated]"
-                        conversation_lines.append(
-                            f"{from_field}: {message_field}")
                     elif isinstance(msg, str):
                         if len(msg) > 5000:
@@ -204,33 +196,31 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
                         conversation_lines.append(msg)
                     else:
-                        conversation_lines.append(
-                            f"[Message {msg_idx + 1}: {type(msg).__name__}]")
                 except (RecursionError, MemoryError) as critical_err:
                     logger.warning(
-                        f"Critical error processing conversation message {msg_idx}: {critical_err}")
                     break
                 except Exception as msg_err:
-                    logger.debug(
-                        f"Error processing conversation message {msg_idx}: {msg_err}")
                     continue
             if len(conversation) > max_conversation_items:
                 conversation_lines.append(
                     f"\n[... {
                         len(conversation) -
-                        max_conversation_items} more messages truncated]")
         conversation_text = (
-            "\n".join(
-                conversation_lines) if conversation_lines else "[No conversation available]"
         )
         setting = item.get("setting", "[No setting provided]")
         if not isinstance(setting, str):
-            setting = str(
-                setting) if setting is not None else "[No setting provided]"
         if len(setting) > 2000:
             setting = setting[:2000] + "... [truncated]"
@@ -240,8 +230,8 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
             characters = [] if characters is None else [characters]
         setting_after = item.get(
-            "setting after interaction",
-            "[No setting after interaction provided]")
         if not isinstance(setting_after, str):
             setting_after = (
                 str(setting_after)
@@ -257,13 +247,11 @@ class MultiCharacterTransformer(BaseWarblerTransformer):
             if len(characters) > 100:
                 characters = characters[:100]
                 characters_str = (
-                    json.dumps(characters, indent=2,
-                               ensure_ascii=False) + "\n[... truncated]"
                 )
             else:
                 characters_str = (
-                    json.dumps(characters, indent=2,
-                               ensure_ascii=False) if characters else "[]"
                 )
         except (TypeError, ValueError, RecursionError) as json_err:
             logger.debug(f"Error serializing characters to JSON: {json_err}")
@@ -283,8 +271,7 @@ After Interaction: {setting_after}
 This represents a multi-character narrative scenario for NPC interaction training."""
             if len(content) > 50000:
-                content = content[:50000] + \
-                    "\n\n[Content truncated due to size]"
             return content
         except Exception as final_err:

         try:
             if "train" not in dataset:
+                logger.warning("Multi-char: No 'train' split found in dataset")
                 return []
             train_data = dataset["train"]
+            total_items = len(train_data) if hasattr(train_data, "__len__") else 0
+            logger.info(f"Processing {total_items} multi-character dialogue items...")
             for idx, item in enumerate(train_data):
                 if idx > 0 and idx % 1000 == 0:
                 try:
                     if item is None:
+                        logger.warning(f"Multi-char {idx + 1}: Item is None, skipping")
                         continue
                     if not isinstance(item, dict):
                         conversation = [] if conversation is None else [conversation]
                     if not setting and not conversation:
+                        logger.warning(f"Multi-char {idx + 1}: Missing essential data, skipping")
                         continue
+                    if conversation and not all(
+                        isinstance(msg, (dict, str)) for msg in conversation[:10]
+                    ):
                         logger.warning(
                             f"Multi-char {idx + 1}: Invalid conversation structure, skipping"
                         )
                             "source_dataset": dataset_name,
                             "setting": setting[:150] + "..." if len(setting) > 150 else setting,
                             "character_count": (
+                                len(characters) if isinstance(characters, list) else 0
                             ),
                             "conversation_length": (
+                                len(conversation) if isinstance(conversation, list) else 0
                             ),
                             "realm_type": "narrative",
                             "realm_label": "multi_character_dialogue",
                     )
                     continue
                 except (KeyboardInterrupt, SystemExit):
+                    logger.warning(f"Multi-char: Processing interrupted at item {idx + 1}")
                     raise
                 except Exception as e:
                     logger.warning(
         except (MemoryError, RecursionError) as critical_error:
             logger.error(
                 f"Multi-char: Critical error during iteration: {
+                    type(critical_error).__name__}: {critical_error}"
+            )
             logger.info(
                 f"Returning {
+                    len(warbler_docs)} documents processed before error"
+            )
         except (KeyboardInterrupt, SystemExit):
             logger.warning(
                 f"Multi-char: Processing interrupted, returning {len(warbler_docs)} documents"
         except Exception as outer_error:
             logger.error(
                 f"Multi-char: Unexpected error during dataset iteration: {
+                    type(outer_error).__name__}: {outer_error}"
+            )
             logger.info(
                 f"Returning {
+                    len(warbler_docs)} documents processed before error"
+            )
+        logger.info(f"✓ Transformed {len(warbler_docs)} multi-character entries")
         return warbler_docs
     @staticmethod
                         message_field = msg.get("message", "")
                         if not isinstance(from_field, str):
+                            from_field = str(from_field) if from_field is not None else "Unknown"
                         if not isinstance(message_field, str):
+                            message_field = str(message_field) if message_field is not None else ""
                         if len(message_field) > 5000:
+                            message_field = message_field[:5000] + "... [truncated]"
+                        conversation_lines.append(f"{from_field}: {message_field}")
                     elif isinstance(msg, str):
                         if len(msg) > 5000:
                         conversation_lines.append(msg)
                     else:
+                        conversation_lines.append(f"[Message {msg_idx + 1}: {type(msg).__name__}]")
                 except (RecursionError, MemoryError) as critical_err:
                     logger.warning(
+                        f"Critical error processing conversation message {msg_idx}: {critical_err}"
+                    )
                     break
                 except Exception as msg_err:
+                    logger.debug(f"Error processing conversation message {msg_idx}: {msg_err}")
                     continue
             if len(conversation) > max_conversation_items:
                 conversation_lines.append(
                     f"\n[... {
                         len(conversation) -
+                        max_conversation_items} more messages truncated]"
+                )
         conversation_text = (
+            "\n".join(conversation_lines) if conversation_lines else "[No conversation available]"
         )
         setting = item.get("setting", "[No setting provided]")
         if not isinstance(setting, str):
+            setting = str(setting) if setting is not None else "[No setting provided]"
         if len(setting) > 2000:
             setting = setting[:2000] + "... [truncated]"
             characters = [] if characters is None else [characters]
         setting_after = item.get(
+            "setting after interaction", "[No setting after interaction provided]"
+        )
         if not isinstance(setting_after, str):
             setting_after = (
                 str(setting_after)
             if len(characters) > 100:
                 characters = characters[:100]
                 characters_str = (
+                    json.dumps(characters, indent=2, ensure_ascii=False) + "\n[... truncated]"
                 )
             else:
                 characters_str = (
+                    json.dumps(characters, indent=2, ensure_ascii=False) if characters else "[]"
                 )
         except (TypeError, ValueError, RecursionError) as json_err:
             logger.debug(f"Error serializing characters to JSON: {json_err}")
 This represents a multi-character narrative scenario for NPC interaction training."""
             if len(content) > 50000:
+                content = content[:50000] + "\n\n[Content truncated due to size]"
             return content
         except Exception as final_err:

warbler_cda/utils/transformers/novels.py CHANGED Viewed

@@ -16,8 +16,7 @@ logger = logging.getLogger(__name__)
 class NovelsTransformer(BaseWarblerTransformer):
     """Transform GOAT-AI/generated-novels dataset"""
-    def transform(
-            self, dataset_name: str = "GOAT-AI/generated-novels") -> List[Dict[str, Any]]:
         """
         Transform GOAT-AI/generated-novels dataset
         Format: Full-length generated novels (PDF-based, treated as narrative metadata)
@@ -61,13 +60,7 @@ class NovelsTransformer(BaseWarblerTransformer):
                 except Exception:
                     item_keys = []
-                for field in [
-                    "text",
-                    "story",
-                    "content",
-                    "novel",
-                    "body",
-                        "full_text"]:
                     try:
                         if isinstance(item, dict):
                             if field in item and item[field]:
@@ -84,9 +77,9 @@ class NovelsTransformer(BaseWarblerTransformer):
                     logger.info(
                         f"Novel {
                             idx +
-                            1}: No text field found, attempting PDF extraction...")
-                    for pdf_field in [
-                            "pdf", "file", "document", "content", "data"]:
                         try:
                             pdf_data = None
                             if isinstance(item, dict):
@@ -101,33 +94,37 @@ class NovelsTransformer(BaseWarblerTransformer):
                                     f"Novel {
                                         idx +
                                         1}: Found PDF data in field '{pdf_field}' (type: {
-                                        type(pdf_data).__name__})")
-                                text = self.extract_pdf_text(
-                                    pdf_data, max_pages=self.max_pdf_pages)
                                 if text:
                                     logger.info(
                                         f"Novel {
                                             idx +
                                             1}: Successfully extracted {
-                                            len(text)} chars from PDF field '{pdf_field}'")
                                     break
                                 else:
                                     logger.warning(
                                         f"Novel {
                                             idx +
-                                            1}: PDF field '{pdf_field}' extraction returned no text")
                         except Exception as e:
                             logger.warning(
                                 f"Novel {
                                     idx +
                                     1}: PDF extraction from field '{pdf_field}' failed: {
-                                    type(e).__name__}: {e}")
                 if not text:
                     logger.warning(
                         f"Novel {
                             idx +
-                            1}: No text content found. Available fields: {item_keys}")
                     pdf_status = (
                         "Enabled"
                         if self.has_pdf_support()
@@ -149,11 +146,9 @@ This entry serves as a placeholder for retrieval system testing."""
                 title = f"Generated Novel #{idx + 1}"
                 try:
                     if isinstance(item, dict):
-                        title = item.get("title", item.get(
-                            "name", f"Generated Novel #{idx + 1}"))
                     elif hasattr(item, "get"):
-                        title = item.get("title", item.get(
-                            "name", f"Generated Novel #{idx + 1}"))
                     elif hasattr(item, "__getitem__"):
                         title = (
                             item.get("title", f"Generated Novel #{idx + 1}")
@@ -193,15 +188,12 @@ This entry serves as a placeholder for retrieval system testing."""
         logger.info(
             f"✓ Transformed {
                 len(warbler_docs)} novel chunks from {
-                len(items)} novels")
         return warbler_docs
     @staticmethod
-    def _create_content(
-            title: str,
-            text_chunk: str,
-            chunk_idx: int,
-            total_chunks: int) -> str:
         """Create content string for novel chunk"""
         return f"""Novel: {title}
 Part: {chunk_idx + 1} of {total_chunks}

 class NovelsTransformer(BaseWarblerTransformer):
     """Transform GOAT-AI/generated-novels dataset"""
+    def transform(self, dataset_name: str = "GOAT-AI/generated-novels") -> List[Dict[str, Any]]:
         """
         Transform GOAT-AI/generated-novels dataset
         Format: Full-length generated novels (PDF-based, treated as narrative metadata)
                 except Exception:
                     item_keys = []
+                for field in ["text", "story", "content", "novel", "body", "full_text"]:
                     try:
                         if isinstance(item, dict):
                             if field in item and item[field]:
                     logger.info(
                         f"Novel {
                             idx +
+                            1}: No text field found, attempting PDF extraction..."
+                    )
+                    for pdf_field in ["pdf", "file", "document", "content", "data"]:
                         try:
                             pdf_data = None
                             if isinstance(item, dict):
                                     f"Novel {
                                         idx +
                                         1}: Found PDF data in field '{pdf_field}' (type: {
+                                        type(pdf_data).__name__})"
+                                )
+                                text = self.extract_pdf_text(pdf_data, max_pages=self.max_pdf_pages)
                                 if text:
                                     logger.info(
                                         f"Novel {
                                             idx +
                                             1}: Successfully extracted {
+                                            len(text)} chars from PDF field '{pdf_field}'"
+                                    )
                                     break
                                 else:
                                     logger.warning(
                                         f"Novel {
                                             idx +
+                                            1}: PDF field '{pdf_field}' extraction returned no text"
+                                    )
                         except Exception as e:
                             logger.warning(
                                 f"Novel {
                                     idx +
                                     1}: PDF extraction from field '{pdf_field}' failed: {
+                                    type(e).__name__}: {e}"
+                            )
                 if not text:
                     logger.warning(
                         f"Novel {
                             idx +
+                            1}: No text content found. Available fields: {item_keys}"
+                    )
                     pdf_status = (
                         "Enabled"
                         if self.has_pdf_support()
                 title = f"Generated Novel #{idx + 1}"
                 try:
                     if isinstance(item, dict):
+                        title = item.get("title", item.get("name", f"Generated Novel #{idx + 1}"))
                     elif hasattr(item, "get"):
+                        title = item.get("title", item.get("name", f"Generated Novel #{idx + 1}"))
                     elif hasattr(item, "__getitem__"):
                         title = (
                             item.get("title", f"Generated Novel #{idx + 1}")
         logger.info(
             f"✓ Transformed {
                 len(warbler_docs)} novel chunks from {
+                len(items)} novels"
+        )
         return warbler_docs
     @staticmethod
+    def _create_content(title: str, text_chunk: str, chunk_idx: int, total_chunks: int) -> str:
         """Create content string for novel chunk"""
         return f"""Novel: {title}
 Part: {chunk_idx + 1} of {total_chunks}

warbler_cda/utils/transformers/npc_dialogue.py CHANGED Viewed

@@ -25,13 +25,17 @@ class NPCDialogueTransformer(BaseWarblerTransformer):
             {
                 "Name": "Elandra the Merchant",
                 "Biography": "A seasoned trader who has traveled across kingdoms, known for her sharp wit.",
-                "Queries": ["What do you sell?", "Can you lower the price?", "Any rare items today?"],
                 "Responses": [
                     "I have wares from distant lands, take a look.",
                     "Prices are firm, but quality is unmatched.",
-                    "Indeed, a relic from the old empire just arrived."
                 ],
-                "Emotions": ["neutral", "greedy", "excited"]
             },
             {
                 "Name": "Tharos the Guard",
@@ -40,20 +44,24 @@ class NPCDialogueTransformer(BaseWarblerTransformer):
                 "Responses": [
                     "Only citizens may pass without a writ.",
                     "Bandits lurk beyond the hills, stay vigilant.",
-                    "I serve the crown, keeping watch at dawn."
                 ],
-                "Emotions": ["serious", "cautious", "stern"]
             },
             {
                 "Name": "Lyra the Healer",
                 "Biography": "A gentle soul who tends to the wounded, guided by compassion and faith.",
-                "Queries": ["Can you heal me?", "What herbs do you use?", "Why do you help strangers?"],
                 "Responses": [
                     "Rest easy, I will mend your wounds.",
                     "Chamomile and sage, nature’s gift to us.",
-                    "Because every life is sacred, no matter the path."
                 ],
-                "Emotions": ["kind", "calm", "hopeful"]
             },
         ]

             {
                 "Name": "Elandra the Merchant",
                 "Biography": "A seasoned trader who has traveled across kingdoms, known for her sharp wit.",
+                "Queries": [
+                    "What do you sell?",
+                    "Can you lower the price?",
+                    "Any rare items today?",
+                ],
                 "Responses": [
                     "I have wares from distant lands, take a look.",
                     "Prices are firm, but quality is unmatched.",
+                    "Indeed, a relic from the old empire just arrived.",
                 ],
+                "Emotions": ["neutral", "greedy", "excited"],
             },
             {
                 "Name": "Tharos the Guard",
                 "Responses": [
                     "Only citizens may pass without a writ.",
                     "Bandits lurk beyond the hills, stay vigilant.",
+                    "I serve the crown, keeping watch at dawn.",
                 ],
+                "Emotions": ["serious", "cautious", "stern"],
             },
             {
                 "Name": "Lyra the Healer",
                 "Biography": "A gentle soul who tends to the wounded, guided by compassion and faith.",
+                "Queries": [
+                    "Can you heal me?",
+                    "What herbs do you use?",
+                    "Why do you help strangers?",
+                ],
                 "Responses": [
                     "Rest easy, I will mend your wounds.",
                     "Chamomile and sage, nature’s gift to us.",
+                    "Because every life is sacred, no matter the path.",
                 ],
+                "Emotions": ["kind", "calm", "hopeful"],
             },
         ]

warbler_cda/utils/transformers/portuguese_education.py CHANGED Viewed

@@ -35,8 +35,7 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
                 items = []
                 if hasattr(dataset, "__getitem__") and "train" in dataset:
                     items = list(dataset["train"])
-                    logger.info(
-                        f"Loaded {len(items)} items from dataset['train']")
                 else:
                     items = self.extract_dataset_items(dataset)
                     logger.info(f"Extracted {len(items)} items from dataset")
@@ -48,8 +47,7 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
         for idx, item in enumerate(items):
             if isinstance(item, str):
-                logger.warning(
-                    f"Portuguese doc {idx + 1}: Item is a string, skipping")
                 continue
             if isinstance(item, dict) or hasattr(item, "__getitem__"):
@@ -63,12 +61,7 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
                     item_keys = []
                 content = None
-                for field in [
-                    "content",
-                    "text",
-                    "body",
-                    "document",
-                        "passage"]:
                     try:
                         if isinstance(item, dict):
                             if field in item and item[field]:
@@ -93,14 +86,14 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
                                     pdf_data = item[pdf_field]
                             if pdf_data:
-                                if isinstance(
-                                        pdf_data, dict) and "bytes" in pdf_data:
                                     pdf_bytes = pdf_data["bytes"]
                                     logger.info(
                                         f"Portuguese doc {
                                             idx +
                                             1}: Found PDF bytes ({
-                                            len(pdf_bytes)} bytes), extracting...")
                                     content = self.extract_pdf_text(
                                         pdf_bytes, max_pages=self.max_pdf_pages
                                     )
@@ -109,7 +102,8 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
                                         f"Portuguese doc {
                                             idx +
                                             1}: Found PDF bytes ({
-                                            len(pdf_data)} bytes), extracting...")
                                     content = self.extract_pdf_text(
                                         pdf_data, max_pages=self.max_pdf_pages
                                     )
@@ -118,7 +112,8 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
                                         f"Portuguese doc {
                                             idx +
                                             1}: Found PDF data (type: {
-                                            type(pdf_data)}), attempting extraction...")
                                     content = self.extract_pdf_text(
                                         pdf_data, max_pages=self.max_pdf_pages
                                     )
@@ -128,24 +123,28 @@ class PortugueseEducationTransformer(BaseWarblerTransformer):
                                         f"Portuguese doc {
                                             idx +
                                             1}: Successfully extracted {
-                                            len(content)} chars from PDF")
                                     break
                                 else:
                                     logger.warning(
                                         f"Portuguese doc {
-                                            idx + 1}: PDF extraction returned no text")
                         except Exception as e:
                             logger.warning(
                                 f"Portuguese doc {
                                     idx +
                                     1}: PDF extraction error: {
-                                    type(e).__name__}: {e}")
                 if not content:
                     logger.warning(
                         f"Portuguese doc {
                             idx +
-                            1}: No content found. Available fields: {item_keys}")
                     content = f"""[Conteúdo Indisponível]
 Este documento (#{idx + 1}) faz parte do dataset Solshine/Portuguese_Language_Education_Texts.
@@ -186,7 +185,8 @@ Esta entrada serve como placeholder para testes do sistema de recuperação."""
                 except Exception as e:
                     logger.warning(
                         f"Portuguese doc {
-                            idx + 1}: Could not convert item to dict: {e}")
                     item_with_content = {}
                 item_with_content["content"] = content
@@ -222,8 +222,7 @@ Esta entrada serve como placeholder para testes do sistema de recuperação."""
                 }
                 warbler_docs.append(doc)
-        logger.info(
-            f"✓ Transformed {len(warbler_docs)} Portuguese education entries")
         return warbler_docs
     @staticmethod

                 items = []
                 if hasattr(dataset, "__getitem__") and "train" in dataset:
                     items = list(dataset["train"])
+                    logger.info(f"Loaded {len(items)} items from dataset['train']")
                 else:
                     items = self.extract_dataset_items(dataset)
                     logger.info(f"Extracted {len(items)} items from dataset")
         for idx, item in enumerate(items):
             if isinstance(item, str):
+                logger.warning(f"Portuguese doc {idx + 1}: Item is a string, skipping")
                 continue
             if isinstance(item, dict) or hasattr(item, "__getitem__"):
                     item_keys = []
                 content = None
+                for field in ["content", "text", "body", "document", "passage"]:
                     try:
                         if isinstance(item, dict):
                             if field in item and item[field]:
                                     pdf_data = item[pdf_field]
                             if pdf_data:
+                                if isinstance(pdf_data, dict) and "bytes" in pdf_data:
                                     pdf_bytes = pdf_data["bytes"]
                                     logger.info(
                                         f"Portuguese doc {
                                             idx +
                                             1}: Found PDF bytes ({
+                                            len(pdf_bytes)} bytes), extracting..."
+                                    )
                                     content = self.extract_pdf_text(
                                         pdf_bytes, max_pages=self.max_pdf_pages
                                     )
                                         f"Portuguese doc {
                                             idx +
                                             1}: Found PDF bytes ({
+                                            len(pdf_data)} bytes), extracting..."
+                                    )
                                     content = self.extract_pdf_text(
                                         pdf_data, max_pages=self.max_pdf_pages
                                     )
                                         f"Portuguese doc {
                                             idx +
                                             1}: Found PDF data (type: {
+                                            type(pdf_data)}), attempting extraction..."
+                                    )
                                     content = self.extract_pdf_text(
                                         pdf_data, max_pages=self.max_pdf_pages
                                     )
                                         f"Portuguese doc {
                                             idx +
                                             1}: Successfully extracted {
+                                            len(content)} chars from PDF"
+                                    )
                                     break
                                 else:
                                     logger.warning(
                                         f"Portuguese doc {
+                                            idx + 1}: PDF extraction returned no text"
+                                    )
                         except Exception as e:
                             logger.warning(
                                 f"Portuguese doc {
                                     idx +
                                     1}: PDF extraction error: {
+                                    type(e).__name__}: {e}"
+                            )
                 if not content:
                     logger.warning(
                         f"Portuguese doc {
                             idx +
+                            1}: No content found. Available fields: {item_keys}"
+                    )
                     content = f"""[Conteúdo Indisponível]
 Este documento (#{idx + 1}) faz parte do dataset Solshine/Portuguese_Language_Education_Texts.
                 except Exception as e:
                     logger.warning(
                         f"Portuguese doc {
+                            idx + 1}: Could not convert item to dict: {e}"
+                    )
                     item_with_content = {}
                 item_with_content["content"] = content
                 }
                 warbler_docs.append(doc)
+        logger.info(f"✓ Transformed {len(warbler_docs)} Portuguese education entries")
         return warbler_docs
     @staticmethod