Spaces:

tomvaillant
/

graphics-llm

Running

App Files Files Community

Add Vanna

by remdms - opened Nov 11

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+109

-929

Files changed (8) hide show

.gitignore +0 -1
513935c4d2db2d2d/query_results_661f24f3.csv +0 -5
513935c4d2db2d2d/query_results_8b61c5d0.csv +0 -2
513935c4d2db2d2d/query_results_c6e0aed3.csv +0 -9
app.py +75 -268
src/query_intent_classifier.py +0 -240
src/vanna.py +34 -104
src/vanna_query_functions.py +0 -300

.gitignore CHANGED Viewed

@@ -48,4 +48,3 @@ flagged/
 # Logs
 *.log
-memory/chroma.sqlite3


48
49	# Logs
50	*.log

513935c4d2db2d2d/query_results_661f24f3.csv DELETED Viewed

@@ -1,5 +0,0 @@
-id,title,source_url,author,published_date,image_url,type
-1242,These preteen go-kart drivers are spending millions for a shot at F1 racing,https://www.washingtonpost.com/world/interactive/2024/formula-1-karting-children-parents-racing-costs/,The Washington Post,2025-07-17,,spotlight
-1912,A Formula 1 pistop: 2 seconds of adrenaline and pressure,https://www.washingtonpost.com/sports/interactive/2023/formula-one-pitstop-haas-red-bull/,The Washington Post,2023-07-17,,spotlight
-7047,Racing Against History,http://www.nytimes.com/interactive/2012/08/01/sports/olympics/racing-against-history.html?gwh=2D12538F1CD4F05B39F50285EFA1313E,The New York Times,2012-07-17,,spotlight
-442,75 years of innovation: How F1 has evolved since 1950 and where it's headed,https://www.espn.com/espn/feature/story/_/id/43832710/how-f1-evolved-1950-where-headed-2026,ESPN,,,spotlight

513935c4d2db2d2d/query_results_8b61c5d0.csv DELETED Viewed

	@@ -1,2 +0,0 @@
1	- id,title,source_url,author,published_date,image_url,type
2	- 391,Our World \| Justdiggit,https://ourworld.justdiggit.org/en/,Just Digg It,2024-01-19,https://towumekminbldlabbyss.supabase.co/storage/v1/object/public/images/posts/share-ourworld-justdiggit.jpg,spotlight

513935c4d2db2d2d/query_results_c6e0aed3.csv DELETED Viewed

@@ -1,9 +0,0 @@
-id,title,source_url,author,published_date,image_url,type
-1242,These preteen go-kart drivers are spending millions for a shot at F1 racing,https://www.washingtonpost.com/world/interactive/2024/formula-1-karting-children-parents-racing-costs/,The Washington Post,2025-07-17,,spotlight
-925,Weed drinks are everywhere in Minnesota. Other states are now embracing them.,https://www.politico.com/news/2024/07/10/minnesota-weed-drinks-00165375,POLITICO,2025-07-17,,spotlight
-1912,A Formula 1 pistop: 2 seconds of adrenaline and pressure,https://www.washingtonpost.com/sports/interactive/2023/formula-one-pitstop-haas-red-bull/,The Washington Post,2023-07-17,,spotlight
-3122,Rising Reality: A look at the difficulties facing communities bracing for climate change all along San Francisco Bay,https://www.sfchronicle.com/projects/2021/san-francisco-bay-area-sea-level-rise-2021/mission-creek,San Francisco Chronicle,2021-07-17,,spotlight
-7047,Racing Against History,http://www.nytimes.com/interactive/2012/08/01/sports/olympics/racing-against-history.html?gwh=2D12538F1CD4F05B39F50285EFA1313E,The New York Times,2012-07-17,,spotlight
-3754,For embracing responsive design,http://www.bostonglobe.com/arts/specials/gardner,Boston Globe,2011-07-17,,spotlight
-46,Privacy Preserving Proximity Tracing,https://tracing.ft0.ch/#/,Privacy Preserving Proximity Tracing,,,spotlight
-442,75 years of innovation: How F1 has evolved since 1950 and where it's headed,https://www.espn.com/espn/feature/story/_/id/43832710/how-f1-evolved-1950-where-headed-2026,ESPN,,,spotlight

app.py CHANGED Viewed

@@ -10,7 +10,6 @@ Now with Datawrapper integration for chart generation!
 import os
 import io
 import asyncio
-import time
 import pandas as pd
 import gradio as gr
 from dotenv import load_dotenv
@@ -19,7 +18,6 @@ from src.datawrapper_client import create_and_publish_chart, get_iframe_html
 from datetime import datetime, timedelta
 from collections import defaultdict
 from src.vanna import VannaComponent
-from src.query_intent_classifier import classify_query, IntentClassifier
 # Load environment variables
 load_dotenv()
@@ -56,32 +54,6 @@ except Exception as e:
     print(f"✗ Error initializing Vanna: {e}")
     raise
-# CSV cleanup function
-def cleanup_old_csv_files():
-    """Delete CSV files older than 24 hours to prevent accumulation"""
-    folder = "513935c4d2db2d2d"
-    if not os.path.exists(folder):
-        return
-    cleaned = 0
-    for file in os.listdir(folder):
-        if file.endswith(".csv"):
-            file_path = os.path.join(folder, file)
-            try:
-                # Check if file is older than 24 hours
-                if os.path.getmtime(file_path) < time.time() - 86400:
-                    os.remove(file_path)
-                    cleaned += 1
-            except Exception as e:
-                print(f"Warning: Could not delete {file_path}: {e}")
-    if cleaned > 0:
-        print(f"✓ Cleaned up {cleaned} old CSV files")
-# Run cleanup on startup
-print("Cleaning up old CSV files...")
-cleanup_old_csv_files()
 def check_rate_limit(request: gr.Request) -> tuple[bool, int]:
     """Check if user has exceeded rate limit"""
     if request is None:
@@ -138,41 +110,23 @@ def recommend_stream(message: str, history: list, request: gr.Request):
         yield f"Error generating response: {str(e)}\n\nPlease check your environment variables (HF_TOKEN, SUPABASE_URL, SUPABASE_KEY) and try again."
-def generate_chart_from_csv(csv_file, user_prompt, api_key):
     """
-    Generate a Datawrapper chart from uploaded CSV and user prompt using user's API key.
     Args:
         csv_file: Uploaded CSV file
         user_prompt: User's description of the chart
-        api_key: User's Datawrapper API key
     Returns:
         HTML string with iframe or error message
     """
-    # Validate API key first
-    if not api_key or api_key.strip() == "":
-        return """
-        <div style='padding: 50px; text-align: center; color: #d9534f;'>
-            <h3>❌ No API Key Provided</h3>
-            <p>Please enter your Datawrapper API key above to generate charts.</p>
-            <p style='margin-top: 15px;'>
-                <a href='https://app.datawrapper.de/account/api-tokens' target='_blank'
-                   style='color: #1976d2; text-decoration: underline;'>Get your API key →</a>
-            </p>
-        </div>
-        """
     if not csv_file:
         return "<div style='padding: 50px; text-align: center;'>Please upload a CSV file to generate a chart.</div>"
     if not user_prompt or user_prompt.strip() == "":
         return "<div style='padding: 50px; text-align: center;'>Please describe what chart you want to create.</div>"
-    # Temporarily set the API key in environment for this request
-    original_key = os.environ.get("DATAWRAPPER_ACCESS_TOKEN")
-    os.environ["DATAWRAPPER_ACCESS_TOKEN"] = api_key
     try:
         # Show loading message
         loading_html = """
@@ -238,15 +192,9 @@ def generate_chart_from_csv(csv_file, user_prompt, api_key):
         <div style='padding: 50px; text-align: center; color: red;'>
             <h3>❌ Error</h3>
             <p>{str(e)}</p>
-            <p style='font-size: 0.9em; color: #666;'>Please ensure your CSV is properly formatted and your API key is correct.</p>
         </div>
         """
-    finally:
-        # Restore original API key or remove it
-        if original_key:
-            os.environ["DATAWRAPPER_ACCESS_TOKEN"] = original_key
-        elif "DATAWRAPPER_ACCESS_TOKEN" in os.environ:
-            del os.environ["DATAWRAPPER_ACCESS_TOKEN"]
 def csv_to_cards_html(csv_text: str) -> str:
     """
@@ -263,7 +211,11 @@ def csv_to_cards_html(csv_text: str) -> str:
             source_url = row.get("source_url", "#")
             author = row.get("author", "Inconnu")
             published_date = row.get("published_date", "")
-            image_url = row.get("image_url", "https://fpoimg.com/800x600?text=Image+not+found")
             cards_html += f"""
             <div style="background: white; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);
@@ -275,7 +227,7 @@ def csv_to_cards_html(csv_text: str) -> str:
                     <p style="margin:0; color:#999; font-size:0.8em;">{published_date}</p>
                     <a href="{source_url}" target="_blank"
                        style="display:inline-block; margin-top:8px; font-size:0.9em; color:#1976d2; text-decoration:none;">
-                       🔗 Source
                     </a>
                 </div>
             </div>
@@ -310,60 +262,20 @@ async def search_inspiration_from_database(user_prompt):
         """
     try:
-        # Classify user intent
-        print(f"\n{'='*60}")
-        print(f"[SEARCH] User prompt: {user_prompt}")
-        classifier = IntentClassifier()
-        classification = classifier.classify(user_prompt)
-        print(f"[INTENT] Type: {classification['intent'].value}")
-        print(f"[INTENT] Keywords: {classification['keywords']}")
-        print(f"[INTENT] Inferred tags: {classification['tags']}")
-        print(f"[INTENT] Short query: {classification['is_short_query']}")
-        # Enhance prompt with intent guidance
-        enhanced_prompt = classifier.format_for_vanna(classification)
-        full_prompt = f"{user_prompt}\n\n{enhanced_prompt}"
-        print(f"[VANNA] Sending enhanced prompt to Vanna...")
-        response = await vanna.ask(full_prompt)
-        print(f"[VANNA] Response received: {repr(response)[:200]}...")
-        print(f"{'='*60}\n")
         clean_response = response.strip()
-        # Check for empty query results (0 rows returned)
-        if "No rows returned" in clean_response or "0 rows" in clean_response.lower():
-            return f"""
-            <div style='padding: 50px; text-align: center; color: #f0ad4e;'>
-                <h3>🔍 No Results Found</h3>
-                <p>Your query was executed successfully, but no posts matched your criteria.</p>
-                <p style='margin-top: 15px; font-weight: 600;'>Suggestions:</p>
-                <ul style='list-style: none; padding: 0; text-align: left; display: inline-block;'>
-                    <li>• Try broader keywords (e.g., "visualization" instead of "F1 dataviz")</li>
-                    <li>• Search by author names (e.g., "New York Times")</li>
-                    <li>• Use simple terms (e.g., "interactive", "maps")</li>
-                </ul>
-                <p style='margin-top: 15px; font-style: italic; color: #666; font-size: 0.9em;'>
-                    <strong>Note:</strong> Most posts are currently being enriched with tags.<br/>
-                    Keyword search works for all {classification.get('total_posts', '7,000+')} posts in the database.
-                </p>
-            </div>
-            """
-        # Check for errors or warnings
-        if clean_response.startswith("⚠️") or clean_response.startswith("❌") or "Aucun CSV détecté" in clean_response:
             return f"""
             <div style='padding: 50px; text-align: center; color: #d9534f;'>
-                <h3>❌ Query Error</h3>
-                <p>The AI encountered an issue processing your request.</p>
-                <p style='margin-top: 10px; font-size: 0.9em; color: #666;'>{clean_response[:200]}</p>
-                <p style='margin-top: 15px;'>Try rephrasing your query or being more specific.</p>
             </div>
             """
-        # Process CSV response
         csv_text = (
             clean_response
             .strip("```")
@@ -371,15 +283,11 @@ async def search_inspiration_from_database(user_prompt):
             .replace("CSV", "")
         )
-        # Check if response contains CSV data
-        if "," not in csv_text or "id,title" not in csv_text.lower():
             return f"""
             <div style='padding: 50px; text-align: center; color: #d9534f;'>
-                <h3>❌ Invalid Response Format</h3>
-                <p>The database query didn't return structured data.</p>
-                <p style='margin-top: 10px; font-size: 0.9em; color: #666;'>
-                    This might be a temporary issue. Please try again.
-                </p>
             </div>
             """
@@ -387,17 +295,11 @@ async def search_inspiration_from_database(user_prompt):
         return cards_html
     except Exception as e:
-        print(f"❌ Exception in search_inspiration_from_database: {str(e)}")
-        import traceback
-        traceback.print_exc()
         return f"""
         <div style='padding: 50px; text-align: center; color: red;'>
-            <h3>❌ System Error</h3>
-            <p style='margin-bottom: 10px;'>An unexpected error occurred:</p>
-            <p style='font-family: monospace; font-size: 0.85em; color: #666;'>{str(e)}</p>
-            <p style='margin-top: 15px; font-size: 0.9em; color: #666;'>
-                Please check the console logs for more details.
-            </p>
         </div>
         """
@@ -430,63 +332,18 @@ with gr.Blocks(
     gr.Markdown("""
     # 📊 Viz LLM
-    Discover inspiring visualizations, refine your design ideas, or generate charts using Datawrapper.
     """)
-    # JavaScript for localStorage persistence
-    gr.HTML("""
-    <script>
-        // Save API key to localStorage when it changes
-        function saveApiKeyToStorage(key) {
-            if (key && key.trim() !== '') {
-                localStorage.setItem('datawrapper_api_key', key);
-            }
-        }
-        // Load API key from localStorage on page load
-        function loadApiKeyFromStorage() {
-            return localStorage.getItem('datawrapper_api_key') || '';
-        }
-        // Auto-load API key when the page loads
-        window.addEventListener('DOMContentLoaded', function() {
-            setTimeout(function() {
-                const savedKey = loadApiKeyFromStorage();
-                if (savedKey) {
-                    const apiKeyInput = document.querySelector('input[type="password"]');
-                    if (apiKeyInput) {
-                        apiKeyInput.value = savedKey;
-                        // Trigger change event to update Gradio state
-                        apiKeyInput.dispatchEvent(new Event('input', { bubbles: true }));
-                    }
-                }
-            }, 1000);
-        });
-    </script>
-    """)
-    # Mode selector buttons (reordered: Inspiration, Refinement, Chart)
     with gr.Row():
-        inspiration_btn = gr.Button("✨ Inspiration", variant="primary", elem_classes="mode-button")
-        ideation_btn = gr.Button("💡 Refinement", variant="secondary", elem_classes="mode-button")
-        chart_gen_btn = gr.Button("📊 Chart", variant="secondary", elem_classes="mode-button")
-    # Inspiration Mode: Search interface (shown by default)
-    with gr.Column(visible=True) as inspiration_container:
-      with gr.Row():
-        inspiration_prompt_input = gr.Textbox(
-            placeholder="Search for inspiration (e.g., 'F1', 'interactive maps')...",
-            show_label=False,
-            scale=4,
-            container=False
-        )
-        inspiration_search_btn = gr.Button("🔍 Search", variant="primary", scale=1)
-      inspiration_cards_html = gr.HTML("")
-    # Refinement Mode: Chat interface (hidden by default, wrapped in Column)
-    with gr.Column(visible=False) as ideation_container:
         ideation_interface = gr.ChatInterface(
             fn=recommend_stream,
             type="messages",
@@ -503,32 +360,6 @@ with gr.Blocks(
     # Chart Generation Mode: Chart controls and output (hidden by default)
     with gr.Column(visible=False) as chart_gen_container:
-        gr.Markdown("### Chart Generator")
-        # API Key Input (collapsible)
-        with gr.Accordion("🔑 Datawrapper API Key", open=False):
-            gr.Markdown("""
-            Enter your Datawrapper API key to generate charts. Your key is stored in your browser and persists across sessions.
-            **Get your key**: [Datawrapper Account Settings](https://app.datawrapper.de/account/api-tokens)
-            """)
-            # Warning about permissions
-            gr.HTML("""
-            <div style="background: #fff3cd; border: 1px solid #ffc107; border-radius: 5px; padding: 12px; margin: 10px 0;">
-                <strong>⚠️ Important:</strong> When creating your API key, toggle <strong>ALL permissions</strong> (Read & Write for Charts, Tables, Folders, etc.) otherwise chart generation will fail.
-            </div>
-            """)
-            api_key_input = gr.Textbox(
-                label="API Key",
-                placeholder="Paste your Datawrapper API key here...",
-                type="password",
-                value=""
-            )
-            api_key_status = gr.Markdown("⚠️ Status: No API key provided")
         csv_upload = gr.File(
             label="📁 Upload CSV File",
             file_types=[".csv"],
@@ -548,111 +379,79 @@ with gr.Blocks(
             label="Generated Chart"
         )
-    # API key state management
-    api_key_state = gr.State(value="")
-    def validate_api_key(api_key: str) -> tuple[str, str]:
-        """Validate and store API key"""
-        if not api_key or api_key.strip() == "":
-            return "", "⚠️ Status: No API key provided"
-        # Basic validation (check format)
-        if len(api_key) < 20:
-            return "", "❌ Status: Invalid API key format (too short)"
-        # Key looks valid - it will be saved to localStorage via JavaScript
-        masked_key = f"...{api_key[-6:]}" if len(api_key) > 6 else "***"
-        return api_key, f"✅ Status: API key saved to browser storage (ends with {masked_key})"
-    # Mode switching functions (updated for new order: Inspiration, Refinement, Chart)
-    def switch_to_inspiration():
-        return [
-            gr.update(variant="primary"),  # inspiration_btn
-            gr.update(variant="secondary"),  # ideation_btn
-            gr.update(variant="secondary"),  # chart_gen_btn
-            gr.update(visible=True),  # inspiration_container
-            gr.update(visible=False),  # ideation_container
-            gr.update(visible=False),  # chart_gen_container
-        ]
     def switch_to_ideation():
         return [
-            gr.update(variant="secondary"),  # inspiration_btn
             gr.update(variant="primary"),  # ideation_btn
             gr.update(variant="secondary"),  # chart_gen_btn
-            gr.update(visible=False),  # inspiration_container
             gr.update(visible=True),  # ideation_container
             gr.update(visible=False),  # chart_gen_container
         ]
     def switch_to_chart_gen():
         return [
-            gr.update(variant="secondary"),  # inspiration_btn
             gr.update(variant="secondary"),  # ideation_btn
             gr.update(variant="primary"),  # chart_gen_btn
-            gr.update(visible=False),  # inspiration_container
             gr.update(visible=False),  # ideation_container
             gr.update(visible=True),  # chart_gen_container
         ]
-    # Wire up mode switching (updated order: inspiration, ideation, chart)
-    inspiration_btn.click(
-        fn=switch_to_inspiration,
-        inputs=[],
-        outputs=[inspiration_btn, ideation_btn, chart_gen_btn, inspiration_container, ideation_container, chart_gen_container]
-    )
     ideation_btn.click(
         fn=switch_to_ideation,
         inputs=[],
-        outputs=[inspiration_btn, ideation_btn, chart_gen_btn, inspiration_container, ideation_container, chart_gen_container]
     )
     chart_gen_btn.click(
         fn=switch_to_chart_gen,
         inputs=[],
-        outputs=[inspiration_btn, ideation_btn, chart_gen_btn, inspiration_container, ideation_container, chart_gen_container]
     )
-    # Connect API key validation and localStorage save
-    api_key_input.change(
-        fn=validate_api_key,
-        inputs=[api_key_input],
-        outputs=[api_key_state, api_key_status],
-        js="(key) => { saveApiKeyToStorage(key); return key; }"
     )
-    # Generate chart when button is clicked (now with API key)
     generate_chart_btn.click(
         fn=generate_chart_from_csv,
-        inputs=[csv_upload, chart_prompt_input, api_key_state],
         outputs=[chart_output]
     )
-    # Search inspiration with loading state
-    def search_with_loading(prompt):
-        """Wrapper to show loading state"""
-        if not prompt or not prompt.strip():
-            return """
-            <div style='padding: 50px; text-align: center;'>
-                Please enter a search query.
-            </div>
-            """
-        # Show loading immediately (Gradio will display this first)
-        yield """
-        <div style='padding: 50px; text-align: center;'>
-            <div style='font-size: 2em; margin-bottom: 20px;'>🔍</div>
-            <h3>Searching database...</h3>
-            <p style='color: #666;'>Analyzing your query and generating SQL...</p>
-        </div>
-        """
-        # Run the actual search
-        import asyncio
-        result = asyncio.run(search_inspiration_from_database(prompt))
-        yield result
     inspiration_search_btn.click(
-        fn=search_with_loading,
         inputs=[inspiration_prompt_input],
         outputs=[inspiration_cards_html]
     )
@@ -661,6 +460,12 @@ with gr.Blocks(
     gr.Markdown("""
     ### About Viz LLM
     **Credits:** Special thanks to the researchers whose work informed this model: Robert Kosara, Edward Segel, Jeffrey Heer, Matthew Conlen, John Maeda, Kennedy Elliott, Scott McCloud, and many others.
     ---
@@ -668,19 +473,21 @@ with gr.Blocks(
     **Usage Limits:** This service is limited to 20 queries per day per user to manage costs. Responses are optimized for English.
     <div style="text-align: center; margin-top: 20px; opacity: 0.6; font-size: 0.9em;">
-    Embeddings: Jina-CLIP-v2 | Charts: Datawrapper API | Database: Nuanced
     </div>
     """)
 # Launch configuration
 if __name__ == "__main__":
-    # Check for required environment variables (Datawrapper key now user-provided)
-    required_vars = ["SUPABASE_URL", "SUPABASE_KEY", "HF_TOKEN"]
     missing_vars = [var for var in required_vars if not os.getenv(var)]
     if missing_vars:
         print(f"⚠️  Warning: Missing environment variables: {', '.join(missing_vars)}")
         print("Please set these in your .env file or as environment variables")
     # Launch the app
     demo.launch(

 import os
 import io
 import asyncio
 import pandas as pd
 import gradio as gr
 from dotenv import load_dotenv
 from datetime import datetime, timedelta
 from collections import defaultdict
 from src.vanna import VannaComponent
 # Load environment variables
 load_dotenv()
     print(f"✗ Error initializing Vanna: {e}")
     raise
 def check_rate_limit(request: gr.Request) -> tuple[bool, int]:
     """Check if user has exceeded rate limit"""
     if request is None:
         yield f"Error generating response: {str(e)}\n\nPlease check your environment variables (HF_TOKEN, SUPABASE_URL, SUPABASE_KEY) and try again."
+def generate_chart_from_csv(csv_file, user_prompt):
     """
+    Generate a Datawrapper chart from uploaded CSV and user prompt.
     Args:
         csv_file: Uploaded CSV file
         user_prompt: User's description of the chart
     Returns:
         HTML string with iframe or error message
     """
     if not csv_file:
         return "<div style='padding: 50px; text-align: center;'>Please upload a CSV file to generate a chart.</div>"
     if not user_prompt or user_prompt.strip() == "":
         return "<div style='padding: 50px; text-align: center;'>Please describe what chart you want to create.</div>"
     try:
         # Show loading message
         loading_html = """
         <div style='padding: 50px; text-align: center; color: red;'>
             <h3>❌ Error</h3>
             <p>{str(e)}</p>
+            <p style='font-size: 0.9em; color: #666;'>Please ensure your CSV is properly formatted and try again.</p>
         </div>
         """
 def csv_to_cards_html(csv_text: str) -> str:
     """
             source_url = row.get("source_url", "#")
             author = row.get("author", "Inconnu")
             published_date = row.get("published_date", "")
+            if not published_date == "nan":
+                published_date = ""
+            image_url = row.get("image_url", "")
+            if not image_url == "nan":
+                image_url = "https://fpoimg.com/800x600?text=Image+not+found"
             cards_html += f"""
             <div style="background: white; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);
                     <p style="margin:0; color:#999; font-size:0.8em;">{published_date}</p>
                     <a href="{source_url}" target="_blank"
                        style="display:inline-block; margin-top:8px; font-size:0.9em; color:#1976d2; text-decoration:none;">
+                       🔗 Voir la source
                     </a>
                 </div>
             </div>
         """
     try:
+        response = await vanna.ask(user_prompt)
+        print("response :", repr(response))
         clean_response = response.strip()
+        if clean_response.startswith("⚠️") or "Aucun CSV détecté" in clean_response:
             return f"""
             <div style='padding: 50px; text-align: center; color: #d9534f;'>
+                <h3>❌ No valid data found</h3>
+                <p>The AI couldn't generate any data for this request. Try being more specific — for example:
+                   <em>"Show me spotlights from 2020 about design"</em>.</p>
             </div>
             """
         csv_text = (
             clean_response
             .strip("```")
             .replace("CSV", "")
         )
+        if "," not in csv_text:
             return f"""
             <div style='padding: 50px; text-align: center; color: #d9534f;'>
+                <h3>❌ No valid CSV detected</h3>
+                <p>The model didn't return any structured data. Try rephrasing your query to be more precise.</p>
             </div>
             """
         return cards_html
     except Exception as e:
         return f"""
         <div style='padding: 50px; text-align: center; color: red;'>
+            <h3>❌ Error</h3>
+            <p>{str(e)}</p>
+            <p style='font-size: 0.9em; color: #666;'>Please try again.</p>
         </div>
         """
     gr.Markdown("""
     # 📊 Viz LLM
+    Get design recommendations or generate charts with AI-powered data visualization assistance.
     """)
+    # Mode selector buttons
     with gr.Row():
+        ideation_btn = gr.Button("💡 Ideation Mode", variant="primary", elem_classes="mode-button")
+        chart_gen_btn = gr.Button("📊 Chart Generation Mode", variant="secondary", elem_classes="mode-button")
+        inspiration_btn = gr.Button("✨ Inspiration Mode", variant="secondary", elem_classes="mode-button")
+    # Ideation Mode: Chat interface (shown by default, wrapped in Column)
+    with gr.Column(visible=True) as ideation_container:
         ideation_interface = gr.ChatInterface(
             fn=recommend_stream,
             type="messages",
     # Chart Generation Mode: Chart controls and output (hidden by default)
     with gr.Column(visible=False) as chart_gen_container:
         csv_upload = gr.File(
             label="📁 Upload CSV File",
             file_types=[".csv"],
             label="Generated Chart"
         )
+    # Inspiration Mode:
+    with gr.Column(visible=False) as inspiration_container:
+      with gr.Row():
+        inspiration_prompt_input = gr.Textbox(
+            placeholder="Ask for an inspiration...",
+            show_label=False,
+            scale=4,
+            container=False
+        )
+        inspiration_search_btn = gr.Button("🔍 Search", variant="primary", scale=1)
+      inspiration_cards_html = gr.HTML("")
+    # Mode switching functions
     def switch_to_ideation():
         return [
             gr.update(variant="primary"),  # ideation_btn
             gr.update(variant="secondary"),  # chart_gen_btn
+            gr.update(variant="secondary"),  # inspiration_btn
             gr.update(visible=True),  # ideation_container
             gr.update(visible=False),  # chart_gen_container
+            gr.update(visible=False),  # inspiration_container
         ]
     def switch_to_chart_gen():
         return [
             gr.update(variant="secondary"),  # ideation_btn
             gr.update(variant="primary"),  # chart_gen_btn
+            gr.update(variant="secondary"),  # inspiration_btn
             gr.update(visible=False),  # ideation_container
             gr.update(visible=True),  # chart_gen_container
+            gr.update(visible=False),  # inspiration_container
         ]
+    def switch_to_inspiration():
+        return [
+            gr.update(variant="secondary"),  # ideation_btn
+            gr.update(variant="secondary"),  # chart_gen_btn
+            gr.update(variant="primary"),  # inspiration_btn
+            gr.update(visible=False),  # ideation_container
+            gr.update(visible=False),  # chart_gen_container
+            gr.update(visible=True),  # inspiration_container
+        ]
+    # Wire up mode switching
     ideation_btn.click(
         fn=switch_to_ideation,
         inputs=[],
+        outputs=[ideation_btn, chart_gen_btn, inspiration_btn, ideation_container, chart_gen_container, inspiration_container]
     )
     chart_gen_btn.click(
         fn=switch_to_chart_gen,
         inputs=[],
+        outputs=[ideation_btn, chart_gen_btn, inspiration_btn, ideation_container, chart_gen_container, inspiration_container]
     )
+    inspiration_btn.click(
+        fn=switch_to_inspiration,
+        inputs=[],
+        outputs=[ideation_btn, chart_gen_btn, inspiration_btn, ideation_container, chart_gen_container, inspiration_container]
     )
+    # Generate chart when button is clicked
     generate_chart_btn.click(
         fn=generate_chart_from_csv,
+        inputs=[csv_upload, chart_prompt_input],
         outputs=[chart_output]
     )
+    # Search inspiration when button is clicked
     inspiration_search_btn.click(
+        fn=search_inspiration_from_database,
         inputs=[inspiration_prompt_input],
         outputs=[inspiration_cards_html]
     )
     gr.Markdown("""
     ### About Viz LLM
+    **Ideation Mode:** Get design recommendations based on research papers, design principles, and examples from the field of information graphics and data visualization.
+    **Chart Generation Mode:** Upload your CSV data and describe your visualization goal. The AI will analyze your data, select the optimal chart type, and generate a publication-ready chart using Datawrapper.
+    **Inspiration Mode:** Coming soon.
     **Credits:** Special thanks to the researchers whose work informed this model: Robert Kosara, Edward Segel, Jeffrey Heer, Matthew Conlen, John Maeda, Kennedy Elliott, Scott McCloud, and many others.
     ---
     **Usage Limits:** This service is limited to 20 queries per day per user to manage costs. Responses are optimized for English.
     <div style="text-align: center; margin-top: 20px; opacity: 0.6; font-size: 0.9em;">
+    Embeddings: Jina-CLIP-v2 | Charts: Datawrapper API
     </div>
     """)
 # Launch configuration
 if __name__ == "__main__":
+    # Check for required environment variables
+    required_vars = ["SUPABASE_URL", "SUPABASE_KEY", "HF_TOKEN", "DATAWRAPPER_ACCESS_TOKEN"]
     missing_vars = [var for var in required_vars if not os.getenv(var)]
     if missing_vars:
         print(f"⚠️  Warning: Missing environment variables: {', '.join(missing_vars)}")
         print("Please set these in your .env file or as environment variables")
+        if "DATAWRAPPER_ACCESS_TOKEN" in missing_vars:
+            print("Note: DATAWRAPPER_ACCESS_TOKEN is required for chart generation mode")
     # Launch the app
     demo.launch(

src/query_intent_classifier.py DELETED Viewed

@@ -1,240 +0,0 @@
-"""
-Query Intent Classifier for Hybrid Search
-Analyzes user queries to determine the best search strategy:
-- keyword: Full-text search on title/author/provider (works for all posts)
-- tag: Tag-based search (works only for tagged posts)
-- hybrid: Try both approaches
-"""
-import re
-from typing import Dict, List
-from enum import Enum
-class QueryIntent(Enum):
-    KEYWORD = "keyword"
-    TAG = "tag"
-    HYBRID = "hybrid"
-class IntentClassifier:
-    """
-    Classifies user queries and extracts relevant search parameters.
-    """
-    # Keywords that suggest tag search
-    TAG_INDICATORS = ["tagged", "category", "topic", "theme", "type", "about"]
-    # Common keywords to expand for better matching
-    KEYWORD_EXPANSIONS = {
-        "f1": ["f1", "formula 1", "formula one", "racing"],
-        "dataviz": ["dataviz", "data visualization", "visualization", "chart", "graph"],
-        "interactive": ["interactive", "interaction", "explore"],
-        "map": ["map", "maps", "mapping", "geographic", "geo"],
-        "nyt": ["new york times", "nyt", "ny times"],
-    }
-    def __init__(self):
-        pass
-    def classify(self, user_prompt: str) -> Dict:
-        """
-        Classify user intent and extract search parameters.
-        Args:
-            user_prompt: The user's search query
-        Returns:
-            Dict with:
-                - intent: QueryIntent enum
-                - keywords: List of keywords to search
-                - tags: List of potential tags to search
-                - original_query: Original user prompt
-        """
-        prompt_lower = user_prompt.lower().strip()
-        # Detect intent
-        intent = self._detect_intent(prompt_lower)
-        # Extract keywords
-        keywords = self._extract_keywords(prompt_lower)
-        # Infer potential tags
-        tags = self._infer_tags(prompt_lower, keywords)
-        return {
-            "intent": intent,
-            "keywords": keywords,
-            "tags": tags,
-            "original_query": user_prompt,
-            "is_short_query": len(prompt_lower.split()) <= 3
-        }
-    def _detect_intent(self, prompt: str) -> QueryIntent:
-        """
-        Determine if user wants tag search, keyword search, or hybrid.
-        """
-        # Check for tag indicators
-        has_tag_indicator = any(indicator in prompt for indicator in self.TAG_INDICATORS)
-        # Short queries (1-3 words) should try hybrid approach
-        word_count = len(prompt.split())
-        if has_tag_indicator:
-            return QueryIntent.TAG
-        elif word_count <= 3:
-            # Short queries: try both tag and keyword search
-            return QueryIntent.HYBRID
-        else:
-            # Longer natural language queries: keyword search first
-            return QueryIntent.KEYWORD
-    def _extract_keywords(self, prompt: str) -> List[str]:
-        """
-        Extract meaningful keywords from the prompt.
-        """
-        # Remove common stop words
-        stop_words = {
-            "show", "me", "find", "get", "search", "for", "the", "a", "an",
-            "with", "about", "of", "in", "on", "at", "to", "from", "by",
-            "what", "where", "when", "who", "how", "is", "are", "was", "were"
-        }
-        # Split and clean
-        words = re.findall(r'\b\w+\b', prompt.lower())
-        # Allow 2-character words like "F1", "AI", "3D"
-        keywords = [w for w in words if w not in stop_words and len(w) >= 2]
-        # Expand known keywords
-        expanded_keywords = []
-        for keyword in keywords:
-            if keyword in self.KEYWORD_EXPANSIONS:
-                expanded_keywords.extend(self.KEYWORD_EXPANSIONS[keyword])
-            else:
-                expanded_keywords.append(keyword)
-        # Remove duplicates while preserving order
-        return list(dict.fromkeys(expanded_keywords))
-    def _infer_tags(self, prompt: str, keywords: List[str]) -> List[str]:
-        """
-        Infer potential tag names from keywords.
-        Since we have limited tags in the database, we map common terms
-        to likely tag names.
-        """
-        # Common tag mappings based on the database
-        tag_mappings = {
-            "f1": ["f1", "racing", "motorsport", "sports"],
-            "formula": ["f1", "racing", "motorsport"],
-            "racing": ["racing", "motorsport", "f1"],
-            "dataviz": ["dataviz", "visualization"],
-            "visualization": ["dataviz", "visualization"],
-            "interactive": ["interactive"],
-            "map": ["maps", "geographic"],
-            "maps": ["maps", "geographic"],
-            "math": ["mathematics", "statistics"],
-            "statistics": ["statistics", "mathematics"],
-            "africa": ["africa", "kenya", "tanzania"],
-            "sustainability": ["sustainability", "regreening"],
-            "documentary": ["documentary", "cinematic"],
-            "education": ["students", "researchers"],
-        }
-        inferred_tags = []
-        for keyword in keywords:
-            if keyword in tag_mappings:
-                inferred_tags.extend(tag_mappings[keyword])
-        # If no specific mapping, use the keyword as-is
-        if not inferred_tags:
-            inferred_tags = keywords[:3]  # Limit to top 3 keywords
-        # Remove duplicates
-        return list(dict.fromkeys(inferred_tags))
-    def format_for_vanna(self, classification: Dict) -> str:
-        """
-        Format the classification result for Vanna's prompt.
-        Returns a string that guides Vanna's SQL generation.
-        """
-        intent = classification["intent"]
-        keywords = classification["keywords"]
-        tags = classification["tags"]
-        if intent == QueryIntent.KEYWORD:
-            keyword_example = keywords[0] if keywords else "keyword"
-            return f"""
-Search using KEYWORD approach:
-- Search terms: {', '.join(keywords)}
-- Search in: posts.title, posts.author, providers.name
-- Use LOWER(column) LIKE '%keyword%' for flexible matching
-- Example: LOWER(p.title) LIKE '%{keyword_example}%'
-- This matches word variants: '{keyword_example}', '{keyword_example}n', '{keyword_example}\\'s', etc.
-"""
-        elif intent == QueryIntent.TAG:
-            return f"""
-Search using TAG approach:
-- Tag names: {', '.join(tags)}
-- 88% of posts (3,362) have tags - tag search is highly effective!
-- Use LOWER(t.name) LIKE '%tagname%' for flexible matching
-- Join with post_tags and tags tables
-"""
-        else:  # HYBRID
-            return f"""
-Search using HYBRID approach (RECOMMENDED):
-- Tags to search: {', '.join(tags)}
-- Keywords to search: {', '.join(keywords)}
-- Use OR logic: tag matches OR keyword matches in title/author
-- 88% of posts have tags, so tag search is primary
-Recommended SQL pattern:
-SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type
-FROM posts p
-LEFT JOIN post_tags pt ON p.id = pt.post_id
-LEFT JOIN tags t ON pt.tag_id = t.id
-LEFT JOIN providers pr ON p.provider_id = pr.id
-WHERE (
-    {' OR '.join(f"LOWER(t.name) LIKE '%{tag}%'" for tag in tags)}
-    OR {' OR '.join(f"LOWER(p.title) LIKE '%{kw}%'" for kw in keywords)}
-    OR {' OR '.join(f"LOWER(p.author) LIKE '%{kw}%'" for kw in keywords)}
-)
-ORDER BY p.published_date DESC NULLS LAST
-LIMIT 50
-"""
-# Convenience function
-def classify_query(user_prompt: str) -> Dict:
-    """
-    Classify a user query and return search parameters.
-    """
-    classifier = IntentClassifier()
-    return classifier.classify(user_prompt)
-# Example usage
-if __name__ == "__main__":
-    # Test cases
-    test_queries = [
-        "F1",
-        "Show me F1 content",
-        "interactive visualizations",
-        "New York Times articles",
-        "content tagged with dataviz",
-        "recent sustainability projects in Africa",
-    ]
-    classifier = IntentClassifier()
-    for query in test_queries:
-        result = classifier.classify(query)
-        print(f"\nQuery: '{query}'")
-        print(f"Intent: {result['intent'].value}")
-        print(f"Keywords: {result['keywords']}")
-        print(f"Tags: {result['tags']}")
-        print(f"Short query: {result['is_short_query']}")

src/vanna.py CHANGED Viewed

@@ -55,6 +55,9 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
             "- Never use SELECT *\n"
             "- Prefer window functions over subqueries when possible\n"
             "- Always include a LIMIT for exploratory queries\n"
             "- Format dates and numbers for readability\n"
         )
@@ -68,7 +71,7 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
             prompt += (
                 "\n## Database Schema\n"
                 "Tables:\n"
-                "- posts (id, title, source_url, author, published_date, image_url, type, provider_id, created_at, updated_at, content_markdown, fts)\n"
                 "- providers (id, name)\n"
                 "- provider_attributes (id, provider_id, type, name)\n"
                 "- post_provider_attributes (post_id, attribute_id)\n"
@@ -96,7 +99,6 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
             "- `providers.name`: name of the publishing organization (e.g., 'Nuanced', 'SND').\n"
             "- `tags.name`: thematic keyword or topic (e.g., '3D', 'AI', 'Design').\n"
             "- `post_tags.weight`: relevance score between a post and a tag.\n"
-            "- `posts.fts`: tsvector column for full-text search (auto-generated from title and author).\n"
         )
         # ======================
@@ -104,38 +106,15 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
         # ======================
         prompt += (
             "\n## Business Logic\n"
             "- A query mentioning an organization (e.g., 'New York Times') should search both `posts.author` and `providers.name`.\n"
-            "- Return all post types (spotlight, resource, insight) unless the user specifies otherwise.\n"
             "- Tags link posts to specific themes or disciplines.\n"
             "- A single post may have multiple tags, awards, or categories.\n"
             "- If the user mentions a year (e.g., 'in 2021'), filter with `EXTRACT(YEAR FROM published_date) = 2021`.\n"
             "- If the user says 'recently', filter posts from the last 90 days.\n"
-            "- Default limit is 50 rows for search results. Use OFFSET for pagination if needed.\n"
-            "\n"
-            "## Search Strategy\n"
-            "**TAG COVERAGE**: 3,362 posts (88%) have tags. Tag-based search is highly effective!\n"
-            "- 9,105 tags available including countries (russia, china, usa), topics (climate change, politics), and formats (interactive, dataviz)\n"
-            "- Use tag matching as PRIMARY search for topic-based queries\n"
-            "\n"
-            "**Hybrid Search Approach (RECOMMENDED)**:\n"
-            "- Combine tag search AND keyword search with OR logic for maximum coverage\n"
-            "- Use LEFT JOINs for tags to also include the 12% of untagged posts\n"
-            "\n"
-            "**Keyword Matching - Use ILIKE for Flexible Matching**:\n"
-            "- Use LOWER(column) LIKE '%keyword%' for case-insensitive substring matching\n"
-            "- Example: LOWER(p.title) LIKE '%russia%' matches 'Russia', 'Russian', 'Russia\\'s', etc.\n"
-            "- This ensures word variants are captured (much better than exact word boundary matching)\n"
-            "- For multi-word searches: LOWER(p.title) LIKE '%new york%'\n"
-            "\n"
-            "**Full-Text Search (for relevance ranking)**:\n"
-            "- The posts table has an 'fts' column (tsvector) for full-text search\n"
-            "- Use: p.fts @@ plainto_tsquery('english', 'search terms')\n"
-            "- For relevance-ranked results: ORDER BY ts_rank(p.fts, plainto_tsquery('english', 'search terms')) DESC\n"
-            "- FTS handles stemming automatically: 'visualization' matches 'visualizations'\n"
-            "- Combine FTS with ILIKE fallback: WHERE p.fts @@ query OR LOWER(p.title) LIKE '%keyword%'\n"
-            "\n"
-            "**When to use tag-only search**: Only if user explicitly mentions 'tagged with' or 'tag:'.\n"
-            "**When to use keyword-only search**: For author/organization names.\n"
         )
         # ======================
@@ -166,38 +145,21 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
         # ======================
         prompt += (
             "\n## Example Interactions\n"
-            "User: 'F1' or 'Show me F1 content'\n"
-            "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
-            "LEFT JOIN post_tags pt ON p.id = pt.post_id "
-            "LEFT JOIN tags t ON pt.tag_id = t.id "
-            "LEFT JOIN providers pr ON p.provider_id = pr.id "
-            "WHERE (LOWER(t.name) LIKE '%f1%' OR LOWER(t.name) LIKE '%formula%' "
-            "OR LOWER(p.title) LIKE '%f1%' OR LOWER(p.title) LIKE '%formula%' "
-            "OR LOWER(p.author) LIKE '%f1%') "
-            "ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
             "\nUser: 'Show me posts from The New York Times'\n"
-            "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
-            "LEFT JOIN providers pr ON p.provider_id = pr.id "
-            "WHERE (LOWER(p.author) LIKE '%new york times%' OR LOWER(pr.name) LIKE '%new york times%') "
-            "ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
-            "\nUser: 'Russia' or 'Show me Russia content'\n"
-            "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
-            "FROM posts p "
-            "LEFT JOIN post_tags pt ON p.id = pt.post_id "
-            "LEFT JOIN tags t ON pt.tag_id = t.id "
-            "WHERE (LOWER(t.name) LIKE '%russia%' "
-            "OR LOWER(p.title) LIKE '%russia%' OR LOWER(p.author) LIKE '%russia%') "
-            "ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
-            "\nUser: 'interactive visualizations'\n"
-            "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
-            "FROM posts p "
-            "LEFT JOIN post_tags pt ON p.id = pt.post_id "
-            "LEFT JOIN tags t ON pt.tag_id = t.id "
-            "WHERE (LOWER(t.name) LIKE '%interactive%' OR LOWER(p.title) LIKE '%interactive%' "
-            "OR LOWER(p.title) LIKE '%visualization%' OR LOWER(t.name) LIKE '%dataviz%') "
-            "ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
         )
         # ======================
@@ -205,6 +167,8 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
         # ======================
         prompt += (
             "\nIMPORTANT:\n"
             "- Always return **only the raw CSV result** — no explanations, no JSON, no commentary.\n"
             "- Stop tool execution once the query result is obtained.\n"
         )
@@ -233,8 +197,8 @@ class VannaComponent:
         db_tool = RunSqlTool(sql_runner=self.sql_runner)
         agent_memory = DemoAgentMemory(max_items=1000)
-        save_memory_tool = SaveQuestionToolArgsTool()
-        search_memory_tool = SearchSavedCorrectToolUsesTool()
         self.user_resolver = SimpleUserResolver()
@@ -247,46 +211,32 @@ class VannaComponent:
             llm_service=llm,
             tool_registry=tools,
             user_resolver=self.user_resolver,
-            agent_memory=agent_memory,
             system_prompt_builder=CustomSQLSystemPromptBuilder("CoJournalist", self.sql_runner),
-            config=AgentConfig(stream_responses=False, max_tool_iterations=3)
         )
     async def ask(self, prompt_for_llm: str):
         ctx = RequestContext()
-        print(f"\n{'='*80}")
-        print(f"🙋 User Query: {prompt_for_llm}")
-        print(f"{'='*80}\n")
         final_text = ""
         seen_texts = set()
-        query_executed = False
-        result_row_count = 0
         async for component in self.agent.send_message(request_context=ctx, message=prompt_for_llm):
             simple = getattr(component, "simple_component", None)
             text = getattr(simple, "text", "") if simple else ""
             if text and text not in seen_texts:
-                print(f"💬 LLM Response: {text[:300]}...")
                 final_text += text + "\n"
                 seen_texts.add(text)
             sql_query = getattr(component, "sql", None)
             if sql_query:
-                query_executed = True
-                print(f"\n🧾 SQL Query Generated:")
-                print(f"{'-'*80}")
-                print(f"{sql_query}")
-                print(f"{'-'*80}\n")
             metadata = getattr(component, "metadata", None)
             if metadata:
-                print(f"📋 Query Metadata: {metadata}")
-                result_row_count = metadata.get("row_count", 0)
-                if result_row_count == 0:
-                    print(f"⚠️  Query returned 0 rows - no data matched the criteria")
-                else:
-                    print(f"✅ Query returned {result_row_count} rows")
             component_type = getattr(component, "type", None)
             if component_type:
@@ -295,36 +245,16 @@ class VannaComponent:
             match = re.search(r"query_results_[\w-]+\.csv", final_text)
             if match:
                 filename = match.group(0)
-                # Calculate the user-specific folder based on the default user ID
-                import hashlib
-                user_hash = hashlib.sha256("guest@example.com".encode()).hexdigest()[:16]
-                folder = user_hash
                 full_path = os.path.join(folder, filename)
-                print(f"\n📁 Looking for CSV file: {full_path}")
-                # Create folder if it doesn't exist
-                if not os.path.exists(folder):
-                    print(f"📂 Creating user directory: {folder}")
-                    os.makedirs(folder, exist_ok=True)
                 if os.path.exists(full_path):
-                    print(f"✅ Found CSV file, reading contents...")
                     with open(full_path, "r", encoding="utf-8") as f:
                         csv_data = f.read().strip()
-                    print(f"📊 CSV Data Preview: {csv_data[:200]}...")
-                    print(f"{'='*80}\n")
                     return csv_data
                 else:
-                    print(f"❌ CSV file not found at: {full_path}")
-                    # List files in the directory to help debug
-                    if os.path.exists(folder):
-                        files = os.listdir(folder)
-                        print(f"📂 Files in {folder}: {files}")
-        print(f"\n{'='*80}")
-        if not query_executed:
-            print(f"⚠️  No SQL query was executed by the LLM")
-        print(f"📤 Returning final response to user")
-        print(f"{'='*80}\n")
         return final_text

             "- Never use SELECT *\n"
             "- Prefer window functions over subqueries when possible\n"
             "- Always include a LIMIT for exploratory queries\n"
+            "- Exclude posts where provider = 'SND'\n"
+            "- Exclude posts where type = 'resource'\n"
+            "- Exclude posts where type = 'insight'\n"
             "- Format dates and numbers for readability\n"
         )
             prompt += (
                 "\n## Database Schema\n"
                 "Tables:\n"
+                "- posts (id, title, source_url, author, published_date, image_url, type, provider_id, created_at, updated_at)\n"
                 "- providers (id, name)\n"
                 "- provider_attributes (id, provider_id, type, name)\n"
                 "- post_provider_attributes (post_id, attribute_id)\n"
             "- `providers.name`: name of the publishing organization (e.g., 'Nuanced', 'SND').\n"
             "- `tags.name`: thematic keyword or topic (e.g., '3D', 'AI', 'Design').\n"
             "- `post_tags.weight`: relevance score between a post and a tag.\n"
         )
         # ======================
         # ======================
         prompt += (
             "\n## Business Logic\n"
+            "- Providers named 'SND' must always be excluded.\n"
             "- A query mentioning an organization (e.g., 'New York Times') should search both `posts.author` and `providers.name`.\n"
+            "- By default, only posts with `type = 'spotlight'` are returned.\n"
+            "- Posts of type `resource` or `insight` are excluded unless explicitly requested.\n"
             "- Tags link posts to specific themes or disciplines.\n"
             "- A single post may have multiple tags, awards, or categories.\n"
             "- If the user mentions a year (e.g., 'in 2021'), filter with `EXTRACT(YEAR FROM published_date) = 2021`.\n"
             "- If the user says 'recently', filter posts from the last 90 days.\n"
+            "- Always limit exploratory results to 9 rows.\n"
         )
         # ======================
         # ======================
         prompt += (
             "\n## Example Interactions\n"
+            "User: 'Show me posts related to 3D'\n"
+            "Assistant: [call run_sql with \"SELECT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
+            "JOIN post_tags pt ON p.id = pt.post_id "
+            "JOIN tags t ON pt.tag_id = t.id "
+            "JOIN providers pr ON p.provider_id = pr.id "
+            "WHERE t.name ILIKE '%3D%' AND pr.name != 'SND' AND p.type = 'spotlight' "
+            "LIMIT 9;\"]\n"
             "\nUser: 'Show me posts from The New York Times'\n"
+            "Assistant: [call run_sql with \"SELECT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
+            "LEFT JOIN providers pr ON pr.id = p.provider_id "
+            "WHERE LOWER(p.author) LIKE '%new york times%' OR LOWER(pr.name) LIKE '%new york times%' "
+            "AND pr.name != 'SND' AND p.type = 'spotlight' "
+            "LIMIT 9;\"]\n"
         )
         # ======================
         # ======================
         prompt += (
             "\nIMPORTANT:\n"
+            "- Always exclude posts with provider = 'SND'.\n"
+            "- Always exclude posts with type = 'resource' or 'insight'.\n"
             "- Always return **only the raw CSV result** — no explanations, no JSON, no commentary.\n"
             "- Stop tool execution once the query result is obtained.\n"
         )
         db_tool = RunSqlTool(sql_runner=self.sql_runner)
         agent_memory = DemoAgentMemory(max_items=1000)
+        save_memory_tool = SaveQuestionToolArgsTool(agent_memory)
+        search_memory_tool = SearchSavedCorrectToolUsesTool(agent_memory)
         self.user_resolver = SimpleUserResolver()
             llm_service=llm,
             tool_registry=tools,
             user_resolver=self.user_resolver,
             system_prompt_builder=CustomSQLSystemPromptBuilder("CoJournalist", self.sql_runner),
+            config=AgentConfig(stream_responses=False, max_tool_iterations=1)
         )
     async def ask(self, prompt_for_llm: str):
         ctx = RequestContext()
+        print(f"🙋 Prompt sent to LLM: {prompt_for_llm}")
         final_text = ""
         seen_texts = set()
         async for component in self.agent.send_message(request_context=ctx, message=prompt_for_llm):
             simple = getattr(component, "simple_component", None)
             text = getattr(simple, "text", "") if simple else ""
             if text and text not in seen_texts:
+                print(f"💬 LLM says (part): {text[:200]}...")
                 final_text += text + "\n"
                 seen_texts.add(text)
             sql_query = getattr(component, "sql", None)
             if sql_query:
+                print(f"🧾 SQL Query Generated: {sql_query}")
             metadata = getattr(component, "metadata", None)
             if metadata:
+                print(f"📋 Metadata: {metadata}")
             component_type = getattr(component, "type", None)
             if component_type:
             match = re.search(r"query_results_[\w-]+\.csv", final_text)
             if match:
                 filename = match.group(0)
+                folder = "513935c4d2db2d2d"
                 full_path = os.path.join(folder, filename)
                 if os.path.exists(full_path):
+                    print(f"📂 Reading result file: {full_path}")
                     with open(full_path, "r", encoding="utf-8") as f:
                         csv_data = f.read().strip()
+                    print("🤖 Response sent to user (from file):", csv_data[:300])
                     return csv_data
                 else:
+                    print(f"⚠️ File not found: {full_path}")
         return final_text

src/vanna_query_functions.py DELETED Viewed

@@ -1,300 +0,0 @@
-"""
-Vanna Query Function Templates
-Defines SQL templates for different search strategies.
-These are used by Vanna to generate accurate, performant SQL queries.
-"""
-from typing import Dict, List
-class QueryFunctions:
-    """
-    Collection of SQL query templates for different search strategies.
-    """
-    @staticmethod
-    def keyword_search(keywords: List[str], limit: int = 9) -> str:
-        """
-        Full-text keyword search across title, author, and provider.
-        Works for all posts in the database (7,248 posts).
-        Args:
-            keywords: List of keywords to search for
-            limit: Maximum number of results
-        Returns:
-            SQL query string
-        """
-        # Build regex conditions for each keyword with word boundaries
-        # Use PostgreSQL ~* operator for case-insensitive regex matching
-        # \m and \M are word boundary markers (start/end of word)
-        keyword_conditions = []
-        for keyword in keywords:
-            keyword_lower = keyword.lower()
-            # Escape special regex characters
-            keyword_escaped = keyword_lower.replace('\\', '\\\\').replace('.', '\\.').replace('+', '\\+')
-            keyword_conditions.append(f"""
-                (p.title ~* '\\m{keyword_escaped}\\M'
-                OR p.author ~* '\\m{keyword_escaped}\\M'
-                OR pr.name ~* '\\m{keyword_escaped}\\M')
-            """)
-        where_clause = " OR ".join(keyword_conditions)
-        return f"""
-SELECT DISTINCT
-    p.id,
-    p.title,
-    p.source_url,
-    p.author,
-    p.published_date,
-    p.image_url,
-    p.type,
-    pr.name as provider_name
-FROM posts p
-LEFT JOIN providers pr ON p.provider_id = pr.id
-WHERE {where_clause}
-ORDER BY p.published_date DESC NULLS LAST
-LIMIT {limit};
-        """
-    @staticmethod
-    def tag_search(tags: List[str], limit: int = 9) -> str:
-        """
-        Tag-based search.
-        Currently works for only 3 posts with tags.
-        As more posts are tagged, this will return more results.
-        Args:
-            tags: List of tag names to search for
-            limit: Maximum number of results
-        Returns:
-            SQL query string
-        """
-        # Format tag array for SQL
-        tags_lower = [f"'{tag.lower()}'" for tag in tags]
-        tags_array = f"ARRAY[{', '.join(tags_lower)}]"
-        return f"""
-SELECT DISTINCT
-    p.id,
-    p.title,
-    p.source_url,
-    p.author,
-    p.published_date,
-    p.image_url,
-    p.type,
-    pr.name as provider_name,
-    string_agg(DISTINCT t.name, ', ') as tags
-FROM posts p
-JOIN post_tags pt ON p.id = pt.post_id
-JOIN tags t ON pt.tag_id = t.id
-LEFT JOIN providers pr ON p.provider_id = pr.id
-WHERE LOWER(t.name) = ANY({tags_array})
-GROUP BY p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type, pr.name
-ORDER BY p.published_date DESC NULLS LAST
-LIMIT {limit};
-        """
-    @staticmethod
-    def hybrid_search(keywords: List[str], tags: List[str], limit: int = 9) -> str:
-        """
-        Hybrid search combining tags AND keywords.
-        Best of both worlds:
-        - Finds tagged posts (currently 3)
-        - Falls back to keyword search for untagged posts (7,245)
-        Args:
-            keywords: List of keywords to search for
-            tags: List of tag names to search for
-            limit: Maximum number of results
-        Returns:
-            SQL query string
-        """
-        # Build tag conditions
-        tags_lower = [f"'{tag.lower()}'" for tag in tags]
-        tags_array = f"ARRAY[{', '.join(tags_lower)}]"
-        # Build regex keyword conditions with word boundaries
-        keyword_conditions = []
-        for keyword in keywords:
-            keyword_lower = keyword.lower()
-            # Escape special regex characters
-            keyword_escaped = keyword_lower.replace('\\', '\\\\').replace('.', '\\.').replace('+', '\\+')
-            keyword_conditions.append(f"""
-                (p.title ~* '\\m{keyword_escaped}\\M'
-                OR p.author ~* '\\m{keyword_escaped}\\M'
-                OR pr.name ~* '\\m{keyword_escaped}\\M')
-            """)
-        keyword_where = " OR ".join(keyword_conditions)
-        return f"""
-SELECT DISTINCT
-    p.id,
-    p.title,
-    p.source_url,
-    p.author,
-    p.published_date,
-    p.image_url,
-    p.type,
-    pr.name as provider_name,
-    string_agg(DISTINCT t.name, ', ') as tags
-FROM posts p
-LEFT JOIN post_tags pt ON p.id = pt.post_id
-LEFT JOIN tags t ON pt.tag_id = t.id
-LEFT JOIN providers pr ON p.provider_id = pr.id
-WHERE
-    LOWER(t.name) = ANY({tags_array})
-    OR ({keyword_where})
-GROUP BY p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type, pr.name
-ORDER BY p.published_date DESC NULLS LAST
-LIMIT {limit};
-        """
-    @staticmethod
-    def search_by_author(author: str, limit: int = 9) -> str:
-        """
-        Search posts by specific author or organization.
-        Args:
-            author: Author name to search for
-            limit: Maximum number of results
-        Returns:
-            SQL query string
-        """
-        # Escape special regex characters
-        author_escaped = author.lower().replace('\\', '\\\\').replace('.', '\\.').replace('+', '\\+')
-        return f"""
-SELECT DISTINCT
-    p.id,
-    p.title,
-    p.source_url,
-    p.author,
-    p.published_date,
-    p.image_url,
-    p.type,
-    pr.name as provider_name
-FROM posts p
-LEFT JOIN providers pr ON p.provider_id = pr.id
-WHERE
-    p.author ~* '\\m{author_escaped}\\M'
-    OR pr.name ~* '\\m{author_escaped}\\M'
-ORDER BY p.published_date DESC NULLS LAST
-LIMIT {limit};
-        """
-    @staticmethod
-    def search_recent(days: int = 90, limit: int = 9) -> str:
-        """
-        Search for recent posts within the last N days.
-        Args:
-            days: Number of days to look back
-            limit: Maximum number of results
-        Returns:
-            SQL query string
-        """
-        return f"""
-SELECT DISTINCT
-    p.id,
-    p.title,
-    p.source_url,
-    p.author,
-    p.published_date,
-    p.image_url,
-    p.type,
-    pr.name as provider_name
-FROM posts p
-LEFT JOIN providers pr ON p.provider_id = pr.id
-WHERE
-    p.published_date >= CURRENT_DATE - INTERVAL '{days} days'
-ORDER BY p.published_date DESC
-LIMIT {limit};
-        """
-    @staticmethod
-    def search_by_type(post_type: str, limit: int = 9) -> str:
-        """
-        Search by post type (spotlight, insight, resource).
-        Args:
-            post_type: Type of post (spotlight, insight, resource)
-            limit: Maximum number of results
-        Returns:
-            SQL query string
-        """
-        return f"""
-SELECT DISTINCT
-    p.id,
-    p.title,
-    p.source_url,
-    p.author,
-    p.published_date,
-    p.image_url,
-    p.type,
-    pr.name as provider_name
-FROM posts p
-LEFT JOIN providers pr ON p.provider_id = pr.id
-WHERE p.type = '{post_type}'
-ORDER BY p.published_date DESC NULLS LAST
-LIMIT {limit};
-        """
-def generate_query(search_type: str, **kwargs) -> str:
-    """
-    Generate SQL query based on search type.
-    Args:
-        search_type: Type of search (keyword, tag, hybrid, author, recent, type)
-        **kwargs: Parameters for the specific search type
-    Returns:
-        SQL query string
-    """
-    functions = {
-        "keyword": QueryFunctions.keyword_search,
-        "tag": QueryFunctions.tag_search,
-        "hybrid": QueryFunctions.hybrid_search,
-        "author": QueryFunctions.search_by_author,
-        "recent": QueryFunctions.search_recent,
-        "type": QueryFunctions.search_by_type,
-    }
-    if search_type not in functions:
-        raise ValueError(f"Unknown search type: {search_type}")
-    return functions[search_type](**kwargs)
-# Example usage
-if __name__ == "__main__":
-    # Test keyword search
-    print("=== KEYWORD SEARCH ===")
-    print(QueryFunctions.keyword_search(["F1", "racing"]))
-    print("\n=== TAG SEARCH ===")
-    print(QueryFunctions.tag_search(["dataviz", "interactive"]))
-    print("\n=== HYBRID SEARCH ===")
-    print(QueryFunctions.hybrid_search(
-        keywords=["visualization"],
-        tags=["dataviz", "interactive"]
-    ))
-    print("\n=== AUTHOR SEARCH ===")
-    print(QueryFunctions.search_by_author("New York Times"))
-    print("\n=== RECENT POSTS ===")
-    print(QueryFunctions.search_recent(days=30))