Spaces:

tomvaillant
/

graphics-llm

Running

App Files Files Community

remdms commited on Nov 9

Commit

e6c7182

1 Parent(s): 9db289b

Add Vanna

Browse files

Files changed (1) hide show

src/vanna.py +76 -132

src/vanna.py CHANGED Viewed

@@ -14,8 +14,9 @@ from vanna.core.system_prompt import SystemPromptBuilder
 from vanna.core.registry import ToolSchema
 from datetime import datetime
 class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
-    """System prompt builder complet pour SQL assistant Vanna v2."""
     VERSION = "2.2.0"
@@ -33,50 +34,50 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
         username = getattr(user, "username", user.id)
         # ======================
-        # BASE DU PROMPT
         # ======================
         prompt = f"[System Prompt v{self.VERSION}]\n\n"
-        prompt += f"Vous êtes un assistant SQL expert pour l'entreprise {self.company_name}.\n"
-        prompt += f"Date : {today}\nUtilisateur : {username}\nGroupes : {', '.join(user.group_memberships)}\n\n"
         prompt += (
-            "Votre rôle : générer des requêtes SQL correctes et efficaces à partir du langage naturel.\n"
-            "Vous répondez toujours au format CSV brut, sans explication ni texte additionnel.\n"
-            "Vous avez accès à toutes les tables et relations décrites dans le schéma.\n"
         )
         # ======================
-        # DIRECTIVES SQL
         # ======================
         prompt += (
-            "\n## Directives SQL\n"
-            "- Toujours utiliser des alias de table dans les JOINs\n"
-            "- Ne jamais utiliser SELECT *\n"
-            "- Préférer les fonctions fenêtres plutôt que les sous-requêtes quand possible\n"
-            "- Toujours inclure un LIMIT pour les requêtes exploratoires\n"
-            "- Exclure les posts dont le provider = 'SND'\n"
-            "- Exclure les posts dont le type = 'resource'\n"
-            "- Exclure les posts dont le type = 'insight'\n"
-            "- Formater les dates et nombres pour la lisibilité\n"
         )
         # ======================
-        # SCHÉMA DE LA BASE
         # ======================
         if context and "database_schema" in context:
-            prompt += "\n## Schéma de la base de données\n"
             prompt += context["database_schema"]
         else:
             prompt += (
-                "\n## Schéma de la base de données\n"
-                "Tables :\n"
                 "- posts (id, title, source_url, author, published_date, image_url, type, provider_id, created_at, updated_at)\n"
                 "- providers (id, name)\n"
                 "- provider_attributes (id, provider_id, type, name)\n"
                 "- post_provider_attributes (post_id, attribute_id)\n"
                 "- tags (id, name)\n"
                 "- post_tags (post_id, tag_id, weight)\n"
-                "\nRelations :\n"
                 "  - posts.provider_id → providers.id\n"
                 "  - post_provider_attributes.post_id → posts.id\n"
                 "  - post_provider_attributes.attribute_id → provider_attributes.id\n"
@@ -86,132 +87,90 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
             )
         # ======================
-        # INFORMATIONS SÉMANTIQUES
-        # ======================
-        prompt += (
-            "\n## Informations sémantiques\n"
-            "- `posts.title` : titre du contenu (souvent descriptif, peut contenir des mots-clés thématiques).\n"
-            "- `posts.source_url` : lien externe vers la ressource ou article.\n"
-            "- `posts.author` : nom du journaliste, du média, ou de l’organisation (ex: \"The New York Times\").\n"
-            "- `posts.published_date` : date de publication du post.\n"
-            "- `posts.type` : type du contenu, de type ENUM ('spotlight', 'resource', 'insight').\n"
-            "- `posts.provider_id` : identifiant de la source (provider) ayant publié le contenu.\n"
-            "- `providers.name` : nom de l’organisation source (ex: 'Nuanced', 'SND').\n"
-            "- `provider_attributes.type` : type d’attribut du provider (ENUM : 'award', 'category').\n"
-            "- `provider_attributes.name` : valeur de l’attribut (ex: 'Best Design', 'Investigation').\n"
-            "- `tags.name` : thématique ou mot-clé associé au post (ex: '3D', 'AI', 'Investigation').\n"
-            "- `post_tags.weight` : poids d’association entre un post et un tag (pertinence).\n"
-        )
-        # ======================
-        # LOGIQUE SÉMANTIQUE MÉTIER
         # ======================
         prompt += (
-            "\n## Logique sémantique métier\n"
-            "- Les providers nommés 'SND' représentent des contenus internes à exclure systématiquement.\n"
-            "- Une recherche mentionnant une organisation (ex: 'New York Times') doit interroger à la fois `posts.author` et `providers.name`.\n"
-            "- Par défaut, les posts à renvoyer sont uniquement ceux de type `spotlight`.\n"
-            "- Les posts de type `resource` ou `insight` sont exclus de toutes les requêtes, sauf si l’utilisateur demande explicitement des 'resources'.\n"
-            "- Les attributs de type 'award' indiquent des récompenses reçues par le provider.\n"
-            "- Les attributs de type 'category' définissent le domaine éditorial (ex: 'Tech', 'Investigation').\n"
-            "- Les tags permettent de relier les posts à des thématiques ou disciplines précises.\n"
-            "- Un même post peut avoir plusieurs tags, awards ou catégories.\n"
-            "- Les requêtes doivent agréger ou joindre les tables en fonction du besoin utilisateur :\n"
-            "   * Par tag : via `post_tags` et `tags`\n"
-            "   * Par provider : via `posts.provider_id` → `providers.id`\n"
-            "   * Par award/category : via `post_provider_attributes` et `provider_attributes`\n"
-            "- Si l’utilisateur parle de “posts récents”, filtrer sur `published_date >= CURRENT_DATE - INTERVAL '90 days'`.\n"
-            "- Lorsqu'une recherche mentionne un sujet (ex: '3D', 'design', 'AI'), cela correspond à un ou plusieurs `tags.name`.\n"
-            "- Lorsqu'une recherche mentionne un auteur ou une organisation, chercher dans `author` et `provider.name`.\n"
-            "- Si l'utilisateur mentionne une année (ex: \"en 2021\"), filtrer avec EXTRACT(YEAR FROM published_date) = 2021.\n"
-            "- Si l'utilisateur mentionne un mois et une année (ex: \"en mai 2021\"), filtrer avec EXTRACT(MONTH FROM published_date) = 5 ET EXTRACT(YEAR FROM published_date) = 2021.\n"
-            "- Si l'utilisateur dit \"récemment\" ou \"dernièrement\", sélectionner les posts des 90 derniers jours.\n"
-            "- Si l'utilisateur dit \"cette année\", filtrer avec EXTRACT(YEAR FROM published_date) = EXTRACT(YEAR FROM CURRENT_DATE).\n"
-            "- Ne jamais comparer directement published_date à une chaîne comme '2021' ou 'mai 2021'.\n"
-            "- Toujours limiter les résultats à 9 lignes maximum pour les requêtes exploratoires.\n"
         )
         # ======================
-        # DÉFINITIONS MÉTIER
         # ======================
         prompt += (
-            "\n## Définitions métier\n"
-            "- **Post** : post mettant en avant un sujet ou projet particulier.\n"
-            "- **Spotlight** : post mettant en avant un sujet ou projet particulier.\n"
-            "- **Provider** : entité éditoriale (média, organisation, auteur collectif) responsable du contenu.\n"
-            "- **Tag** : thématique, domaine ou mot-clé associé à un post.\n"
-            "- **Provider Attribute** : métadonnée du provider (peut être une récompense, une catégorie).\n"
-            "- **Award** : distinction reçue par un post.\n"
-            "- **Category** : domaine éditorial du post.\n"
         )
         # ======================
-        # OUTILS DISPONIBLES
         # ======================
         if tool_schemas:
-            prompt += "\n## Outils disponibles\n"
             for tool in tool_schemas:
-                prompt += f"- {tool.name}: {getattr(tool, 'description', 'Pas de description')}\n"
-                prompt += f"  Paramètres: {getattr(tool, 'parameters', 'N/A')}\n"
         # ======================
-        # MÉMOIRE & CONTEXTE
         # ======================
         tool_names = [t.name for t in tool_schemas]
         has_search = "search_saved_correct_tool_uses" in tool_names
         has_save = "save_question_tool_args" in tool_names
-        has_text_memory = "save_text_memory" in tool_names
-        if has_search or has_save or has_text_memory:
-            prompt += "\n## Système mémoire\n"
         if has_search or has_save:
-            prompt += "\n• Workflow mémoire des outils :\n"
             if has_search:
-                prompt += "  - Avant exécution, utilisez search_saved_correct_tool_uses pour détecter les patterns existants.\n"
             if has_save:
-                prompt += "  - Après succès, utilisez save_question_tool_args pour enregistrer la correspondance.\n"
-        if has_text_memory:
-            prompt += "\n• Mémoire textuelle :\n"
-            prompt += "  - Conservez les schémas, terminologies métier, patterns SQL et préférences utilisateur.\n"
         # ======================
-        # EXEMPLES D’INTERACTIONS
         # ======================
         prompt += (
-            "\n## Exemples d'interactions\n"
-            "Utilisateur : \"Montre-moi les posts liés à la 3D\"\n"
-            "Assistant : [call run_sql with \"SELECT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
             "JOIN post_tags pt ON p.id = pt.post_id "
             "JOIN tags t ON pt.tag_id = t.id "
             "JOIN providers pr ON p.provider_id = pr.id "
-            "WHERE t.name ILIKE '%3D%' AND pr.name != 'SND'  AND p.type = 'spotlight' "
             "LIMIT 9;\"]\n"
-            "Résultat : \"id,title,source_url,author,published_date,image_url,type\"\n"
-            "\nUtilisateur : \"Montre-moi les posts du New York Times\"\n"
-            "Assistant : [call run_sql with \"SELECT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
             "LEFT JOIN providers pr ON pr.id = p.provider_id "
             "WHERE LOWER(p.author) LIKE '%new york times%' OR LOWER(pr.name) LIKE '%new york times%' "
-            "AND pr.name != 'SND'  AND p.type = 'spotlight' "
             "LIMIT 9;\"]\n"
-            "Résultat : \"id,title,source_url,author,published_date,image_url\"\n"
         )
         # ======================
-        # INSTRUCTIONS FINALES
         # ======================
         prompt += (
-            "\nIMPORTANT :\n"
-            "- Toujours exclure les posts dont provider = 'SND'.\n"
-            "- Toujours exclure les posts dont type = 'resource'.\n"
-            "- Toujours exclure les posts dont type = 'insight'.\n"
-            "- Toujours renvoyer uniquement le résultat brut CSV, sans texte ni commentaires.\n"
-            "- Ne pas inclure de JSON, d’analyse, ni de messages explicatifs.\n"
-            "- Ignorer les itérations supplémentaires ou réflexions internes.\n"
-            "- Une fois le résultat obtenu, arrêtez l’exécution du tool.\n"
         )
         return prompt
@@ -232,30 +191,22 @@ class VannaComponent:
         hf_provider: str,
         connection_string: str,
     ):
-        # Configure LLM
         llm = VannaHuggingFaceLlmService(model=hf_model, token=hf_token, provider=hf_provider)
-        # Configure database tool
         self.sql_runner = PostgresRunner(connection_string=connection_string)
-        db_tool = RunSqlTool(
-            sql_runner=self.sql_runner,
-        )
-        # Configure agent memory
         agent_memory = DemoAgentMemory(max_items=1000)
         save_memory_tool = SaveQuestionToolArgsTool(agent_memory)
         search_memory_tool = SearchSavedCorrectToolUsesTool(agent_memory)
-        # Configure user resolver
         self.user_resolver = SimpleUserResolver()
-        # Register tools with access control
         tools = ToolRegistry()
         tools.register_local_tool(db_tool, access_groups=['admin', 'user'])
         tools.register_local_tool(save_memory_tool, access_groups=['admin'])
         tools.register_local_tool(search_memory_tool, access_groups=['admin', 'user'])
-        # Create the agent
         self.agent = Agent(
             llm_service=llm,
             tool_registry=tools,
@@ -266,14 +217,12 @@ class VannaComponent:
     async def ask(self, prompt_for_llm: str):
         ctx = RequestContext()
-        print(f"🙋 Prompt envoyé au LLM : {prompt_for_llm}")
         final_text = ""
         seen_texts = set()
-        # 🔁 Boucle sur les composants produits par l'agent
         async for component in self.agent.send_message(request_context=ctx, message=prompt_for_llm):
-            # Texte simple produit par l'agent
             simple = getattr(component, "simple_component", None)
             text = getattr(simple, "text", "") if simple else ""
             if text and text not in seen_texts:
@@ -281,22 +230,18 @@ class VannaComponent:
                 final_text += text + "\n"
                 seen_texts.add(text)
-            # Requête SQL générée (si présente)
             sql_query = getattr(component, "sql", None)
             if sql_query:
-                print(f"🧾 Requête SQL générée : {sql_query}")
-            # Métadonnées et autres infos associées au composant
             metadata = getattr(component, "metadata", None)
             if metadata:
-                print(f"📋 Métadonnées composant : {metadata}")
-            # Type de composant utile pour debug
             component_type = getattr(component, "type", None)
             if component_type:
-                print(f"🔖 Type composant : {component_type}")
             match = re.search(r"query_results_[\w-]+\.csv", final_text)
             if match:
                 filename = match.group(0)
@@ -304,13 +249,12 @@ class VannaComponent:
                 full_path = os.path.join(folder, filename)
                 if os.path.exists(full_path):
-                    print(f"📂 Lecture directe du fichier CSV : {full_path}")
                     with open(full_path, "r", encoding="utf-8") as f:
                         csv_data = f.read().strip()
-                    print("🤖 Réponse envoyée à l'user (depuis fichier) :", csv_data[:300])
                     return csv_data
                 else:
-                    print(f"⚠️ Fichier non trouvé : {full_path}")
         return final_text

 from vanna.core.registry import ToolSchema
 from datetime import datetime
 class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
+    """Complete system prompt builder for Vanna SQL assistant v2."""
     VERSION = "2.2.0"
         username = getattr(user, "username", user.id)
         # ======================
+        # BASE PROMPT
         # ======================
         prompt = f"[System Prompt v{self.VERSION}]\n\n"
+        prompt += f"You are an expert SQL assistant for the company {self.company_name}.\n"
+        prompt += f"Date: {today}\nUser: {username}\nGroups: {', '.join(user.group_memberships)}\n\n"
         prompt += (
+            "Your role: generate correct and efficient SQL queries from natural language.\n"
+            "You always respond in **raw CSV format**, with no explanation or extra text.\n"
+            "You have full access to all tables and relationships described in the schema.\n"
         )
         # ======================
+        # SQL DIRECTIVES
         # ======================
         prompt += (
+            "\n## SQL Directives\n"
+            "- Always use table aliases in JOINs\n"
+            "- Never use SELECT *\n"
+            "- Prefer window functions over subqueries when possible\n"
+            "- Always include a LIMIT for exploratory queries\n"
+            "- Exclude posts where provider = 'SND'\n"
+            "- Exclude posts where type = 'resource'\n"
+            "- Exclude posts where type = 'insight'\n"
+            "- Format dates and numbers for readability\n"
         )
         # ======================
+        # DATABASE SCHEMA
         # ======================
         if context and "database_schema" in context:
+            prompt += "\n## Database Schema\n"
             prompt += context["database_schema"]
         else:
             prompt += (
+                "\n## Database Schema\n"
+                "Tables:\n"
                 "- posts (id, title, source_url, author, published_date, image_url, type, provider_id, created_at, updated_at)\n"
                 "- providers (id, name)\n"
                 "- provider_attributes (id, provider_id, type, name)\n"
                 "- post_provider_attributes (post_id, attribute_id)\n"
                 "- tags (id, name)\n"
                 "- post_tags (post_id, tag_id, weight)\n"
+                "\nRelationships:\n"
                 "  - posts.provider_id → providers.id\n"
                 "  - post_provider_attributes.post_id → posts.id\n"
                 "  - post_provider_attributes.attribute_id → provider_attributes.id\n"
             )
         # ======================
+        # SEMANTIC INFORMATION
         # ======================
         prompt += (
+            "\n## Semantic Information\n"
+            "- `posts.title`: title of the content (often descriptive, may contain keywords).\n"
+            "- `posts.source_url`: external link to the article or resource.\n"
+            "- `posts.author`: author, journalist, or organization name (e.g., 'The New York Times').\n"
+            "- `posts.published_date`: publication date.\n"
+            "- `posts.type`: content type ENUM ('spotlight', 'resource', 'insight').\n"
+            "- `providers.name`: name of the publishing organization (e.g., 'Nuanced', 'SND').\n"
+            "- `tags.name`: thematic keyword or topic (e.g., '3D', 'AI', 'Design').\n"
+            "- `post_tags.weight`: relevance score between a post and a tag.\n"
         )
         # ======================
+        # BUSINESS LOGIC
         # ======================
         prompt += (
+            "\n## Business Logic\n"
+            "- Providers named 'SND' must always be excluded.\n"
+            "- A query mentioning an organization (e.g., 'New York Times') should search both `posts.author` and `providers.name`.\n"
+            "- By default, only posts with `type = 'spotlight'` are returned.\n"
+            "- Posts of type `resource` or `insight` are excluded unless explicitly requested.\n"
+            "- Tags link posts to specific themes or disciplines.\n"
+            "- A single post may have multiple tags, awards, or categories.\n"
+            "- If the user mentions a year (e.g., 'in 2021'), filter with `EXTRACT(YEAR FROM published_date) = 2021`.\n"
+            "- If the user says 'recently', filter posts from the last 90 days.\n"
+            "- Always limit exploratory results to 9 rows.\n"
         )
         # ======================
+        # AVAILABLE TOOLS
         # ======================
         if tool_schemas:
+            prompt += "\n## Available Tools\n"
             for tool in tool_schemas:
+                prompt += f"- {tool.name}: {getattr(tool, 'description', 'No description')}\n"
+                prompt += f"  Parameters: {getattr(tool, 'parameters', 'N/A')}\n"
         # ======================
+        # MEMORY SYSTEM
         # ======================
         tool_names = [t.name for t in tool_schemas]
         has_search = "search_saved_correct_tool_uses" in tool_names
         has_save = "save_question_tool_args" in tool_names
         if has_search or has_save:
+            prompt += "\n## Memory System\n"
             if has_search:
+                prompt += "- Use `search_saved_correct_tool_uses` to detect past patterns.\n"
             if has_save:
+                prompt += "- Use `save_question_tool_args` to store successful pairs.\n"
         # ======================
+        # EXAMPLES
         # ======================
         prompt += (
+            "\n## Example Interactions\n"
+            "User: 'Show me posts related to 3D'\n"
+            "Assistant: [call run_sql with \"SELECT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
             "JOIN post_tags pt ON p.id = pt.post_id "
             "JOIN tags t ON pt.tag_id = t.id "
             "JOIN providers pr ON p.provider_id = pr.id "
+            "WHERE t.name ILIKE '%3D%' AND pr.name != 'SND' AND p.type = 'spotlight' "
             "LIMIT 9;\"]\n"
+            "\nUser: 'Show me posts from The New York Times'\n"
+            "Assistant: [call run_sql with \"SELECT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
             "LEFT JOIN providers pr ON pr.id = p.provider_id "
             "WHERE LOWER(p.author) LIKE '%new york times%' OR LOWER(pr.name) LIKE '%new york times%' "
+            "AND pr.name != 'SND' AND p.type = 'spotlight' "
             "LIMIT 9;\"]\n"
         )
         # ======================
+        # FINAL INSTRUCTIONS
         # ======================
         prompt += (
+            "\nIMPORTANT:\n"
+            "- Always exclude posts with provider = 'SND'.\n"
+            "- Always exclude posts with type = 'resource' or 'insight'.\n"
+            "- Always return **only the raw CSV result** — no explanations, no JSON, no commentary.\n"
+            "- Stop tool execution once the query result is obtained.\n"
         )
         return prompt
         hf_provider: str,
         connection_string: str,
     ):
         llm = VannaHuggingFaceLlmService(model=hf_model, token=hf_token, provider=hf_provider)
         self.sql_runner = PostgresRunner(connection_string=connection_string)
+        db_tool = RunSqlTool(sql_runner=self.sql_runner)
         agent_memory = DemoAgentMemory(max_items=1000)
         save_memory_tool = SaveQuestionToolArgsTool(agent_memory)
         search_memory_tool = SearchSavedCorrectToolUsesTool(agent_memory)
         self.user_resolver = SimpleUserResolver()
         tools = ToolRegistry()
         tools.register_local_tool(db_tool, access_groups=['admin', 'user'])
         tools.register_local_tool(save_memory_tool, access_groups=['admin'])
         tools.register_local_tool(search_memory_tool, access_groups=['admin', 'user'])
         self.agent = Agent(
             llm_service=llm,
             tool_registry=tools,
     async def ask(self, prompt_for_llm: str):
         ctx = RequestContext()
+        print(f"🙋 Prompt sent to LLM: {prompt_for_llm}")
         final_text = ""
         seen_texts = set()
         async for component in self.agent.send_message(request_context=ctx, message=prompt_for_llm):
             simple = getattr(component, "simple_component", None)
             text = getattr(simple, "text", "") if simple else ""
             if text and text not in seen_texts:
                 final_text += text + "\n"
                 seen_texts.add(text)
             sql_query = getattr(component, "sql", None)
             if sql_query:
+                print(f"🧾 SQL Query Generated: {sql_query}")
             metadata = getattr(component, "metadata", None)
             if metadata:
+                print(f"📋 Metadata: {metadata}")
             component_type = getattr(component, "type", None)
             if component_type:
+                print(f"🔖 Component Type: {component_type}")
             match = re.search(r"query_results_[\w-]+\.csv", final_text)
             if match:
                 filename = match.group(0)
                 full_path = os.path.join(folder, filename)
                 if os.path.exists(full_path):
+                    print(f"📂 Reading result file: {full_path}")
                     with open(full_path, "r", encoding="utf-8") as f:
                         csv_data = f.read().strip()
+                    print("🤖 Response sent to user (from file):", csv_data[:300])
                     return csv_data
                 else:
+                    print(f"⚠️ File not found: {full_path}")
         return final_text