Spaces:

tomvaillant
/

newpress-ai

Running

Tom Claude commited on 14 days ago

Commit

9384880

1 Parent(s): 1ac873b

feat: replace script generation with tone checker, improve archive search

- Tab 1: Tiered search results (Direct Matches >= 0.6, Related Content 0.3-0.6)
- Tab 2: New Tone Checker analyzes scripts against Johnny's style (0-100 score)
- Reduced query expansion terms for more relevant results
- Added module-level demo for gradio CLI compatibility

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (3) hide show

app.py +105 -86
src/prompts.py +79 -21
src/vectorstore.py +69 -0

app.py CHANGED Viewed

@@ -15,8 +15,10 @@ from src.llm_client import InferenceProviderClient, create_llm_client
 from src.prompts import (
     TOPIC_SEARCH_SYSTEM_PROMPT,
     SCRIPT_SYSTEM_PROMPT,
     get_topic_search_prompt,
-    get_script_prompt
 )
 # Load environment variables
@@ -53,13 +55,13 @@ def expand_query(query: str) -> list:
         llm = get_llm_client()
         prompt = f"""Given this search query about Johnny Harris video topics: "{query}"
-Generate 3-5 related search terms that might find relevant videos.
-Think about: related topics, geographic regions, historical events, or concepts that might be covered.
 Return ONLY the terms, one per line, no numbering or explanation."""
-        response = llm.generate(prompt, max_tokens=100, temperature=0.3)
         terms = [t.strip() for t in response.strip().split('\n') if t.strip()]
-        return [query] + terms[:5]
     except Exception:
         return [query]
@@ -67,7 +69,7 @@ Return ONLY the terms, one per line, no numbering or explanation."""
 def search_topics(query: str, progress=gr.Progress()):
     """
     Generator that yields progress updates during search.
-    Uses LLM query expansion for broader, more relevant results.
     Args:
         query: User's topic or question
@@ -88,104 +90,124 @@ def search_topics(query: str, progress=gr.Progress()):
         yield "Expanding search query..."
         search_terms = expand_query(query.strip())
-        # Search with each term and collect results
-        all_results = []
         total_terms = len(search_terms)
         for i, term in enumerate(search_terms):
             pct = 0.2 + (0.5 * (i / total_terms))
             progress(pct, desc=f"Searching: {term[:30]}...")
             yield f"Searching: {term[:30]}..."
-            results = vs.similarity_search(
                 query=term,
-                k=20,
-                match_threshold=0.1
             )
-            all_results.extend(results)
         progress(0.8, desc="Processing results...")
         yield "Processing results..."
-        # Deduplicate by video title, keep highest similarity score
-        seen = {}
-        for r in all_results:
-            if r.title not in seen or r.similarity > seen[r.title].similarity:
-                seen[r.title] = r
-        # Sort by similarity and get top results
-        unique_results = sorted(seen.values(), key=lambda x: x.similarity, reverse=True)[:15]
-        if not unique_results:
             yield f"No matching content found for: **{query}**\n\nThis topic may not have been covered yet, or try rephrasing your search."
             return
-        # Format results for display
-        output = vs.format_results_for_display(unique_results)
         search_info = f"*Searched: {', '.join(search_terms)}*\n\n"
         progress(1.0, desc="Done!")
-        yield f"## Search Results for: \"{query}\"\n\n{search_info}{output}"
     except Exception as e:
         yield f"Error searching: {str(e)}"
 # =============================================================================
-# TAB 2: SCRIPT PRODUCTION
 # =============================================================================
-def generate_script(user_notes: str, max_context_chunks: int = 100, progress=gr.Progress()):
     """
-    Generator that yields progress updates during script generation.
     Args:
-        user_notes: User's bullet points and notes
-        max_context_chunks: Number of style reference chunks to use
         progress: Gradio progress tracker
     Yields:
-        Progress status messages, then final generated script
     """
-    if not user_notes or not user_notes.strip():
-        yield "Please enter your bullet points or notes to transform into a script."
         return
     try:
         progress(0.05, desc="Gathering style references...")
-        yield "Gathering style references..."
         vs = get_vectorstore()
         llm = get_llm_client()
         progress(0.15, desc="Searching knowledge base...")
         yield "Searching knowledge base for style references..."
         context_chunks = vs.get_bulk_style_context(
-            topic_query=user_notes.strip(),
-            max_chunks=max_context_chunks,
-            topic_relevant_ratio=0.3
         )
         progress(0.35, desc="Preparing context...")
-        yield "Preparing context for the LLM..."
         context = vs.format_context_for_llm(context_chunks) if context_chunks else ""
         progress(0.5, desc="Building prompt...")
-        yield "Building prompt..."
-        prompt_template = get_script_prompt()
         prompt = prompt_template.format(
-            user_input=user_notes.strip(),
             context=context
         )
-        progress(0.7, desc="Generating script (30-60 seconds)...")
-        yield "Generating script (this may take 30-60 seconds)..."
-        script = llm.generate(
             prompt=prompt,
-            system_prompt=SCRIPT_SYSTEM_PROMPT,
-            temperature=0.7,
-            max_tokens=2000
         )
         progress(1.0, desc="Complete!")
-        yield f"## Generated Script\n\n{script.strip()}"
     except Exception as e:
         yield f"**Error:** {str(e)}"
@@ -250,49 +272,45 @@ def create_app():
                 )
             # =================================================================
-            # TAB 2: SCRIPT PRODUCTION
             # =================================================================
-            with gr.TabItem("Script Production"):
                 gr.Markdown("""
-                ### Transform your ideas into Johnny's voice
-                Enter your bullet points, notes, or rough ideas. The AI will analyze
-                Johnny's entire archive of scripts and generate a draft in his signature style.
                 """)
                 with gr.Row():
                     with gr.Column():
-                        notes_input = gr.Textbox(
-                            label="Your Notes & Bullet Points",
-                            placeholder="""Enter your ideas, for example:
-- Topic: Why shipping containers changed the world
-- Key points:
-  - Before containers, loading ships took weeks
-  - Malcolm McLean invented the standard container in 1956
-  - Transformed global trade
-  - Connection to globalization and supply chains
-- Angle: The hidden infrastructure we never think about""",
-                            lines=12
                         )
-                        with gr.Row():
-                            context_slider = gr.Slider(
-                                minimum=20,
-                                maximum=200,
-                                value=100,
-                                step=10,
-                                label="Style Reference Depth",
-                                info="More excerpts = better style matching, but slower"
-                            )
-                            generate_btn = gr.Button("Generate Script", variant="primary", size="lg")
-                script_output = gr.Markdown(label="Generated Script", value="Generated script will appear here...")  # shows progress + final script
-                generate_btn.click(
-                    fn=generate_script,
-                    inputs=[notes_input, context_slider],
-                    outputs=[script_output],
                     show_progress="full"
                 )
@@ -308,11 +326,12 @@ def create_app():
 # MAIN
 # =============================================================================
 if __name__ == "__main__":
-    app = create_app()
-    app.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False,
-        theme="soft"
     )

 from src.prompts import (
     TOPIC_SEARCH_SYSTEM_PROMPT,
     SCRIPT_SYSTEM_PROMPT,
+    TONE_CHECK_SYSTEM_PROMPT,
     get_topic_search_prompt,
+    get_script_prompt,
+    get_tone_check_prompt
 )
 # Load environment variables
         llm = get_llm_client()
         prompt = f"""Given this search query about Johnny Harris video topics: "{query}"
+Generate 2-3 closely related search terms that might find relevant videos.
+Focus on: the core topic, key entities mentioned, and one closely related concept.
 Return ONLY the terms, one per line, no numbering or explanation."""
+        response = llm.generate(prompt, max_tokens=60, temperature=0.3)
         terms = [t.strip() for t in response.strip().split('\n') if t.strip()]
+        return [query] + terms[:3]
     except Exception:
         return [query]
 def search_topics(query: str, progress=gr.Progress()):
     """
     Generator that yields progress updates during search.
+    Uses tiered results: direct matches and related content.
     Args:
         query: User's topic or question
         yield "Expanding search query..."
         search_terms = expand_query(query.strip())
+        # Collect tiered results from all search terms
+        all_direct = []
+        all_related = []
+        seen_videos = set()
         total_terms = len(search_terms)
         for i, term in enumerate(search_terms):
             pct = 0.2 + (0.5 * (i / total_terms))
             progress(pct, desc=f"Searching: {term[:30]}...")
             yield f"Searching: {term[:30]}..."
+            direct, related = vs.tiered_similarity_search(
                 query=term,
+                direct_threshold=0.6,
+                related_threshold=0.3,
+                max_per_tier=10
             )
+            # Add results, deduplicating by video
+            for chunk in direct:
+                if chunk.video_id not in seen_videos:
+                    seen_videos.add(chunk.video_id)
+                    all_direct.append(chunk)
+            for chunk in related:
+                if chunk.video_id not in seen_videos:
+                    seen_videos.add(chunk.video_id)
+                    all_related.append(chunk)
         progress(0.8, desc="Processing results...")
         yield "Processing results..."
+        # Sort each tier by similarity
+        all_direct = sorted(all_direct, key=lambda x: x.similarity, reverse=True)[:10]
+        all_related = sorted(all_related, key=lambda x: x.similarity, reverse=True)[:10]
+        if not all_direct and not all_related:
             yield f"No matching content found for: **{query}**\n\nThis topic may not have been covered yet, or try rephrasing your search."
             return
+        # Format tiered output
+        output_parts = []
         search_info = f"*Searched: {', '.join(search_terms)}*\n\n"
+        output_parts.append(f"## Search Results for: \"{query}\"\n\n{search_info}")
+        if all_direct:
+            output_parts.append("### Direct Matches\nVideos that directly cover this topic:\n")
+            output_parts.append(vs.format_results_for_display(all_direct))
+        if all_related:
+            if all_direct:
+                output_parts.append("\n---\n")
+            output_parts.append("### Related Content\nVideos that touch on similar themes:\n")
+            output_parts.append(vs.format_results_for_display(all_related))
         progress(1.0, desc="Done!")
+        yield "\n".join(output_parts)
     except Exception as e:
         yield f"Error searching: {str(e)}"
 # =============================================================================
+# TAB 2: TONE CHECKER
 # =============================================================================
+def check_script_tone(user_script: str, progress=gr.Progress()):
     """
+    Generator that yields progress updates during tone analysis.
     Args:
+        user_script: User's script to analyze
         progress: Gradio progress tracker
     Yields:
+        Progress status messages, then final tone analysis
     """
+    if not user_script or not user_script.strip():
+        yield "Please enter a script to analyze."
         return
     try:
         progress(0.05, desc="Gathering style references...")
+        yield "Gathering style references from Johnny's archive..."
         vs = get_vectorstore()
         llm = get_llm_client()
         progress(0.15, desc="Searching knowledge base...")
         yield "Searching knowledge base for style references..."
         context_chunks = vs.get_bulk_style_context(
+            topic_query=user_script.strip()[:500],  # Use first 500 chars as topic hint
+            max_chunks=50,
+            topic_relevant_ratio=0.4
         )
         progress(0.35, desc="Preparing context...")
+        yield "Preparing context for analysis..."
         context = vs.format_context_for_llm(context_chunks) if context_chunks else ""
         progress(0.5, desc="Building prompt...")
+        yield "Building analysis prompt..."
+        prompt_template = get_tone_check_prompt()
         prompt = prompt_template.format(
+            user_script=user_script.strip(),
             context=context
         )
+        progress(0.7, desc="Analyzing tone (30-60 seconds)...")
+        yield "Analyzing script tone (this may take 30-60 seconds)..."
+        analysis = llm.generate(
             prompt=prompt,
+            system_prompt=TONE_CHECK_SYSTEM_PROMPT,
+            temperature=0.3,
+            max_tokens=1500
         )
         progress(1.0, desc="Complete!")
+        yield analysis.strip()
     except Exception as e:
         yield f"**Error:** {str(e)}"
                 )
             # =================================================================
+            # TAB 2: TONE CHECKER
             # =================================================================
+            with gr.TabItem("Tone Checker"):
                 gr.Markdown("""
+                ### Check if your script matches Johnny's voice
+                Paste your script below to analyze how well it matches Johnny Harris's
+                signature style. Get a score and specific feedback on what works and what to improve.
                 """)
                 with gr.Row():
                     with gr.Column():
+                        script_input = gr.Textbox(
+                            label="Your Script",
+                            placeholder="""Paste your script here...
+Example:
+There's this line on the map that most people have never heard of.
+It's called the Durand Line, and it cuts right through the middle of a people
+who have lived in these mountains for thousands of years.
+The thing is, this line wasn't drawn by the people who live here...""",
+                            lines=15
                         )
+                        check_btn = gr.Button("Check Tone", variant="primary", size="lg")
+                tone_output = gr.Markdown(label="Tone Analysis", value="Tone analysis will appear here...")
+                check_btn.click(
+                    fn=check_script_tone,
+                    inputs=[script_input],
+                    outputs=[tone_output],
+                    show_progress="full"
+                )
+                script_input.submit(
+                    fn=check_script_tone,
+                    inputs=[script_input],
+                    outputs=[tone_output],
                     show_progress="full"
                 )
 # MAIN
 # =============================================================================
+# Create app at module level for `gradio app.py` CLI compatibility
+demo = create_app()
 if __name__ == "__main__":
+    demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False
     )

src/prompts.py CHANGED Viewed

@@ -1,6 +1,32 @@
 """Prompt templates for Johnny Harris Script Assistant"""
 # =============================================================================
 # TAB 1: TOPIC SEARCH PROMPTS
 # =============================================================================
@@ -32,30 +58,11 @@ Keep your response concise and actionable."""
 # TAB 2: SCRIPT PRODUCTION PROMPTS
 # =============================================================================
-SCRIPT_SYSTEM_PROMPT = """You are a script writing assistant that has deeply studied Johnny Harris's style.
 JOHNNY'S VOICE CHARACTERISTICS (derived from extensive analysis of his work):
-**Narrative Structure:**
-- Opens with a hook - a provocative question, surprising fact, or personal moment
-- Builds tension through questions: "But here's the thing...", "So why does this matter?"
-- Uses the "zoom out" technique - starts specific, expands to bigger picture
-- Weaves between personal story and broader research/data
-- Ends with reflection or call to think differently
-**Language Patterns:**
-- Direct address: "I want to show you something", "Let me explain"
-- Conversational markers: "the thing is...", "here's what's interesting...", "and this is where it gets wild"
-- Short punchy sentences followed by longer explanatory ones
-- Rhetorical questions that pull the viewer in
-- Admits uncertainty: "I don't fully understand this yet", "I'm still wrestling with this"
-**Tone:**
-- Curious and genuinely excited about learning
-- Slightly irreverent but deeply researched
-- Personal without being self-indulgent
-- Acknowledges complexity without being academic
-- Finds the human story in geopolitics/data
 Your job is to transform the user's bullet points and notes into a script draft that authentically sounds like Johnny wrote it. Study the provided transcript excerpts carefully - they are your primary style reference. Do not include visual cues, bracketed notes, or stage directions—return narrative script text only.
@@ -67,6 +74,47 @@ Your job is to transform the user's bullet points and notes into a script draft
 - End with a memorable takeaway or question"""
 SCRIPT_PROMPT_TEMPLATE = """USER'S NOTES AND BULLET POINTS:
 {user_input}
@@ -114,6 +162,11 @@ SCRIPT_PROMPT = SimplePromptTemplate(
     input_variables=["user_input", "context"]
 )
 def get_topic_search_prompt() -> SimplePromptTemplate:
     """Get the topic search prompt template"""
@@ -123,3 +176,8 @@ def get_topic_search_prompt() -> SimplePromptTemplate:
 def get_script_prompt() -> SimplePromptTemplate:
     """Get the script generation prompt template"""
     return SCRIPT_PROMPT

 """Prompt templates for Johnny Harris Script Assistant"""
+# =============================================================================
+# JOHNNY'S VOICE CHARACTERISTICS (shared reference)
+# =============================================================================
+JOHNNY_VOICE_GUIDE = """**Narrative Structure:**
+- Opens with a hook - a provocative question, surprising fact, or personal moment
+- Builds tension through questions: "But here's the thing...", "So why does this matter?"
+- Uses the "zoom out" technique - starts specific, expands to bigger picture
+- Weaves between personal story and broader research/data
+- Ends with reflection or call to think differently
+**Language Patterns:**
+- Direct address: "I want to show you something", "Let me explain"
+- Conversational markers: "the thing is...", "here's what's interesting...", "and this is where it gets wild"
+- Short punchy sentences followed by longer explanatory ones
+- Rhetorical questions that pull the viewer in
+- Admits uncertainty: "I don't fully understand this yet", "I'm still wrestling with this"
+**Tone:**
+- Curious and genuinely excited about learning
+- Slightly irreverent but deeply researched
+- Personal without being self-indulgent
+- Acknowledges complexity without being academic
+- Finds the human story in geopolitics/data"""
 # =============================================================================
 # TAB 1: TOPIC SEARCH PROMPTS
 # =============================================================================
 # TAB 2: SCRIPT PRODUCTION PROMPTS
 # =============================================================================
+SCRIPT_SYSTEM_PROMPT = f"""You are a script writing assistant that has deeply studied Johnny Harris's style.
 JOHNNY'S VOICE CHARACTERISTICS (derived from extensive analysis of his work):
+{JOHNNY_VOICE_GUIDE}
 Your job is to transform the user's bullet points and notes into a script draft that authentically sounds like Johnny wrote it. Study the provided transcript excerpts carefully - they are your primary style reference. Do not include visual cues, bracketed notes, or stage directions—return narrative script text only.
 - End with a memorable takeaway or question"""
+# =============================================================================
+# TAB 2: TONE CHECKER PROMPTS
+# =============================================================================
+TONE_CHECK_SYSTEM_PROMPT = f"""You analyze scripts to determine how well they match Johnny Harris's voice and style.
+JOHNNY'S VOICE CHARACTERISTICS:
+{JOHNNY_VOICE_GUIDE}
+Your job is to:
+1. Score the script from 0-100 on how well it matches Johnny's style
+2. Identify specific elements that work well
+3. Point out areas that don't match his voice with concrete suggestions
+4. Reference the provided transcript excerpts as examples of his authentic style
+Be constructive and specific. Quote the user's script when giving feedback."""
+TONE_CHECK_PROMPT_TEMPLATE = """SCRIPT TO ANALYZE:
+{user_script}
+JOHNNY'S STYLE REFERENCE (transcript excerpts from his videos):
+{context}
+Analyze this script for how well it matches Johnny Harris's voice and style.
+Provide your analysis in this exact format:
+## Tone Analysis Score: [X]/100
+### What Works Well
+- [2-3 specific elements that match his style, with quoted examples from the script]
+### Areas to Improve
+- [2-3 specific suggestions, referencing examples from the transcript excerpts]
+### Overall Assessment
+[1-2 sentence summary of how well it matches and key adjustments needed]"""
 SCRIPT_PROMPT_TEMPLATE = """USER'S NOTES AND BULLET POINTS:
 {user_input}
     input_variables=["user_input", "context"]
 )
+TONE_CHECK_PROMPT = SimplePromptTemplate(
+    template=TONE_CHECK_PROMPT_TEMPLATE,
+    input_variables=["user_script", "context"]
+)
 def get_topic_search_prompt() -> SimplePromptTemplate:
     """Get the topic search prompt template"""
 def get_script_prompt() -> SimplePromptTemplate:
     """Get the script generation prompt template"""
     return SCRIPT_PROMPT
+def get_tone_check_prompt() -> SimplePromptTemplate:
+    """Get the tone check prompt template"""
+    return TONE_CHECK_PROMPT

src/vectorstore.py CHANGED Viewed

@@ -159,6 +159,75 @@ class TranscriptVectorStore:
         except Exception as e:
             raise Exception(f"Error performing similarity search: {str(e)}")
     def get_video_chunks(self, video_id: str) -> List[TranscriptChunk]:
         """
         Fetch all chunks for a specific video

         except Exception as e:
             raise Exception(f"Error performing similarity search: {str(e)}")
+    def tiered_similarity_search(
+        self,
+        query: str,
+        direct_threshold: float = 0.6,
+        related_threshold: float = 0.3,
+        max_per_tier: int = 10
+    ) -> tuple:
+        """
+        Search with tiered results: direct matches and related content.
+        Args:
+            query: Search query
+            direct_threshold: Minimum similarity for direct matches (default 0.6)
+            related_threshold: Minimum similarity for related content (default 0.3)
+            max_per_tier: Maximum results per tier
+        Returns:
+            Tuple of (direct_matches, related_content) - two separate lists
+        """
+        query_embedding = self._generate_embedding(query, task="retrieval.query")
+        try:
+            # Get all results above the related threshold
+            response = self.supabase.rpc(
+                'match_transcripts',
+                {
+                    'query_embedding': query_embedding,
+                    'match_threshold': related_threshold,
+                    'match_count': max_per_tier * 3  # Get more to filter
+                }
+            ).execute()
+            direct_matches = []
+            related_content = []
+            seen_videos = set()
+            for item in response.data:
+                similarity = item.get('similarity', 0.0)
+                video_id = item.get('video_id')
+                # Deduplicate by video (keep highest similarity per video)
+                if video_id in seen_videos:
+                    continue
+                seen_videos.add(video_id)
+                chunk = TranscriptChunk(
+                    chunk_text=item.get('chunk_text') or '',
+                    metadata={
+                        'video_id': video_id,
+                        'video_url': item.get('video_url'),
+                        'title': item.get('title', ''),
+                        'chunk_index': item.get('chunk_index'),
+                        'total_chunks': item.get('total_chunks'),
+                        'similarity': similarity
+                    }
+                )
+                if similarity >= direct_threshold:
+                    if len(direct_matches) < max_per_tier:
+                        direct_matches.append(chunk)
+                elif similarity >= related_threshold:
+                    if len(related_content) < max_per_tier:
+                        related_content.append(chunk)
+            return (direct_matches, related_content)
+        except Exception as e:
+            raise Exception(f"Error performing tiered search: {str(e)}")
     def get_video_chunks(self, video_id: str) -> List[TranscriptChunk]:
         """
         Fetch all chunks for a specific video