Spaces:

tomvaillant
/

newpress-ai

Running

File size: 11,719 Bytes

"""
NewPress AI - Johnny Harris Script Assistant

A Gradio app that uses a Supabase vector database of Johnny Harris transcripts to:
1. Search if topics have been covered before
2. Generate scripts in Johnny's voice from bullet points
"""

import os
import gradio as gr
from dotenv import load_dotenv

from src.vectorstore import TranscriptVectorStore, create_vectorstore
from src.llm_client import InferenceProviderClient, create_llm_client
from src.prompts import (
    TOPIC_SEARCH_SYSTEM_PROMPT,
    SCRIPT_SYSTEM_PROMPT,
    TONE_CHECK_SYSTEM_PROMPT,
    get_topic_search_prompt,
    get_script_prompt,
    get_tone_check_prompt
)

# Load environment variables
load_dotenv()

# Initialize components (lazy loading)
vectorstore = None
llm_client = None


def get_vectorstore() -> TranscriptVectorStore:
    """Get or create the vector store instance"""
    global vectorstore
    if vectorstore is None:
        vectorstore = create_vectorstore()
    return vectorstore


def get_llm_client() -> InferenceProviderClient:
    """Get or create the LLM client instance"""
    global llm_client
    if llm_client is None:
        llm_client = create_llm_client()
    return llm_client


# =============================================================================
# TAB 1: TOPIC SEARCH
# =============================================================================

def expand_query(query: str) -> list:
    """Use LLM to generate related search terms for broader coverage"""
    try:
        llm = get_llm_client()
        prompt = f"""Given this search query about Johnny Harris video topics: "{query}"

Generate 2-3 closely related search terms that might find relevant videos.
Focus on: the core topic, key entities mentioned, and one closely related concept.
Return ONLY the terms, one per line, no numbering or explanation."""

        response = llm.generate(prompt, max_tokens=60, temperature=0.3)
        terms = [t.strip() for t in response.strip().split('\n') if t.strip()]
        return [query] + terms[:3]
    except Exception:
        return [query]


def search_topics(query: str, progress=gr.Progress()):
    """
    Generator that yields progress updates during search.
    Uses tiered results: direct matches and related content.

    Args:
        query: User's topic or question
        progress: Gradio progress tracker

    Yields:
        Progress status messages, then final search results
    """
    if not query or not query.strip():
        yield "Please enter a topic or question to search."
        return

    try:
        vs = get_vectorstore()

        # Expand query using LLM
        progress(0.1, desc="Expanding search query...")
        yield "Expanding search query..."
        search_terms = expand_query(query.strip())

        # Collect tiered results from all search terms
        all_direct = []
        all_related = []
        seen_videos = set()

        total_terms = len(search_terms)
        for i, term in enumerate(search_terms):
            pct = 0.2 + (0.5 * (i / total_terms))
            progress(pct, desc=f"Searching: {term[:30]}...")
            yield f"Searching: {term[:30]}..."

            direct, related = vs.tiered_similarity_search(
                query=term,
                direct_threshold=0.6,
                related_threshold=0.3,
                max_per_tier=10
            )

            # Add results, deduplicating by video
            for chunk in direct:
                if chunk.video_id not in seen_videos:
                    seen_videos.add(chunk.video_id)
                    all_direct.append(chunk)

            for chunk in related:
                if chunk.video_id not in seen_videos:
                    seen_videos.add(chunk.video_id)
                    all_related.append(chunk)

        progress(0.8, desc="Processing results...")
        yield "Processing results..."

        # Sort each tier by similarity
        all_direct = sorted(all_direct, key=lambda x: x.similarity, reverse=True)[:10]
        all_related = sorted(all_related, key=lambda x: x.similarity, reverse=True)[:10]

        if not all_direct and not all_related:
            yield f"No matching content found for: **{query}**\n\nThis topic may not have been covered yet, or try rephrasing your search."
            return

        # Format tiered output
        output_parts = []
        search_info = f"*Searched: {', '.join(search_terms)}*\n\n"
        output_parts.append(f"## Search Results for: \"{query}\"\n\n{search_info}")

        if all_direct:
            output_parts.append("### Direct Matches\nVideos that directly cover this topic:\n")
            output_parts.append(vs.format_results_for_display(all_direct))

        if all_related:
            if all_direct:
                output_parts.append("\n---\n")
            output_parts.append("### Related Content\nVideos that touch on similar themes:\n")
            output_parts.append(vs.format_results_for_display(all_related))

        progress(1.0, desc="Done!")
        yield "\n".join(output_parts)

    except Exception as e:
        yield f"Error searching: {str(e)}"


# =============================================================================
# TAB 2: TONE CHECKER
# =============================================================================

def check_script_tone(user_script: str, progress=gr.Progress()):
    """
    Generator that yields progress updates during tone analysis.

    Args:
        user_script: User's script to analyze
        progress: Gradio progress tracker

    Yields:
        Progress status messages, then final tone analysis
    """
    if not user_script or not user_script.strip():
        yield "Please enter a script to analyze."
        return

    try:
        progress(0.05, desc="Gathering style references...")
        yield "Gathering style references from Johnny's archive..."
        vs = get_vectorstore()
        llm = get_llm_client()

        progress(0.15, desc="Searching knowledge base...")
        yield "Searching knowledge base for style references..."
        context_chunks = vs.get_bulk_style_context(
            topic_query=user_script.strip()[:500],  # Use first 500 chars as topic hint
            max_chunks=50,
            topic_relevant_ratio=0.4
        )

        progress(0.35, desc="Preparing context...")
        yield "Preparing context for analysis..."
        context = vs.format_context_for_llm(context_chunks) if context_chunks else ""

        progress(0.5, desc="Building prompt...")
        yield "Building analysis prompt..."
        prompt_template = get_tone_check_prompt()
        prompt = prompt_template.format(
            user_script=user_script.strip(),
            context=context
        )

        progress(0.7, desc="Analyzing tone (30-60 seconds)...")
        yield "Analyzing script tone (this may take 30-60 seconds)..."
        analysis = llm.generate(
            prompt=prompt,
            system_prompt=TONE_CHECK_SYSTEM_PROMPT,
            temperature=0.3,
            max_tokens=1500
        )

        progress(1.0, desc="Complete!")
        yield analysis.strip()

    except Exception as e:
        yield f"**Error:** {str(e)}"


# =============================================================================
# GRADIO INTERFACE
# =============================================================================

def create_app():
    """Create and configure the Gradio application"""

    with gr.Blocks(
        title="NewPress AI - Johnny Harris Script Assistant"
    ) as app:
        app.queue()  # Enable queue before defining event handlers for progress to work

        gr.Markdown("""
        # NewPress AI
        ### Johnny Harris Script Assistant

        Use Johnny's archive of hundreds of video transcripts to:
        - **Search** if a topic has been covered before
        - **Generate** scripts in Johnny's voice from your notes
        """)

        with gr.Tabs():
            # =================================================================
            # TAB 1: TOPIC SEARCH
            # =================================================================
            with gr.TabItem("Topic Search"):
                gr.Markdown("""
                ### Has Johnny covered this topic?

                Search the archive to see if a topic has been addressed in previous videos.
                """)

                with gr.Row():
                    with gr.Column(scale=3):
                        topic_input = gr.Textbox(
                            label="Topic or Question",
                            placeholder="e.g., Why do borders exist? or US immigration policy",
                            lines=2
                        )
                    with gr.Column(scale=1):
                        search_btn = gr.Button("Search", variant="primary", size="lg")

                search_output = gr.Markdown(label="Search Results", value="Search results will appear here...")

                search_btn.click(
                    fn=search_topics,
                    inputs=[topic_input],
                    outputs=[search_output],
                    show_progress="full"
                )

                topic_input.submit(
                    fn=search_topics,
                    inputs=[topic_input],
                    outputs=[search_output],
                    show_progress="full"
                )

            # =================================================================
            # TAB 2: TONE CHECKER
            # =================================================================
            with gr.TabItem("Tone Checker"):
                gr.Markdown("""
                ### Check if your script matches Johnny's voice

                Paste your script below to analyze how well it matches Johnny Harris's
                signature style. Get a score and specific feedback on what works and what to improve.
                """)

                with gr.Row():
                    with gr.Column():
                        script_input = gr.Textbox(
                            label="Your Script",
                            placeholder="""Paste your script here...

Example:
There's this line on the map that most people have never heard of.
It's called the Durand Line, and it cuts right through the middle of a people
who have lived in these mountains for thousands of years.
The thing is, this line wasn't drawn by the people who live here...""",
                            lines=15
                        )

                        check_btn = gr.Button("Check Tone", variant="primary", size="lg")

                tone_output = gr.Markdown(label="Tone Analysis", value="Tone analysis will appear here...")

                check_btn.click(
                    fn=check_script_tone,
                    inputs=[script_input],
                    outputs=[tone_output],
                    show_progress="full"
                )

                script_input.submit(
                    fn=check_script_tone,
                    inputs=[script_input],
                    outputs=[tone_output],
                    show_progress="full"
                )

        gr.Markdown("""
        ---
        *Powered by Johnny Harris's transcript archive, Jina AI embeddings, and Qwen-2.5-72B*
        """)

    return app


# =============================================================================
# MAIN
# =============================================================================

# Create app at module level for `gradio app.py` CLI compatibility
demo = create_app()

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )