newpress-ai / app.py
Tom
feat: replace script generation with tone checker, improve archive search
9384880
"""
NewPress AI - Johnny Harris Script Assistant
A Gradio app that uses a Supabase vector database of Johnny Harris transcripts to:
1. Search if topics have been covered before
2. Generate scripts in Johnny's voice from bullet points
"""
import os
import gradio as gr
from dotenv import load_dotenv
from src.vectorstore import TranscriptVectorStore, create_vectorstore
from src.llm_client import InferenceProviderClient, create_llm_client
from src.prompts import (
TOPIC_SEARCH_SYSTEM_PROMPT,
SCRIPT_SYSTEM_PROMPT,
TONE_CHECK_SYSTEM_PROMPT,
get_topic_search_prompt,
get_script_prompt,
get_tone_check_prompt
)
# Load environment variables
load_dotenv()
# Initialize components (lazy loading)
vectorstore = None
llm_client = None
def get_vectorstore() -> TranscriptVectorStore:
"""Get or create the vector store instance"""
global vectorstore
if vectorstore is None:
vectorstore = create_vectorstore()
return vectorstore
def get_llm_client() -> InferenceProviderClient:
"""Get or create the LLM client instance"""
global llm_client
if llm_client is None:
llm_client = create_llm_client()
return llm_client
# =============================================================================
# TAB 1: TOPIC SEARCH
# =============================================================================
def expand_query(query: str) -> list:
"""Use LLM to generate related search terms for broader coverage"""
try:
llm = get_llm_client()
prompt = f"""Given this search query about Johnny Harris video topics: "{query}"
Generate 2-3 closely related search terms that might find relevant videos.
Focus on: the core topic, key entities mentioned, and one closely related concept.
Return ONLY the terms, one per line, no numbering or explanation."""
response = llm.generate(prompt, max_tokens=60, temperature=0.3)
terms = [t.strip() for t in response.strip().split('\n') if t.strip()]
return [query] + terms[:3]
except Exception:
return [query]
def search_topics(query: str, progress=gr.Progress()):
"""
Generator that yields progress updates during search.
Uses tiered results: direct matches and related content.
Args:
query: User's topic or question
progress: Gradio progress tracker
Yields:
Progress status messages, then final search results
"""
if not query or not query.strip():
yield "Please enter a topic or question to search."
return
try:
vs = get_vectorstore()
# Expand query using LLM
progress(0.1, desc="Expanding search query...")
yield "Expanding search query..."
search_terms = expand_query(query.strip())
# Collect tiered results from all search terms
all_direct = []
all_related = []
seen_videos = set()
total_terms = len(search_terms)
for i, term in enumerate(search_terms):
pct = 0.2 + (0.5 * (i / total_terms))
progress(pct, desc=f"Searching: {term[:30]}...")
yield f"Searching: {term[:30]}..."
direct, related = vs.tiered_similarity_search(
query=term,
direct_threshold=0.6,
related_threshold=0.3,
max_per_tier=10
)
# Add results, deduplicating by video
for chunk in direct:
if chunk.video_id not in seen_videos:
seen_videos.add(chunk.video_id)
all_direct.append(chunk)
for chunk in related:
if chunk.video_id not in seen_videos:
seen_videos.add(chunk.video_id)
all_related.append(chunk)
progress(0.8, desc="Processing results...")
yield "Processing results..."
# Sort each tier by similarity
all_direct = sorted(all_direct, key=lambda x: x.similarity, reverse=True)[:10]
all_related = sorted(all_related, key=lambda x: x.similarity, reverse=True)[:10]
if not all_direct and not all_related:
yield f"No matching content found for: **{query}**\n\nThis topic may not have been covered yet, or try rephrasing your search."
return
# Format tiered output
output_parts = []
search_info = f"*Searched: {', '.join(search_terms)}*\n\n"
output_parts.append(f"## Search Results for: \"{query}\"\n\n{search_info}")
if all_direct:
output_parts.append("### Direct Matches\nVideos that directly cover this topic:\n")
output_parts.append(vs.format_results_for_display(all_direct))
if all_related:
if all_direct:
output_parts.append("\n---\n")
output_parts.append("### Related Content\nVideos that touch on similar themes:\n")
output_parts.append(vs.format_results_for_display(all_related))
progress(1.0, desc="Done!")
yield "\n".join(output_parts)
except Exception as e:
yield f"Error searching: {str(e)}"
# =============================================================================
# TAB 2: TONE CHECKER
# =============================================================================
def check_script_tone(user_script: str, progress=gr.Progress()):
"""
Generator that yields progress updates during tone analysis.
Args:
user_script: User's script to analyze
progress: Gradio progress tracker
Yields:
Progress status messages, then final tone analysis
"""
if not user_script or not user_script.strip():
yield "Please enter a script to analyze."
return
try:
progress(0.05, desc="Gathering style references...")
yield "Gathering style references from Johnny's archive..."
vs = get_vectorstore()
llm = get_llm_client()
progress(0.15, desc="Searching knowledge base...")
yield "Searching knowledge base for style references..."
context_chunks = vs.get_bulk_style_context(
topic_query=user_script.strip()[:500], # Use first 500 chars as topic hint
max_chunks=50,
topic_relevant_ratio=0.4
)
progress(0.35, desc="Preparing context...")
yield "Preparing context for analysis..."
context = vs.format_context_for_llm(context_chunks) if context_chunks else ""
progress(0.5, desc="Building prompt...")
yield "Building analysis prompt..."
prompt_template = get_tone_check_prompt()
prompt = prompt_template.format(
user_script=user_script.strip(),
context=context
)
progress(0.7, desc="Analyzing tone (30-60 seconds)...")
yield "Analyzing script tone (this may take 30-60 seconds)..."
analysis = llm.generate(
prompt=prompt,
system_prompt=TONE_CHECK_SYSTEM_PROMPT,
temperature=0.3,
max_tokens=1500
)
progress(1.0, desc="Complete!")
yield analysis.strip()
except Exception as e:
yield f"**Error:** {str(e)}"
# =============================================================================
# GRADIO INTERFACE
# =============================================================================
def create_app():
"""Create and configure the Gradio application"""
with gr.Blocks(
title="NewPress AI - Johnny Harris Script Assistant"
) as app:
app.queue() # Enable queue before defining event handlers for progress to work
gr.Markdown("""
# NewPress AI
### Johnny Harris Script Assistant
Use Johnny's archive of hundreds of video transcripts to:
- **Search** if a topic has been covered before
- **Generate** scripts in Johnny's voice from your notes
""")
with gr.Tabs():
# =================================================================
# TAB 1: TOPIC SEARCH
# =================================================================
with gr.TabItem("Topic Search"):
gr.Markdown("""
### Has Johnny covered this topic?
Search the archive to see if a topic has been addressed in previous videos.
""")
with gr.Row():
with gr.Column(scale=3):
topic_input = gr.Textbox(
label="Topic or Question",
placeholder="e.g., Why do borders exist? or US immigration policy",
lines=2
)
with gr.Column(scale=1):
search_btn = gr.Button("Search", variant="primary", size="lg")
search_output = gr.Markdown(label="Search Results", value="Search results will appear here...")
search_btn.click(
fn=search_topics,
inputs=[topic_input],
outputs=[search_output],
show_progress="full"
)
topic_input.submit(
fn=search_topics,
inputs=[topic_input],
outputs=[search_output],
show_progress="full"
)
# =================================================================
# TAB 2: TONE CHECKER
# =================================================================
with gr.TabItem("Tone Checker"):
gr.Markdown("""
### Check if your script matches Johnny's voice
Paste your script below to analyze how well it matches Johnny Harris's
signature style. Get a score and specific feedback on what works and what to improve.
""")
with gr.Row():
with gr.Column():
script_input = gr.Textbox(
label="Your Script",
placeholder="""Paste your script here...
Example:
There's this line on the map that most people have never heard of.
It's called the Durand Line, and it cuts right through the middle of a people
who have lived in these mountains for thousands of years.
The thing is, this line wasn't drawn by the people who live here...""",
lines=15
)
check_btn = gr.Button("Check Tone", variant="primary", size="lg")
tone_output = gr.Markdown(label="Tone Analysis", value="Tone analysis will appear here...")
check_btn.click(
fn=check_script_tone,
inputs=[script_input],
outputs=[tone_output],
show_progress="full"
)
script_input.submit(
fn=check_script_tone,
inputs=[script_input],
outputs=[tone_output],
show_progress="full"
)
gr.Markdown("""
---
*Powered by Johnny Harris's transcript archive, Jina AI embeddings, and Qwen-2.5-72B*
""")
return app
# =============================================================================
# MAIN
# =============================================================================
# Create app at module level for `gradio app.py` CLI compatibility
demo = create_app()
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)