Spaces:

tomvaillant
/

newpress-ai

Running

newpress-ai / app.py

Tom

feat: replace script generation with tone checker, improve archive search

9384880 11 days ago

11.7 kB

	"""
	NewPress AI - Johnny Harris Script Assistant

	A Gradio app that uses a Supabase vector database of Johnny Harris transcripts to:
	1. Search if topics have been covered before
	2. Generate scripts in Johnny's voice from bullet points
	"""

	import os
	import gradio as gr
	from dotenv import load_dotenv

	from src.vectorstore import TranscriptVectorStore, create_vectorstore
	from src.llm_client import InferenceProviderClient, create_llm_client
	from src.prompts import (
	TOPIC_SEARCH_SYSTEM_PROMPT,
	SCRIPT_SYSTEM_PROMPT,
	TONE_CHECK_SYSTEM_PROMPT,
	get_topic_search_prompt,
	get_script_prompt,
	get_tone_check_prompt
	)

	# Load environment variables
	load_dotenv()

	# Initialize components (lazy loading)
	vectorstore = None
	llm_client = None


	def get_vectorstore() -> TranscriptVectorStore:
	"""Get or create the vector store instance"""
	global vectorstore
	if vectorstore is None:
	vectorstore = create_vectorstore()
	return vectorstore


	def get_llm_client() -> InferenceProviderClient:
	"""Get or create the LLM client instance"""
	global llm_client
	if llm_client is None:
	llm_client = create_llm_client()
	return llm_client


	# =============================================================================
	# TAB 1: TOPIC SEARCH
	# =============================================================================

	def expand_query(query: str) -> list:
	"""Use LLM to generate related search terms for broader coverage"""
	try:
	llm = get_llm_client()
	prompt = f"""Given this search query about Johnny Harris video topics: "{query}"

	Generate 2-3 closely related search terms that might find relevant videos.
	Focus on: the core topic, key entities mentioned, and one closely related concept.
	Return ONLY the terms, one per line, no numbering or explanation."""

	response = llm.generate(prompt, max_tokens=60, temperature=0.3)
	terms = [t.strip() for t in response.strip().split('\n') if t.strip()]
	return [query] + terms[:3]
	except Exception:
	return [query]


	def search_topics(query: str, progress=gr.Progress()):
	"""
	Generator that yields progress updates during search.
	Uses tiered results: direct matches and related content.

	Args:
	query: User's topic or question
	progress: Gradio progress tracker

	Yields:
	Progress status messages, then final search results
	"""
	if not query or not query.strip():
	yield "Please enter a topic or question to search."
	return

	try:
	vs = get_vectorstore()

	# Expand query using LLM
	progress(0.1, desc="Expanding search query...")
	yield "Expanding search query..."
	search_terms = expand_query(query.strip())

	# Collect tiered results from all search terms
	all_direct = []
	all_related = []
	seen_videos = set()

	total_terms = len(search_terms)
	for i, term in enumerate(search_terms):
	pct = 0.2 + (0.5 * (i / total_terms))
	progress(pct, desc=f"Searching: {term[:30]}...")
	yield f"Searching: {term[:30]}..."

	direct, related = vs.tiered_similarity_search(
	query=term,
	direct_threshold=0.6,
	related_threshold=0.3,
	max_per_tier=10
	)

	# Add results, deduplicating by video
	for chunk in direct:
	if chunk.video_id not in seen_videos:
	seen_videos.add(chunk.video_id)
	all_direct.append(chunk)

	for chunk in related:
	if chunk.video_id not in seen_videos:
	seen_videos.add(chunk.video_id)
	all_related.append(chunk)

	progress(0.8, desc="Processing results...")
	yield "Processing results..."

	# Sort each tier by similarity
	all_direct = sorted(all_direct, key=lambda x: x.similarity, reverse=True)[:10]
	all_related = sorted(all_related, key=lambda x: x.similarity, reverse=True)[:10]

	if not all_direct and not all_related:
	yield f"No matching content found for: {query}\n\nThis topic may not have been covered yet, or try rephrasing your search."
	return

	# Format tiered output
	output_parts = []
	search_info = f"Searched: {', '.join(search_terms)}\n\n"
	output_parts.append(f"## Search Results for: \"{query}\"\n\n{search_info}")

	if all_direct:
	output_parts.append("### Direct Matches\nVideos that directly cover this topic:\n")
	output_parts.append(vs.format_results_for_display(all_direct))

	if all_related:
	if all_direct:
	output_parts.append("\n---\n")
	output_parts.append("### Related Content\nVideos that touch on similar themes:\n")
	output_parts.append(vs.format_results_for_display(all_related))

	progress(1.0, desc="Done!")
	yield "\n".join(output_parts)

	except Exception as e:
	yield f"Error searching: {str(e)}"


	# =============================================================================
	# TAB 2: TONE CHECKER
	# =============================================================================

	def check_script_tone(user_script: str, progress=gr.Progress()):
	"""
	Generator that yields progress updates during tone analysis.

	Args:
	user_script: User's script to analyze
	progress: Gradio progress tracker

	Yields:
	Progress status messages, then final tone analysis
	"""
	if not user_script or not user_script.strip():
	yield "Please enter a script to analyze."
	return

	try:
	progress(0.05, desc="Gathering style references...")
	yield "Gathering style references from Johnny's archive..."
	vs = get_vectorstore()
	llm = get_llm_client()

	progress(0.15, desc="Searching knowledge base...")
	yield "Searching knowledge base for style references..."
	context_chunks = vs.get_bulk_style_context(
	topic_query=user_script.strip()[:500], # Use first 500 chars as topic hint
	max_chunks=50,
	topic_relevant_ratio=0.4
	)

	progress(0.35, desc="Preparing context...")
	yield "Preparing context for analysis..."
	context = vs.format_context_for_llm(context_chunks) if context_chunks else ""

	progress(0.5, desc="Building prompt...")
	yield "Building analysis prompt..."
	prompt_template = get_tone_check_prompt()
	prompt = prompt_template.format(
	user_script=user_script.strip(),
	context=context
	)

	progress(0.7, desc="Analyzing tone (30-60 seconds)...")
	yield "Analyzing script tone (this may take 30-60 seconds)..."
	analysis = llm.generate(
	prompt=prompt,
	system_prompt=TONE_CHECK_SYSTEM_PROMPT,
	temperature=0.3,
	max_tokens=1500
	)

	progress(1.0, desc="Complete!")
	yield analysis.strip()

	except Exception as e:
	yield f"Error: {str(e)}"


	# =============================================================================
	# GRADIO INTERFACE
	# =============================================================================

	def create_app():
	"""Create and configure the Gradio application"""

	with gr.Blocks(
	title="NewPress AI - Johnny Harris Script Assistant"
	) as app:
	app.queue() # Enable queue before defining event handlers for progress to work

	gr.Markdown("""
	# NewPress AI
	### Johnny Harris Script Assistant

	Use Johnny's archive of hundreds of video transcripts to:
	- Search if a topic has been covered before
	- Generate scripts in Johnny's voice from your notes
	""")

	with gr.Tabs():
	# =================================================================
	# TAB 1: TOPIC SEARCH
	# =================================================================
	with gr.TabItem("Topic Search"):
	gr.Markdown("""
	### Has Johnny covered this topic?

	Search the archive to see if a topic has been addressed in previous videos.
	""")

	with gr.Row():
	with gr.Column(scale=3):
	topic_input = gr.Textbox(
	label="Topic or Question",
	placeholder="e.g., Why do borders exist? or US immigration policy",
	lines=2
	)
	with gr.Column(scale=1):
	search_btn = gr.Button("Search", variant="primary", size="lg")

	search_output = gr.Markdown(label="Search Results", value="Search results will appear here...")

	search_btn.click(
	fn=search_topics,
	inputs=[topic_input],
	outputs=[search_output],
	show_progress="full"
	)

	topic_input.submit(
	fn=search_topics,
	inputs=[topic_input],
	outputs=[search_output],
	show_progress="full"
	)

	# =================================================================
	# TAB 2: TONE CHECKER
	# =================================================================
	with gr.TabItem("Tone Checker"):
	gr.Markdown("""
	### Check if your script matches Johnny's voice

	Paste your script below to analyze how well it matches Johnny Harris's
	signature style. Get a score and specific feedback on what works and what to improve.
	""")

	with gr.Row():
	with gr.Column():
	script_input = gr.Textbox(
	label="Your Script",
	placeholder="""Paste your script here...

	Example:
	There's this line on the map that most people have never heard of.
	It's called the Durand Line, and it cuts right through the middle of a people
	who have lived in these mountains for thousands of years.
	The thing is, this line wasn't drawn by the people who live here...""",
	lines=15
	)

	check_btn = gr.Button("Check Tone", variant="primary", size="lg")

	tone_output = gr.Markdown(label="Tone Analysis", value="Tone analysis will appear here...")

	check_btn.click(
	fn=check_script_tone,
	inputs=[script_input],
	outputs=[tone_output],
	show_progress="full"
	)

	script_input.submit(
	fn=check_script_tone,
	inputs=[script_input],
	outputs=[tone_output],
	show_progress="full"
	)

	gr.Markdown("""
	---
	Powered by Johnny Harris's transcript archive, Jina AI embeddings, and Qwen-2.5-72B
	""")

	return app


	# =============================================================================
	# MAIN
	# =============================================================================

	# Create app at module level for `gradio app.py` CLI compatibility
	demo = create_app()

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)