Final_Assignment_Template

Sleeping

App Files Files Community

santimber commited on Jun 30, 2025

Commit

cf93357

1 Parent(s): f206914

handle images and youtube

Browse files

Files changed (4) hide show

__pycache__/app.cpython-311.pyc +0 -0
__pycache__/tools.cpython-311.pyc +0 -0
test_youtube_question.py +71 -0
tools.py +62 -16

__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ

__pycache__/tools.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/tools.cpython-311.pyc and b/__pycache__/tools.cpython-311.pyc differ

test_youtube_question.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/env python3
+"""
+Test script to verify video analysis for a GAIA YouTube question.
+"""
+import requests
+from tools import video_analysis_tool
+def test_youtube_video_question():
+    api_url = "https://agents-course-unit4-scoring.hf.space"
+    questions_url = f"{api_url}/questions"
+    print("=== Testing YouTube Video Question ===")
+    # 1. Fetch questions
+    print("1. Fetching questions...")
+    try:
+        response = requests.get(questions_url, timeout=15)
+        response.raise_for_status()
+        questions_data = response.json()
+        print(f"✅ Fetched {len(questions_data)} questions")
+    except Exception as e:
+        print(f"❌ Failed to fetch questions: {e}")
+        return
+    # 2. Find a question with a YouTube link in the question text or file_name
+    youtube_question = None
+    for i, question in enumerate(questions_data):
+        qtext = question.get('question', '').lower()
+        fname = question.get('file_name', '').lower()
+        if 'youtube.com' in qtext or 'youtu.be' in qtext or 'youtube.com' in fname or 'youtu.be' in fname:
+            youtube_question = (i, question)
+            break
+    if not youtube_question:
+        print("❌ No YouTube video questions found.")
+        return
+    idx, question = youtube_question
+    question_text = question.get('question')
+    file_name = question.get('file_name', '')
+    print(f"\n2. Found YouTube video question {idx+1}:")
+    print(f"   Question: {question_text[:120]}...")
+    print(f"   File name: {file_name}")
+    # 3. Extract YouTube URL
+    # Try to find a YouTube URL in the question text or file_name
+    import re
+    yt_url = None
+    yt_pattern = r'(https?://(?:www\.)?(?:youtube\.com|youtu\.be)[^\s]*)'
+    match = re.search(yt_pattern, question_text)
+    if match:
+        yt_url = match.group(1)
+    elif file_name and ('youtube.com' in file_name or 'youtu.be' in file_name):
+        yt_url = file_name
+    if not yt_url:
+        print("❌ Could not extract YouTube URL from question.")
+        return
+    print(f"3. YouTube URL: {yt_url}")
+    # 4. Analyze the video
+    print("4. Analyzing video with video_analysis_tool...")
+    result = video_analysis_tool.invoke(yt_url)
+    print(f"5. Tool result:")
+    print(f"   {result[:500]}...")
+    print("\n✅ YouTube video analysis test complete!")
+if __name__ == "__main__":
+    test_youtube_video_question()

tools.py CHANGED Viewed

@@ -290,23 +290,64 @@ def image_recognition(img_path: str) -> str:
     try:
         if not os.path.exists(img_path):
             return f"Error: Image file not found at {img_path}"
         if not os.getenv("OPENAI_API_KEY"):
             return "OpenAI API key not found. Please set OPENAI_API_KEY in your environment variables."
-        vision_llm = ChatOpenAI(model="gpt-4o")
-        with open(img_path, "rb") as image_file:
-            image_bytes = image_file.read()
-        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
-        message = [
-            HumanMessage(
-                content=[
-                    {"type": "text", "text": "Describe the image or extract all the text from this image. Return only the description or extracted text, no explanations."},
-                    {"type": "image_url", "image_url": {
-                        "url": f"data:image/png;base64,{image_base64}"}},
-                ]
-            )
-        ]
-        response = vision_llm.invoke(message)
-        return response.content.strip()
     except Exception as e:
         return f"Error analyzing image: {str(e)}"
@@ -608,9 +649,14 @@ def analyze_excel_file(file_path: str, query: str) -> str:
         return f"Error analyzing Excel file: {str(e)}"
 analyze_excel_file_tool = Tool(
     name="analyze_excel_file_tool",
-    func=analyze_excel_file,
     description="Analyze an Excel file using pandas and answer a question about it."
 )

     try:
         if not os.path.exists(img_path):
             return f"Error: Image file not found at {img_path}"
         if not os.getenv("OPENAI_API_KEY"):
             return "OpenAI API key not found. Please set OPENAI_API_KEY in your environment variables."
+        # Get image info first
+        try:
+            img = Image.open(img_path)
+            image_info = f"Image: {img.size[0]}x{img.size[1]} pixels, mode: {img.mode}"
+        except Exception as e:
+            image_info = f"Image info error: {str(e)}"
+        # Try vision model
+        try:
+            vision_llm = ChatOpenAI(model="gpt-4o", temperature=0)
+            with open(img_path, "rb") as image_file:
+                image_bytes = image_file.read()
+            image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+            message = [
+                HumanMessage(
+                    content=[
+                        {"type": "text", "text": "Describe what you see in this image in detail. If there's text, extract it. If it's a chess position, describe the board state and pieces."},
+                        {"type": "image_url", "image_url": {
+                            "url": f"data:image/png;base64,{image_base64}"}},
+                    ]
+                )
+            ]
+            response = vision_llm.invoke(message)
+            vision_result = response.content.strip()
+            # Check if we got a content policy response
+            if "sorry" in vision_result.lower() and "can't assist" in vision_result.lower():
+                # Fallback to OCR
+                try:
+                    import pytesseract
+                    text = pytesseract.image_to_string(img).strip()
+                    if text:
+                        return f"{image_info}\n\nOCR extracted text:\n{text}"
+                    else:
+                        return f"{image_info}\n\nVision model blocked. OCR found no text."
+                except ImportError:
+                    return f"{image_info}\n\nVision model blocked. OCR not available."
+            else:
+                return f"{image_info}\n\nVision analysis:\n{vision_result}"
+        except Exception as vision_error:
+            # Fallback to OCR if vision fails
+            try:
+                import pytesseract
+                text = pytesseract.image_to_string(img).strip()
+                if text:
+                    return f"{image_info}\n\nVision failed, OCR extracted text:\n{text}"
+                else:
+                    return f"{image_info}\n\nVision failed: {str(vision_error)}. OCR found no text."
+            except ImportError:
+                return f"{image_info}\n\nVision failed: {str(vision_error)}. OCR not available."
     except Exception as e:
         return f"Error analyzing image: {str(e)}"
         return f"Error analyzing Excel file: {str(e)}"
+def analyze_excel_file_simple(file_path: str) -> str:
+    """Wrapper for analyze_excel_file that uses a default query."""
+    return analyze_excel_file(file_path, "Analyze this spreadsheet")
 analyze_excel_file_tool = Tool(
     name="analyze_excel_file_tool",
+    func=analyze_excel_file_simple,
     description="Analyze an Excel file using pandas and answer a question about it."
 )