Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.common.by import By
|
| 6 |
+
from selenium.webdriver.chrome.options import Options
|
| 7 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 8 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 9 |
+
from selenium.common.exceptions import TimeoutException
|
| 10 |
+
import time
|
| 11 |
+
import logging
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
app = FastAPI()
|
| 15 |
+
|
| 16 |
+
# Configure CORS
|
| 17 |
+
app.add_middleware(
|
| 18 |
+
CORSMiddleware,
|
| 19 |
+
allow_origins=["*"],
|
| 20 |
+
allow_credentials=True,
|
| 21 |
+
allow_methods=["*"],
|
| 22 |
+
allow_headers=["*"],
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# Configure logging
|
| 26 |
+
logging.basicConfig(level=logging.INFO)
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
class VideoRequest(BaseModel):
|
| 30 |
+
url: str
|
| 31 |
+
|
| 32 |
+
class TranscriptResponse(BaseModel):
|
| 33 |
+
success: bool
|
| 34 |
+
transcript: list[str] | None
|
| 35 |
+
error: str | None
|
| 36 |
+
processing_time: float
|
| 37 |
+
|
| 38 |
+
def init_driver():
|
| 39 |
+
options = Options()
|
| 40 |
+
options.add_argument("--headless")
|
| 41 |
+
options.add_argument("--disable-gpu")
|
| 42 |
+
options.add_argument("--no-sandbox")
|
| 43 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 44 |
+
options.add_argument("--log-level=3")
|
| 45 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 46 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 47 |
+
|
| 48 |
+
# For Hugging Face Spaces
|
| 49 |
+
options.binary_location = "/usr/bin/google-chrome"
|
| 50 |
+
return webdriver.Chrome(options=options)
|
| 51 |
+
|
| 52 |
+
@app.post("/transcript", response_model=TranscriptResponse)
|
| 53 |
+
async def get_transcript(request: VideoRequest):
|
| 54 |
+
start_time = time.time()
|
| 55 |
+
driver = None
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
video_url = request.url
|
| 59 |
+
if not ("youtube.com" in video_url or "youtu.be" in video_url):
|
| 60 |
+
raise HTTPException(status_code=400, detail="Invalid YouTube URL")
|
| 61 |
+
|
| 62 |
+
driver = init_driver()
|
| 63 |
+
logger.info(f"Processing URL: {video_url}")
|
| 64 |
+
driver.get(video_url)
|
| 65 |
+
|
| 66 |
+
# Handle cookie consent if it appears
|
| 67 |
+
try:
|
| 68 |
+
cookie_button = WebDriverWait(driver, 5).until(
|
| 69 |
+
EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
|
| 70 |
+
)
|
| 71 |
+
cookie_button.click()
|
| 72 |
+
logger.info("Accepted cookies")
|
| 73 |
+
except TimeoutException:
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
# Click more button
|
| 77 |
+
more_button = WebDriverWait(driver, 10).until(
|
| 78 |
+
EC.element_to_be_clickable((By.ID, "expand"))
|
| 79 |
+
)
|
| 80 |
+
driver.execute_script("arguments[0].click();", more_button)
|
| 81 |
+
|
| 82 |
+
# Click transcript button
|
| 83 |
+
transcript_button = WebDriverWait(driver, 10).until(
|
| 84 |
+
EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
|
| 85 |
+
)
|
| 86 |
+
driver.execute_script("arguments[0].click();", transcript_button)
|
| 87 |
+
|
| 88 |
+
# Wait for transcript
|
| 89 |
+
WebDriverWait(driver, 15).until(
|
| 90 |
+
EC.presence_of_element_located((By.ID, "segments-container"))
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Extract transcript
|
| 94 |
+
segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
|
| 95 |
+
transcript = [segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
|
| 96 |
+
for segment in segments if segment.find_element(By.CLASS_NAME, "segment-text").text.strip()]
|
| 97 |
+
|
| 98 |
+
if not transcript:
|
| 99 |
+
raise HTTPException(status_code=404, detail="No transcript available")
|
| 100 |
+
|
| 101 |
+
return TranscriptResponse(
|
| 102 |
+
success=True,
|
| 103 |
+
transcript=transcript,
|
| 104 |
+
error=None,
|
| 105 |
+
processing_time=time.time() - start_time
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
except TimeoutException as e:
|
| 109 |
+
logger.error(f"Timeout: {str(e)}")
|
| 110 |
+
return TranscriptResponse(
|
| 111 |
+
success=False,
|
| 112 |
+
transcript=None,
|
| 113 |
+
error="Timed out waiting for page elements",
|
| 114 |
+
processing_time=time.time() - start_time
|
| 115 |
+
)
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Error: {str(e)}")
|
| 118 |
+
return TranscriptResponse(
|
| 119 |
+
success=False,
|
| 120 |
+
transcript=None,
|
| 121 |
+
error=str(e),
|
| 122 |
+
processing_time=time.time() - start_time
|
| 123 |
+
)
|
| 124 |
+
finally:
|
| 125 |
+
if driver:
|
| 126 |
+
driver.quit()
|
| 127 |
+
|
| 128 |
+
@app.get("/")
|
| 129 |
+
async def health_check():
|
| 130 |
+
return {"status": "OK", "message": "YouTube Transcript API is running"}
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
import uvicorn
|
| 134 |
+
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
|