hamza2923 commited on
Commit
566327c
·
verified ·
1 Parent(s): 1b250e1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from selenium import webdriver
5
+ from selenium.webdriver.common.by import By
6
+ from selenium.webdriver.chrome.options import Options
7
+ from selenium.webdriver.support.ui import WebDriverWait
8
+ from selenium.webdriver.support import expected_conditions as EC
9
+ from selenium.common.exceptions import TimeoutException
10
+ import time
11
+ import logging
12
+ import os
13
+
14
+ app = FastAPI()
15
+
16
+ # Configure CORS
17
+ app.add_middleware(
18
+ CORSMiddleware,
19
+ allow_origins=["*"],
20
+ allow_credentials=True,
21
+ allow_methods=["*"],
22
+ allow_headers=["*"],
23
+ )
24
+
25
+ # Configure logging
26
+ logging.basicConfig(level=logging.INFO)
27
+ logger = logging.getLogger(__name__)
28
+
29
+ class VideoRequest(BaseModel):
30
+ url: str
31
+
32
+ class TranscriptResponse(BaseModel):
33
+ success: bool
34
+ transcript: list[str] | None
35
+ error: str | None
36
+ processing_time: float
37
+
38
+ def init_driver():
39
+ options = Options()
40
+ options.add_argument("--headless")
41
+ options.add_argument("--disable-gpu")
42
+ options.add_argument("--no-sandbox")
43
+ options.add_argument("--disable-dev-shm-usage")
44
+ options.add_argument("--log-level=3")
45
+ options.add_experimental_option("excludeSwitches", ["enable-automation"])
46
+ options.add_experimental_option('useAutomationExtension', False)
47
+
48
+ # For Hugging Face Spaces
49
+ options.binary_location = "/usr/bin/google-chrome"
50
+ return webdriver.Chrome(options=options)
51
+
52
+ @app.post("/transcript", response_model=TranscriptResponse)
53
+ async def get_transcript(request: VideoRequest):
54
+ start_time = time.time()
55
+ driver = None
56
+
57
+ try:
58
+ video_url = request.url
59
+ if not ("youtube.com" in video_url or "youtu.be" in video_url):
60
+ raise HTTPException(status_code=400, detail="Invalid YouTube URL")
61
+
62
+ driver = init_driver()
63
+ logger.info(f"Processing URL: {video_url}")
64
+ driver.get(video_url)
65
+
66
+ # Handle cookie consent if it appears
67
+ try:
68
+ cookie_button = WebDriverWait(driver, 5).until(
69
+ EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
70
+ )
71
+ cookie_button.click()
72
+ logger.info("Accepted cookies")
73
+ except TimeoutException:
74
+ pass
75
+
76
+ # Click more button
77
+ more_button = WebDriverWait(driver, 10).until(
78
+ EC.element_to_be_clickable((By.ID, "expand"))
79
+ )
80
+ driver.execute_script("arguments[0].click();", more_button)
81
+
82
+ # Click transcript button
83
+ transcript_button = WebDriverWait(driver, 10).until(
84
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
85
+ )
86
+ driver.execute_script("arguments[0].click();", transcript_button)
87
+
88
+ # Wait for transcript
89
+ WebDriverWait(driver, 15).until(
90
+ EC.presence_of_element_located((By.ID, "segments-container"))
91
+ )
92
+
93
+ # Extract transcript
94
+ segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
95
+ transcript = [segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
96
+ for segment in segments if segment.find_element(By.CLASS_NAME, "segment-text").text.strip()]
97
+
98
+ if not transcript:
99
+ raise HTTPException(status_code=404, detail="No transcript available")
100
+
101
+ return TranscriptResponse(
102
+ success=True,
103
+ transcript=transcript,
104
+ error=None,
105
+ processing_time=time.time() - start_time
106
+ )
107
+
108
+ except TimeoutException as e:
109
+ logger.error(f"Timeout: {str(e)}")
110
+ return TranscriptResponse(
111
+ success=False,
112
+ transcript=None,
113
+ error="Timed out waiting for page elements",
114
+ processing_time=time.time() - start_time
115
+ )
116
+ except Exception as e:
117
+ logger.error(f"Error: {str(e)}")
118
+ return TranscriptResponse(
119
+ success=False,
120
+ transcript=None,
121
+ error=str(e),
122
+ processing_time=time.time() - start_time
123
+ )
124
+ finally:
125
+ if driver:
126
+ driver.quit()
127
+
128
+ @app.get("/")
129
+ async def health_check():
130
+ return {"status": "OK", "message": "YouTube Transcript API is running"}
131
+
132
+ if __name__ == "__main__":
133
+ import uvicorn
134
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))