File size: 2,226 Bytes
8d9faa6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from langchain.tools import tool
try:
from youtube_transcript_api import YouTubeTranscriptApi
except Exception:
YouTubeTranscriptApi = None
import re
from urllib.parse import urlparse, parse_qs
def _extract_video_id(url_or_id: str) -> str | None:
s = (url_or_id or "").strip()
if re.fullmatch(r"[A-Za-z0-9_-]{11}", s):
return s
u = urlparse(s)
# youtu.be/<id>
if u.netloc.endswith("youtu.be"):
vid = u.path.strip("/").split("/")[0]
return vid if re.fullmatch(r"[A-Za-z0-9_-]{11}", vid) else None
# watch?v=<id>
qs = parse_qs(u.query or "")
if "v" in qs:
vid = qs["v"][0]
return vid if re.fullmatch(r"[A-Za-z0-9_-]{11}", vid) else None
# /embed/<id>, /shorts/<id>, /v/<id>
for pref in ("/embed/", "/shorts/", "/v/"):
if u.path.startswith(pref):
vid = u.path[len(pref):].split("/")[0]
return vid if re.fullmatch(r"[A-Za-z0-9_-]{11}", vid) else None
return None
@tool
def extract_youtube_transcript(url: str, chars: int = 1000) -> str:
"""
Simple YouTube transcript fetcher.
Input:
- url: Regular YouTube URL (or the 11-char video_id).
- chars: Return the first `chars` characters of the transcript.
Output:
- String with the transcript (trimmed to `chars`), or an error string:
"yt_error:<reason>"
"""
if YouTubeTranscriptApi is None:
return "yt_error:missing_dependency"
vid = _extract_video_id(url)
if not vid:
return "yt_error:id_not_found"
try:
api = YouTubeTranscriptApi()
# New API returns a list of FetchedTranscriptSnippet objects
snippets = api.fetch(vid)
parts = []
for s in snippets:
# Support both object (new) and dict (old) shapes
text = getattr(s, "text", None)
if text is None and isinstance(s, dict):
text = s.get("text")
if not text:
continue
parts.append(text.replace("\n", " ").strip())
full_text = " ".join(p for p in parts if p)
return full_text[: max(0, int(chars))]
except Exception as e:
return f"yt_error:{type(e).__name__}:{e}" |