Initial commit of OpenClaw agent Chloe
This commit is contained in:
81
skills/youtube-watcher/scripts/get_transcript.py
Normal file
81
skills/youtube-watcher/scripts/get_transcript.py
Normal file
@@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
def clean_vtt(content: str) -> str:
|
||||
"""
|
||||
Clean WebVTT content to plain text.
|
||||
Removes headers, timestamps, and duplicate lines.
|
||||
"""
|
||||
lines = content.splitlines()
|
||||
text_lines = []
|
||||
seen = set()
|
||||
|
||||
timestamp_pattern = re.compile(r'\d{2}:\d{2}:\d{2}\.\d{3}\s-->\s\d{2}:\d{2}:\d{2}\.\d{3}')
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line or line == 'WEBVTT' or line.isdigit():
|
||||
continue
|
||||
if timestamp_pattern.match(line):
|
||||
continue
|
||||
if line.startswith('NOTE') or line.startswith('STYLE'):
|
||||
continue
|
||||
|
||||
if text_lines and text_lines[-1] == line:
|
||||
continue
|
||||
|
||||
line = re.sub(r'<[^>]+>', '', line)
|
||||
|
||||
text_lines.append(line)
|
||||
|
||||
return '\n'.join(text_lines)
|
||||
|
||||
def get_transcript(url: str):
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
cmd = [
|
||||
"yt-dlp",
|
||||
"--write-subs",
|
||||
"--write-auto-subs",
|
||||
"--skip-download",
|
||||
"--sub-lang", "en",
|
||||
"--output", "subs",
|
||||
url
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, cwd=temp_dir, check=True, capture_output=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error running yt-dlp: {e.stderr.decode()}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except FileNotFoundError:
|
||||
print("Error: yt-dlp not found. Please install it.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
temp_path = Path(temp_dir)
|
||||
vtt_files = list(temp_path.glob("*.vtt"))
|
||||
|
||||
if not vtt_files:
|
||||
print("No subtitles found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
vtt_file = vtt_files[0]
|
||||
|
||||
content = vtt_file.read_text(encoding='utf-8')
|
||||
clean_text = clean_vtt(content)
|
||||
print(clean_text)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fetch YouTube transcript.")
|
||||
parser.add_argument("url", help="YouTube video URL")
|
||||
args = parser.parse_args()
|
||||
|
||||
get_transcript(args.url)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user