# Import Libraries import os import gc import torch import numpy as np import uuid import pycountry import yt_dlp import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig from huggingface_hub import login from pydub import AudioSegment from faster_whisper import WhisperModel # Setup YouTube Cookies from Environment def setup_cookies(): """Write cookies from environment variable to cookies.txt file""" cookies_content = os.getenv('YOUTUBE_COOKIES') if cookies_content: with open('cookies.txt', 'w') as f: f.write(cookies_content) print("✅ Cookies loaded successfully") return True else: print("⚠️ No cookies found in environment - YouTube downloads may fail") return False # Call cookie setup when app starts setup_cookies() # Hugging Face Login Setup hf_token = os.getenv('HF_TOKEN') if hf_token: login(hf_token, add_to_git_credential=True) # Model names LLAMA = "meta-llama/Llama-3.2-3B-Instruct" QWEN = "Qwen/Qwen3-4B-Instruct-2507" PHI = "microsoft/Phi-4-mini-instruct" DEEPSEEK = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" Gemma = 'google/gemma-3-4b-it' # YouTube Download Function def _download_if_youtube(source): if "youtube.com" in source or "youtu.be" in source: unique = str(uuid.uuid4())[:8] filename = f"audio_{unique}.%(ext)s" ydl_opts = { "format": "bestaudio/best", "outtmpl": filename, "quiet": True, "extractor_args": {"youtube": {"player_client": ["default"]}}, "cookiefile": "cookies.txt", } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(source, download=True) return ydl.prepare_filename(info) else: return source # Convert to WAV def _to_wav(path): unique = str(uuid.uuid4())[:8] wav_path = f"audio_{unique}.wav" AudioSegment.from_file(path).export(wav_path, format="wav") return wav_path # Transcription Function def transcription_whisper(source): torch.cuda.empty_cache() gc.collect() device = "cuda" if torch.cuda.is_available() else "cpu" compute = "float16" if device == "cuda" else "int8" model = WhisperModel('medium', device=device, compute_type=compute) file_path = _download_if_youtube(source) wav_path = _to_wav(file_path) segments, info = model.transcribe(wav_path) result = [] formatted_output = "**TRANSCRIPTION**\n" + "="*50 + "\n\n" for seg in segments: result.append({ "start": seg.start, "end": seg.end, "text": seg.text.strip() }) formatted_output += f"[{seg.start:.2f}s - {seg.end:.2f}s]\n{seg.text.strip()}\n\n" del model gc.collect() torch.cuda.empty_cache() return formatted_output, result # Prompts system_prompt = """ You are an expert assistant that generates clear, concise, and well-structured Minutes of Meeting (MOM) documents from raw meeting transcripts. Your output must be in clean Markdown format (without code blocks) and must include: - **Meeting Summary:** A brief overview of the meeting context, agenda, and participants (if mentioned). - **Key Discussion Points:** Major topics, decisions, or debates. - **Takeaways:** Important insights, conclusions, and agreements. - **Action Items:** Actionable tasks with responsible owners and deadlines (e.g., "John will prepare the project plan by Monday"). Guidelines: - Write in professional, easy-to-read language. - Summarize meaningfully; avoid filler words or irrelevant content. - Omit transcription artifacts (e.g., "um", "okay", "yeah"). - Do not include timestamps. - Maintain a formal and factual tone while being concise. - Focus entirely on clarity, structure, and readability. """ def user_prompt_for(source): formatted_output, segments = transcription_whisper(source) transcript_text = " ".join(seg["text"] for seg in segments) user_prompt = f""" Please write well-structured **Minutes of Meeting (MOM)** in Markdown format (without code blocks), including: - **Summary:** Include attendees, location, and date if mentioned. - **Key Discussion Points:** List the main topics or discussions. - **Takeaways:** Summaries of conclusions or insights. - **Action Items:** Tasks with clear owners and deadlines. Transcription: {transcript_text} """ return user_prompt def messages_for(source): messages = [ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': user_prompt_for(source)} ] return messages # Quantization Config quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4' ) # Generate MOM / Summarization def generate(model_name, source): messages = messages_for(source) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token inputs = tokenizer.apply_chat_template(messages, return_tensors='pt', add_generation_prompt=True).to('cuda') model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', quantization_config=quant_config) streamer = TextStreamer(tokenizer) outputs = model.generate(inputs, streamer=streamer, max_new_tokens=5000) result = tokenizer.decode(outputs[0], skip_special_tokens=True) mom_output = result if '<|start_header_id|>assistant<|end_header_id|>' in mom_output: mom_output = mom_output.split('<|start_header_id|>assistant<|end_header_id|>')[-1] elif 'assistant' in mom_output: parts = mom_output.split('assistant') if len(parts) > 1: mom_output = parts[-1] mom_output = mom_output.replace('<|eot_id|>', '').replace('<|end_header_id|>', '').strip() if '**Minutes of Meeting' in mom_output: mom_output = mom_output.split('**Minutes of Meeting')[1] mom_output = '**Minutes of Meeting' + mom_output elif '**MINUTES' in mom_output: mom_output = mom_output.split('**MINUTES')[1] mom_output = '**MINUTES' + mom_output del model, inputs, tokenizer, outputs gc.collect() torch.cuda.empty_cache() yield mom_output.strip() # Translation Functions : Valid Language or Not def valid_language(lang): return bool( pycountry.languages.get(name=lang.capitalize()) or pycountry.languages.get(alpha_2=lang.lower()) or pycountry.languages.get(alpha_3=lang.lower()) ) # Translate Prompts system_prompt_translate = "You are a translation assistant. Given a target language and some content, translate the content accurately into that language, preserving meaning, tone, and style, and return only the translated text. Also maintain proper format." def user_prompt_translate(source, lang): if not valid_language(lang): return f"Invalid language: {lang}. Please provide a valid language name or code." transcript_text, _ = transcription_whisper(source) lines = transcript_text.split('\n') text_lines = [] for line in lines: if line.startswith('**') or line.startswith('=') or line.startswith('[') or not line.strip(): continue text_lines.append(line.strip()) transcript_text = " ".join(text_lines) max_chars = 3000 if len(transcript_text) > max_chars: transcript_text = transcript_text[:max_chars] + "..." user_prompt = f"""Translate the following text into {lang}. Instructions: - Provide ONLY the translation in {lang} - Do NOT add any explanations or comments - Preserve the original meaning and tone - Keep formatting simple and clean Text to translate: {transcript_text} {lang} translation:""" return user_prompt def messages_for_translate(source, lang): messages = [ {'role': 'system', 'content': system_prompt_translate}, {'role': 'user', 'content': user_prompt_translate(source, lang)} ] return messages def translate_transcribe(model_name, source, lang): messages = messages_for_translate(source, lang) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token inputs = tokenizer.apply_chat_template(messages, return_tensors='pt', add_generation_prompt=True).to('cuda') model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', quantization_config=quant_config) streamer = TextStreamer(tokenizer) outputs = model.generate(inputs, streamer=streamer, max_new_tokens=5000) result = tokenizer.decode(outputs[0], skip_special_tokens=True) translate_output = result if '<|start_header_id|>assistant<|end_header_id|>' in translate_output: translate_output = translate_output.split('<|start_header_id|>assistant<|end_header_id|>')[-1] elif 'assistant' in translate_output: parts = translate_output.split('assistant') if len(parts) > 1: translate_output = parts[-1] translate_output = translate_output.replace('<|eot_id|>', '').replace('<|end_header_id|>', '').strip() if 'translation:' in translate_output.lower(): translate_output = translate_output.split('translation:')[-1].strip() if "Here's an edited version:" in translate_output: translate_output = translate_output.split("Here's an edited version:")[0].strip() translate_output = translate_output.replace('assistant', '').strip() # Format into paragraphs sentences = translate_output.split('. ') paragraphs = [] current_para = [] sentence_count = 0 for sentence in sentences: current_para.append(sentence.strip()) sentence_count += 1 if sentence_count >= 4: paragraphs.append('. '.join(current_para) + '.') current_para = [] sentence_count = 0 if current_para: paragraphs.append('. '.join(current_para) + ('.' if not current_para[-1].endswith('.') else '')) formatted_output = '\n\n'.join(paragraphs) del model, inputs, tokenizer, outputs gc.collect() torch.cuda.empty_cache() yield formatted_output def translate_transcribe_gemma(Gemma, source, lang): messages = [{'role': 'user', 'content': user_prompt_translate(source, lang)}] tokenizer = AutoTokenizer.from_pretrained(Gemma, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token inputs = tokenizer.apply_chat_template(messages, return_tensors='pt', add_generation_prompt=True).to('cuda') model = AutoModelForCausalLM.from_pretrained(Gemma, device_map='auto', quantization_config=quant_config) streamer = TextStreamer(tokenizer) outputs = model.generate(inputs, streamer=streamer, max_new_tokens=5000) result = tokenizer.decode(outputs[0], skip_special_tokens=True) translate_output = result if '<|start_header_id|>assistant<|end_header_id|>' in translate_output: translate_output = translate_output.split('<|start_header_id|>assistant<|end_header_id|>')[-1] elif 'assistant' in translate_output: parts = translate_output.split('assistant') if len(parts) > 1: translate_output = parts[-1] translate_output = translate_output.replace('<|eot_id|>', '').replace('<|end_header_id|>', '').strip() if 'translation:' in translate_output.lower(): translate_output = translate_output.split('translation:')[-1].strip() if "Here's an edited version:" in translate_output: translate_output = translate_output.split("Here's an edited version:")[0].strip() translate_output = translate_output.replace('assistant', '').strip() del model, inputs, tokenizer, outputs gc.collect() torch.cuda.empty_cache() yield translate_output # Optimization Functions for MOM def optimize(model_name, source): if model_name == 'LLAMA': result = generate(LLAMA, source) elif model_name == 'PHI': result = generate(PHI, source) elif model_name == 'QWEN': result = generate(QWEN, source) elif model_name == 'DEEPSEEK': result = generate(DEEPSEEK, source) for chunk in result: yield chunk # Optimization Functions for Translation def optimize_translate(model_name, source, lang): if model_name == 'LLAMA': translate = translate_transcribe(LLAMA, source, lang) elif model_name == 'PHI': translate = translate_transcribe(PHI, source, lang) elif model_name == 'QWEN': translate = translate_transcribe(QWEN, source, lang) elif model_name == 'DEEPSEEK': translate = translate_transcribe(DEEPSEEK, source, lang) elif model_name == 'Gemma': translate = translate_transcribe_gemma(Gemma, source, lang) for chunk_tr in translate: yield chunk_tr # Helper Functon for Gradio UI def get_source_input(file, link): if file is not None: return file.name if hasattr(file, 'name') else file return link if link else "" # CSS Styling css = """ #file-box { min-height: 500px !important; } #file-box button { height: 100% !important; width: 100% !important; display: flex !important; flex-direction: column !important; align-items: center !important; justify-content: center !important; margin: 0 !important; padding: 0 !important; } #box { min-height: 550px !important; } """ # Gradio Interface with gr.Blocks(css=css) as ui: gr.Markdown("## Transcription & MOM Generator & Translator") gr.Markdown(""" ### 📌 Note: YouTube Link Support Due to YouTube's bot protection, only **direct file uploads** are guaranteed to work. YouTube links may fail without authentication cookies. **Workaround:** Upload your audio/video file directly for best results. """) with gr.Row(): with gr.Column(scale=2): input_file = gr.File(label="Upload Audio/Video", file_types=["audio", "video"], elem_id="file-box") input_link = gr.Textbox(label="YouTube Link (optional)", lines=2) with gr.Column(scale=2): output_transcription = gr.Textbox(label="Transcription", lines=25, elem_id='box') transcribe = gr.Button("Transcribe", variant="primary", scale=2) with gr.Column(scale=2): output_summary = gr.Textbox(label="MOM Output", lines=25, elem_id='box') summarize = gr.Button("Summarize", variant="secondary", scale=2) with gr.Column(scale=2): output_translate = gr.Textbox(label='Translation Output', lines=20) language_input = gr.Textbox(label="Target Language", value="English", lines=1) translate = gr.Button('Translate', scale=2) with gr.Row(): model = gr.Dropdown( ["LLAMA", "PHI", "QWEN", "DEEPSEEK", 'Gemma'], label="Choose Your Model", value="LLAMA" ) # Wrapper functions to handle generators properly def summarize_wrapper(model, file, link): source = get_source_input(file, link) for result in optimize(model, source): yield result def translate_wrapper(model, file, link, lang): source = get_source_input(file, link) for result in optimize_translate(model, source, lang): yield result # Event handlers with file or link support transcribe.click( fn=lambda file, link: transcription_whisper(get_source_input(file, link))[0], inputs=[input_file, input_link], outputs=[output_transcription] ) summarize.click( fn=summarize_wrapper, inputs=[model, input_file, input_link], outputs=[output_summary] ) translate.click( fn=translate_wrapper, inputs=[model, input_file, input_link, language_input], outputs=[output_translate] ) # Launch the app if __name__ == "__main__": ui.launch(server_name="0.0.0.0", server_port=7860)