Spaces:

ccclllwww
/

Assignment_V1

Build error

App Files Files Community

Assignment_V1 / app.py

ccclllwww

Create app.py

70fa8c1 verified 8 months ago

raw

history blame

7.65 kB

	import streamlit as st
	from PIL import Image
	import time
	from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
	from typing import Tuple
	from datasets import load_dataset
	import soundfile as sf
	import torch

	# Initialize image captioning pipeline with pretrained model
	# Model source: Hugging Face Model Hub
	_image_caption_pipeline = pipeline(
	task="image-to-text",
	model="noamrot/FuseCap_Image_Captioning"
	)

	# Global model configuration constants
	_MODEL_NAME = "Qwen/Qwen3-1.7B"
	_THINKING_TOKEN_ID = 151668 # Special token marking thinking/content separation

	# Initialize model components once
	_tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
	_model = AutoModelForCausalLM.from_pretrained(
	_MODEL_NAME,
	torch_dtype="auto",
	device_map="auto"
	)

	# Initialize TTS components once to avoid reloading
	_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
	_EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	_DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0)

	def generate_image_caption(input_image):
	"""
	Generate a textual description for an input image using a pretrained model.

	Args:
	input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
	- A PIL Image object
	- A string containing a filesystem path to an image file

	Returns:
	str: Generated caption text in natural language

	Example:
	>>> from PIL import Image
	>>> img = Image.open("photo.jpg")
	>>> caption = generate_image_caption(img)
	>>> print(f"Caption: {caption}")
	"""
	# Process image through the captioning pipeline
	inference_results = _image_caption_pipeline(input_image)

	# Extract text from the first (and only) result dictionary
	caption_text = inference_results[0]['generated_text']

	return caption_text

	def generate_story_content(system_prompt: str, user_prompt: str) -> str:
	"""
	Generates a children's story based on provided system and user prompts.

	Args:
	system_prompt: Defines the assistant's role and writing constraints
	user_prompt: Describes the story scenario and specific elements to include

	Returns:
	Generated story text without any thinking process metadata

	Raises:
	RuntimeError: If text generation fails at any stage

	Example:
	>>> story = generate_story_content(
	... "You are a helpful children's author...",
	... "Kids playing with dogs in a sunny meadow..."
	... )
	"""
	try:
	# Prepare chat message structure
	conversation_history = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	]

	# Format input using model-specific template
	formatted_input = _tokenizer.apply_chat_template(
	conversation_history,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=False
	)

	# Tokenize and prepare model inputs
	model_inputs = _tokenizer(
	[formatted_input],
	return_tensors="pt"
	).to(_model.device)

	# Generate text completion
	generated_sequences = _model.generate(
	**model_inputs,
	max_new_tokens=1000
	)

	# Process and clean output
	return _process_generated_output(
	generated_sequences,
	model_inputs.input_ids
	)

	except Exception as error:
	raise RuntimeError(f"Story generation failed: {str(error)}") from error

	def _process_generated_output(generated_sequences: list, input_ids: list) -> str:
	"""
	Processes raw model output to extract final content.

	Args:
	generated_sequences: Raw output sequences from model generation
	input_ids: Original input token IDs used for generation

	Returns:
	Cleaned final content text
	"""
	# Extract new tokens excluding original prompt
	new_tokens = generated_sequences[0][len(input_ids[0]):].tolist()

	# Find separation point between thinking and final content
	separation_index = _find_thinking_separation(new_tokens)

	# Decode and clean final content
	return _tokenizer.decode(
	new_tokens[separation_index:],
	skip_special_tokens=True
	).strip("\n")

	def _find_thinking_separation(token_sequence: list) -> int:
	"""
	Locates the boundary between thinking process and final content.

	Args:
	token_sequence: List of generated token IDs

	Returns:
	Index position marking the start of final content
	"""
	try:
	# Search from end for separation token
	reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID)
	return len(token_sequence) - reverse_position
	except ValueError:
	return 0 # Return start if token not found

	def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
	"""
	Convert text story to speech audio file using text-to-speech synthesis.

	Args:
	story_text: Input story text to synthesize
	output_path: Path to save generated audio (default: 'output.wav')

	Returns:
	Path to generated audio file

	Raises:
	ValueError: For empty/invalid input text
	RuntimeError: If audio generation fails

	Example:
	>>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
	'story_audio.wav'
	"""
	# Validate input text
	if not isinstance(story_text, str) or not story_text.strip():
	raise ValueError("Input story text must be a non-empty string")

	try:
	# Generate speech with default speaker profile
	speech_output = _SPEECH_PIPELINE(
	story_text,
	forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING}
	)

	# Save audio to WAV file
	sf.write(
	output_path,
	speech_output["audio"],
	samplerate=speech_output["sampling_rate"]
	)

	return output_path

	except Exception as error:
	raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error


	# App title
	st.title("Best Story Teller")

	# Write some text
	st.write("Upload a picture and start your journey of creativeness and imagination")

	# File uploader for image and audio
	uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
	uploaded_audio = st.file_uploader("Upload an audio file", type=["mp3", "wav", "ogg"])

	# Display image with spinner
	if uploaded_image is not None:
	with st.spinner("Loading image..."):
	image = Image.open(uploaded_image)
	st.image(image, caption="Uploaded Image", use_column_width=True)
	with st.spinner("Captioning image..."):
	caption_from_file = generate_image_caption(image)
	with st.spinner("Adding some magics and imagination..."):
	system_prompt = "You are a helpful kid story writter. You should directly generate a simple, educational and intresting story no more than 150 words."
	user_prompt = caption_from_file
	story = generate_story_content(system_prompt, user_prompt)
	st.write(story)
	with st.spinner("Finding the best voice actor"):
	generated_audio = generate_audio_from_story(story,"childrens_story.wav")
	st.audio(generated_audio)