Spaces:

GeorgiosIoannouCoder
/

cuny-tech-prep-tutorial-1

Running

App Files Files Community

GeorgiosIoannouCoder commited on Feb 8, 2024

Commit

29bab07

verified ·

1 Parent(s): 75f74a2

Create app.py

Browse files

Files changed (1) hide show

app.py +191 -0

app.py ADDED Viewed

	@@ -0,0 +1,191 @@

+#############################################################################################################################
+# Filename   : app.py
+# Description: A Streamlit application to turn an image to audio story.
+# Author     : Georgios Ioannou
+#
+# Copyright © 2024 by Georgios Ioannou
+#############################################################################################################################
+# Import libraries.
+import os  # Load environment variable(s).
+import requests  # Send HTTP GET request to Hugging Face models for inference.
+import streamlit as st  # Build the GUI of the application.
+from dotenv import find_dotenv, load_dotenv  # Load environment variables.
+from langchain.chat_models import ChatOpenAI  # Access to OpenAI gpt-3.5-turbo model.
+from langchain.chains import LLMChain  # Chain to run queries against LLMs.
+# A prompt template. It accepts a set of parameters from the user that can be used to generate a prompt for a language model.
+from langchain.prompts import PromptTemplate
+from transformers import pipeline  # Access to Hugging Face models.
+#############################################################################################################################
+# Load environment variable(s).
+load_dotenv(find_dotenv())
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+#############################################################################################################################
+# Function to apply local CSS.
+def local_css(file_name):
+    with open(file_name) as f:
+        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+#############################################################################################################################
+# Return the text generated by the model for the image.
+# Using pipeline.
+def img_to_text(image_path):
+    # https://huggingface.co/tasks
+    # Task used here : "image-to-text".
+    # Model used here: "Salesforce/blip-image-captioning-base".
+    # Backup model: "nlpconnect/vit-gpt2-image-captioning".
+    image_to_text = pipeline(
+        "image-to-text", model="Salesforce/blip-image-captioning-base"
+    )
+    # image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
+    scenario = image_to_text(image_path)[0]["generated_text"]
+    return scenario
+#############################################################################################################################
+# Return the story generated by the model for the scenario.
+# Using Langchain.
+def generate_story(scenario, personality):
+    # Model used here: "gpt-3.5-turbo".
+    # The template can be customized to meet one's needs such as:
+    # Generate a story and generate lyrics of a song.
+    template = """
+    You are a story teller.
+    You must sound like {personality}.
+    The story should be less than 50 words.
+    Generate a story based on the above constraints and the following scenario: {scenario}.
+    """
+    prompt = PromptTemplate(
+        template=template, input_variables=["scenario", "personality"]
+    )
+    story_llm = LLMChain(
+        llm=ChatOpenAI(
+            model_name="gpt-3.5-turbo", temperature=0
+        ),  # Increasing the temperature, the model becomes more creative and takes longer for inference.
+        prompt=prompt,
+        verbose=True,  # Print intermediate values to the console.
+    )
+    story = story_llm.predict(
+        scenario=scenario, personality=personality
+    )  # Format prompt with kwargs and pass to LLM.
+    return story
+#############################################################################################################################
+# Return the speech generated by the model for the story.
+# Using inference api.
+def text_to_speech(story):
+    # Model used here: "espnet/kan-bayashi_ljspeech_vits.
+    # Backup model: "facebook/mms-tts-eng".
+    API_URL = (
+        "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
+    )
+    # API_URL = "https://api-inference.huggingface.co/models/facebook/mms-tts-eng"
+    headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
+    payload = {"inputs": story}
+    response = requests.post(API_URL, headers=headers, json=payload)
+    with open("audio.flac", "wb") as file:
+        file.write(response.content)
+#############################################################################################################################
+# Main function to create the Streamlit web application.
+def main():
+    try:
+        # Page title and favicon.
+        st.set_page_config(page_title="Image To Audio Story", page_icon="🖼️")
+        # Load CSS.
+        local_css("styles/style.css")
+        # Title.
+        title = f"""<h1 align="center" style="font-family: monospace; font-size: 2.1rem; margin-top: -6rem">
+                    Turn Image to Audio Story</h1>"""
+        st.markdown(title, unsafe_allow_html=True)
+        # Define the personalities for the dropdown menu.
+        personalities = [
+            "Donald Trump",
+            "Abraham Lincoln",
+            "Aristotle",
+            "Cardi B",
+            "Kanye West",
+        ]
+        personality = st.selectbox("Select a personality:", personalities)
+        # Upload an image.
+        uploaded_file = st.file_uploader("Choose an image:")
+        if uploaded_file is not None:
+            # Display the uploaded image.
+            bytes_data = uploaded_file.getvalue()
+            with open(uploaded_file.name, "wb") as file:
+                file.write(bytes_data)
+            st.image(uploaded_file, caption="Uploaded Image.", use_column_width=True)
+            with st.spinner(text="Model Inference..."): # Spinner to keep the application interactive.
+                # Model inference.
+                scenario = img_to_text(uploaded_file.name)
+                story = generate_story(scenario=scenario, personality=personality)
+                text_to_speech(story)
+                # Display the scenario and story.
+                with st.expander("Scenario"):
+                    st.write(scenario)
+                with st.expander("Story"):
+                    st.write(story)
+            # Display the audio.
+            st.audio("audio.flac")
+    except Exception as e:
+        # Display any errors.
+        st.error(e)
+#############################################################################################################################
+if __name__ == "__main__":
+    main()