Spaces:

ngeb25
/

SpotChatbot

Running

File size: 5,692 Bytes

"""

Spot: The Spotify Chatbot
IAT360 Final Project

By Nathan Gebreab (301582871) & EmXi Vo (301600699)

Spot is a chatbot using Meta's Llama-3.2-3B-Instruct model & uses 
RAG (Retrieval-Augmented Generation) to provide the user with song recommendations 
based on their input prompt. By using RAG, Spot is able to access a dataset of 
approximately 30000 Spotify songs and their descriptive parameters in order to 
find the best recommendations.

Links to Model (Authentication from Meta Required): 
https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct
https://www.llama.com/llama-downloads/

Link to Dataset (created by Joakim Arvidsson):
https://www.kaggle.com/datasets/joebeachcapital/30000-spotify-songs

"""

import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import warnings
import gradio as gr
from huggingface_hub import InferenceClient


model_id="meta-llama/Llama-3.2-3B-Instruct"

# Suppress warnings
warnings.filterwarnings('ignore')

# Load the spotify dataset all at the beginning
print("Loading Spotify songs database...")
spotify_df = pd.read_csv('spotify_songs.csv')

# Remove duplicates based on track name and artist name
spotify_df = spotify_df.drop_duplicates(subset=["track_name", "track_artist"])

documents = spotify_df.apply(
    lambda row: f"""Song: {row['track_name']},
        Album: {row['track_album_name']},
        Album Release Date: {row['track_album_release_date']},
        Artist: {row['track_artist']}, 
        Playlist Genre: {row['playlist_genre']},
        Playlist Subgenre: {row['playlist_subgenre']},
        Danceability: {row['danceability']},
        Energy: {row['energy']},
        Key: {row['key']},
        Loudness: {row['loudness']},
        Mode: {row['mode']},
        Speechiness: {row['speechiness']},
        Acousticness: {row['acousticness']},
        Instrumentalness: {row['instrumentalness']},
        Liveness: {row['liveness']},
        Valence: {row['valence']},
        Tempo: {row['tempo']},
        Duration: {row['duration_ms']}
        """, 
                    
    axis=1
).tolist()

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(documents, show_progress_bar=False)

df = pd.DataFrame({
    "Document": documents,
    "Embedding": list(embeddings)
})

print("Database loaded! Ready to chat.\n")

def retrieve_with_pandas(query, top_k=10):
    query_embedding = embedding_model.encode([query])[0]
    
    df['Similarity'] = df['Embedding'].apply(lambda x: np.dot(query_embedding, x) /
                                             (np.linalg.norm(query_embedding) * np.linalg.norm(x)))
    results = df.sort_values(by="Similarity", ascending=False).head(top_k)
    return results[["Document", "Similarity"]]

def generate_intro(query):

    llm = pipeline(
        "text-generation",
        model=model_id,
        dtype=torch.bfloat16,
        device_map="auto",
    )

    system_prompt = (
        "You are Spot, a friendly music recommendation chatbot."
        "Respond to the user in 1–3 natural sentences."
        "Do NOT list songs. Do NOT number anything. Do NOT name any songs. Do NOT name any artists. Do NOT name any musicians. Do NOT name any famous works."
        "Just give a short, warm and friendly message that leads into the list of recommended songs"
    )

    prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}\n" \
             f"<|start_header_id|>user<|end_header_id|>\n{query}\n" \
             f"<|start_header_id|>assistant<|end_header_id|>\n"

    intro = llm(
        prompt,
        max_new_tokens=60,
        do_sample=True,
        temperature=2.0
    )[0]["generated_text"]

    intro = intro.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()

    return intro

def num_requested_songs(query):
    for word in query.split():
        if word.isdigit():
            return min(int(word), 10)  # Max 10 songs
    return 3  # Default number of songs

def generate_response(query, num_songs):
    
    intro = generate_intro(query)
    
    retrieved = retrieve_with_pandas(query, top_k=num_songs)
    
    # Get the actual songs
    songs_list = "\n".join([f"{i+1}. {row['Document']}" 
                            for i, (_, row) in enumerate(retrieved.iterrows())])

    response = f"""{intro}
    
    Here are my recommendations:
        {songs_list}
    """

    return response

def respond(
    message,
    history: list[dict[str, str]],
    # system_message,
    # max_tokens,
    # temperature,
    # top_p,
    # hf_token: gr.OAuthToken,
):
    
    if message.lower() in ['quit', 'exit', 'bye', 'goodbye']:
        return "Thanks for chatting!"
    
    if not message.strip():
        return "Please ask me something!"
   
    num_songs = num_requested_songs(message)

    response = generate_response(message, num_songs)
    return response
 

chatbot = gr.ChatInterface(
    respond,
    title="Spot: The Spotify Chatbot",
    description="""
    Hello! My name's Spot and I'm here to give song recommendations!
    
    You can request a specific song, or just let me know how you're feeling!
    
    *Type 'quit' or 'exit' to end the conversation.*
    """,
    examples=[
        "Give me 8 upbeat songs",
        "Show me 5 chill songs for studying",
        "Recommend songs by Drake",
        "I want something energetic"
    ],
    theme="glass", 
    # retry_btn=None,
    # undo_btn=None,
    # clear_btn="Clear Chat"
)

with gr.Blocks() as demo:
    with gr.Sidebar():
        gr.LoginButton()
    chatbot.render()


if __name__ == "__main__":
    demo.launch()