Jayashree Sridhar
commited on
Commit
·
0f20f6a
1
Parent(s):
d39c478
converted numpy array
Browse files- agents/tools/voice_tools.py +108 -25
agents/tools/voice_tools.py
CHANGED
|
@@ -12,32 +12,114 @@ from crewai.tools import BaseTool
|
|
| 12 |
|
| 13 |
|
| 14 |
|
| 15 |
-
class MultilingualVoiceProcessor:
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
def __init__(self, model_name="openai/whisper-base", device=None):
|
| 18 |
cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
|
| 19 |
if device is None:
|
| 20 |
device = 0 if torch.cuda.is_available() else -1
|
| 21 |
-
|
| 22 |
-
# Load model and processor with cache_dir
|
| 23 |
-
processor = AutoProcessor.from_pretrained(model_name, cache_dir=cache_dir)
|
| 24 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, cache_dir=cache_dir)
|
| 25 |
-
|
| 26 |
-
# Create the pipeline, DO NOT PASS cache_dir here
|
| 27 |
-
# self.pipe = pipeline(
|
| 28 |
-
# "automatic-speech-recognition",
|
| 29 |
-
# model=model,
|
| 30 |
-
# tokenizer=processor,
|
| 31 |
-
# feature_extractor=processor,
|
| 32 |
-
# device=device,
|
| 33 |
-
# generate_kwargs={"task": "transcribe", "return_timestamps": False},
|
| 34 |
-
# )
|
| 35 |
self.pipe = pipeline(
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
)
|
| 41 |
|
| 42 |
async def transcribe(self, audio_data: np.ndarray, language: str = None):
|
| 43 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
|
|
@@ -49,16 +131,17 @@ class MultilingualVoiceProcessor:
|
|
| 49 |
|
| 50 |
async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
|
| 51 |
raise NotImplementedError("Use gTTS or edge-tts as before.")
|
|
|
|
| 52 |
class TranscribeAudioTool(BaseTool):
|
| 53 |
name: str = "transcribe_audio"
|
| 54 |
description: str = "Transcribe audio to text and detect language."
|
| 55 |
model_config = {"arbitrary_types_allowed": True}
|
| 56 |
-
#_vp: MultilingualVoiceProcessor = PrivateAttr()
|
| 57 |
def __init__(self, config=None):
|
| 58 |
super().__init__()
|
| 59 |
self.vp = MultilingualVoiceProcessor()
|
| 60 |
-
def
|
| 61 |
-
|
|
|
|
| 62 |
return {"text": text, "language": detected_lang}
|
| 63 |
|
| 64 |
class DetectEmotionTool(BaseTool):
|
|
@@ -67,7 +150,7 @@ class DetectEmotionTool(BaseTool):
|
|
| 67 |
model_config = {"arbitrary_types_allowed": True}
|
| 68 |
def __init__(self, config=None):
|
| 69 |
super().__init__()
|
| 70 |
-
def
|
| 71 |
model = TinyGPT2Model()
|
| 72 |
prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
|
| 73 |
response = model.generate(prompt)
|
|
@@ -82,7 +165,7 @@ class GenerateReflectiveQuestionsTool(BaseTool):
|
|
| 82 |
model_config = {"arbitrary_types_allowed": True}
|
| 83 |
def __init__(self, config=None):
|
| 84 |
super().__init__()
|
| 85 |
-
def
|
| 86 |
emotion = context.get("primary_emotion", "neutral")
|
| 87 |
questions_map = {
|
| 88 |
"anxiety": ["What triggers your anxiety?", "How do you cope?"],
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
|
| 15 |
+
# class MultilingualVoiceProcessor:
|
| 16 |
|
| 17 |
+
# def __init__(self, model_name="openai/whisper-base", device=None):
|
| 18 |
+
# cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
|
| 19 |
+
# if device is None:
|
| 20 |
+
# device = 0 if torch.cuda.is_available() else -1
|
| 21 |
+
|
| 22 |
+
# # Load model and processor with cache_dir
|
| 23 |
+
# processor = AutoProcessor.from_pretrained(model_name, cache_dir=cache_dir)
|
| 24 |
+
# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, cache_dir=cache_dir)
|
| 25 |
+
|
| 26 |
+
# # Create the pipeline, DO NOT PASS cache_dir here
|
| 27 |
+
# # self.pipe = pipeline(
|
| 28 |
+
# # "automatic-speech-recognition",
|
| 29 |
+
# # model=model,
|
| 30 |
+
# # tokenizer=processor,
|
| 31 |
+
# # feature_extractor=processor,
|
| 32 |
+
# # device=device,
|
| 33 |
+
# # generate_kwargs={"task": "transcribe", "return_timestamps": False},
|
| 34 |
+
# # )
|
| 35 |
+
# self.pipe = pipeline(
|
| 36 |
+
# "automatic-speech-recognition",
|
| 37 |
+
# model=model_name,
|
| 38 |
+
# device=device,
|
| 39 |
+
# generate_kwargs={"task": "transcribe", "return_timestamps": False},
|
| 40 |
+
# )
|
| 41 |
+
|
| 42 |
+
# async def transcribe(self, audio_data: np.ndarray, language: str = None):
|
| 43 |
+
# with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
|
| 44 |
+
# sf.write(tmp_wav.name, audio_data, samplerate=16000)
|
| 45 |
+
# extra = {"language": language} if language else {}
|
| 46 |
+
# result = self.pipe(tmp_wav.name, **extra)
|
| 47 |
+
# text = result['text']
|
| 48 |
+
# return text, language or "unknown"
|
| 49 |
+
|
| 50 |
+
# async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
|
| 51 |
+
# raise NotImplementedError("Use gTTS or edge-tts as before.")
|
| 52 |
+
# class TranscribeAudioTool(BaseTool):
|
| 53 |
+
# name: str = "transcribe_audio"
|
| 54 |
+
# description: str = "Transcribe audio to text and detect language."
|
| 55 |
+
# model_config = {"arbitrary_types_allowed": True}
|
| 56 |
+
# #_vp: MultilingualVoiceProcessor = PrivateAttr()
|
| 57 |
+
# def __init__(self, config=None):
|
| 58 |
+
# super().__init__()
|
| 59 |
+
# self.vp = MultilingualVoiceProcessor()
|
| 60 |
+
# def _run(self, audio_data: np.ndarray, language=None):
|
| 61 |
+
# text, detected_lang = asyncio.run(self.vp.transcribe(audio_data, language))
|
| 62 |
+
# return {"text": text, "language": detected_lang}
|
| 63 |
+
|
| 64 |
+
# class DetectEmotionTool(BaseTool):
|
| 65 |
+
# name: str = "detect_emotion"
|
| 66 |
+
# description: str = "Detect the emotional state from text."
|
| 67 |
+
# model_config = {"arbitrary_types_allowed": True}
|
| 68 |
+
# def __init__(self, config=None):
|
| 69 |
+
# super().__init__()
|
| 70 |
+
# def _run(self, text: str):
|
| 71 |
+
# model = TinyGPT2Model()
|
| 72 |
+
# prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
|
| 73 |
+
# response = model.generate(prompt)
|
| 74 |
+
# return {"primary_emotion": "detected_emotion",
|
| 75 |
+
# "intensity": "medium",
|
| 76 |
+
# "feelings": ["feeling1"],
|
| 77 |
+
# "concerns": ["concern1"]}
|
| 78 |
+
|
| 79 |
+
# class GenerateReflectiveQuestionsTool(BaseTool):
|
| 80 |
+
# name: str = "generate_reflective_questions"
|
| 81 |
+
# description: str = "Generate reflective questions."
|
| 82 |
+
# model_config = {"arbitrary_types_allowed": True}
|
| 83 |
+
# def __init__(self, config=None):
|
| 84 |
+
# super().__init__()
|
| 85 |
+
# def _run(self, context: dict):
|
| 86 |
+
# emotion = context.get("primary_emotion", "neutral")
|
| 87 |
+
# questions_map = {
|
| 88 |
+
# "anxiety": ["What triggers your anxiety?", "How do you cope?"],
|
| 89 |
+
# "sadness": ["What helps when you feel sad?", "Who can you talk to?"]
|
| 90 |
+
# }
|
| 91 |
+
# return questions_map.get(emotion, [
|
| 92 |
+
# "How are you feeling?",
|
| 93 |
+
# "What feels important now?"
|
| 94 |
+
# ])
|
| 95 |
+
|
| 96 |
+
# class VoiceTools:
|
| 97 |
+
# def __init__(self, config=None):
|
| 98 |
+
# self.transcribe_audio = TranscribeAudioTool(config)
|
| 99 |
+
# self.detect_emotion = DetectEmotionTool(config)
|
| 100 |
+
# self.generate_reflective_questions = GenerateReflectiveQuestionsTool(config)
|
| 101 |
+
import numpy as np
|
| 102 |
+
import asyncio
|
| 103 |
+
from typing import List, Optional
|
| 104 |
+
from models.tinygpt2_model import TinyGPT2Model
|
| 105 |
+
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
|
| 106 |
+
import os
|
| 107 |
+
import tempfile
|
| 108 |
+
import soundfile as sf
|
| 109 |
+
import torch
|
| 110 |
+
from crewai.tools import BaseTool
|
| 111 |
+
|
| 112 |
+
class MultilingualVoiceProcessor:
|
| 113 |
def __init__(self, model_name="openai/whisper-base", device=None):
|
| 114 |
cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
|
| 115 |
if device is None:
|
| 116 |
device = 0 if torch.cuda.is_available() else -1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
self.pipe = pipeline(
|
| 118 |
+
"automatic-speech-recognition",
|
| 119 |
+
model=model_name,
|
| 120 |
+
device=device,
|
| 121 |
+
generate_kwargs={"task": "transcribe", "return_timestamps": False},
|
| 122 |
+
)
|
| 123 |
|
| 124 |
async def transcribe(self, audio_data: np.ndarray, language: str = None):
|
| 125 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
|
|
|
|
| 131 |
|
| 132 |
async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
|
| 133 |
raise NotImplementedError("Use gTTS or edge-tts as before.")
|
| 134 |
+
|
| 135 |
class TranscribeAudioTool(BaseTool):
|
| 136 |
name: str = "transcribe_audio"
|
| 137 |
description: str = "Transcribe audio to text and detect language."
|
| 138 |
model_config = {"arbitrary_types_allowed": True}
|
|
|
|
| 139 |
def __init__(self, config=None):
|
| 140 |
super().__init__()
|
| 141 |
self.vp = MultilingualVoiceProcessor()
|
| 142 |
+
def _run(self, audio_data: List[float], language: Optional[str] = None):
|
| 143 |
+
audio_np = np.array(audio_data, dtype=np.float32)
|
| 144 |
+
text, detected_lang = asyncio.run(self.vp.transcribe(audio_np, language))
|
| 145 |
return {"text": text, "language": detected_lang}
|
| 146 |
|
| 147 |
class DetectEmotionTool(BaseTool):
|
|
|
|
| 150 |
model_config = {"arbitrary_types_allowed": True}
|
| 151 |
def __init__(self, config=None):
|
| 152 |
super().__init__()
|
| 153 |
+
def _run(self, text: str):
|
| 154 |
model = TinyGPT2Model()
|
| 155 |
prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
|
| 156 |
response = model.generate(prompt)
|
|
|
|
| 165 |
model_config = {"arbitrary_types_allowed": True}
|
| 166 |
def __init__(self, config=None):
|
| 167 |
super().__init__()
|
| 168 |
+
def _run(self, context: dict):
|
| 169 |
emotion = context.get("primary_emotion", "neutral")
|
| 170 |
questions_map = {
|
| 171 |
"anxiety": ["What triggers your anxiety?", "How do you cope?"],
|