Spaces:

jayashree
/

TatTwamAI

Sleeping

App Files Files Community

Jayashree Sridhar commited on Jun 24

Commit

0f20f6a

1 Parent(s): d39c478

converted numpy array

Browse files

Files changed (1) hide show

agents/tools/voice_tools.py +108 -25

agents/tools/voice_tools.py CHANGED Viewed

@@ -12,32 +12,114 @@ from crewai.tools import BaseTool
-class MultilingualVoiceProcessor:
     def __init__(self, model_name="openai/whisper-base", device=None):
         cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
         if device is None:
             device = 0 if torch.cuda.is_available() else -1
-        # Load model and processor with cache_dir
-        processor = AutoProcessor.from_pretrained(model_name, cache_dir=cache_dir)
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, cache_dir=cache_dir)
-        # Create the pipeline, DO NOT PASS cache_dir here
-        # self.pipe = pipeline(
-        #     "automatic-speech-recognition",
-        #     model=model,
-        #     tokenizer=processor,
-        #     feature_extractor=processor,
-        #     device=device,
-        #     generate_kwargs={"task": "transcribe", "return_timestamps": False},
-        # )
         self.pipe = pipeline(
-   "automatic-speech-recognition",
-   model=model_name,
-   device=device,
-   generate_kwargs={"task": "transcribe", "return_timestamps": False},
-)
     async def transcribe(self, audio_data: np.ndarray, language: str = None):
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
@@ -49,16 +131,17 @@ class MultilingualVoiceProcessor:
     async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
         raise NotImplementedError("Use gTTS or edge-tts as before.")
 class TranscribeAudioTool(BaseTool):
     name: str = "transcribe_audio"
     description: str = "Transcribe audio to text and detect language."
     model_config = {"arbitrary_types_allowed": True}
-    #_vp: MultilingualVoiceProcessor = PrivateAttr()
     def __init__(self, config=None):
         super().__init__()
         self.vp = MultilingualVoiceProcessor()
-    def  _run(self, audio_data: np.ndarray, language=None):
-        text, detected_lang = asyncio.run(self.vp.transcribe(audio_data, language))
         return {"text": text, "language": detected_lang}
 class DetectEmotionTool(BaseTool):
@@ -67,7 +150,7 @@ class DetectEmotionTool(BaseTool):
     model_config = {"arbitrary_types_allowed": True}
     def __init__(self, config=None):
         super().__init__()
-    def  _run(self, text: str):
         model = TinyGPT2Model()
         prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
         response = model.generate(prompt)
@@ -82,7 +165,7 @@ class GenerateReflectiveQuestionsTool(BaseTool):
     model_config = {"arbitrary_types_allowed": True}
     def __init__(self, config=None):
         super().__init__()
-    def  _run(self, context: dict):
         emotion = context.get("primary_emotion", "neutral")
         questions_map = {
             "anxiety": ["What triggers your anxiety?", "How do you cope?"],

+# class MultilingualVoiceProcessor:
+#     def __init__(self, model_name="openai/whisper-base", device=None):
+#         cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
+#         if device is None:
+#             device = 0 if torch.cuda.is_available() else -1
+#         # Load model and processor with cache_dir
+#         processor = AutoProcessor.from_pretrained(model_name, cache_dir=cache_dir)
+#         model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, cache_dir=cache_dir)
+#         # Create the pipeline, DO NOT PASS cache_dir here
+#         # self.pipe = pipeline(
+#         #     "automatic-speech-recognition",
+#         #     model=model,
+#         #     tokenizer=processor,
+#         #     feature_extractor=processor,
+#         #     device=device,
+#         #     generate_kwargs={"task": "transcribe", "return_timestamps": False},
+#         # )
+#         self.pipe = pipeline(
+#    "automatic-speech-recognition",
+#    model=model_name,
+#    device=device,
+#    generate_kwargs={"task": "transcribe", "return_timestamps": False},
+# )
+#     async def transcribe(self, audio_data: np.ndarray, language: str = None):
+#         with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
+#             sf.write(tmp_wav.name, audio_data, samplerate=16000)
+#             extra = {"language": language} if language else {}
+#             result = self.pipe(tmp_wav.name, **extra)
+#         text = result['text']
+#         return text, language or "unknown"
+#     async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
+#         raise NotImplementedError("Use gTTS or edge-tts as before.")
+# class TranscribeAudioTool(BaseTool):
+#     name: str = "transcribe_audio"
+#     description: str = "Transcribe audio to text and detect language."
+#     model_config = {"arbitrary_types_allowed": True}
+#     #_vp: MultilingualVoiceProcessor = PrivateAttr()
+#     def __init__(self, config=None):
+#         super().__init__()
+#         self.vp = MultilingualVoiceProcessor()
+#     def  _run(self, audio_data: np.ndarray, language=None):
+#         text, detected_lang = asyncio.run(self.vp.transcribe(audio_data, language))
+#         return {"text": text, "language": detected_lang}
+# class DetectEmotionTool(BaseTool):
+#     name: str = "detect_emotion"
+#     description: str = "Detect the emotional state from text."
+#     model_config = {"arbitrary_types_allowed": True}
+#     def __init__(self, config=None):
+#         super().__init__()
+#     def  _run(self, text: str):
+#         model = TinyGPT2Model()
+#         prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
+#         response = model.generate(prompt)
+#         return {"primary_emotion": "detected_emotion",
+#                 "intensity": "medium",
+#                 "feelings": ["feeling1"],
+#                 "concerns": ["concern1"]}
+# class GenerateReflectiveQuestionsTool(BaseTool):
+#     name: str = "generate_reflective_questions"
+#     description: str = "Generate reflective questions."
+#     model_config = {"arbitrary_types_allowed": True}
+#     def __init__(self, config=None):
+#         super().__init__()
+#     def  _run(self, context: dict):
+#         emotion = context.get("primary_emotion", "neutral")
+#         questions_map = {
+#             "anxiety": ["What triggers your anxiety?", "How do you cope?"],
+#             "sadness": ["What helps when you feel sad?", "Who can you talk to?"]
+#         }
+#         return questions_map.get(emotion, [
+#             "How are you feeling?",
+#             "What feels important now?"
+#         ])
+# class VoiceTools:
+#     def __init__(self, config=None):
+#         self.transcribe_audio = TranscribeAudioTool(config)
+#         self.detect_emotion = DetectEmotionTool(config)
+#         self.generate_reflective_questions = GenerateReflectiveQuestionsTool(config)
+import numpy as np
+import asyncio
+from typing import List, Optional
+from models.tinygpt2_model import TinyGPT2Model
+from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
+import os
+import tempfile
+import soundfile as sf
+import torch
+from crewai.tools import BaseTool
+class MultilingualVoiceProcessor:
     def __init__(self, model_name="openai/whisper-base", device=None):
         cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
         if device is None:
             device = 0 if torch.cuda.is_available() else -1
         self.pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model_name,
+            device=device,
+            generate_kwargs={"task": "transcribe", "return_timestamps": False},
+        )
     async def transcribe(self, audio_data: np.ndarray, language: str = None):
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
     async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
         raise NotImplementedError("Use gTTS or edge-tts as before.")
 class TranscribeAudioTool(BaseTool):
     name: str = "transcribe_audio"
     description: str = "Transcribe audio to text and detect language."
     model_config = {"arbitrary_types_allowed": True}
     def __init__(self, config=None):
         super().__init__()
         self.vp = MultilingualVoiceProcessor()
+    def _run(self, audio_data: List[float], language: Optional[str] = None):
+        audio_np = np.array(audio_data, dtype=np.float32)
+        text, detected_lang = asyncio.run(self.vp.transcribe(audio_np, language))
         return {"text": text, "language": detected_lang}
 class DetectEmotionTool(BaseTool):
     model_config = {"arbitrary_types_allowed": True}
     def __init__(self, config=None):
         super().__init__()
+    def _run(self, text: str):
         model = TinyGPT2Model()
         prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
         response = model.generate(prompt)
     model_config = {"arbitrary_types_allowed": True}
     def __init__(self, config=None):
         super().__init__()
+    def _run(self, context: dict):
         emotion = context.get("primary_emotion", "neutral")
         questions_map = {
             "anxiety": ["What triggers your anxiety?", "How do you cope?"],