Jayashree Sridhar commited on
Commit
0f20f6a
·
1 Parent(s): d39c478

converted numpy array

Browse files
Files changed (1) hide show
  1. agents/tools/voice_tools.py +108 -25
agents/tools/voice_tools.py CHANGED
@@ -12,32 +12,114 @@ from crewai.tools import BaseTool
12
 
13
 
14
 
15
- class MultilingualVoiceProcessor:
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def __init__(self, model_name="openai/whisper-base", device=None):
18
  cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
19
  if device is None:
20
  device = 0 if torch.cuda.is_available() else -1
21
-
22
- # Load model and processor with cache_dir
23
- processor = AutoProcessor.from_pretrained(model_name, cache_dir=cache_dir)
24
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, cache_dir=cache_dir)
25
-
26
- # Create the pipeline, DO NOT PASS cache_dir here
27
- # self.pipe = pipeline(
28
- # "automatic-speech-recognition",
29
- # model=model,
30
- # tokenizer=processor,
31
- # feature_extractor=processor,
32
- # device=device,
33
- # generate_kwargs={"task": "transcribe", "return_timestamps": False},
34
- # )
35
  self.pipe = pipeline(
36
- "automatic-speech-recognition",
37
- model=model_name,
38
- device=device,
39
- generate_kwargs={"task": "transcribe", "return_timestamps": False},
40
- )
41
 
42
  async def transcribe(self, audio_data: np.ndarray, language: str = None):
43
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
@@ -49,16 +131,17 @@ class MultilingualVoiceProcessor:
49
 
50
  async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
51
  raise NotImplementedError("Use gTTS or edge-tts as before.")
 
52
  class TranscribeAudioTool(BaseTool):
53
  name: str = "transcribe_audio"
54
  description: str = "Transcribe audio to text and detect language."
55
  model_config = {"arbitrary_types_allowed": True}
56
- #_vp: MultilingualVoiceProcessor = PrivateAttr()
57
  def __init__(self, config=None):
58
  super().__init__()
59
  self.vp = MultilingualVoiceProcessor()
60
- def _run(self, audio_data: np.ndarray, language=None):
61
- text, detected_lang = asyncio.run(self.vp.transcribe(audio_data, language))
 
62
  return {"text": text, "language": detected_lang}
63
 
64
  class DetectEmotionTool(BaseTool):
@@ -67,7 +150,7 @@ class DetectEmotionTool(BaseTool):
67
  model_config = {"arbitrary_types_allowed": True}
68
  def __init__(self, config=None):
69
  super().__init__()
70
- def _run(self, text: str):
71
  model = TinyGPT2Model()
72
  prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
73
  response = model.generate(prompt)
@@ -82,7 +165,7 @@ class GenerateReflectiveQuestionsTool(BaseTool):
82
  model_config = {"arbitrary_types_allowed": True}
83
  def __init__(self, config=None):
84
  super().__init__()
85
- def _run(self, context: dict):
86
  emotion = context.get("primary_emotion", "neutral")
87
  questions_map = {
88
  "anxiety": ["What triggers your anxiety?", "How do you cope?"],
 
12
 
13
 
14
 
15
+ # class MultilingualVoiceProcessor:
16
 
17
+ # def __init__(self, model_name="openai/whisper-base", device=None):
18
+ # cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
19
+ # if device is None:
20
+ # device = 0 if torch.cuda.is_available() else -1
21
+
22
+ # # Load model and processor with cache_dir
23
+ # processor = AutoProcessor.from_pretrained(model_name, cache_dir=cache_dir)
24
+ # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, cache_dir=cache_dir)
25
+
26
+ # # Create the pipeline, DO NOT PASS cache_dir here
27
+ # # self.pipe = pipeline(
28
+ # # "automatic-speech-recognition",
29
+ # # model=model,
30
+ # # tokenizer=processor,
31
+ # # feature_extractor=processor,
32
+ # # device=device,
33
+ # # generate_kwargs={"task": "transcribe", "return_timestamps": False},
34
+ # # )
35
+ # self.pipe = pipeline(
36
+ # "automatic-speech-recognition",
37
+ # model=model_name,
38
+ # device=device,
39
+ # generate_kwargs={"task": "transcribe", "return_timestamps": False},
40
+ # )
41
+
42
+ # async def transcribe(self, audio_data: np.ndarray, language: str = None):
43
+ # with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
44
+ # sf.write(tmp_wav.name, audio_data, samplerate=16000)
45
+ # extra = {"language": language} if language else {}
46
+ # result = self.pipe(tmp_wav.name, **extra)
47
+ # text = result['text']
48
+ # return text, language or "unknown"
49
+
50
+ # async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
51
+ # raise NotImplementedError("Use gTTS or edge-tts as before.")
52
+ # class TranscribeAudioTool(BaseTool):
53
+ # name: str = "transcribe_audio"
54
+ # description: str = "Transcribe audio to text and detect language."
55
+ # model_config = {"arbitrary_types_allowed": True}
56
+ # #_vp: MultilingualVoiceProcessor = PrivateAttr()
57
+ # def __init__(self, config=None):
58
+ # super().__init__()
59
+ # self.vp = MultilingualVoiceProcessor()
60
+ # def _run(self, audio_data: np.ndarray, language=None):
61
+ # text, detected_lang = asyncio.run(self.vp.transcribe(audio_data, language))
62
+ # return {"text": text, "language": detected_lang}
63
+
64
+ # class DetectEmotionTool(BaseTool):
65
+ # name: str = "detect_emotion"
66
+ # description: str = "Detect the emotional state from text."
67
+ # model_config = {"arbitrary_types_allowed": True}
68
+ # def __init__(self, config=None):
69
+ # super().__init__()
70
+ # def _run(self, text: str):
71
+ # model = TinyGPT2Model()
72
+ # prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
73
+ # response = model.generate(prompt)
74
+ # return {"primary_emotion": "detected_emotion",
75
+ # "intensity": "medium",
76
+ # "feelings": ["feeling1"],
77
+ # "concerns": ["concern1"]}
78
+
79
+ # class GenerateReflectiveQuestionsTool(BaseTool):
80
+ # name: str = "generate_reflective_questions"
81
+ # description: str = "Generate reflective questions."
82
+ # model_config = {"arbitrary_types_allowed": True}
83
+ # def __init__(self, config=None):
84
+ # super().__init__()
85
+ # def _run(self, context: dict):
86
+ # emotion = context.get("primary_emotion", "neutral")
87
+ # questions_map = {
88
+ # "anxiety": ["What triggers your anxiety?", "How do you cope?"],
89
+ # "sadness": ["What helps when you feel sad?", "Who can you talk to?"]
90
+ # }
91
+ # return questions_map.get(emotion, [
92
+ # "How are you feeling?",
93
+ # "What feels important now?"
94
+ # ])
95
+
96
+ # class VoiceTools:
97
+ # def __init__(self, config=None):
98
+ # self.transcribe_audio = TranscribeAudioTool(config)
99
+ # self.detect_emotion = DetectEmotionTool(config)
100
+ # self.generate_reflective_questions = GenerateReflectiveQuestionsTool(config)
101
+ import numpy as np
102
+ import asyncio
103
+ from typing import List, Optional
104
+ from models.tinygpt2_model import TinyGPT2Model
105
+ from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
106
+ import os
107
+ import tempfile
108
+ import soundfile as sf
109
+ import torch
110
+ from crewai.tools import BaseTool
111
+
112
+ class MultilingualVoiceProcessor:
113
  def __init__(self, model_name="openai/whisper-base", device=None):
114
  cache_dir = os.getenv("TRANSFORMERS_CACHE", None)
115
  if device is None:
116
  device = 0 if torch.cuda.is_available() else -1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  self.pipe = pipeline(
118
+ "automatic-speech-recognition",
119
+ model=model_name,
120
+ device=device,
121
+ generate_kwargs={"task": "transcribe", "return_timestamps": False},
122
+ )
123
 
124
  async def transcribe(self, audio_data: np.ndarray, language: str = None):
125
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_wav:
 
131
 
132
  async def synthesize(self, text, language: str = "en", voice_type: str = "normal"):
133
  raise NotImplementedError("Use gTTS or edge-tts as before.")
134
+
135
  class TranscribeAudioTool(BaseTool):
136
  name: str = "transcribe_audio"
137
  description: str = "Transcribe audio to text and detect language."
138
  model_config = {"arbitrary_types_allowed": True}
 
139
  def __init__(self, config=None):
140
  super().__init__()
141
  self.vp = MultilingualVoiceProcessor()
142
+ def _run(self, audio_data: List[float], language: Optional[str] = None):
143
+ audio_np = np.array(audio_data, dtype=np.float32)
144
+ text, detected_lang = asyncio.run(self.vp.transcribe(audio_np, language))
145
  return {"text": text, "language": detected_lang}
146
 
147
  class DetectEmotionTool(BaseTool):
 
150
  model_config = {"arbitrary_types_allowed": True}
151
  def __init__(self, config=None):
152
  super().__init__()
153
+ def _run(self, text: str):
154
  model = TinyGPT2Model()
155
  prompt = f'Analyse emotions in: "{text}". Format: JSON with primary_emotion, intensity, feelings, concerns.'
156
  response = model.generate(prompt)
 
165
  model_config = {"arbitrary_types_allowed": True}
166
  def __init__(self, config=None):
167
  super().__init__()
168
+ def _run(self, context: dict):
169
  emotion = context.get("primary_emotion", "neutral")
170
  questions_map = {
171
  "anxiety": ["What triggers your anxiety?", "How do you cope?"],