wasmdashai commited on
Commit
39576d6
·
verified ·
1 Parent(s): b485fef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -99
app.py CHANGED
@@ -1,79 +1,24 @@
1
- from logging import error
2
- import gradio as gr
3
- import spaces
4
- import torch
5
  from transformers import AutoTokenizer, VitsModel
6
- import os
7
  import numpy as np
 
8
  import noisereduce as nr
9
- import torch.nn as nn
10
- from typing import Optional, Iterator
11
 
12
  # قراءة التوكن من Secrets
13
- token = os.getenv("acees-token") # تأكد أنك سميته بنفس الاسم في Settings → Repository secrets
14
 
15
- # كائن لتخزين النماذج
16
  models = {}
17
 
18
- # اختيار الجهاز (CUDA لو متوفر، غير كذا CPU)
19
  device = "cuda" if torch.cuda.is_available() else "cpu"
20
 
21
-
22
  # دالة إزالة الضوضاء
23
  def remove_noise_nr(audio_data, sr=16000):
24
  return nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
25
 
26
-
27
- # دالة inference (streaming / non-streaming)
28
- def _inference_forward_stream(
29
- self,
30
- input_ids: Optional[torch.Tensor] = None,
31
- attention_mask: Optional[torch.Tensor] = None,
32
- speaker_embeddings: Optional[torch.Tensor] = None,
33
- chunk_size: int = 32,
34
- is_streaming: bool = True
35
- ) -> Iterator[torch.Tensor]:
36
-
37
- padding_mask = attention_mask.unsqueeze(-1).float() if attention_mask is not None else torch.ones_like(input_ids).unsqueeze(-1).float()
38
- text_encoder_output = self.text_encoder(input_ids=input_ids, padding_mask=padding_mask, attention_mask=attention_mask)
39
- hidden_states = text_encoder_output[0].transpose(1, 2)
40
- input_padding_mask = padding_mask.transpose(1, 2)
41
-
42
- log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
43
- length_scale = 1.0 / self.speaking_rate
44
- duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
45
- predicted_lengths = torch.clamp_min(torch.sum(duration, [1,2]), 1).long()
46
-
47
- indices = torch.arange(predicted_lengths.max(), device=predicted_lengths.device)
48
- output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
49
- output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
50
-
51
- attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
52
- batch_size, _, output_length, input_length = attn_mask.shape
53
- cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
54
- indices = torch.arange(output_length, device=duration.device)
55
- valid_indices = indices.unsqueeze(0) < cum_duration
56
- valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
57
- padded_indices = valid_indices - nn.functional.pad(valid_indices, [0,0,1,0,0,0])[:, :-1]
58
- attn = padded_indices.unsqueeze(1).transpose(2,3) * attn_mask
59
-
60
- prior_means = text_encoder_output[1]
61
- prior_log_variances = text_encoder_output[2]
62
- prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
63
- latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
64
- spectrogram = latents * output_padding_mask
65
-
66
- if is_streaming:
67
- for i in range(0, spectrogram.size(-1), chunk_size):
68
- with torch.no_grad():
69
- wav = self.decoder(spectrogram[:,:,i:i+chunk_size], speaker_embeddings)
70
- yield wav.squeeze().cpu().numpy()
71
- else:
72
- with torch.no_grad():
73
- wav = self.decoder(spectrogram, speaker_embeddings)
74
- yield wav.squeeze().cpu().numpy()
75
-
76
-
77
  # تحميل النموذج + التوكن
78
  def get_model(name_model):
79
  global models
@@ -90,41 +35,35 @@ def get_model(name_model):
90
  tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
91
  return models[name_model], tokenizer
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- # النص الافتراضي
95
- TXT = "السلام عليكم ورحمة الله وبركات�� يا هلا وسهلا ومراحب بالغالي"
96
-
97
-
98
- # دالة تحويل النص إلى كلام
99
- def modelspeech(text=TXT, name_model="wasmdashai/vits-ar-sa-huba-v2", speaking_rate=16000):
100
- model, tokenizer = get_model(name_model)
101
- inputs = tokenizer(text, return_tensors="pt").to(device) # يشتغل على CPU أو GPU حسب المتوفر
102
- model.speaking_rate = speaking_rate
103
- with torch.no_grad():
104
- outputs = model(**inputs)
105
- waveform = outputs.waveform[0].cpu().numpy()
106
- return model.config.sampling_rate, remove_noise_nr(waveform)
107
-
108
-
109
- # واجهة Gradio
110
- model_choices = gr.Dropdown(
111
- choices=[
112
- "wasmdashai/vits-ar-sa-huba-v1",
113
- "wasmdashai/vits-ar-sa-huba-v2",
114
- "wasmdashai/vits-ar-sa-A",
115
- "wasmdashai/vits-ar-ye-sa",
116
- "wasmdashai/vits-ar-sa-M-v1",
117
- "wasmdashai/vits-en-v1"
118
- ],
119
- label="اختر النموذج",
120
- value="wasmdashai/vits-ar-sa-huba-v2"
121
- )
122
-
123
- demo = gr.Interface(
124
- fn=modelspeech,
125
- inputs=["text", model_choices, gr.Slider(0.1, 1, step=0.1, value=0.8)],
126
- outputs=["audio"]
127
- )
128
-
129
- demo.queue()
130
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
 
 
3
  from transformers import AutoTokenizer, VitsModel
4
+ import torch
5
  import numpy as np
6
+ import os
7
  import noisereduce as nr
 
 
8
 
9
  # قراءة التوكن من Secrets
10
+ token = os.getenv("acees-token")
11
 
12
+ # تخزين النماذج
13
  models = {}
14
 
15
+ # اختيار الجهاز
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
 
18
  # دالة إزالة الضوضاء
19
  def remove_noise_nr(audio_data, sr=16000):
20
  return nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # تحميل النموذج + التوكن
23
  def get_model(name_model):
24
  global models
 
35
  tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
36
  return models[name_model], tokenizer
37
 
38
+ # نموذج البيانات للـ POST
39
+ class TTSRequest(BaseModel):
40
+ text: str
41
+ name_model: str = "wasmdashai/vits-ar-sa-huba-v2"
42
+ speaking_rate: int = 16000
43
+
44
+ # إنشاء التطبيق
45
+ app = FastAPI(title="VITS TTS API")
46
+
47
+ # مسار صحة الخدمة
48
+ @app.get("/")
49
+ def home():
50
+ return {"message": "FastAPI VITS TTS service is running"}
51
+
52
+ # مسار تحويل النص إلى كلام
53
+ @app.post("/predict/")
54
+ def modelspeech(req: TTSRequest):
55
+ try:
56
+ model, tokenizer = get_model(req.name_model)
57
+ inputs = tokenizer(req.text, return_tensors="pt").to(device)
58
+ model.speaking_rate = req.speaking_rate
59
+ with torch.no_grad():
60
+ outputs = model(**inputs)
61
+ waveform = outputs.waveform[0].cpu().numpy()
62
+ audio = remove_noise_nr(waveform)
63
+ return {
64
+ "sampling_rate": model.config.sampling_rate,
65
+ "audio": audio.tolist() # تحويل numpy array إلى قائمة
66
+ }
67
+ except Exception as e:
68
+ raise HTTPException(status_code=500, detail=str(e))
69