Spaces:

Detomo
/

aisatsu-api

Sleeping

App Files Files Community

vumichien commited on Mar 27, 2023

Commit

1352988

1 Parent(s): cd09ca8

Update main.py

Browse files

Files changed (1) hide show

main.py +76 -34

main.py CHANGED Viewed

@@ -1,29 +1,30 @@
-import time
-from ultralytics import YOLO
-from base64 import b64encode
-from speech_recognition import AudioFile, Recognizer
-import numpy as np
 from scipy.spatial import distance as dist
-from typing import Union, Optional
 from fastapi import FastAPI, File, UploadFile, Form
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.gzip import GZipMiddleware
-from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
-from huggingface_hub import hf_hub_download
 from io import BytesIO
 import zipfile
-model_path = hf_hub_download(repo_id="ultralyticsplus/yolov8s", filename='yolov8s.pt')
-model = YOLO(model_path)
 CLASS = model.model.names
 default_bot_voice = "おはいようございます"
 area_threshold = 0.3
-ZIP = False
 app = FastAPI()
 app.add_middleware(GZipMiddleware, minimum_size=1000)
@@ -34,15 +35,18 @@ def read_root():
     return {"Message": "Application startup complete"}
-@app.post("/human_detect/")
-async def predict_api(
         file: UploadFile = File(...),
-        # last_seen: Union[UploadFile, None] = File(None),
-        last_seen: Optional[str] = Form(None),
 ):
     # parameters
     total_time = time.time()
-    start_time = time.time()
     most_close = 0
     out_img = None
     diff_value = 0.5
@@ -63,27 +67,34 @@ async def predict_api(
             if area_rate >= most_close:
                 out_img = image.crop(tuple(box)).resize((64, 64))
                 most_close = area_rate
-    print("Get face time", time.time() - start_time)
     # check with previous image if have
-    start_time = time.time()
     if last_seen is not None:
         if type(last_seen) == str:
             last_seen = base64_to_pil(last_seen)
         else:
             last_seen = read_image_file(await last_seen.read())
-        if out_img is not None:
-            diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen))
-    print("Hist time", time.time() - start_time)
     # return results
-    start_time = time.time()
-    print(f"Distance: {most_close}. Different value: {diff_value}")
     if most_close >= area_threshold and diff_value >= 0.5:
         if ZIP:
-            voice_bot_path = tts(default_bot_voice, language="ja")
-            image_bot_path = pil_to_base64(out_img)
-            print("Voice time", time.time() - start_time)
             io = BytesIO()
             zip_filename = "final_archive.zip"
             with zipfile.ZipFile(io, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
@@ -98,12 +109,43 @@ async def predict_api(
             )
         else:
             voice_bot_path = tts(default_bot_voice, language="ja", encode=True)
-            image_bot_path = pil_to_base64(out_img, encode=True)
-            print("Voice time", time.time() - start_time)
             print("Total time", time.time() - total_time)
             return {
                 "voice": voice_bot_path,
                 "image": image_bot_path
             }
     else:
-        return {"message": "No face detected"}

+from ultralyticsplus import YOLO
+from typing import Optional, Union
 from scipy.spatial import distance as dist
+import time
 from fastapi import FastAPI, File, UploadFile, Form
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.gzip import GZipMiddleware
 from io import BytesIO
+from utils import tts, stt, read_image_file, pil_to_base64, base64_to_pil, get_hist, ffmpeg_read
 import zipfile
+import soundfile as sf
+import openai
+# Config for camera picture
+model = YOLO('ultralyticsplus/yolov8s')
 CLASS = model.model.names
+ZIP = False
 default_bot_voice = "おはいようございます"
 area_threshold = 0.3
+# Config for human input
+prompt_template = "私はあなたに、Detomo社が作ったロボットのように振る舞ってほしいです。あなたの名前はアイサツです。"\
+                  "あなたのミッションは、子供たちが他の子供たちに挨拶する自信を持ち、幸せになることを助けることです。"\
+                  "質問には簡単な方法でしか答えないようにし、明示的に要求されない限り、追加情報を提供しないでください。"
+system_prompt = [{"role": "system", "content": prompt_template}]
+openai.api_key = os.environ["OPENAI_API_KEY"]
 app = FastAPI()
 app.add_middleware(GZipMiddleware, minimum_size=1000)
     return {"Message": "Application startup complete"}
+@app.get("/client_settings/")
+def client_settings_api():
+    return {"camera_picture_period": 5}
+@app.post("/camera_picture/")
+async def camera_picture_api(
         file: UploadFile = File(...),
+        last_seen: Optional[Union[str, UploadFile]] = Form(None),
 ):
     # parameters
     total_time = time.time()
     most_close = 0
     out_img = None
     diff_value = 0.5
             if area_rate >= most_close:
                 out_img = image.crop(tuple(box)).resize((64, 64))
                 most_close = area_rate
+    # check detect people or not
+    if out_img is None:
+        return {
+            "status": "No face detected",
+            "text": None,
+            "voice": None,
+            "image": None
+        }
+    else:
+        if ZIP:
+            image_bot_path = pil_to_base64(out_img, encode=False)
+        else:
+            image_bot_path = pil_to_base64(out_img, encode=True)
     # check with previous image if have
     if last_seen is not None:
         if type(last_seen) == str:
             last_seen = base64_to_pil(last_seen)
         else:
             last_seen = read_image_file(await last_seen.read())
+        diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen))
+        print(f"Distance: {most_close}. Different value: {diff_value}")
     # return results
     if most_close >= area_threshold and diff_value >= 0.5:
         if ZIP:
+            voice_bot_path = tts(default_bot_voice, language="ja", encode=False)
             io = BytesIO()
             zip_filename = "final_archive.zip"
             with zipfile.ZipFile(io, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
             )
         else:
             voice_bot_path = tts(default_bot_voice, language="ja", encode=True)
             print("Total time", time.time() - total_time)
             return {
+                "status": "New people",
+                "text": default_bot_voice,
                 "voice": voice_bot_path,
                 "image": image_bot_path
             }
     else:
+        print("Total time", time.time() - total_time)
+        return {
+            "status": "Old people",
+            "text": None,
+            "voice": None,
+            "image": image_bot_path,
+        }
+@app.post("/human_input/")
+async def human_input_api(
+        input_data: Union[str, bytes],
+        temperature: float = 0.7,
+        max_tokens: int = 1000,
+):
+    print("Input data type", type(input_data))
+    if type(input_data) != str:
+        upload_audio = ffmpeg_read(input_data, sampling_rate=24000)
+        sf.write('temp.wav', upload_audio, 24000, subtype='PCM_16')
+        text = stt('temp.wav')
+    else:
+        text = input_data
+    prompt_msg = {"role": "user", "content": text}
+    messages = system_prompt + [prompt_msg]
+    completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, temperature=temperature,
+                                              max_tokens=max_tokens)
+    print(completion['usage']['total_tokens'])
+    return {
+        "human_text": str(text),
+        "robot_text": completion.choices[0].message.content,
+        "robot_voice": tts(completion.choices[0].message.content, language="ja", encode=True)
+    }