Spaces:

channelcorp
/

Ko-TTS-Arena

Sleeping

blackhole1218 commited on 9 days ago

Commit

427a0a4

1 Parent(s): 0ed13a6

fix: remove ConsumedSentence logic - allow same sentence to be used multiple times

- Remove consumed sentence tracking (same sentence can now be voted on multiple times)
- This allows more accurate ELO ratings with more votes per sentence
- Simplify cache and random sentence selection logic

Files changed (3) hide show

app.py +26 -59
ko_prompts.json +3 -50
models.py +0 -8

app.py CHANGED Viewed

@@ -64,10 +64,6 @@ from flask import (
 )
 from flask_login import LoginManager, current_user
 from models import *
-from models import (
-    hash_sentence, is_sentence_consumed, mark_sentence_consumed,
-    get_unconsumed_sentences, get_consumed_sentences_count, get_random_unconsumed_sentence
-)
 from auth import auth, init_oauth, is_admin
 from admin import admin
 from security import is_vote_allowed, check_user_security_score, detect_coordinated_voting
@@ -480,12 +476,11 @@ def _generate_cache_entry_task(sentence):
             # Select a new sentence if not provided (for replacement)
             with tts_cache_lock:
                 cached_keys = set(tts_cache.keys())
-            # Get unconsumed sentences that are also not already cached
-            unconsumed_sentences = get_unconsumed_sentences(all_harvard_sentences)
-            available_sentences = [s for s in unconsumed_sentences if s not in cached_keys]
             if not available_sentences:
-                app.logger.warning("No more unconsumed sentences available for caching. All sentences have been consumed.")
-                return
             sentence = random.choice(available_sentences)
         # app.logger.info removed duplicate log
@@ -524,8 +519,6 @@ def _generate_cache_entry_task(sentence):
                             "audio_b": audio_b_path,
                             "created_at": datetime.utcnow(),
                         }
-                        # Mark sentence as consumed for cache usage
-                        mark_sentence_consumed(sentence, usage_type='cache')
                         app.logger.info(f"Successfully cached entry for: '{sentence[:50]}...'")
                     elif sentence in tts_cache:
                          app.logger.warning(f"Sentence '{sentence[:50]}...' already re-cached. Discarding new generation.")
@@ -550,19 +543,18 @@ def _generate_cache_entry_task(sentence):
 def update_initial_sentences():
-    """Update initial sentences to only include unconsumed ones."""
     global initial_sentences
     try:
-        unconsumed_for_initial = get_unconsumed_sentences(all_harvard_sentences)
-        if unconsumed_for_initial:
-            initial_sentences = random.sample(unconsumed_for_initial, min(len(unconsumed_for_initial), 500))
-            print(f"Updated initial sentences with {len(initial_sentences)} unconsumed sentences")
         else:
-            print("Warning: No unconsumed sentences available for initial selection, disabling fallback")
-            initial_sentences = []  # No fallback to consumed sentences
     except Exception as e:
-        print(f"Error updating initial sentences: {e}, disabling fallback for security")
-        initial_sentences = []  # No fallback to consumed sentences
 def initialize_tts_cache():
@@ -573,16 +565,11 @@ def initialize_tts_cache():
             app.logger.error("Harvard sentences not loaded. Cannot initialize cache.")
             return
-        # Update initial sentences with unconsumed ones
         update_initial_sentences()
-        # Only use unconsumed sentences for initial cache population
-        unconsumed_sentences = get_unconsumed_sentences(all_harvard_sentences)
-        if not unconsumed_sentences:
-            app.logger.error("No unconsumed sentences available for cache initialization. Cache will remain empty.")
-            app.logger.warning("WARNING: All sentences from the dataset have been consumed. No new TTS generations will be possible.")
-            return
-        initial_selection = random.sample(unconsumed_sentences, min(len(unconsumed_sentences), TTS_CACHE_SIZE))
         app.logger.info(f"Initializing TTS cache with {len(initial_selection)} sentences...")
         for sentence in initial_selection:
@@ -613,14 +600,6 @@ def generate_tts():
     # Check if text contains Korean (at least 30% Korean characters)
     if not is_korean_text(text):
         return jsonify({"error": "한국어 텍스트를 입력해주세요. 최소 30% 이상의 한국어가 포함되어야 합니다."}), 400
-    # Check if sentence has already been consumed
-    if is_sentence_consumed(text):
-        remaining_count = len(get_unconsumed_sentences(all_harvard_sentences))
-        if remaining_count == 0:
-            return jsonify({"error": "This sentence has already been used and no unconsumed sentences remain. All sentences from the dataset have been consumed."}), 400
-        else:
-            return jsonify({"error": f"This sentence has already been used. Please select a different sentence. {remaining_count} sentences remain available."}), 400
     # --- Cache Check ---
     cache_hit = False
@@ -1196,47 +1175,35 @@ def toggle_leaderboard_visibility():
 @app.route("/api/tts/cached-sentences")
 def get_cached_sentences():
-    """Returns a list of unconsumed sentences available for random selection."""
-    # Get unconsumed sentences from the full pool (not just cached ones)
-    unconsumed_sentences = get_unconsumed_sentences(all_harvard_sentences)
     # Limit the response size to avoid overwhelming the frontend
     max_sentences = 1000
-    if len(unconsumed_sentences) > max_sentences:
-        import random
-        unconsumed_sentences = random.sample(unconsumed_sentences, max_sentences)
-    return jsonify(unconsumed_sentences)
 @app.route("/api/tts/sentence-stats")
 def get_sentence_stats():
-    """Returns statistics about sentence consumption."""
     total_sentences = len(all_harvard_sentences)
-    consumed_count = get_consumed_sentences_count()
-    remaining_count = total_sentences - consumed_count
     return jsonify({
         "total_sentences": total_sentences,
-        "consumed_sentences": consumed_count,
-        "remaining_sentences": remaining_count,
-        "consumption_percentage": round((consumed_count / total_sentences) * 100, 2) if total_sentences > 0 else 0
     })
 @app.route("/api/tts/random-sentence")
 def get_random_sentence():
-    """Returns a random unconsumed sentence."""
-    random_sentence = get_random_unconsumed_sentence(all_harvard_sentences)
-    if random_sentence:
-        return jsonify({"sentence": random_sentence})
     else:
-        total_sentences = len(all_harvard_sentences)
-        consumed_count = get_consumed_sentences_count()
-        return jsonify({
-            "error": "No unconsumed sentences available",
-            "details": f"All {total_sentences} sentences have been consumed ({consumed_count} total consumed)"
-        }), 404
 def get_weighted_random_models(

 )
 from flask_login import LoginManager, current_user
 from models import *
 from auth import auth, init_oauth, is_admin
 from admin import admin
 from security import is_vote_allowed, check_user_security_score, detect_coordinated_voting
             # Select a new sentence if not provided (for replacement)
             with tts_cache_lock:
                 cached_keys = set(tts_cache.keys())
+            # Get sentences that are not already cached
+            available_sentences = [s for s in all_harvard_sentences if s not in cached_keys]
             if not available_sentences:
+                # All sentences are cached, pick any random one
+                available_sentences = all_harvard_sentences
             sentence = random.choice(available_sentences)
         # app.logger.info removed duplicate log
                             "audio_b": audio_b_path,
                             "created_at": datetime.utcnow(),
                         }
                         app.logger.info(f"Successfully cached entry for: '{sentence[:50]}...'")
                     elif sentence in tts_cache:
                          app.logger.warning(f"Sentence '{sentence[:50]}...' already re-cached. Discarding new generation.")
 def update_initial_sentences():
+    """Update initial sentences for random selection."""
     global initial_sentences
     try:
+        if all_harvard_sentences:
+            initial_sentences = random.sample(all_harvard_sentences, min(len(all_harvard_sentences), 500))
+            print(f"Updated initial sentences with {len(initial_sentences)} sentences")
         else:
+            print("Warning: No sentences available for initial selection")
+            initial_sentences = []
     except Exception as e:
+        print(f"Error updating initial sentences: {e}")
+        initial_sentences = []
 def initialize_tts_cache():
             app.logger.error("Harvard sentences not loaded. Cannot initialize cache.")
             return
+        # Update initial sentences
         update_initial_sentences()
+        # Select random sentences for initial cache population
+        initial_selection = random.sample(all_harvard_sentences, min(len(all_harvard_sentences), TTS_CACHE_SIZE))
         app.logger.info(f"Initializing TTS cache with {len(initial_selection)} sentences...")
         for sentence in initial_selection:
     # Check if text contains Korean (at least 30% Korean characters)
     if not is_korean_text(text):
         return jsonify({"error": "한국어 텍스트를 입력해주세요. 최소 30% 이상의 한국어가 포함되어야 합니다."}), 400
     # --- Cache Check ---
     cache_hit = False
 @app.route("/api/tts/cached-sentences")
 def get_cached_sentences():
+    """Returns a list of sentences available for random selection."""
+    sentences = all_harvard_sentences.copy()
     # Limit the response size to avoid overwhelming the frontend
     max_sentences = 1000
+    if len(sentences) > max_sentences:
+        sentences = random.sample(sentences, max_sentences)
+    return jsonify(sentences)
 @app.route("/api/tts/sentence-stats")
 def get_sentence_stats():
+    """Returns statistics about available sentences."""
     total_sentences = len(all_harvard_sentences)
     return jsonify({
         "total_sentences": total_sentences,
+        "available_sentences": total_sentences
     })
 @app.route("/api/tts/random-sentence")
 def get_random_sentence():
+    """Returns a random sentence."""
+    if all_harvard_sentences:
+        return jsonify({"sentence": random.choice(all_harvard_sentences)})
     else:
+        return jsonify({"error": "No sentences available"}), 404
 def get_weighted_random_models(

ko_prompts.json CHANGED Viewed

@@ -1,55 +1,8 @@
 {
   "prompts": [
-    "안녕하세요, 오늘 날씨가 정말 좋네요.",
-    "지금 몇 시예요? 약속 시간에 늦을 것 같아요.",
-    "오늘 저녁에 뭐 먹을까요? 치킨이 땡기는데.",
-    "주말에 시간 되시면 같이 영화 보러 갈래요?",
-    "커피 한 잔 하실래요? 제가 살게요.",
-    "회의는 오후 세 시에 시작합니다. 자료 준비해 주세요.",
-    "이번 분기 매출이 전년 대비 이십 퍼센트 증가했습니다.",
-    "고객님, 문의하신 내용 확인 후 답변드리겠습니다.",
-    "프로젝트 마감일이 다음 주 금요일입니다.",
-    "미팅 일정을 조율하고 싶은데, 언제가 편하신가요?",
-    "채널톡 고객센터입니다. 무엇을 도와드릴까요?",
-    "주문하신 상품은 내일 오전 중으로 배송될 예정입니다.",
-    "불편을 드려 죄송합니다. 바로 처리해 드리겠습니다.",
-    "결제가 정상적으로 완료되었습니다. 감사합니다.",
-    "반품 신청이 접수되었습니다. 삼 영업일 내에 처리됩니다.",
-    "서울의 현재 기온은 섭씨 이십오 도입니다.",
-    "다음 정류장은 강남역입니다. 내리실 분은 준비해 주세요.",
-    "오늘의 환율은 달러당 천삼백원입니다.",
-    "이 제품의 가격은 삼만구천원입니다.",
-    "영업시간은 오전 아홉 시부터 오후 여섯 시까지입니다.",
-    "정말 기쁜 소식이에요! 축하드려요!",
-    "걱정하지 마세요, 다 잘 될 거예요.",
-    "오랜만이에요! 그동안 잘 지내셨어요?",
-    "정말 감사합니다. 덕분에 큰 도움이 됐어요.",
-    "아쉽지만 다음 기회에 뵙겠습니다.",
-    "문을 열려면 버튼을 눌러주세요.",
-    "왼쪽으로 돌아서 직진하시면 됩니다.",
-    "앱을 설치하고 회원가입을 진행해 주세요.",
-    "비밀번호는 여덟 자리 이상으로 설정해 주세요.",
-    "첨부파일을 확인하시고 서명해 주세요.",
-    "오늘 주요 뉴스를 전해드리겠습니다.",
-    "정부가 새로운 정책을 발표했습니다.",
-    "국내 반도체 수출이 사상 최대치를 기록했습니다.",
-    "내일 전국적으로 비가 내릴 예정입니다.",
-    "올해 출생률이 역대 최저를 기록했습니다.",
-    "오늘 수업에서는 인공지능의 기초를 배워보겠습니다.",
-    "이 문제의 정답은 삼번입니다.",
-    "다음 시간까지 과제를 제출해 주세요.",
-    "질문이 있으시면 언제든지 물어보세요.",
-    "복습은 학습의 가장 중요한 부분입니다.",
-    "이번 주 인기 영화 순위를 알려드릴게요.",
-    "새 앨범이 음원 차트 일위를 차지했습니다.",
-    "오늘 경기에서 한국팀이 이겼습니다!",
-    "다음 에피소드가 정말 기대돼요.",
-    "이 노래 가사가 정말 마음에 들어요.",
-    "인공지능 기술이 빠르게 발전하고 있습니다.",
-    "스마트폰 배터리를 절약하는 방법을 알려드릴게요.",
-    "이 앱은 무료로 다운로드할 수 있습니다.",
-    "시스템 업데이트가 완료되었습니다.",
-    "클라우드에 파일이 자동으로 저장됩니다."
   ]
 }

 {
   "prompts": [
+    "여러 상용 TTS가 이미 존재하지만, 저희가 자체 모델을 만드는 이유는 명확합니다.",
+    "안녕하세요 채널톡 AI 에이전트 알프입니다.",
+    "안녕하세요 채널톡 한국어 TTS 아레나입니다!"
   ]
 }

models.py CHANGED Viewed

@@ -314,14 +314,6 @@ def record_vote(user_id, text, chosen_model_id, rejected_model_id, model_type,
     db.session.add_all([chosen_history, rejected_history])
-    # Mark sentence as consumed AFTER successful vote recording (only for dataset sentences that count)
-    if counts_for_public and sentence_origin == 'dataset':
-        try:
-            mark_sentence_consumed(text, usage_type='voted')
-        except Exception as e:
-            # If consumption marking fails, log but don't fail the vote
-            logging.error(f"Failed to mark sentence as consumed after vote: {str(e)}")
     db.session.commit()
     return vote, None

     db.session.add_all([chosen_history, rejected_history])
     db.session.commit()
     return vote, None