blackhole1218 commited on
Commit
427a0a4
·
1 Parent(s): 0ed13a6

fix: remove ConsumedSentence logic - allow same sentence to be used multiple times

Browse files

- Remove consumed sentence tracking (same sentence can now be voted on multiple times)
- This allows more accurate ELO ratings with more votes per sentence
- Simplify cache and random sentence selection logic

Files changed (3) hide show
  1. app.py +26 -59
  2. ko_prompts.json +3 -50
  3. models.py +0 -8
app.py CHANGED
@@ -64,10 +64,6 @@ from flask import (
64
  )
65
  from flask_login import LoginManager, current_user
66
  from models import *
67
- from models import (
68
- hash_sentence, is_sentence_consumed, mark_sentence_consumed,
69
- get_unconsumed_sentences, get_consumed_sentences_count, get_random_unconsumed_sentence
70
- )
71
  from auth import auth, init_oauth, is_admin
72
  from admin import admin
73
  from security import is_vote_allowed, check_user_security_score, detect_coordinated_voting
@@ -480,12 +476,11 @@ def _generate_cache_entry_task(sentence):
480
  # Select a new sentence if not provided (for replacement)
481
  with tts_cache_lock:
482
  cached_keys = set(tts_cache.keys())
483
- # Get unconsumed sentences that are also not already cached
484
- unconsumed_sentences = get_unconsumed_sentences(all_harvard_sentences)
485
- available_sentences = [s for s in unconsumed_sentences if s not in cached_keys]
486
  if not available_sentences:
487
- app.logger.warning("No more unconsumed sentences available for caching. All sentences have been consumed.")
488
- return
489
  sentence = random.choice(available_sentences)
490
 
491
  # app.logger.info removed duplicate log
@@ -524,8 +519,6 @@ def _generate_cache_entry_task(sentence):
524
  "audio_b": audio_b_path,
525
  "created_at": datetime.utcnow(),
526
  }
527
- # Mark sentence as consumed for cache usage
528
- mark_sentence_consumed(sentence, usage_type='cache')
529
  app.logger.info(f"Successfully cached entry for: '{sentence[:50]}...'")
530
  elif sentence in tts_cache:
531
  app.logger.warning(f"Sentence '{sentence[:50]}...' already re-cached. Discarding new generation.")
@@ -550,19 +543,18 @@ def _generate_cache_entry_task(sentence):
550
 
551
 
552
  def update_initial_sentences():
553
- """Update initial sentences to only include unconsumed ones."""
554
  global initial_sentences
555
  try:
556
- unconsumed_for_initial = get_unconsumed_sentences(all_harvard_sentences)
557
- if unconsumed_for_initial:
558
- initial_sentences = random.sample(unconsumed_for_initial, min(len(unconsumed_for_initial), 500))
559
- print(f"Updated initial sentences with {len(initial_sentences)} unconsumed sentences")
560
  else:
561
- print("Warning: No unconsumed sentences available for initial selection, disabling fallback")
562
- initial_sentences = [] # No fallback to consumed sentences
563
  except Exception as e:
564
- print(f"Error updating initial sentences: {e}, disabling fallback for security")
565
- initial_sentences = [] # No fallback to consumed sentences
566
 
567
 
568
  def initialize_tts_cache():
@@ -573,16 +565,11 @@ def initialize_tts_cache():
573
  app.logger.error("Harvard sentences not loaded. Cannot initialize cache.")
574
  return
575
 
576
- # Update initial sentences with unconsumed ones
577
  update_initial_sentences()
578
 
579
- # Only use unconsumed sentences for initial cache population
580
- unconsumed_sentences = get_unconsumed_sentences(all_harvard_sentences)
581
- if not unconsumed_sentences:
582
- app.logger.error("No unconsumed sentences available for cache initialization. Cache will remain empty.")
583
- app.logger.warning("WARNING: All sentences from the dataset have been consumed. No new TTS generations will be possible.")
584
- return
585
- initial_selection = random.sample(unconsumed_sentences, min(len(unconsumed_sentences), TTS_CACHE_SIZE))
586
  app.logger.info(f"Initializing TTS cache with {len(initial_selection)} sentences...")
587
 
588
  for sentence in initial_selection:
@@ -613,14 +600,6 @@ def generate_tts():
613
  # Check if text contains Korean (at least 30% Korean characters)
614
  if not is_korean_text(text):
615
  return jsonify({"error": "한국어 텍스트를 입력해주세요. 최소 30% 이상의 한국어가 포함되어야 합니다."}), 400
616
-
617
- # Check if sentence has already been consumed
618
- if is_sentence_consumed(text):
619
- remaining_count = len(get_unconsumed_sentences(all_harvard_sentences))
620
- if remaining_count == 0:
621
- return jsonify({"error": "This sentence has already been used and no unconsumed sentences remain. All sentences from the dataset have been consumed."}), 400
622
- else:
623
- return jsonify({"error": f"This sentence has already been used. Please select a different sentence. {remaining_count} sentences remain available."}), 400
624
 
625
  # --- Cache Check ---
626
  cache_hit = False
@@ -1196,47 +1175,35 @@ def toggle_leaderboard_visibility():
1196
 
1197
  @app.route("/api/tts/cached-sentences")
1198
  def get_cached_sentences():
1199
- """Returns a list of unconsumed sentences available for random selection."""
1200
- # Get unconsumed sentences from the full pool (not just cached ones)
1201
- unconsumed_sentences = get_unconsumed_sentences(all_harvard_sentences)
1202
 
1203
  # Limit the response size to avoid overwhelming the frontend
1204
  max_sentences = 1000
1205
- if len(unconsumed_sentences) > max_sentences:
1206
- import random
1207
- unconsumed_sentences = random.sample(unconsumed_sentences, max_sentences)
1208
 
1209
- return jsonify(unconsumed_sentences)
1210
 
1211
 
1212
  @app.route("/api/tts/sentence-stats")
1213
  def get_sentence_stats():
1214
- """Returns statistics about sentence consumption."""
1215
  total_sentences = len(all_harvard_sentences)
1216
- consumed_count = get_consumed_sentences_count()
1217
- remaining_count = total_sentences - consumed_count
1218
 
1219
  return jsonify({
1220
  "total_sentences": total_sentences,
1221
- "consumed_sentences": consumed_count,
1222
- "remaining_sentences": remaining_count,
1223
- "consumption_percentage": round((consumed_count / total_sentences) * 100, 2) if total_sentences > 0 else 0
1224
  })
1225
 
1226
 
1227
  @app.route("/api/tts/random-sentence")
1228
  def get_random_sentence():
1229
- """Returns a random unconsumed sentence."""
1230
- random_sentence = get_random_unconsumed_sentence(all_harvard_sentences)
1231
- if random_sentence:
1232
- return jsonify({"sentence": random_sentence})
1233
  else:
1234
- total_sentences = len(all_harvard_sentences)
1235
- consumed_count = get_consumed_sentences_count()
1236
- return jsonify({
1237
- "error": "No unconsumed sentences available",
1238
- "details": f"All {total_sentences} sentences have been consumed ({consumed_count} total consumed)"
1239
- }), 404
1240
 
1241
 
1242
  def get_weighted_random_models(
 
64
  )
65
  from flask_login import LoginManager, current_user
66
  from models import *
 
 
 
 
67
  from auth import auth, init_oauth, is_admin
68
  from admin import admin
69
  from security import is_vote_allowed, check_user_security_score, detect_coordinated_voting
 
476
  # Select a new sentence if not provided (for replacement)
477
  with tts_cache_lock:
478
  cached_keys = set(tts_cache.keys())
479
+ # Get sentences that are not already cached
480
+ available_sentences = [s for s in all_harvard_sentences if s not in cached_keys]
 
481
  if not available_sentences:
482
+ # All sentences are cached, pick any random one
483
+ available_sentences = all_harvard_sentences
484
  sentence = random.choice(available_sentences)
485
 
486
  # app.logger.info removed duplicate log
 
519
  "audio_b": audio_b_path,
520
  "created_at": datetime.utcnow(),
521
  }
 
 
522
  app.logger.info(f"Successfully cached entry for: '{sentence[:50]}...'")
523
  elif sentence in tts_cache:
524
  app.logger.warning(f"Sentence '{sentence[:50]}...' already re-cached. Discarding new generation.")
 
543
 
544
 
545
  def update_initial_sentences():
546
+ """Update initial sentences for random selection."""
547
  global initial_sentences
548
  try:
549
+ if all_harvard_sentences:
550
+ initial_sentences = random.sample(all_harvard_sentences, min(len(all_harvard_sentences), 500))
551
+ print(f"Updated initial sentences with {len(initial_sentences)} sentences")
 
552
  else:
553
+ print("Warning: No sentences available for initial selection")
554
+ initial_sentences = []
555
  except Exception as e:
556
+ print(f"Error updating initial sentences: {e}")
557
+ initial_sentences = []
558
 
559
 
560
  def initialize_tts_cache():
 
565
  app.logger.error("Harvard sentences not loaded. Cannot initialize cache.")
566
  return
567
 
568
+ # Update initial sentences
569
  update_initial_sentences()
570
 
571
+ # Select random sentences for initial cache population
572
+ initial_selection = random.sample(all_harvard_sentences, min(len(all_harvard_sentences), TTS_CACHE_SIZE))
 
 
 
 
 
573
  app.logger.info(f"Initializing TTS cache with {len(initial_selection)} sentences...")
574
 
575
  for sentence in initial_selection:
 
600
  # Check if text contains Korean (at least 30% Korean characters)
601
  if not is_korean_text(text):
602
  return jsonify({"error": "한국어 텍스트를 입력해주세요. 최소 30% 이상의 한국어가 포함되어야 합니다."}), 400
 
 
 
 
 
 
 
 
603
 
604
  # --- Cache Check ---
605
  cache_hit = False
 
1175
 
1176
  @app.route("/api/tts/cached-sentences")
1177
  def get_cached_sentences():
1178
+ """Returns a list of sentences available for random selection."""
1179
+ sentences = all_harvard_sentences.copy()
 
1180
 
1181
  # Limit the response size to avoid overwhelming the frontend
1182
  max_sentences = 1000
1183
+ if len(sentences) > max_sentences:
1184
+ sentences = random.sample(sentences, max_sentences)
 
1185
 
1186
+ return jsonify(sentences)
1187
 
1188
 
1189
  @app.route("/api/tts/sentence-stats")
1190
  def get_sentence_stats():
1191
+ """Returns statistics about available sentences."""
1192
  total_sentences = len(all_harvard_sentences)
 
 
1193
 
1194
  return jsonify({
1195
  "total_sentences": total_sentences,
1196
+ "available_sentences": total_sentences
 
 
1197
  })
1198
 
1199
 
1200
  @app.route("/api/tts/random-sentence")
1201
  def get_random_sentence():
1202
+ """Returns a random sentence."""
1203
+ if all_harvard_sentences:
1204
+ return jsonify({"sentence": random.choice(all_harvard_sentences)})
 
1205
  else:
1206
+ return jsonify({"error": "No sentences available"}), 404
 
 
 
 
 
1207
 
1208
 
1209
  def get_weighted_random_models(
ko_prompts.json CHANGED
@@ -1,55 +1,8 @@
1
  {
2
  "prompts": [
3
- "안녕하세요, 오늘 날씨가 정말 좋네요.",
4
- "지금 시예요? 약속 시간에 늦을 것 같아요.",
5
- "오늘 저녁에 먹을까요? 치킨이 땡기는데.",
6
- "주말에 시간 되시면 같이 영화 보러 갈래요?",
7
- "커피 한 잔 하실래요? 제가 살게요.",
8
- "회의는 오후 세 시에 시작합니다. 자료 준비해 주세요.",
9
- "이번 분기 매출이 전년 대비 이십 퍼센트 증가했습니다.",
10
- "고객님, 문의하신 내용 확인 후 답변드리겠습니다.",
11
- "프로젝트 마감일이 다음 주 금요일입니다.",
12
- "미팅 일정을 조율하고 싶은데, 언제가 편하신가요?",
13
- "채널톡 고객센터입니다. 무엇을 도와드릴까요?",
14
- "주문하신 상품은 내일 오전 중으로 배송될 예정입니다.",
15
- "불편을 드려 죄송합니다. 바로 처리해 드리겠습니다.",
16
- "결제가 정상적으로 완료되었습니다. 감사합니다.",
17
- "반품 신청이 접수되었습니다. 삼 영업일 내에 처리됩니다.",
18
- "서울의 현재 기온은 섭씨 이십오 도입니다.",
19
- "다음 정류장은 강남역입니다. 내리실 분은 준비해 주세요.",
20
- "오늘의 환율은 달러당 천삼백원입니다.",
21
- "이 제품의 가격은 삼만구천원입니다.",
22
- "영업시간은 오전 아홉 시부터 오후 여섯 시까지입니다.",
23
- "정말 기쁜 소식이에요! 축하드려요!",
24
- "걱정하지 마세요, 다 잘 될 거예요.",
25
- "오랜만이에요! 그동안 잘 지내셨어요?",
26
- "정말 감사합니다. 덕분에 큰 도움이 됐어요.",
27
- "아쉽지만 다음 기회에 뵙겠습니다.",
28
- "문을 열려면 버튼을 눌러주세요.",
29
- "왼쪽으로 돌아서 직진하시면 됩니다.",
30
- "앱을 설치하고 회원가입을 진행해 주세요.",
31
- "비밀번호는 여덟 자리 이상으로 설정해 주세요.",
32
- "첨부파일을 확인하시고 서명해 주세요.",
33
- "오늘 주요 뉴스를 전해드리겠습니다.",
34
- "정부가 새로운 정책을 발표했습니다.",
35
- "국내 반도체 수출이 사상 최대치를 기록했습니다.",
36
- "내일 전국적으로 비가 내릴 예정입니다.",
37
- "올해 출생률이 역대 최저를 기록했습니다.",
38
- "오늘 수업에서는 인공지능의 기초를 배워보겠습니다.",
39
- "이 문제의 정답은 삼번입니다.",
40
- "다음 시간까지 과제를 제출해 주세요.",
41
- "질문이 있으시면 언제든지 물어보세요.",
42
- "복습은 학습의 가장 중요한 부분입니다.",
43
- "이번 주 인기 영화 순위를 알려드릴게요.",
44
- "새 앨범이 음원 차트 일위를 차지했습니다.",
45
- "오늘 경기에서 한국팀이 이겼습니다!",
46
- "다음 에피소드가 정말 기대돼요.",
47
- "이 노래 가사가 정말 마음에 들어요.",
48
- "인공지능 기술이 빠르게 발전하고 있습니다.",
49
- "스마트폰 배터리를 절약하는 방법을 알려드릴게요.",
50
- "이 앱은 무료로 다운로드할 수 있습니다.",
51
- "시스템 업데이트가 완료되었습니다.",
52
- "클라우드에 파일이 자동으로 저장됩니다."
53
  ]
54
  }
55
 
 
1
  {
2
  "prompts": [
3
+ "여러 상용 TTS가 이미 존재하지만, 저희가 자체 모델을 만드는 이유는 명확합니다.",
4
+ "안녕하세요 채널톡 AI 에이전트 알프입니다.",
5
+ "안녕하세요 채널톡 한국어 TTS 아레나입니다!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  ]
7
  }
8
 
models.py CHANGED
@@ -314,14 +314,6 @@ def record_vote(user_id, text, chosen_model_id, rejected_model_id, model_type,
314
 
315
  db.session.add_all([chosen_history, rejected_history])
316
 
317
- # Mark sentence as consumed AFTER successful vote recording (only for dataset sentences that count)
318
- if counts_for_public and sentence_origin == 'dataset':
319
- try:
320
- mark_sentence_consumed(text, usage_type='voted')
321
- except Exception as e:
322
- # If consumption marking fails, log but don't fail the vote
323
- logging.error(f"Failed to mark sentence as consumed after vote: {str(e)}")
324
-
325
  db.session.commit()
326
 
327
  return vote, None
 
314
 
315
  db.session.add_all([chosen_history, rejected_history])
316
 
 
 
 
 
 
 
 
 
317
  db.session.commit()
318
 
319
  return vote, None