Rulga commited on
Commit
56a8632
·
1 Parent(s): b47770f

Refactor ChatEvaluator class; simplify initialization, enhance annotation handling, and improve error logging

Browse files
Files changed (1) hide show
  1. src/analytics/chat_evaluator.py +76 -65
src/analytics/chat_evaluator.py CHANGED
@@ -9,31 +9,50 @@ from typing import List, Dict, Any, Tuple, Optional
9
  import pandas as pd
10
  from src.knowledge_base.dataset import DatasetManager
11
  from huggingface_hub import HfApi
 
 
 
 
12
 
13
  class ChatEvaluator:
14
- def __init__(self,
15
- dataset_manager: Optional[DatasetManager] = None,
16
- hf_token: str = None,
17
- dataset_id: str = None,
18
- chat_history_path: str = None):
19
  """
20
  Initialize chat evaluator
21
 
22
  Args:
23
- dataset_manager: Dataset manager for retrieving chat history
24
- hf_token: Hugging Face token for uploading annotations
25
- dataset_id: Hugging Face dataset ID
26
  """
27
- self.dataset_manager = dataset_manager or DatasetManager()
28
- self.hf_token = hf_token
29
- self.dataset_id = dataset_id
30
- self.chat_history_path = chat_history_path
31
- self.annotations_dir = os.path.join(os.path.dirname(chat_history_path), "annotations") if chat_history_path else None
32
 
33
- # Create annotations directory if it doesn't exist
34
- if self.annotations_dir:
35
- os.makedirs(self.annotations_dir, exist_ok=True)
36
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def get_chat_history(self) -> List[Dict[str, Any]]:
38
  """
39
  Get all chat history data from dataset
@@ -130,9 +149,6 @@ class ChatEvaluator:
130
  Returns:
131
  (success, message)
132
  """
133
- if not self.annotations_dir:
134
- return False, "Annotations directory not configured"
135
-
136
  try:
137
  # Create annotation object
138
  annotation = {
@@ -146,61 +162,55 @@ class ChatEvaluator:
146
  }
147
 
148
  # Create filename with conversation_id
149
- filename = f"annotation_{conversation_id}.json"
150
- filepath = os.path.join(self.annotations_dir, filename)
151
 
152
- # Save to local file
153
- with open(filepath, 'w', encoding='utf-8') as f:
154
- json.dump(annotation, f, ensure_ascii=False, indent=2)
155
 
156
- # Upload to HuggingFace dataset if configured
157
- if self.hf_token and self.dataset_id:
158
- try:
159
- api = HfApi(token=self.hf_token)
160
-
161
- # Extract just the directory name from annotations_dir
162
- dir_name = os.path.basename(self.annotations_dir)
163
- target_path = f"{dir_name}/{filename}"
164
-
165
- # Upload the file to the dataset
166
- api.upload_file(
167
- path_or_fileobj=filepath,
168
- path_in_repo=target_path,
169
- repo_id=self.dataset_id,
170
- repo_type="dataset"
171
- )
172
-
173
- except Exception as e:
174
- return True, f"Saved locally but failed to upload to dataset: {str(e)}"
175
 
176
  return True, "Annotation saved successfully"
 
177
  except Exception as e:
178
- return False, f"Error saving annotation: {str(e)}"
 
179
 
180
  def get_annotations(self) -> List[Dict[str, Any]]:
181
  """
182
- Get all saved annotations
183
-
184
- Returns:
185
- List of annotation objects
186
  """
187
- if not self.annotations_dir or not os.path.exists(self.annotations_dir):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  return []
189
-
190
- annotations = []
191
- for filename in os.listdir(self.annotations_dir):
192
- if filename.startswith("annotation_") and filename.endswith(".json"):
193
- try:
194
- filepath = os.path.join(self.annotations_dir, filename)
195
- with open(filepath, 'r', encoding='utf-8') as f:
196
- annotation = json.load(f)
197
- annotations.append(annotation)
198
- except Exception as e:
199
- print(f"Error loading annotation {filename}: {str(e)}")
200
-
201
- # Sort by timestamp (newest first)
202
- annotations.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
203
- return annotations
204
 
205
  def get_annotation_by_conversation_id(self, conversation_id: str) -> Optional[Dict[str, Any]]:
206
  """
@@ -325,3 +335,4 @@ class ChatEvaluator:
325
 
326
 
327
 
 
 
9
  import pandas as pd
10
  from src.knowledge_base.dataset import DatasetManager
11
  from huggingface_hub import HfApi
12
+ import io
13
+ import logging
14
+
15
+ logger = logging.getLogger(__name__)
16
 
17
  class ChatEvaluator:
18
+ def __init__(self, hf_token: str = None, dataset_id: str = None):
 
 
 
 
19
  """
20
  Initialize chat evaluator
21
 
22
  Args:
23
+ hf_token: Hugging Face token
24
+ dataset_id: Dataset ID on Hugging Face
 
25
  """
26
+ self.hf_token = hf_token or os.getenv('HF_TOKEN')
27
+ self.dataset_id = dataset_id or "Rulga/status-law-knowledge-base"
28
+ self.api = HfApi(token=self.hf_token)
 
 
29
 
30
+ # Path for annotations in the dataset
31
+ self.annotations_path = "annotations"
32
+
33
+ # Ensure annotations directory exists in dataset
34
+ try:
35
+ self._ensure_annotations_dir()
36
+ except Exception as e:
37
+ logger.error(f"Failed to ensure annotations directory: {e}")
38
+
39
+ def _ensure_annotations_dir(self):
40
+ """Ensure annotations directory exists in the dataset"""
41
+ try:
42
+ # Check if directory exists
43
+ files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
44
+ if self.annotations_path not in files:
45
+ # Create empty file to initialize directory
46
+ self.api.upload_file(
47
+ path_or_fileobj=io.StringIO(""),
48
+ path_in_repo=f"{self.annotations_path}/.gitkeep",
49
+ repo_id=self.dataset_id,
50
+ repo_type="dataset"
51
+ )
52
+ except Exception as e:
53
+ logger.error(f"Error ensuring annotations directory: {e}")
54
+ raise
55
+
56
  def get_chat_history(self) -> List[Dict[str, Any]]:
57
  """
58
  Get all chat history data from dataset
 
149
  Returns:
150
  (success, message)
151
  """
 
 
 
152
  try:
153
  # Create annotation object
154
  annotation = {
 
162
  }
163
 
164
  # Create filename with conversation_id
165
+ filename = f"{self.annotations_path}/annotation_{conversation_id}.json"
 
166
 
167
+ # Convert to JSON string
168
+ json_content = json.dumps(annotation, ensure_ascii=False, indent=2)
 
169
 
170
+ # Upload to dataset
171
+ self.api.upload_file(
172
+ path_or_fileobj=io.StringIO(json_content),
173
+ path_in_repo=filename,
174
+ repo_id=self.dataset_id,
175
+ repo_type="dataset"
176
+ )
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  return True, "Annotation saved successfully"
179
+
180
  except Exception as e:
181
+ logger.error(f"Error saving annotation: {e}")
182
+ return False, f"Failed to save annotation: {str(e)}"
183
 
184
  def get_annotations(self) -> List[Dict[str, Any]]:
185
  """
186
+ Get all saved annotations from dataset
 
 
 
187
  """
188
+ try:
189
+ annotations = []
190
+ files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
191
+
192
+ for file in files:
193
+ if file.startswith(f"{self.annotations_path}/annotation_") and file.endswith(".json"):
194
+ try:
195
+ # Download and parse annotation file
196
+ content = self.api.hf_hub_download(
197
+ repo_id=self.dataset_id,
198
+ filename=file,
199
+ repo_type="dataset"
200
+ )
201
+ with open(content, 'r', encoding='utf-8') as f:
202
+ annotation = json.load(f)
203
+ annotations.append(annotation)
204
+ except Exception as e:
205
+ logger.error(f"Error loading annotation {file}: {e}")
206
+
207
+ # Sort by timestamp (newest first)
208
+ annotations.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
209
+ return annotations
210
+
211
+ except Exception as e:
212
+ logger.error(f"Error getting annotations: {e}")
213
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  def get_annotation_by_conversation_id(self, conversation_id: str) -> Optional[Dict[str, Any]]:
216
  """
 
335
 
336
 
337
 
338
+