HMWCS commited on
Commit
8f8cb24
·
verified ·
1 Parent(s): 187dd30

Update classifier.py

Browse files
Files changed (1) hide show
  1. classifier.py +113 -52
classifier.py CHANGED
@@ -170,24 +170,48 @@ class GarbageClassifier:
170
  def _extract_classification(self, response: str) -> str:
171
  """Extract the main classification from the response with enhanced logic"""
172
  response_lower = response.lower()
173
-
174
  # Strong indicators that this is NOT garbage - check these first
175
  non_garbage_indicators = [
176
  "unable to classify",
177
  "cannot classify",
178
- "not garbage",
179
  "not waste",
180
  "not trash",
181
- "person", "people", "human", "face", "man", "woman",
182
- "living", "alive", "animal", "pet", "dog", "cat",
183
- "functioning", "in use", "working", "operational",
184
- "furniture", "appliance", "electronic device",
185
- "building", "house", "room", "landscape",
186
- "vehicle", "car", "truck", "bike",
187
- "elon musk", "celebrity", "famous person",
188
- "portrait", "photo of a person"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  ]
190
-
191
  # Check for explicit statements about not being garbage
192
  non_garbage_phrases = [
193
  "this is not",
@@ -198,101 +222,136 @@ class GarbageClassifier:
198
  "shows a person",
199
  "image of a person",
200
  "human being",
201
- "living creature"
202
  ]
203
-
204
  # First priority: Check for strong non-garbage indicators
205
  if any(indicator in response_lower for indicator in non_garbage_indicators):
206
  return "Unable to classify"
207
-
208
  # Second priority: Check for phrases indicating it's not garbage
209
  if any(phrase in response_lower for phrase in non_garbage_phrases):
210
  return "Unable to classify"
211
-
212
  # Third priority: Look for reasoning that explicitly says it's not waste/garbage
213
  reasoning_against_waste = [
214
  "cannot be classified as waste",
215
  "should not be classified as",
216
  "not appropriate to classify",
217
  "does not belong to any waste category",
218
- "is not waste material"
219
  ]
220
-
221
  if any(phrase in response_lower for phrase in reasoning_against_waste):
222
  return "Unable to classify"
223
-
224
  # Only if none of the above conditions are met, then look for garbage categories
225
  categories = self.knowledge.get_categories()
226
  waste_categories = [cat for cat in categories if cat != "Unable to classify"]
227
-
228
  # Look for exact category matches
229
  for category in waste_categories:
230
  if category.lower() in response_lower:
231
  # Double check - make sure the context is positive
232
  category_index = response_lower.find(category.lower())
233
- context_before = response_lower[max(0, category_index-50):category_index]
234
- context_after = response_lower[category_index:category_index+50]
235
-
 
 
236
  # If there are negation words around the category, skip it
237
- negation_words = ["not", "cannot", "unable", "doesn't", "isn't", "won't", "shouldn't"]
238
- if any(neg in context_before or neg in context_after for neg in negation_words):
 
 
 
 
 
 
 
 
 
 
 
239
  continue
240
-
241
  return category
242
-
243
  # Look for key terms only if no explicit non-garbage indicators were found
244
  category_keywords = {
245
  "Recyclable Waste": [
246
- "recyclable", "recycle", "plastic bottle", "aluminum can",
247
- "cardboard box", "glass bottle", "metal can"
 
 
 
 
 
248
  ],
249
  "Food/Kitchen Waste": [
250
- "food scraps", "fruit peel", "vegetable waste", "leftovers",
251
- "organic waste", "kitchen waste"
 
 
 
 
252
  ],
253
  "Hazardous Waste": [
254
- "battery", "chemical container", "medicine bottle",
255
- "paint can", "toxic material"
 
 
 
256
  ],
257
- "Other Waste": [
258
- "cigarette butt", "ceramic piece", "dust", "general waste"
259
- ]
260
  }
261
-
262
  for category, keywords in category_keywords.items():
263
  if any(keyword in response_lower for keyword in keywords):
264
  return category
265
-
266
  # Default to "Unable to classify" if nothing clear is found
267
  return "Unable to classify"
268
 
269
  def _extract_reasoning(self, response: str) -> str:
270
  """Extract only the reasoning content, removing all formatting markers"""
271
  import re
272
-
273
  # Remove all formatting markers
274
  cleaned_response = response.replace("**Classification**:", "")
275
  cleaned_response = cleaned_response.replace("**Reasoning**:", "")
276
- cleaned_response = re.sub(r'\*\*.*?\*\*:', '', cleaned_response) # Remove any **text**: patterns
277
- cleaned_response = cleaned_response.replace("**", "") # Remove remaining ** markers
278
-
 
 
 
 
279
  # Split into lines and process
280
- lines = cleaned_response.split('\n')
281
  reasoning_parts = []
282
-
283
  for line in lines:
284
  line = line.strip()
285
  # Skip empty lines and lines that look like classification categories
286
- if line and not line in ["Recyclable Waste", "Food/Kitchen Waste", "Hazardous Waste", "Other Waste", "Unable to classify"]:
 
 
 
 
 
 
287
  # Skip lines that are just category names
288
  if line not in self.knowledge.get_categories():
289
  reasoning_parts.append(line)
290
-
291
  # Join the reasoning parts
292
- reasoning = ' '.join(reasoning_parts).strip()
293
-
294
  # If we still have structured format markers, try a different approach
295
- if reasoning.startswith("Classification:") or reasoning.startswith("Reasoning:"):
 
 
296
  # Split by common patterns and take the reasoning part
297
  if "Reasoning:" in reasoning:
298
  reasoning = reasoning.split("Reasoning:")[-1].strip()
@@ -301,12 +360,14 @@ class GarbageClassifier:
301
  parts = reasoning.split(":", 1)
302
  if len(parts) > 1:
303
  reasoning = parts[1].strip()
304
-
305
  # Clean up any remaining artifacts
306
- reasoning = re.sub(r'^[A-Za-z\s]+:', '', reasoning).strip() # Remove "Category:" type prefixes
307
-
 
 
308
  return reasoning if reasoning else "Analysis not available"
309
 
310
  def get_categories_info(self):
311
  """Get information about all categories"""
312
- return self.knowledge.get_category_descriptions()
 
170
  def _extract_classification(self, response: str) -> str:
171
  """Extract the main classification from the response with enhanced logic"""
172
  response_lower = response.lower()
173
+
174
  # Strong indicators that this is NOT garbage - check these first
175
  non_garbage_indicators = [
176
  "unable to classify",
177
  "cannot classify",
178
+ "not garbage",
179
  "not waste",
180
  "not trash",
181
+ "person",
182
+ "people",
183
+ "human",
184
+ "face",
185
+ "man",
186
+ "woman",
187
+ "living",
188
+ "alive",
189
+ "animal",
190
+ "pet",
191
+ "dog",
192
+ "cat",
193
+ "functioning",
194
+ "in use",
195
+ "working",
196
+ "operational",
197
+ "furniture",
198
+ "appliance",
199
+ "electronic device",
200
+ "building",
201
+ "house",
202
+ "room",
203
+ "landscape",
204
+ "vehicle",
205
+ "car",
206
+ "truck",
207
+ "bike",
208
+ "elon musk",
209
+ "celebrity",
210
+ "famous person",
211
+ "portrait",
212
+ "photo of a person",
213
  ]
214
+
215
  # Check for explicit statements about not being garbage
216
  non_garbage_phrases = [
217
  "this is not",
 
222
  "shows a person",
223
  "image of a person",
224
  "human being",
225
+ "living creature",
226
  ]
227
+
228
  # First priority: Check for strong non-garbage indicators
229
  if any(indicator in response_lower for indicator in non_garbage_indicators):
230
  return "Unable to classify"
231
+
232
  # Second priority: Check for phrases indicating it's not garbage
233
  if any(phrase in response_lower for phrase in non_garbage_phrases):
234
  return "Unable to classify"
235
+
236
  # Third priority: Look for reasoning that explicitly says it's not waste/garbage
237
  reasoning_against_waste = [
238
  "cannot be classified as waste",
239
  "should not be classified as",
240
  "not appropriate to classify",
241
  "does not belong to any waste category",
242
+ "is not waste material",
243
  ]
244
+
245
  if any(phrase in response_lower for phrase in reasoning_against_waste):
246
  return "Unable to classify"
247
+
248
  # Only if none of the above conditions are met, then look for garbage categories
249
  categories = self.knowledge.get_categories()
250
  waste_categories = [cat for cat in categories if cat != "Unable to classify"]
251
+
252
  # Look for exact category matches
253
  for category in waste_categories:
254
  if category.lower() in response_lower:
255
  # Double check - make sure the context is positive
256
  category_index = response_lower.find(category.lower())
257
+ context_before = response_lower[
258
+ max(0, category_index - 50) : category_index
259
+ ]
260
+ context_after = response_lower[category_index : category_index + 50]
261
+
262
  # If there are negation words around the category, skip it
263
+ negation_words = [
264
+ "not",
265
+ "cannot",
266
+ "unable",
267
+ "doesn't",
268
+ "isn't",
269
+ "won't",
270
+ "shouldn't",
271
+ ]
272
+ if any(
273
+ neg in context_before or neg in context_after
274
+ for neg in negation_words
275
+ ):
276
  continue
277
+
278
  return category
279
+
280
  # Look for key terms only if no explicit non-garbage indicators were found
281
  category_keywords = {
282
  "Recyclable Waste": [
283
+ "recyclable",
284
+ "recycle",
285
+ "plastic bottle",
286
+ "aluminum can",
287
+ "cardboard box",
288
+ "glass bottle",
289
+ "metal can",
290
  ],
291
  "Food/Kitchen Waste": [
292
+ "food scraps",
293
+ "fruit peel",
294
+ "vegetable waste",
295
+ "leftovers",
296
+ "organic waste",
297
+ "kitchen waste",
298
  ],
299
  "Hazardous Waste": [
300
+ "battery",
301
+ "chemical container",
302
+ "medicine bottle",
303
+ "paint can",
304
+ "toxic material",
305
  ],
306
+ "Other Waste": ["cigarette butt", "ceramic piece", "dust", "general waste"],
 
 
307
  }
308
+
309
  for category, keywords in category_keywords.items():
310
  if any(keyword in response_lower for keyword in keywords):
311
  return category
312
+
313
  # Default to "Unable to classify" if nothing clear is found
314
  return "Unable to classify"
315
 
316
  def _extract_reasoning(self, response: str) -> str:
317
  """Extract only the reasoning content, removing all formatting markers"""
318
  import re
319
+
320
  # Remove all formatting markers
321
  cleaned_response = response.replace("**Classification**:", "")
322
  cleaned_response = cleaned_response.replace("**Reasoning**:", "")
323
+ cleaned_response = re.sub(
324
+ r"\*\*.*?\*\*:", "", cleaned_response
325
+ ) # Remove any **text**: patterns
326
+ cleaned_response = cleaned_response.replace(
327
+ "**", ""
328
+ ) # Remove remaining ** markers
329
+
330
  # Split into lines and process
331
+ lines = cleaned_response.split("\n")
332
  reasoning_parts = []
333
+
334
  for line in lines:
335
  line = line.strip()
336
  # Skip empty lines and lines that look like classification categories
337
+ if line and not line in [
338
+ "Recyclable Waste",
339
+ "Food/Kitchen Waste",
340
+ "Hazardous Waste",
341
+ "Other Waste",
342
+ "Unable to classify",
343
+ ]:
344
  # Skip lines that are just category names
345
  if line not in self.knowledge.get_categories():
346
  reasoning_parts.append(line)
347
+
348
  # Join the reasoning parts
349
+ reasoning = " ".join(reasoning_parts).strip()
350
+
351
  # If we still have structured format markers, try a different approach
352
+ if reasoning.startswith("Classification:") or reasoning.startswith(
353
+ "Reasoning:"
354
+ ):
355
  # Split by common patterns and take the reasoning part
356
  if "Reasoning:" in reasoning:
357
  reasoning = reasoning.split("Reasoning:")[-1].strip()
 
360
  parts = reasoning.split(":", 1)
361
  if len(parts) > 1:
362
  reasoning = parts[1].strip()
363
+
364
  # Clean up any remaining artifacts
365
+ reasoning = re.sub(
366
+ r"^[A-Za-z\s]+:", "", reasoning
367
+ ).strip() # Remove "Category:" type prefixes
368
+
369
  return reasoning if reasoning else "Analysis not available"
370
 
371
  def get_categories_info(self):
372
  """Get information about all categories"""
373
+ return self.knowledge.get_category_descriptions()