Spaces:
Running
on
Zero
Running
on
Zero
Update classifier.py
Browse files- classifier.py +113 -52
classifier.py
CHANGED
|
@@ -170,24 +170,48 @@ class GarbageClassifier:
|
|
| 170 |
def _extract_classification(self, response: str) -> str:
|
| 171 |
"""Extract the main classification from the response with enhanced logic"""
|
| 172 |
response_lower = response.lower()
|
| 173 |
-
|
| 174 |
# Strong indicators that this is NOT garbage - check these first
|
| 175 |
non_garbage_indicators = [
|
| 176 |
"unable to classify",
|
| 177 |
"cannot classify",
|
| 178 |
-
"not garbage",
|
| 179 |
"not waste",
|
| 180 |
"not trash",
|
| 181 |
-
"person",
|
| 182 |
-
"
|
| 183 |
-
"
|
| 184 |
-
"
|
| 185 |
-
"
|
| 186 |
-
"
|
| 187 |
-
"
|
| 188 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
]
|
| 190 |
-
|
| 191 |
# Check for explicit statements about not being garbage
|
| 192 |
non_garbage_phrases = [
|
| 193 |
"this is not",
|
|
@@ -198,101 +222,136 @@ class GarbageClassifier:
|
|
| 198 |
"shows a person",
|
| 199 |
"image of a person",
|
| 200 |
"human being",
|
| 201 |
-
"living creature"
|
| 202 |
]
|
| 203 |
-
|
| 204 |
# First priority: Check for strong non-garbage indicators
|
| 205 |
if any(indicator in response_lower for indicator in non_garbage_indicators):
|
| 206 |
return "Unable to classify"
|
| 207 |
-
|
| 208 |
# Second priority: Check for phrases indicating it's not garbage
|
| 209 |
if any(phrase in response_lower for phrase in non_garbage_phrases):
|
| 210 |
return "Unable to classify"
|
| 211 |
-
|
| 212 |
# Third priority: Look for reasoning that explicitly says it's not waste/garbage
|
| 213 |
reasoning_against_waste = [
|
| 214 |
"cannot be classified as waste",
|
| 215 |
"should not be classified as",
|
| 216 |
"not appropriate to classify",
|
| 217 |
"does not belong to any waste category",
|
| 218 |
-
"is not waste material"
|
| 219 |
]
|
| 220 |
-
|
| 221 |
if any(phrase in response_lower for phrase in reasoning_against_waste):
|
| 222 |
return "Unable to classify"
|
| 223 |
-
|
| 224 |
# Only if none of the above conditions are met, then look for garbage categories
|
| 225 |
categories = self.knowledge.get_categories()
|
| 226 |
waste_categories = [cat for cat in categories if cat != "Unable to classify"]
|
| 227 |
-
|
| 228 |
# Look for exact category matches
|
| 229 |
for category in waste_categories:
|
| 230 |
if category.lower() in response_lower:
|
| 231 |
# Double check - make sure the context is positive
|
| 232 |
category_index = response_lower.find(category.lower())
|
| 233 |
-
context_before = response_lower[
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
| 236 |
# If there are negation words around the category, skip it
|
| 237 |
-
negation_words = [
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
continue
|
| 240 |
-
|
| 241 |
return category
|
| 242 |
-
|
| 243 |
# Look for key terms only if no explicit non-garbage indicators were found
|
| 244 |
category_keywords = {
|
| 245 |
"Recyclable Waste": [
|
| 246 |
-
"recyclable",
|
| 247 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
],
|
| 249 |
"Food/Kitchen Waste": [
|
| 250 |
-
"food scraps",
|
| 251 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
],
|
| 253 |
"Hazardous Waste": [
|
| 254 |
-
"battery",
|
| 255 |
-
"
|
|
|
|
|
|
|
|
|
|
| 256 |
],
|
| 257 |
-
"Other Waste": [
|
| 258 |
-
"cigarette butt", "ceramic piece", "dust", "general waste"
|
| 259 |
-
]
|
| 260 |
}
|
| 261 |
-
|
| 262 |
for category, keywords in category_keywords.items():
|
| 263 |
if any(keyword in response_lower for keyword in keywords):
|
| 264 |
return category
|
| 265 |
-
|
| 266 |
# Default to "Unable to classify" if nothing clear is found
|
| 267 |
return "Unable to classify"
|
| 268 |
|
| 269 |
def _extract_reasoning(self, response: str) -> str:
|
| 270 |
"""Extract only the reasoning content, removing all formatting markers"""
|
| 271 |
import re
|
| 272 |
-
|
| 273 |
# Remove all formatting markers
|
| 274 |
cleaned_response = response.replace("**Classification**:", "")
|
| 275 |
cleaned_response = cleaned_response.replace("**Reasoning**:", "")
|
| 276 |
-
cleaned_response = re.sub(
|
| 277 |
-
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
# Split into lines and process
|
| 280 |
-
lines = cleaned_response.split(
|
| 281 |
reasoning_parts = []
|
| 282 |
-
|
| 283 |
for line in lines:
|
| 284 |
line = line.strip()
|
| 285 |
# Skip empty lines and lines that look like classification categories
|
| 286 |
-
if line and not line in [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
# Skip lines that are just category names
|
| 288 |
if line not in self.knowledge.get_categories():
|
| 289 |
reasoning_parts.append(line)
|
| 290 |
-
|
| 291 |
# Join the reasoning parts
|
| 292 |
-
reasoning =
|
| 293 |
-
|
| 294 |
# If we still have structured format markers, try a different approach
|
| 295 |
-
if reasoning.startswith("Classification:") or reasoning.startswith(
|
|
|
|
|
|
|
| 296 |
# Split by common patterns and take the reasoning part
|
| 297 |
if "Reasoning:" in reasoning:
|
| 298 |
reasoning = reasoning.split("Reasoning:")[-1].strip()
|
|
@@ -301,12 +360,14 @@ class GarbageClassifier:
|
|
| 301 |
parts = reasoning.split(":", 1)
|
| 302 |
if len(parts) > 1:
|
| 303 |
reasoning = parts[1].strip()
|
| 304 |
-
|
| 305 |
# Clean up any remaining artifacts
|
| 306 |
-
reasoning = re.sub(
|
| 307 |
-
|
|
|
|
|
|
|
| 308 |
return reasoning if reasoning else "Analysis not available"
|
| 309 |
|
| 310 |
def get_categories_info(self):
|
| 311 |
"""Get information about all categories"""
|
| 312 |
-
return self.knowledge.get_category_descriptions()
|
|
|
|
| 170 |
def _extract_classification(self, response: str) -> str:
|
| 171 |
"""Extract the main classification from the response with enhanced logic"""
|
| 172 |
response_lower = response.lower()
|
| 173 |
+
|
| 174 |
# Strong indicators that this is NOT garbage - check these first
|
| 175 |
non_garbage_indicators = [
|
| 176 |
"unable to classify",
|
| 177 |
"cannot classify",
|
| 178 |
+
"not garbage",
|
| 179 |
"not waste",
|
| 180 |
"not trash",
|
| 181 |
+
"person",
|
| 182 |
+
"people",
|
| 183 |
+
"human",
|
| 184 |
+
"face",
|
| 185 |
+
"man",
|
| 186 |
+
"woman",
|
| 187 |
+
"living",
|
| 188 |
+
"alive",
|
| 189 |
+
"animal",
|
| 190 |
+
"pet",
|
| 191 |
+
"dog",
|
| 192 |
+
"cat",
|
| 193 |
+
"functioning",
|
| 194 |
+
"in use",
|
| 195 |
+
"working",
|
| 196 |
+
"operational",
|
| 197 |
+
"furniture",
|
| 198 |
+
"appliance",
|
| 199 |
+
"electronic device",
|
| 200 |
+
"building",
|
| 201 |
+
"house",
|
| 202 |
+
"room",
|
| 203 |
+
"landscape",
|
| 204 |
+
"vehicle",
|
| 205 |
+
"car",
|
| 206 |
+
"truck",
|
| 207 |
+
"bike",
|
| 208 |
+
"elon musk",
|
| 209 |
+
"celebrity",
|
| 210 |
+
"famous person",
|
| 211 |
+
"portrait",
|
| 212 |
+
"photo of a person",
|
| 213 |
]
|
| 214 |
+
|
| 215 |
# Check for explicit statements about not being garbage
|
| 216 |
non_garbage_phrases = [
|
| 217 |
"this is not",
|
|
|
|
| 222 |
"shows a person",
|
| 223 |
"image of a person",
|
| 224 |
"human being",
|
| 225 |
+
"living creature",
|
| 226 |
]
|
| 227 |
+
|
| 228 |
# First priority: Check for strong non-garbage indicators
|
| 229 |
if any(indicator in response_lower for indicator in non_garbage_indicators):
|
| 230 |
return "Unable to classify"
|
| 231 |
+
|
| 232 |
# Second priority: Check for phrases indicating it's not garbage
|
| 233 |
if any(phrase in response_lower for phrase in non_garbage_phrases):
|
| 234 |
return "Unable to classify"
|
| 235 |
+
|
| 236 |
# Third priority: Look for reasoning that explicitly says it's not waste/garbage
|
| 237 |
reasoning_against_waste = [
|
| 238 |
"cannot be classified as waste",
|
| 239 |
"should not be classified as",
|
| 240 |
"not appropriate to classify",
|
| 241 |
"does not belong to any waste category",
|
| 242 |
+
"is not waste material",
|
| 243 |
]
|
| 244 |
+
|
| 245 |
if any(phrase in response_lower for phrase in reasoning_against_waste):
|
| 246 |
return "Unable to classify"
|
| 247 |
+
|
| 248 |
# Only if none of the above conditions are met, then look for garbage categories
|
| 249 |
categories = self.knowledge.get_categories()
|
| 250 |
waste_categories = [cat for cat in categories if cat != "Unable to classify"]
|
| 251 |
+
|
| 252 |
# Look for exact category matches
|
| 253 |
for category in waste_categories:
|
| 254 |
if category.lower() in response_lower:
|
| 255 |
# Double check - make sure the context is positive
|
| 256 |
category_index = response_lower.find(category.lower())
|
| 257 |
+
context_before = response_lower[
|
| 258 |
+
max(0, category_index - 50) : category_index
|
| 259 |
+
]
|
| 260 |
+
context_after = response_lower[category_index : category_index + 50]
|
| 261 |
+
|
| 262 |
# If there are negation words around the category, skip it
|
| 263 |
+
negation_words = [
|
| 264 |
+
"not",
|
| 265 |
+
"cannot",
|
| 266 |
+
"unable",
|
| 267 |
+
"doesn't",
|
| 268 |
+
"isn't",
|
| 269 |
+
"won't",
|
| 270 |
+
"shouldn't",
|
| 271 |
+
]
|
| 272 |
+
if any(
|
| 273 |
+
neg in context_before or neg in context_after
|
| 274 |
+
for neg in negation_words
|
| 275 |
+
):
|
| 276 |
continue
|
| 277 |
+
|
| 278 |
return category
|
| 279 |
+
|
| 280 |
# Look for key terms only if no explicit non-garbage indicators were found
|
| 281 |
category_keywords = {
|
| 282 |
"Recyclable Waste": [
|
| 283 |
+
"recyclable",
|
| 284 |
+
"recycle",
|
| 285 |
+
"plastic bottle",
|
| 286 |
+
"aluminum can",
|
| 287 |
+
"cardboard box",
|
| 288 |
+
"glass bottle",
|
| 289 |
+
"metal can",
|
| 290 |
],
|
| 291 |
"Food/Kitchen Waste": [
|
| 292 |
+
"food scraps",
|
| 293 |
+
"fruit peel",
|
| 294 |
+
"vegetable waste",
|
| 295 |
+
"leftovers",
|
| 296 |
+
"organic waste",
|
| 297 |
+
"kitchen waste",
|
| 298 |
],
|
| 299 |
"Hazardous Waste": [
|
| 300 |
+
"battery",
|
| 301 |
+
"chemical container",
|
| 302 |
+
"medicine bottle",
|
| 303 |
+
"paint can",
|
| 304 |
+
"toxic material",
|
| 305 |
],
|
| 306 |
+
"Other Waste": ["cigarette butt", "ceramic piece", "dust", "general waste"],
|
|
|
|
|
|
|
| 307 |
}
|
| 308 |
+
|
| 309 |
for category, keywords in category_keywords.items():
|
| 310 |
if any(keyword in response_lower for keyword in keywords):
|
| 311 |
return category
|
| 312 |
+
|
| 313 |
# Default to "Unable to classify" if nothing clear is found
|
| 314 |
return "Unable to classify"
|
| 315 |
|
| 316 |
def _extract_reasoning(self, response: str) -> str:
|
| 317 |
"""Extract only the reasoning content, removing all formatting markers"""
|
| 318 |
import re
|
| 319 |
+
|
| 320 |
# Remove all formatting markers
|
| 321 |
cleaned_response = response.replace("**Classification**:", "")
|
| 322 |
cleaned_response = cleaned_response.replace("**Reasoning**:", "")
|
| 323 |
+
cleaned_response = re.sub(
|
| 324 |
+
r"\*\*.*?\*\*:", "", cleaned_response
|
| 325 |
+
) # Remove any **text**: patterns
|
| 326 |
+
cleaned_response = cleaned_response.replace(
|
| 327 |
+
"**", ""
|
| 328 |
+
) # Remove remaining ** markers
|
| 329 |
+
|
| 330 |
# Split into lines and process
|
| 331 |
+
lines = cleaned_response.split("\n")
|
| 332 |
reasoning_parts = []
|
| 333 |
+
|
| 334 |
for line in lines:
|
| 335 |
line = line.strip()
|
| 336 |
# Skip empty lines and lines that look like classification categories
|
| 337 |
+
if line and not line in [
|
| 338 |
+
"Recyclable Waste",
|
| 339 |
+
"Food/Kitchen Waste",
|
| 340 |
+
"Hazardous Waste",
|
| 341 |
+
"Other Waste",
|
| 342 |
+
"Unable to classify",
|
| 343 |
+
]:
|
| 344 |
# Skip lines that are just category names
|
| 345 |
if line not in self.knowledge.get_categories():
|
| 346 |
reasoning_parts.append(line)
|
| 347 |
+
|
| 348 |
# Join the reasoning parts
|
| 349 |
+
reasoning = " ".join(reasoning_parts).strip()
|
| 350 |
+
|
| 351 |
# If we still have structured format markers, try a different approach
|
| 352 |
+
if reasoning.startswith("Classification:") or reasoning.startswith(
|
| 353 |
+
"Reasoning:"
|
| 354 |
+
):
|
| 355 |
# Split by common patterns and take the reasoning part
|
| 356 |
if "Reasoning:" in reasoning:
|
| 357 |
reasoning = reasoning.split("Reasoning:")[-1].strip()
|
|
|
|
| 360 |
parts = reasoning.split(":", 1)
|
| 361 |
if len(parts) > 1:
|
| 362 |
reasoning = parts[1].strip()
|
| 363 |
+
|
| 364 |
# Clean up any remaining artifacts
|
| 365 |
+
reasoning = re.sub(
|
| 366 |
+
r"^[A-Za-z\s]+:", "", reasoning
|
| 367 |
+
).strip() # Remove "Category:" type prefixes
|
| 368 |
+
|
| 369 |
return reasoning if reasoning else "Analysis not available"
|
| 370 |
|
| 371 |
def get_categories_info(self):
|
| 372 |
"""Get information about all categories"""
|
| 373 |
+
return self.knowledge.get_category_descriptions()
|