Spaces:

LPX55
/

csam_patterns

Sleeping

App Files Files Community

csam_patterns / app.py

LPX55

Create app.py

840b896 verified 5 months ago

raw

history blame contribute delete

17.2 kB

	import re
	from typing import List, Tuple
	import gradio as gr

	class ContentSafetyChecker:
	def __init__(self):
	# Pre-compile regex patterns for performance
	self.csam_patterns = self._compile_csam_patterns()
	self.suspicious_combinations = self._compile_suspicious_combinations()

	def _compile_csam_patterns(self) -> List[re.Pattern]:
	"""Compile CSAM detection patterns"""
	patterns = [
	# Direct CSAM references
	r'\b(csam\|child.sexual\|minor.abuse\|illegal.*content)',
	# Age + sexual content combinations
	r'\b(child\|kid\|baby\|toddler\|infant\|minor\|youngster\|youth\|teen).*\b(sex\|nude\|naked\|pussy\|penis\|vagina)',
	r'\b(sex\|nude\|naked\|porn\|erotic\|seductive\|lust\|breast\|genital\|penis\|vagina).*\b(child\|kid\|baby\|toddler\|infant\|minor\|youngster\|youth\|teen)',
	# Numerical age checks with sexual content
	r'\b(?:age\|aged?\|years? old)\s[=\-:]\s(?:[0-9]\|1[0-7])\b.*\b(sex\|nude\|naked\|porn\|erotic)',
	r'\b([0-9]\|1[0-7])\s(?:years? old\|yo\|y/o)\b.\b(sex\|nude\|naked\|porn\|erotic)',
	# Numbers 3-17 directly followed by sexual content terms
	r'\b([3-9]\|1[0-7])\s[,\-]?\s(sex\|nude\|naked\|porn\|erotic\|seductive\|explicit)',
	# Descriptive numerical age checks
	r'\b(under\s(1[0-8]\|[0-9])\|below\s(1[0-8]\|[0-9])\|less than\s(1[0-8]\|[0-9]))\syears?\s[,\-]?\s(sex\|nude\|naked\|porn\|erotic)',
	# Inappropriate scenarios with numerical ages
	r'\b(pedophil\|lolita\|underage.sex\|minor.sex\|illegal.*sex)',
	# Grooming language
	r'\b(inappropriate.adult.child\|adult.minor.relationship\|groom)',
	]
	return [re.compile(pattern, re.IGNORECASE) for pattern in patterns]

	def _compile_suspicious_combinations(self) -> List[Tuple[List[str], List[str], int]]:
	"""Define suspicious word combinations with numerical thresholds"""
	return [
	# Age indicators + sexual content ( Fixed tuple structure and removed extra parameters)
	(['4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'young teen', 'minor', 'child', 'kid', 'baby'],
	['nude', 'naked', 'sex', 'porn', 'erotic', 'seductive', 'penis', 'vagina', 'penetration', 'cock', 'pussy'], 8),
	(['adult'],
	['pedo', 'loli', 'secret', 'rape'], 5),
	]

	def check_pattern_matching(self, prompt: str) -> bool:
	"""First line of defense: regex pattern matching"""
	for pattern in self.csam_patterns:
	if pattern.search(prompt):
	print(f"🚨 Pattern matched: {pattern.pattern}")
	return True
	return False

	def check_numerical_age_context(self, prompt: str) -> bool:
	"""Check for numerical ages in suspicious contexts"""
	prompt_lower = prompt.lower()

	# Extract all numbers from prompt
	numbers = re.findall(r'\b\d+\b', prompt_lower)

	# Suspicious sexual content terms
	sexual_terms = ['nude', 'naked', 'sex', 'porn', 'erotic', 'seductive', 'explicit', 'undress', 'penis', 'vagina', 'pussy', 'cock']

	# Check each number
	for num_str in numbers:
	try:
	age = int(num_str)
	# If age is in minor range (3-17) and sexual content is mentioned
	if 3 <= age <= 17 and any(term in prompt_lower for term in sexual_terms):
	print(f"🚨 Suspicious age with sexual content: {age}")
	return True
	# Also check for descriptive phrases like "12 year old"
	age_phrases = [
	f"{num_str} year old",
	f"{num_str} years old",
	f"{num_str}yo",
	f"{num_str} y/o"
	]
	if any(phrase in prompt_lower for phrase in age_phrases):
	if 3 <= age <= 17 and any(term in prompt_lower for term in sexual_terms):
	print(f"🚨 Suspicious age phrase with sexual content: {age}")
	return True
	except ValueError:
	continue

	# Check for phrases like "under 18", "below 16", etc.
	age_limit_phrases = [
	r'\bunder\s*(1[0-8]\|[0-9])\b',
	r'\bbelow\s*(1[0-8]\|[0-9])\b',
	r'\bless than\s*(1[0-8]\|[0-9])\b',
	r'\b[0-9]{1,2}\s*and under\b'
	]

	for phrase_pattern in age_limit_phrases:
	if re.search(phrase_pattern, prompt_lower):
	# Check if followed by sexual content within reasonable distance
	match = re.search(phrase_pattern, prompt_lower)
	phrase_end = match.end()
	remaining_text = prompt_lower[phrase_end:]
	if any(term in remaining_text for term in sexual_terms):
	print(f"🚨 Age limit phrase with sexual content: {match.group()}")
	return True

	return False

	def check_age_sexual_combinations(self, prompt: str) -> bool:
	"""More sophisticated check for age terms combined with sexual content"""
	prompt_lower = prompt.lower()

	# Check age + sexual content with numerical extraction
	age_sexual_patterns = [
	r'(child\|kid\|baby\|toddler\|infant\|minor\|youngster\|teen\|youth)\s(?:of\|is\|was)?\s(\d{1,2})\s(years?\sold)?',
	r'(\d{1,2})\s(years?\sold\|yo\|y/o)?\s*(child\|kid\|baby\|toddler\|infant\|minor\|young)',
	]

	sexual_content_patterns = ['nude', 'naked', 'sex', 'porn', 'erotic', 'explicit', 'sexual', 'penis', 'vagina', 'pussy', 'cock']

	for age_pattern in age_sexual_patterns:
	age_matches = re.finditer(age_pattern, prompt_lower)
	for match in age_matches:
	# Extract age number from match groups
	age = None
	for group in match.groups():
	if group and group.isdigit():
	age = int(group)
	break

	if age is not None and 3 <= age <= 17: # Minor age range
	# Look for sexual content nearby (within 50 characters)
	start_pos = max(0, match.start() - 50)
	end_pos = min(len(prompt_lower), match.end() + 50)
	context = prompt_lower[start_pos:end_pos]

	if any(term in context for term in sexual_content_patterns):
	print(f"🚨 Age-sexual content combination detected: age {age}")
	return True

	return False

	def check_word_proximity(self, prompt: str) -> bool: # Removed unused max_distance parameter
	"""Second line: check proximity of suspicious words including numerical checks"""
	words = prompt.lower().split()

	# Original non-numerical proximity checks
	for age_words, sexual_words, distance_threshold in self.suspicious_combinations:
	for age_word in age_words:
	for sexual_word in sexual_words:
	if age_word in words and sexual_word in words:
	try:
	age_idx = words.index(age_word)
	sexual_idx = words.index(sexual_word)
	if abs(age_idx - sexual_idx) <= distance_threshold:
	return True
	except ValueError:
	continue

	return False

	def simple_age_check(self, prompt: str) -> bool: # Added self parameter
	"""Simple numerical age check"""
	numbers = re.findall(r'\b\d+\b', prompt)
	sexual_terms = ['nude', 'naked', 'sex', 'porn', 'erotic', 'seductive', 'explicit', 'penis', 'vagina', 'pussy', 'cock']

	for num in numbers:
	try:
	age = int(num)
	if 3 <= age <= 17 and any(term in prompt.lower() for term in sexual_terms):
	return True
	except ValueError:
	continue
	return False

	def check_suspicious_phrases(self, prompt: str) -> bool:
	"""Third line: check for specific suspicious phrases"""
	suspicious_phrases = [
	'young model', 'teen photoshoot', 'minor content',
	'child art', 'kids only', 'no adults allowed',
	'private session', 'secret photos', 'hidden content'
	]
	prompt_lower = prompt.lower()
	for phrase in suspicious_phrases:
	if phrase in prompt_lower:
	return True
	return False

	def check_evasion_patterns(self, prompt: str) -> bool:
	"""Fourth line: check for common evasion techniques"""
	# Look for character substitutions or obfuscation
	evasion_indicators = [
	r'p\w*is',
	r'p\w*sy',
	r's\w*x',
	r'n\wd\w',
	]
	for pattern in evasion_indicators:
	if re.search(pattern, prompt, re.IGNORECASE):
	# Additional check: see if it's combined with age-related terms
	age_indicators = ['old', 'teen', 'baby', 'kid', 'child', 'minor', 'young']
	if any(indicator in prompt.lower() for indicator in age_indicators):
	return True
	return False

	def check_numerical_age_indicators(self, prompt: str) -> bool:
	"""Fifth line: check for specific age numbers with sexual content"""
	# Extract numbers from prompt
	numbers = re.findall(r'\b\d+\b', prompt)
	sexual_indicators = ['nude', 'naked', 'sex', 'porn', 'erotic', 'nudify', 'off', 'penis', 'cock', 'pussy', 'vagina', 'insert']

	for num_str in numbers:
	try:
	age = int(num_str)
	# Flag ages typically associated with minors when combined with sexual content
	if 3 <= age <= 17:
	prompt_lower = prompt.lower()
	if any(indicator in prompt_lower for indicator in sexual_indicators):
	return True
	except ValueError:
	continue
	return False

	def check_contextual_red_flags(self, prompt: str) -> bool:
	"""Sixth line: check for contextual red flags"""
	red_flag_contexts = [
	# Suspicious session types
	(r'(private\|secret\|hidden\|exclusive).*photo', ['nude', 'naked', 'photos']), # Fixed this tuple (was missing opening parenthesis)
	# Suspicious content descriptions
	(r'(special\|unique\|exclusive\|view\|angle\|pov\|shot).*content', ['young', 'minor', 'teen'])
	]

	prompt_lower = prompt.lower()
	for context_pattern, suspicious_words in red_flag_contexts:
	if re.search(context_pattern, prompt_lower):
	if any(word in prompt_lower for word in suspicious_words):
	return True
	return False

	def is_content_suspicious(self, prompt: str) -> Tuple[bool, str]:
	"""Main safety checking function with detailed feedback"""
	checks = [
	(self.check_pattern_matching, "Pattern matching detected suspicious content"),
	(self.check_word_proximity, "Suspicious word proximity detected"),
	(self.check_suspicious_phrases, "Suspicious phrases detected"),
	(self.check_evasion_patterns, "Potential evasion patterns detected"),
	(self.check_numerical_age_indicators, "Suspicious age indicators with sexual content"),
	(self.check_contextual_red_flags, "Contextual red flags detected"),
	(self.simple_age_check, "Simple age check detected suspicious content"), # Added the missing simple_age_check
	(self.check_numerical_age_context, "Numerical age context check detected suspicious content"), # Added numerical age context check
	(self.check_age_sexual_combinations, "Age-sexual combination check detected suspicious content") # Added age-sexual combination check
	]

	for check_func, message in checks:
	try:
	if check_func(prompt):
	return True, message
	except Exception as e:
	print(f"Warning: Safety check {check_func.__name__} failed: {e}")
	continue

	return False, "Content appears safe"

	# Enhanced safety function
	def comprehensive_safety_check(prompt: str) -> Tuple[bool, str]:
	"""Multi-layer safety checking with fallback mechanisms"""
	try:
	# Initialize the safety checker for each check (ensures fresh state)
	safety_checker = ContentSafetyChecker()

	# Primary check
	is_suspicious, message = safety_checker.is_content_suspicious(prompt)
	if is_suspicious:
	return True, message

	# Fallback checks if primary fails
	fallback_checks = [
	lambda p: len(p) > 1000, # Unusually long prompts (potential obfuscation)
	lambda p: p.count('"') > 20, # Excessive quotes (potential code injection)
	lambda p: '\|\|' in p or '&&' in p, # Shell command operators
	lambda p: any(char in p for char in ['<script', 'javascript:', 'onload=']), # Basic XSS
	]

	for i, check in enumerate(fallback_checks):
	try:
	if check(prompt):
	return True, f"Fallback safety check #{i+1} triggered"
	except Exception:
	continue

	return False, "All safety checks passed"

	except Exception as e:
	print(f"Safety check failed, erring on caution: {e}")
	return True, "Safety system error - content blocked for caution"

	# Usage in your application
	def test_prompt(original_prompt: str) -> str:
	is_blocked, reason = comprehensive_safety_check(original_prompt)

	if is_blocked:
	print(f"🚨 SAFETY BLOCK: {reason} - Prompt: {original_prompt[:100]}...")
	# Log the blocked content (for monitoring)
	try:
	with open("safety_blocks.log", "a") as f:
	f.write(f"{reason}: {original_prompt}\n")
	except Exception:
	print("Warning: Could not write to safety log")
	return "A professional and appropriate image editing task" # Safe fallback

	# If safe, proceed with normal enhancement
	return f"Regex safety check passed: {original_prompt}"

	import gradio as gr

	with gr.Blocks(title="Content Safety Checker \| Testing Zone") as demo:
	gr.Markdown("""
	# 🛡️ Regex-based Content Safety Checker
	Some of you guys need to seek help...
	""")

	with gr.Row():
	with gr.Column(scale=1):
	input_prompt = gr.Textbox(
	label="Test Prompt",
	placeholder="Type a prompt to check for safety...",
	lines=5,
	max_lines=10
	)
	check_button = gr.Button("Check Safety", variant="primary")
	clear_button = gr.Button("Clear", variant="secondary")

	with gr.Column(scale=2):
	output_result = gr.Textbox(
	label="Safety Check Result",
	interactive=False,
	lines=10,
	max_lines=15
	)
	safety_info = gr.HTML(
	value="<div style='padding:15px; margin-top:15px; background: #f8f9fa; border-radius: 8px;'>"
	"<h4>ℹ️ How it works:</h4>"
	"<ul>"
	"<li>Multi-layer safety checking system</li>"
	"<li>Detects age-related terms combined with sexual content</li>"
	"<li>Identifies potential CSAM/illegal content patterns</li>"
	"<li>Checks for evasion techniques and suspicious combinations</li>"
	"</ul>"
	"</div>"
	)

	def process_prompt(prompt):
	if not prompt or not prompt.strip():
	return "Please enter a prompt to test."

	is_blocked, reason = comprehensive_safety_check(prompt)
	if is_blocked:
	return f"🚨 BLOCKED: {reason}\n\nOriginal prompt: {prompt}"
	else:
	return f"✅ SAFE: {reason}\n\nOriginal prompt: {prompt}"

	def clear_inputs():
	return "", ""

	check_button.click(
	fn=process_prompt,
	inputs=input_prompt,
	outputs=output_result
	)

	input_prompt.submit(
	fn=process_prompt,
	inputs=input_prompt,
	outputs=output_result
	)

	clear_button.click(
	fn=clear_inputs,
	inputs=None,
	outputs=[input_prompt, output_result]
	)

	demo.launch()