onkar-waghmode commited on
Commit
ee07330
·
1 Parent(s): 51ef0d5
Files changed (3) hide show
  1. app v.1.py +593 -0
  2. app.py +532 -197
  3. requirements.txt +2 -0
app v.1.py ADDED
@@ -0,0 +1,593 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import random
3
+ import nltk
4
+ import re
5
+ import spacy
6
+ from nltk.corpus import wordnet, stopwords
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
+ from sentence_transformers import SentenceTransformer
9
+ import torch
10
+ import numpy as np
11
+ from typing import List, Dict, Tuple
12
+ import logging
13
+ from transformers import pipeline
14
+
15
+
16
+ # Setup logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Download NLTK data
21
+ print("Downloading NLTK data...")
22
+ for data in ['punkt','punkt_tab', 'wordnet', 'averaged_perceptron_tagger', 'stopwords', 'omw-1.4', 'averaged_perceptron_tagger_eng']:
23
+ try:
24
+ nltk.data.find(f'{data}')
25
+ except:
26
+ nltk.download(data, quiet=True)
27
+
28
+ # Load models globally
29
+ print("Loading models...")
30
+ device = "cuda" if torch.cuda.is_available() else "cpu"
31
+ print(f"Using device: {device}")
32
+
33
+ t5_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
34
+ t5_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")
35
+ t5_model.to(device)
36
+
37
+
38
+ similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
39
+ nlp = spacy.load("en_core_web_sm")
40
+
41
+
42
+ ai_detector_pipe = pipeline("text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta")
43
+
44
+ print("Models loaded successfully!")
45
+
46
+
47
+
48
+ # ============================================================================
49
+ # STAGE 1: PARAPHRASING WITH T5 MODEL
50
+ # ============================================================================
51
+ def paraphrase_text(text: str, max_length: int = 512, num_beams: int = 4,
52
+ temperature: float = 0.7, top_p: float = 0.9,
53
+ repetition_penalty: float = 1.2, length_penalty: float = 1.0) -> str:
54
+ """Paraphrase text using T5 model"""
55
+ try:
56
+ input_text = f"paraphrase: {text.strip()}"
57
+ inputs = t5_tokenizer(input_text, return_tensors="pt",
58
+ max_length=512, truncation=True, padding=True).to(device)
59
+
60
+ with torch.no_grad():
61
+ outputs = t5_model.generate(
62
+ **inputs,
63
+ max_length=max_length,
64
+ num_beams=num_beams,
65
+ num_return_sequences=1,
66
+ temperature=temperature,
67
+ do_sample=True if temperature > 0 else False,
68
+ top_p=top_p,
69
+ repetition_penalty=repetition_penalty,
70
+ length_penalty=length_penalty,
71
+ early_stopping=True
72
+ )
73
+
74
+ result = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
75
+ return result.strip()
76
+
77
+ except Exception as e:
78
+ logger.warning(f"Paraphrasing failed: {e}. Returning original text.")
79
+ return text
80
+
81
+ def paraphrase_long_text(text: str, max_length: int = 512, num_beams: int = 4,
82
+ temperature: float = 0.7, top_p: float = 0.9,
83
+ repetition_penalty: float = 1.2, length_penalty: float = 1.0) -> str:
84
+ """Handle long texts by breaking them into chunks"""
85
+ sentences = nltk.sent_tokenize(text)
86
+ paraphrased_sentences = []
87
+ current_chunk = ""
88
+
89
+ for sentence in sentences:
90
+ if len((current_chunk + " " + sentence).split()) > 80:
91
+ if current_chunk:
92
+ paraphrased = paraphrase_text(current_chunk, max_length, num_beams,
93
+ temperature, top_p, repetition_penalty, length_penalty)
94
+ paraphrased_sentences.append(paraphrased)
95
+ current_chunk = sentence
96
+ else:
97
+ current_chunk += " " + sentence if current_chunk else sentence
98
+
99
+ if current_chunk:
100
+ paraphrased = paraphrase_text(current_chunk, max_length, num_beams,
101
+ temperature, top_p, repetition_penalty, length_penalty)
102
+ paraphrased_sentences.append(paraphrased)
103
+
104
+ return " ".join(paraphrased_sentences)
105
+
106
+ # ============================================================================
107
+ # STAGE 2: SYNONYM REPLACEMENT
108
+ # ============================================================================
109
+ def get_synonyms(word: str, pos: str, max_synonyms: int = 3) -> List[str]:
110
+ """Get WordNet synonyms"""
111
+ pos_mapping = {
112
+ 'NN': wordnet.NOUN, 'NNS': wordnet.NOUN, 'NNP': wordnet.NOUN, 'NNPS': wordnet.NOUN,
113
+ 'VB': wordnet.VERB, 'VBD': wordnet.VERB, 'VBG': wordnet.VERB, 'VBN': wordnet.VERB,
114
+ 'VBP': wordnet.VERB, 'VBZ': wordnet.VERB,
115
+ 'JJ': wordnet.ADJ, 'JJR': wordnet.ADJ, 'JJS': wordnet.ADJ,
116
+ 'RB': wordnet.ADV, 'RBR': wordnet.ADV, 'RBS': wordnet.ADV
117
+ }
118
+
119
+ wn_pos = pos_mapping.get(pos, wordnet.NOUN)
120
+ synsets = wordnet.synsets(word.lower(), pos=wn_pos)
121
+
122
+ if not synsets:
123
+ synsets = wordnet.synsets(word.lower())
124
+
125
+ synonyms = []
126
+ for synset in synsets[:max_synonyms]:
127
+ for lemma in synset.lemmas()[:5]:
128
+ syn = lemma.name().replace('_', ' ')
129
+ if len(syn.split()) == 1 and syn.lower() != word.lower():
130
+ synonyms.append(syn)
131
+
132
+ return list(set(synonyms))
133
+
134
+ def synonym_replace(text: str, prob: float = 0.3, min_word_length: int = 3,
135
+ max_synonyms: int = 3) -> str:
136
+ """Replace words with synonyms"""
137
+ from nltk import pos_tag, word_tokenize
138
+
139
+ stop_words = set(stopwords.words('english'))
140
+ words = word_tokenize(text)
141
+ pos_tags = pos_tag(words)
142
+ new_words = []
143
+
144
+ for word, pos in pos_tags:
145
+ if not word.isalpha():
146
+ new_words.append(word)
147
+ continue
148
+
149
+ if word.lower() in stop_words or len(word) <= min_word_length:
150
+ new_words.append(word)
151
+ continue
152
+
153
+ if random.random() > prob:
154
+ new_words.append(word)
155
+ continue
156
+
157
+ synonyms = get_synonyms(word, pos, max_synonyms)
158
+ candidates = [s for s in synonyms if s.lower() != word.lower()]
159
+
160
+ if candidates:
161
+ replacement = random.choice(candidates)
162
+ new_words.append(replacement)
163
+ else:
164
+ new_words.append(word)
165
+
166
+ return ' '.join(new_words)
167
+
168
+ # ============================================================================
169
+ # STAGE 3: ACADEMIC DISCOURSE
170
+ # ============================================================================
171
+ def add_academic_discourse(text: str, hedge_prob: float = 0.2, booster_prob: float = 0.15,
172
+ connector_prob: float = 0.25, starter_prob: float = 0.1) -> str:
173
+ """Add academic discourse elements"""
174
+
175
+ contractions = {
176
+ "don't": "do not", "doesn't": "does not", "didn't": "did not",
177
+ "can't": "cannot", "couldn't": "could not", "shouldn't": "should not",
178
+ "wouldn't": "would not", "won't": "will not", "aren't": "are not",
179
+ "isn't": "is not", "wasn't": "was not", "weren't": "were not",
180
+ "haven't": "have not", "hasn't": "has not", "hadn't": "had not",
181
+ "I'm": "I am", "I've": "I have", "I'll": "I will", "I'd": "I would",
182
+ "you're": "you are", "you've": "you have", "you'll": "you will",
183
+ "we're": "we are", "we've": "we have", "we'll": "we will",
184
+ "they're": "they are", "they've": "they have", "they'll": "they will",
185
+ "it's": "it is", "that's": "that is", "there's": "there is", "what's": "what is"
186
+ }
187
+
188
+ hedges = [
189
+ "it appears that", "it is possible that", "the results suggest",
190
+ "it seems that", "there is evidence that", "it may be the case that",
191
+ "to some extent", "in general terms", "one could argue that"
192
+ ]
193
+
194
+ boosters = [
195
+ "clearly", "indeed", "in fact", "undoubtedly",
196
+ "without doubt", "it is evident that", "there is no question that"
197
+ ]
198
+
199
+ connectors = {
200
+ "contrast": ["however", "on the other hand", "in contrast", "nevertheless"],
201
+ "addition": ["moreover", "furthermore", "in addition", "what is more"],
202
+ "cause_effect": ["therefore", "thus", "as a result", "consequently", "hence"],
203
+ "example": ["for instance", "for example", "to illustrate"],
204
+ "conclusion": ["in conclusion", "overall", "in summary", "to sum up"]
205
+ }
206
+
207
+ sentence_starters = [
208
+ "It is important to note that",
209
+ "A key implication is that",
210
+ "The evidence indicates that",
211
+ "The findings suggest that",
212
+ "This demonstrates that",
213
+ "It should be emphasized that",
214
+ "From these observations, it follows that"
215
+ ]
216
+
217
+ # Expand contractions
218
+ for contraction, expansion in contractions.items():
219
+ pattern = re.compile(r'\b' + re.escape(contraction) + r'\b', re.IGNORECASE)
220
+ text = pattern.sub(expansion, text)
221
+
222
+ sentences = nltk.sent_tokenize(text)
223
+ modified = []
224
+
225
+ for i, sent in enumerate(sentences):
226
+ # Add hedge
227
+ if random.random() < hedge_prob and i > 0:
228
+ hedge = random.choice(hedges)
229
+ sent = f"{hedge}, {sent[0].lower() + sent[1:]}"
230
+
231
+ # Add booster
232
+ elif random.random() < booster_prob:
233
+ booster = random.choice(boosters)
234
+ sent = f"{booster.capitalize()}, {sent}"
235
+
236
+ # Add starter
237
+ elif random.random() < starter_prob and i > 0:
238
+ starter = random.choice(sentence_starters)
239
+ sent = f"{starter} {sent[0].lower() + sent[1:]}"
240
+
241
+ # Add connector
242
+ if i > 0 and random.random() < connector_prob:
243
+ conn_type = random.choice(list(connectors.keys()))
244
+ connector = random.choice(connectors[conn_type])
245
+ sent = f"{connector.capitalize()}, {sent[0].lower() + sent[1:]}"
246
+
247
+ modified.append(sent)
248
+
249
+ return ' '.join(modified)
250
+
251
+ # ============================================================================
252
+ # STAGE 4: SENTENCE STRUCTURE VARIATION
253
+ # ============================================================================
254
+ def vary_sentence_structure(text: str, split_prob: float = 0.4, merge_prob: float = 0.3,
255
+ min_split_length: int = 20, max_merge_length: int = 10) -> str:
256
+ """Vary sentence structure"""
257
+
258
+ connectors = {
259
+ "contrast": ["however", "nevertheless", "nonetheless", "in contrast"],
260
+ "addition": ["moreover", "furthermore", "in addition", "what is more"],
261
+ "cause_effect": ["therefore", "thus", "consequently", "as a result"],
262
+ "example": ["for example", "for instance", "to illustrate"],
263
+ "conclusion": ["in conclusion", "overall", "in summary"]
264
+ }
265
+
266
+ all_connectors = {c.lower() for group in connectors.values() for c in group}
267
+
268
+ def already_has_connector(sentence: str) -> bool:
269
+ lower_sent = sentence.strip().lower()
270
+ return any(lower_sent.startswith(conn) for conn in all_connectors)
271
+
272
+ def choose_connector_type(prev_sent: str, curr_sent: str) -> str:
273
+ curr_lower = curr_sent.lower()
274
+
275
+ if any(phrase in curr_lower for phrase in ["such as", "including", "for instance"]):
276
+ return "example"
277
+ elif curr_lower.startswith(("but", "although", "however")):
278
+ return "contrast"
279
+ elif any(phrase in curr_lower for phrase in ["because", "due to", "as a result"]):
280
+ return "cause_effect"
281
+
282
+ # Semantic similarity fallback
283
+ if prev_sent:
284
+ emb = similarity_model.encode([prev_sent, curr_sent])
285
+ score = np.dot(emb[0], emb[1]) / (np.linalg.norm(emb[0]) * np.linalg.norm(emb[1]))
286
+ return "addition" if score > 0.6 else "contrast"
287
+
288
+ return "addition"
289
+
290
+ doc = nlp(text)
291
+ sentences = list(doc.sents)
292
+ modified = []
293
+
294
+ for idx, sent in enumerate(sentences):
295
+ sent_text = sent.text.strip()
296
+ words = sent_text.split()
297
+
298
+ # Split long sentences
299
+ if len(words) > min_split_length and random.random() < split_prob:
300
+ split_points = [tok.i - sent.start for tok in sent if tok.dep_ in ("cc", "mark")]
301
+ if split_points:
302
+ split_point = random.choice(split_points)
303
+ tokens = list(sent)
304
+ if 0 < split_point < len(tokens):
305
+ first = ' '.join([t.text for t in tokens[:split_point]]).strip()
306
+ second = ' '.join([t.text for t in tokens[split_point+1:]]).strip()
307
+ if first and second and len(second.split()) > 3:
308
+ if random.random() < 0.5 and not already_has_connector(second):
309
+ conn_type = choose_connector_type(first, second)
310
+ connector = random.choice(connectors[conn_type])
311
+ second = f"{connector.capitalize()}, {second[0].lower() + second[1:]}"
312
+ modified.extend([first + '.', second])
313
+ continue
314
+
315
+ # Merge short sentences
316
+ if (modified and len(words) < max_merge_length and
317
+ len(modified[-1].split()) < max_merge_length and random.random() < merge_prob):
318
+ prev_sent = modified[-1]
319
+ if not already_has_connector(sent_text):
320
+ conn_type = choose_connector_type(prev_sent, sent_text)
321
+ connector = random.choice(connectors[conn_type])
322
+ combined = f"{prev_sent.rstrip('.')}; {connector}, {sent_text[0].lower() + sent_text[1:]}"
323
+ modified[-1] = combined
324
+ continue
325
+
326
+ modified.append(sent_text)
327
+
328
+ return ' '.join(modified)
329
+
330
+ # ============================================================================
331
+ # QUALITY CHECK
332
+ # ============================================================================
333
+ def calculate_similarity(text1: str, text2: str) -> float:
334
+ """Calculate semantic similarity between two texts"""
335
+ try:
336
+ embeddings = similarity_model.encode([text1.strip(), text2.strip()])
337
+ similarity = float(np.dot(embeddings[0], embeddings[1]) / (
338
+ np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
339
+ ))
340
+ similarity = round(similarity*100, 2)
341
+ return similarity
342
+ except Exception as e:
343
+ logger.error(f"Similarity calculation failed: {e}")
344
+ return 0.0
345
+
346
+
347
+ # ============================================================================
348
+ # AI Detection
349
+ # ============================================================================
350
+ def predict_ai_content(text):
351
+ if not text or not text.strip():
352
+ return "No input provided", 0.0
353
+
354
+ try:
355
+ result = ai_detector_pipe(text)
356
+ if isinstance(result, list) and len(result) > 0:
357
+ res = result[0]
358
+ ai_content_label = res.get('label', 'Unknown')
359
+ ai_content_score = round(float(res.get('score', 0)) * 100, 2)
360
+ return ai_content_label, ai_content_score
361
+ else:
362
+ return "Invalid response", 0.0
363
+ except Exception as e:
364
+ print(f"Error in prediction: {e}")
365
+ return "Error", 0.0
366
+
367
+
368
+ # ============================================================================
369
+ # MAIN HUMANIZER FUNCTION
370
+ # ============================================================================
371
+ def humanize_text(
372
+ input_text: str,
373
+ # Stage toggles
374
+ enable_stage1: bool,
375
+ enable_stage2: bool,
376
+ enable_stage3: bool,
377
+ enable_stage4: bool,
378
+ # Stage 1 parameters
379
+ temperature: float,
380
+ top_p: float,
381
+ num_beams: int,
382
+ max_length: int,
383
+ repetition_penalty: float,
384
+ length_penalty: float,
385
+ # Stage 2 parameters
386
+ synonym_prob: float,
387
+ min_word_length: int,
388
+ max_synonyms: int,
389
+ # Stage 3 parameters
390
+ hedge_prob: float,
391
+ booster_prob: float,
392
+ connector_prob: float,
393
+ starter_prob: float,
394
+ # Stage 4 parameters
395
+ split_prob: float,
396
+ merge_prob: float,
397
+ min_split_length: int,
398
+ max_merge_length: int
399
+ ):
400
+ """Main humanizer function that processes text through all enabled stages"""
401
+
402
+ if not input_text.strip():
403
+ return "", 0.0, "Please enter some text to humanize."
404
+
405
+ try:
406
+ result = input_text
407
+ stages_applied = []
408
+
409
+ # Stage 1: Paraphrasing
410
+ if enable_stage1:
411
+ word_count = len(result.split())
412
+ if word_count > 100:
413
+ result = paraphrase_long_text(result, max_length, num_beams, temperature,
414
+ top_p, repetition_penalty, length_penalty)
415
+ else:
416
+ result = paraphrase_text(result, max_length, num_beams, temperature,
417
+ top_p, repetition_penalty, length_penalty)
418
+ stages_applied.append("Paraphrasing")
419
+
420
+ # Stage 2: Synonym Replacement
421
+ if enable_stage2:
422
+ result = synonym_replace(result, synonym_prob, min_word_length, max_synonyms)
423
+ stages_applied.append("Synonym Replacement")
424
+
425
+ # Stage 3: Academic Discourse
426
+ if enable_stage3:
427
+ result = add_academic_discourse(result, hedge_prob, booster_prob,
428
+ connector_prob, starter_prob)
429
+ stages_applied.append("Academic Discourse")
430
+
431
+ # Stage 4: Sentence Structure
432
+ if enable_stage4:
433
+ result = vary_sentence_structure(result, split_prob, merge_prob,
434
+ min_split_length, max_merge_length)
435
+ stages_applied.append("Sentence Structure")
436
+
437
+ # Calculate similarity
438
+ similarity = calculate_similarity(input_text, result)
439
+ ai_content_label_generated, ai_content_score_generated = predict_ai_content(result)
440
+ ai_content_label_input, ai_content_score_input = predict_ai_content(input_text)
441
+
442
+ # Generate status message
443
+ if not stages_applied:
444
+ status = "⚠️ No stages enabled. Please enable at least one stage."
445
+ else:
446
+ status = f"✅ Successfully applied: {', '.join(stages_applied)}"
447
+
448
+ return result, similarity, status,ai_content_label_generated, ai_content_score_generated,ai_content_label_input, ai_content_score_input
449
+
450
+ except Exception as e:
451
+ logger.error(f"Error in humanization: {e}")
452
+ import traceback
453
+ traceback.print_exc()
454
+ return "", 0.0, f"❌ Error: {str(e)}"
455
+
456
+ # ============================================================================
457
+ # GRADIO INTERFACE
458
+ # ============================================================================
459
+ def create_gradio_interface():
460
+ """Create the Gradio interface"""
461
+
462
+ with gr.Blocks(theme=gr.themes.Soft(), title="Neural Humanizer") as demo:
463
+ gr.Markdown(
464
+ """
465
+ # ✍️ Neural Humanizer
466
+ Transform AI-generated text into natural, human-like language with precision, style, and control.
467
+ """
468
+ )
469
+
470
+ with gr.Row():
471
+ with gr.Column(scale=2):
472
+ input_text = gr.Textbox(
473
+ label="Input Text",
474
+ placeholder="Enter your text here to humanize...",
475
+ lines=10
476
+ )
477
+
478
+ with gr.Row():
479
+ submit_btn = gr.Button("🚀 Transform Text", variant="primary", size="lg")
480
+ clear_btn = gr.Button("🔄 Clear", size="lg")
481
+
482
+
483
+ output_text = gr.Textbox(
484
+ label="Humanized Output",
485
+ lines=10,
486
+ interactive=False
487
+ )
488
+
489
+ with gr.Row():
490
+ gr.Markdown("### Semantic Similarity & Status")
491
+
492
+ with gr.Row():
493
+ similarity_output = gr.Number(label="Content Similarity (%)", precision=2)
494
+ status_output = gr.Textbox(label="Status",interactive=False,lines=2, max_lines=10)
495
+
496
+ with gr.Row():
497
+ gr.Markdown("### Given Input Text Analysis")
498
+
499
+ with gr.Row():
500
+ ai_content_label_input = gr.Textbox(
501
+ label="Detected Content Type",
502
+ interactive=False,
503
+ lines=2,
504
+ max_lines=10
505
+ )
506
+ ai_content_score_input = gr.Number(
507
+ label="Model Confidence (%)",
508
+ precision=2,
509
+ interactive=False
510
+ )
511
+
512
+ with gr.Row():
513
+ gr.Markdown("### Humanized Text Analysis")
514
+
515
+ with gr.Row():
516
+ ai_content_label_generated = gr.Textbox(
517
+ label="Detected Content Type",
518
+ interactive=False,
519
+ lines=2,
520
+ max_lines=10
521
+ )
522
+
523
+ ai_content_score_generated = gr.Number(
524
+ label="Model Confidence (%)",
525
+ precision=2,
526
+ interactive=False
527
+ )
528
+
529
+
530
+
531
+ with gr.Column(scale=1):
532
+ gr.Markdown("## 🎛️ Pipeline Configuration")
533
+
534
+ with gr.Accordion("Stage Selection", open=True):
535
+ enable_stage1 = gr.Checkbox(label="Stage 1: Paraphrasing (T5)", value=True)
536
+ enable_stage2 = gr.Checkbox(label="Stage 2: Lexical Diversification", value=True)
537
+ enable_stage3 = gr.Checkbox(label="Stage 3: Discourse Enrichment", value=True)
538
+ enable_stage4 = gr.Checkbox(label="Stage 4: Structural Variation", value=True)
539
+
540
+
541
+ with gr.Accordion("Stage 1: Paraphrasing Parameters", open=False):
542
+ temperature = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
543
+ top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
544
+ num_beams = gr.Slider(1, 10, value=4, step=1, label="Num Beams")
545
+ max_length = gr.Slider(128, 1024, value=512, step=64, label="Max Length")
546
+ repetition_penalty = gr.Slider(1.0, 2.0, value=1.2, step=0.1, label="Repetition Penalty")
547
+ length_penalty = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Length Penalty")
548
+
549
+ with gr.Accordion("Stage 2: Synonym Replacement Parameters", open=False):
550
+ synonym_prob = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Replacement Probability")
551
+ min_word_length = gr.Slider(2, 8, value=3, step=1, label="Min Word Length")
552
+ max_synonyms = gr.Slider(1, 10, value=3, step=1, label="Max Synonyms")
553
+
554
+ with gr.Accordion("Stage 3: Academic Discourse Parameters", open=False):
555
+ hedge_prob = gr.Slider(0.0, 0.5, value=0.2, step=0.05, label="Hedge Probability")
556
+ booster_prob = gr.Slider(0.0, 0.5, value=0.15, step=0.05, label="Booster Probability")
557
+ connector_prob = gr.Slider(0.0, 0.5, value=0.25, step=0.05, label="Connector Probability")
558
+ starter_prob = gr.Slider(0.0, 0.3, value=0.1, step=0.05, label="Starter Probability")
559
+
560
+ with gr.Accordion("Stage 4: Sentence Structure Parameters", open=False):
561
+ split_prob = gr.Slider(0.0, 1.0, value=0.4, step=0.05, label="Split Probability")
562
+ merge_prob = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Merge Probability")
563
+ min_split_length = gr.Slider(10, 40, value=20, step=5, label="Min Split Length (words)")
564
+ max_merge_length = gr.Slider(5, 20, value=10, step=1, label="Max Merge Length (words)")
565
+
566
+ # Event handlers
567
+ submit_btn.click(
568
+ fn=humanize_text,
569
+ inputs=[
570
+ input_text,
571
+ enable_stage1, enable_stage2, enable_stage3, enable_stage4,
572
+ temperature, top_p, num_beams, max_length, repetition_penalty, length_penalty,
573
+ synonym_prob, min_word_length, max_synonyms,
574
+ hedge_prob, booster_prob, connector_prob, starter_prob,
575
+ split_prob, merge_prob, min_split_length, max_merge_length
576
+ ],
577
+ outputs=[output_text, similarity_output, status_output, ai_content_label_generated, ai_content_score_generated, ai_content_label_input, ai_content_score_input]
578
+ )
579
+
580
+ clear_btn.click(
581
+ fn=lambda: ("", "", 0.0, "","", 0.0, "", 0.0),
582
+ inputs=[],
583
+ outputs=[input_text, output_text, similarity_output, status_output, ai_content_label_generated, ai_content_score_generated, ai_content_label_input, ai_content_score_input]
584
+ )
585
+
586
+ return demo
587
+
588
+ # ============================================================================
589
+ # LAUNCH
590
+ # ============================================================================
591
+ if __name__ == "__main__":
592
+ demo = create_gradio_interface()
593
+ demo.launch(share=True, server_name="0.0.0.0", server_port=7860)
app.py CHANGED
@@ -4,18 +4,20 @@ import nltk
4
  import re
5
  import spacy
6
  from nltk.corpus import wordnet, stopwords
 
 
7
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
- from sentence_transformers import SentenceTransformer
9
  import torch
10
  import numpy as np
11
- from typing import List, Dict, Tuple
12
- import logging
13
  from transformers import pipeline
 
 
14
 
 
15
 
16
- # Setup logging
17
- logging.basicConfig(level=logging.INFO)
18
- logger = logging.getLogger(__name__)
19
 
20
  # Download NLTK data
21
  print("Downloading NLTK data...")
@@ -34,7 +36,7 @@ t5_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
34
  t5_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")
35
  t5_model.to(device)
36
 
37
-
38
  similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
39
  nlp = spacy.load("en_core_web_sm")
40
 
@@ -103,229 +105,549 @@ def paraphrase_long_text(text: str, max_length: int = 512, num_beams: int = 4,
103
 
104
  return " ".join(paraphrased_sentences)
105
 
 
106
  # ============================================================================
107
- # STAGE 2: SYNONYM REPLACEMENT
108
  # ============================================================================
109
- def get_synonyms(word: str, pos: str, max_synonyms: int = 3) -> List[str]:
110
- """Get WordNet synonyms"""
111
- pos_mapping = {
112
- 'NN': wordnet.NOUN, 'NNS': wordnet.NOUN, 'NNP': wordnet.NOUN, 'NNPS': wordnet.NOUN,
113
- 'VB': wordnet.VERB, 'VBD': wordnet.VERB, 'VBG': wordnet.VERB, 'VBN': wordnet.VERB,
114
- 'VBP': wordnet.VERB, 'VBZ': wordnet.VERB,
115
- 'JJ': wordnet.ADJ, 'JJR': wordnet.ADJ, 'JJS': wordnet.ADJ,
116
- 'RB': wordnet.ADV, 'RBR': wordnet.ADV, 'RBS': wordnet.ADV
117
- }
118
-
119
- wn_pos = pos_mapping.get(pos, wordnet.NOUN)
120
- synsets = wordnet.synsets(word.lower(), pos=wn_pos)
121
-
122
- if not synsets:
123
- synsets = wordnet.synsets(word.lower())
124
-
125
- synonyms = []
126
- for synset in synsets[:max_synonyms]:
127
- for lemma in synset.lemmas()[:5]:
128
- syn = lemma.name().replace('_', ' ')
129
- if len(syn.split()) == 1 and syn.lower() != word.lower():
130
- synonyms.append(syn)
131
-
132
- return list(set(synonyms))
133
 
134
- def synonym_replace(text: str, prob: float = 0.3, min_word_length: int = 3,
135
- max_synonyms: int = 3) -> str:
136
- """Replace words with synonyms"""
137
- from nltk import pos_tag, word_tokenize
138
-
139
- stop_words = set(stopwords.words('english'))
140
- words = word_tokenize(text)
141
- pos_tags = pos_tag(words)
142
- new_words = []
143
-
144
- for word, pos in pos_tags:
145
- if not word.isalpha():
146
- new_words.append(word)
147
- continue
148
 
149
- if word.lower() in stop_words or len(word) <= min_word_length:
150
- new_words.append(word)
151
- continue
 
 
 
 
 
 
152
 
153
- if random.random() > prob:
154
- new_words.append(word)
155
- continue
156
 
157
- synonyms = get_synonyms(word, pos, max_synonyms)
158
- candidates = [s for s in synonyms if s.lower() != word.lower()]
159
 
160
- if candidates:
161
- replacement = random.choice(candidates)
162
- new_words.append(replacement)
163
- else:
164
- new_words.append(word)
 
 
 
 
165
 
166
- return ' '.join(new_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  # ============================================================================
169
- # STAGE 3: ACADEMIC DISCOURSE
170
  # ============================================================================
171
- def add_academic_discourse(text: str, hedge_prob: float = 0.2, booster_prob: float = 0.15,
172
- connector_prob: float = 0.25, starter_prob: float = 0.1) -> str:
173
- """Add academic discourse elements"""
174
-
175
- contractions = {
176
- "don't": "do not", "doesn't": "does not", "didn't": "did not",
177
- "can't": "cannot", "couldn't": "could not", "shouldn't": "should not",
178
- "wouldn't": "would not", "won't": "will not", "aren't": "are not",
179
- "isn't": "is not", "wasn't": "was not", "weren't": "were not",
180
- "haven't": "have not", "hasn't": "has not", "hadn't": "had not",
181
- "I'm": "I am", "I've": "I have", "I'll": "I will", "I'd": "I would",
182
- "you're": "you are", "you've": "you have", "you'll": "you will",
183
- "we're": "we are", "we've": "we have", "we'll": "we will",
184
- "they're": "they are", "they've": "they have", "they'll": "they will",
185
- "it's": "it is", "that's": "that is", "there's": "there is", "what's": "what is"
186
- }
187
-
188
- hedges = [
189
- "it appears that", "it is possible that", "the results suggest",
190
- "it seems that", "there is evidence that", "it may be the case that",
191
- "to some extent", "in general terms", "one could argue that"
192
- ]
193
-
194
- boosters = [
195
- "clearly", "indeed", "in fact", "undoubtedly",
196
- "without doubt", "it is evident that", "there is no question that"
197
- ]
198
-
199
- connectors = {
200
- "contrast": ["however", "on the other hand", "in contrast", "nevertheless"],
201
- "addition": ["moreover", "furthermore", "in addition", "what is more"],
202
- "cause_effect": ["therefore", "thus", "as a result", "consequently", "hence"],
203
- "example": ["for instance", "for example", "to illustrate"],
204
- "conclusion": ["in conclusion", "overall", "in summary", "to sum up"]
205
- }
206
-
207
- sentence_starters = [
208
- "It is important to note that",
209
- "A key implication is that",
210
- "The evidence indicates that",
211
- "The findings suggest that",
212
- "This demonstrates that",
213
- "It should be emphasized that",
214
- "From these observations, it follows that"
215
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
- # Expand contractions
218
- for contraction, expansion in contractions.items():
219
- pattern = re.compile(r'\b' + re.escape(contraction) + r'\b', re.IGNORECASE)
220
- text = pattern.sub(expansion, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
- sentences = nltk.sent_tokenize(text)
223
- modified = []
 
 
 
 
224
 
225
- for i, sent in enumerate(sentences):
226
- # Add hedge
227
- if random.random() < hedge_prob and i > 0:
228
- hedge = random.choice(hedges)
229
- sent = f"{hedge}, {sent[0].lower() + sent[1:]}"
230
-
231
- # Add booster
232
- elif random.random() < booster_prob:
233
- booster = random.choice(boosters)
234
- sent = f"{booster.capitalize()}, {sent}"
235
-
236
- # Add starter
237
- elif random.random() < starter_prob and i > 0:
238
- starter = random.choice(sentence_starters)
239
- sent = f"{starter} {sent[0].lower() + sent[1:]}"
240
-
241
- # Add connector
242
- if i > 0 and random.random() < connector_prob:
243
- conn_type = random.choice(list(connectors.keys()))
244
- connector = random.choice(connectors[conn_type])
245
- sent = f"{connector.capitalize()}, {sent[0].lower() + sent[1:]}"
246
 
247
- modified.append(sent)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- return ' '.join(modified)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
  # ============================================================================
252
  # STAGE 4: SENTENCE STRUCTURE VARIATION
253
  # ============================================================================
254
- def vary_sentence_structure(text: str, split_prob: float = 0.4, merge_prob: float = 0.3,
255
- min_split_length: int = 20, max_merge_length: int = 10) -> str:
256
- """Vary sentence structure"""
257
-
 
 
 
 
 
 
 
 
258
  connectors = {
259
  "contrast": ["however", "nevertheless", "nonetheless", "in contrast"],
260
- "addition": ["moreover", "furthermore", "in addition", "what is more"],
261
  "cause_effect": ["therefore", "thus", "consequently", "as a result"],
262
  "example": ["for example", "for instance", "to illustrate"],
263
  "conclusion": ["in conclusion", "overall", "in summary"]
264
  }
265
-
266
  all_connectors = {c.lower() for group in connectors.values() for c in group}
267
-
268
- def already_has_connector(sentence: str) -> bool:
269
- lower_sent = sentence.strip().lower()
270
- return any(lower_sent.startswith(conn) for conn in all_connectors)
271
-
 
 
 
 
 
 
272
  def choose_connector_type(prev_sent: str, curr_sent: str) -> str:
273
  curr_lower = curr_sent.lower()
274
-
275
- if any(phrase in curr_lower for phrase in ["such as", "including", "for instance"]):
 
276
  return "example"
277
- elif curr_lower.startswith(("but", "although", "however")):
278
  return "contrast"
279
- elif any(phrase in curr_lower for phrase in ["because", "due to", "as a result"]):
280
  return "cause_effect"
281
-
282
- # Semantic similarity fallback
283
- if prev_sent:
284
- emb = similarity_model.encode([prev_sent, curr_sent])
285
- score = np.dot(emb[0], emb[1]) / (np.linalg.norm(emb[0]) * np.linalg.norm(emb[1]))
286
- return "addition" if score > 0.6 else "contrast"
287
-
288
- return "addition"
289
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  doc = nlp(text)
291
- sentences = list(doc.sents)
292
  modified = []
293
-
294
- for idx, sent in enumerate(sentences):
295
- sent_text = sent.text.strip()
296
- words = sent_text.split()
297
-
298
- # Split long sentences
299
  if len(words) > min_split_length and random.random() < split_prob:
300
- split_points = [tok.i - sent.start for tok in sent if tok.dep_ in ("cc", "mark")]
301
- if split_points:
302
- split_point = random.choice(split_points)
303
- tokens = list(sent)
304
- if 0 < split_point < len(tokens):
305
- first = ' '.join([t.text for t in tokens[:split_point]]).strip()
306
- second = ' '.join([t.text for t in tokens[split_point+1:]]).strip()
307
- if first and second and len(second.split()) > 3:
308
- if random.random() < 0.5 and not already_has_connector(second):
309
- conn_type = choose_connector_type(first, second)
310
- connector = random.choice(connectors[conn_type])
311
- second = f"{connector.capitalize()}, {second[0].lower() + second[1:]}"
312
- modified.extend([first + '.', second])
 
313
  continue
314
-
315
- # Merge short sentences
316
- if (modified and len(words) < max_merge_length and
317
- len(modified[-1].split()) < max_merge_length and random.random() < merge_prob):
318
- prev_sent = modified[-1]
319
- if not already_has_connector(sent_text):
320
- conn_type = choose_connector_type(prev_sent, sent_text)
321
- connector = random.choice(connectors[conn_type])
322
- combined = f"{prev_sent.rstrip('.')}; {connector}, {sent_text[0].lower() + sent_text[1:]}"
323
- modified[-1] = combined
324
- continue
325
-
326
- modified.append(sent_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
- return ' '.join(modified)
 
 
 
 
329
 
330
  # ============================================================================
331
  # QUALITY CHECK
@@ -399,6 +721,8 @@ def humanize_text(
399
  ):
400
  """Main humanizer function that processes text through all enabled stages"""
401
 
 
 
402
  if not input_text.strip():
403
  return "", 0.0, "Please enter some text to humanize."
404
 
@@ -419,13 +743,21 @@ def humanize_text(
419
 
420
  # Stage 2: Synonym Replacement
421
  if enable_stage2:
422
- result = synonym_replace(result, synonym_prob, min_word_length, max_synonyms)
 
 
 
 
 
 
 
423
  stages_applied.append("Synonym Replacement")
424
 
425
  # Stage 3: Academic Discourse
426
  if enable_stage3:
427
- result = add_academic_discourse(result, hedge_prob, booster_prob,
428
- connector_prob, starter_prob)
 
429
  stages_applied.append("Academic Discourse")
430
 
431
  # Stage 4: Sentence Structure
@@ -434,6 +766,10 @@ def humanize_text(
434
  min_split_length, max_merge_length)
435
  stages_applied.append("Sentence Structure")
436
 
 
 
 
 
437
  # Calculate similarity
438
  similarity = calculate_similarity(input_text, result)
439
  ai_content_label_generated, ai_content_score_generated = predict_ai_content(result)
@@ -448,7 +784,6 @@ def humanize_text(
448
  return result, similarity, status,ai_content_label_generated, ai_content_score_generated,ai_content_label_input, ai_content_score_input
449
 
450
  except Exception as e:
451
- logger.error(f"Error in humanization: {e}")
452
  import traceback
453
  traceback.print_exc()
454
  return "", 0.0, f"❌ Error: {str(e)}"
 
4
  import re
5
  import spacy
6
  from nltk.corpus import wordnet, stopwords
7
+ from nltk import pos_tag, word_tokenize
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
10
+ from sentence_transformers import SentenceTransformer,util
11
  import torch
12
  import numpy as np
13
+ from typing import List, Dict, Tuple,Optional
 
14
  from transformers import pipeline
15
+ import google.generativeai as genai
16
+ import json
17
 
18
+ genai.configure(api_key="AIzaSyBpAvPOI4rOWIIP80XYrd0R8U6kwrWv8t4")
19
 
20
+ model = genai.GenerativeModel("gemini-2.5-flash-lite")
 
 
21
 
22
  # Download NLTK data
23
  print("Downloading NLTK data...")
 
36
  t5_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")
37
  t5_model.to(device)
38
 
39
+ nli_model = SentenceTransformer("cross-encoder/nli-deberta-v3-base")
40
  similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
41
  nlp = spacy.load("en_core_web_sm")
42
 
 
105
 
106
  return " ".join(paraphrased_sentences)
107
 
108
+
109
  # ============================================================================
110
+ # CONTEXTUAL SYNONYM REPLACEMENT
111
  # ============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ class ContextualSynonymReplacer:
114
+ def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
115
+ """Initialize with sentence transformer for contextual similarity"""
116
+ self.model = SentenceTransformer(model_name)
117
+ self.stop_words = set(stopwords.words('english'))
 
 
 
 
 
 
 
 
 
118
 
119
+ def get_synonyms(self, word: str, pos: str, max_synonyms: int = 5) -> List[str]:
120
+ """Get WordNet synonyms with POS filtering"""
121
+ pos_mapping = {
122
+ 'NN': wordnet.NOUN, 'NNS': wordnet.NOUN, 'NNP': wordnet.NOUN, 'NNPS': wordnet.NOUN,
123
+ 'VB': wordnet.VERB, 'VBD': wordnet.VERB, 'VBG': wordnet.VERB, 'VBN': wordnet.VERB,
124
+ 'VBP': wordnet.VERB, 'VBZ': wordnet.VERB,
125
+ 'JJ': wordnet.ADJ, 'JJR': wordnet.ADJ, 'JJS': wordnet.ADJ,
126
+ 'RB': wordnet.ADV, 'RBR': wordnet.ADV, 'RBS': wordnet.ADV
127
+ }
128
 
129
+ wn_pos = pos_mapping.get(pos, wordnet.NOUN)
130
+ synsets = wordnet.synsets(word.lower(), pos=wn_pos)
 
131
 
132
+ if not synsets:
133
+ synsets = wordnet.synsets(word.lower())
134
 
135
+ synonyms = []
136
+ for synset in synsets[:max_synonyms]:
137
+ for lemma in synset.lemmas():
138
+ syn = lemma.name().replace('_', ' ')
139
+ # Only single words, different from original
140
+ if len(syn.split()) == 1 and syn.lower() != word.lower():
141
+ synonyms.append(syn)
142
+
143
+ return list(set(synonyms))
144
 
145
+ def get_contextual_similarity(self, original_sentence: str,
146
+ modified_sentences: List[str]) -> np.ndarray:
147
+ """Calculate semantic similarity between original and modified sentences"""
148
+ all_sentences = [original_sentence] + modified_sentences
149
+ embeddings = self.model.encode(all_sentences)
150
+
151
+ # Compute similarity between original and all modified versions
152
+ similarities = cosine_similarity([embeddings[0]], embeddings[1:])[0]
153
+ return similarities
154
+
155
+ def select_best_synonym(self, word: str, synonyms: List[str],
156
+ context: str, word_idx: int,
157
+ words: List[str]) -> str:
158
+ """Select synonym that maintains contextual meaning"""
159
+ if not synonyms:
160
+ return word
161
+
162
+ # Create original sentence
163
+ original_sentence = ' '.join(words)
164
+
165
+ # Create candidate sentences with each synonym
166
+ candidate_sentences = []
167
+ for syn in synonyms:
168
+ modified_words = words.copy()
169
+ modified_words[word_idx] = syn
170
+ candidate_sentences.append(' '.join(modified_words))
171
+
172
+ # Calculate contextual similarities
173
+ similarities = self.get_contextual_similarity(original_sentence, candidate_sentences)
174
+
175
+ # Filter synonyms with high similarity (> threshold)
176
+ similarity_threshold = 0.85
177
+ valid_candidates = [
178
+ (syn, sim) for syn, sim in zip(synonyms, similarities)
179
+ if sim >= similarity_threshold
180
+ ]
181
+
182
+ if not valid_candidates:
183
+ # If no candidates meet threshold, return original word
184
+ return word
185
+
186
+ # Return synonym with highest similarity
187
+ best_synonym = max(valid_candidates, key=lambda x: x[1])[0]
188
+ return best_synonym
189
+
190
+ def synonym_replace(self, text: str, prob: float = 0.3,
191
+ min_word_length: int = 3,
192
+ max_synonyms: int = 5) -> str:
193
+ """Replace words with contextually appropriate synonyms"""
194
+ words = word_tokenize(text)
195
+ pos_tags = pos_tag(words)
196
+ new_words = words.copy()
197
+
198
+ for idx, (word, pos) in enumerate(pos_tags):
199
+ # Skip non-alphabetic tokens
200
+ if not word.isalpha():
201
+ continue
202
+
203
+ # Skip stopwords and short words
204
+ if word.lower() in self.stop_words or len(word) <= min_word_length:
205
+ continue
206
+
207
+ # Random probability check
208
+ if random.random() > prob:
209
+ continue
210
+
211
+ # Get candidate synonyms
212
+ synonyms = self.get_synonyms(word, pos, max_synonyms)
213
+
214
+ if synonyms:
215
+ # Select best contextual synonym
216
+ best_syn = self.select_best_synonym(
217
+ word, synonyms, text, idx, words
218
+ )
219
+ new_words[idx] = best_syn
220
+
221
+ return ' '.join(new_words)
222
+
223
 
224
  # ============================================================================
225
+ # IMPROVED ACADEMIC DISCOURSE TRANSFORMATION
226
  # ============================================================================
227
+
228
+ class AcademicDiscourseTransformer:
229
+ def __init__(self):
230
+ self.contractions = {
231
+ "don't": "do not", "doesn't": "does not", "didn't": "did not",
232
+ "can't": "cannot", "couldn't": "could not", "shouldn't": "should not",
233
+ "wouldn't": "would not", "won't": "will not", "aren't": "are not",
234
+ "isn't": "is not", "wasn't": "was not", "weren't": "were not",
235
+ "haven't": "have not", "hasn't": "has not", "hadn't": "had not",
236
+ "I'm": "I am", "I've": "I have", "I'll": "I will", "I'd": "I would",
237
+ "you're": "you are", "you've": "you have", "you'll": "you will",
238
+ "we're": "we are", "we've": "we have", "we'll": "we will",
239
+ "they're": "they are", "they've": "they have", "they'll": "they will",
240
+ "it's": "it is", "that's": "that is", "there's": "there is",
241
+ "what's": "what is"
242
+ }
243
+
244
+ self.hedges = [
245
+ "it appears that", "it is possible that", "the results suggest",
246
+ "it seems that", "there is evidence that", "it may be the case that",
247
+ "to some extent", "in general terms", "one could argue that",
248
+ "arguably", "potentially"
249
+ ]
250
+
251
+ self.boosters = [
252
+ "clearly", "indeed", "in fact", "undoubtedly",
253
+ "without doubt", "it is evident that", "there is no question that",
254
+ "certainly", "definitely", "obviously"
255
+ ]
256
+
257
+ self.connectors = {
258
+ "contrast": ["however", "on the other hand", "in contrast",
259
+ "nevertheless", "nonetheless", "conversely"],
260
+ "addition": ["moreover", "furthermore", "in addition", "additionally",
261
+ "what is more", "besides"],
262
+ "cause_effect": ["therefore", "thus", "as a result", "consequently",
263
+ "hence", "accordingly"],
264
+ "example": ["for instance", "for example", "to illustrate", "namely"],
265
+ "emphasis": ["notably", "particularly", "especially", "significantly"],
266
+ "conclusion": ["in conclusion", "overall", "in summary", "to sum up",
267
+ "in brief"]
268
+ }
269
+
270
+ self.sentence_starters = [
271
+ "It is important to note that",
272
+ "A key implication is that",
273
+ "The evidence indicates that",
274
+ "The findings suggest that",
275
+ "This demonstrates that",
276
+ "It should be emphasized that",
277
+ "From these observations, it follows that",
278
+ "It is worth noting that"
279
+ ]
280
+
281
+ # Sentence classification patterns
282
+ self.claim_patterns = [
283
+ r'\b(introduce|present|propose|develop|create|build|design)\b',
284
+ r'\b(this (paper|study|work|research))\b',
285
+ r'\b(we (introduce|present|propose|develop))\b'
286
+ ]
287
+
288
+ self.evidence_patterns = [
289
+ r'\b(results? (show|indicate|demonstrate|reveal))\b',
290
+ r'\b(findings? (suggest|indicate|show))\b',
291
+ r'\b(data (show|indicate|demonstrate))\b',
292
+ r'\b(experiments? (show|demonstrate|reveal))\b',
293
+ r'\b(analysis (shows?|indicates?|demonstrates?))\b'
294
+ ]
295
+
296
+ self.interpretation_patterns = [
297
+ r'\b(implies? that|suggests? that|indicates? that)\b',
298
+ r'\b(can be (interpreted|understood|seen))\b',
299
+ r'\b(may (be|indicate|suggest))\b'
300
+ ]
301
+
302
+ def classify_sentence(self, sentence: str) -> str:
303
+ """Classify sentence by its academic function"""
304
+ sent_lower = sentence.lower()
305
+
306
+ # Check for claims/contributions
307
+ if any(re.search(pattern, sent_lower) for pattern in self.claim_patterns):
308
+ return 'claim'
309
+
310
+ # Check for evidence/results
311
+ if any(re.search(pattern, sent_lower) for pattern in self.evidence_patterns):
312
+ return 'evidence'
313
+
314
+ # Check for interpretations
315
+ if any(re.search(pattern, sent_lower) for pattern in self.interpretation_patterns):
316
+ return 'interpretation'
317
+
318
+ return 'general'
319
 
320
+ def detect_semantic_relationship(self, prev_sent: str, curr_sent: str) -> Optional[str]:
321
+ """Detect semantic relationship between consecutive sentences"""
322
+ prev_lower = prev_sent.lower()
323
+ curr_lower = curr_sent.lower()
324
+
325
+ # Contrast indicators
326
+ contrast_words = ['however', 'but', 'although', 'while', 'whereas', 'despite']
327
+ if any(word in curr_lower for word in contrast_words):
328
+ return 'contrast'
329
+
330
+ # Addition/continuation indicators
331
+ addition_words = ['also', 'additionally', 'moreover', 'furthermore']
332
+ if any(word in curr_lower for word in addition_words):
333
+ return 'addition'
334
+
335
+ # Cause-effect indicators
336
+ causal_words = ['therefore', 'thus', 'consequently', 'as a result', 'because']
337
+ if any(word in curr_lower for word in causal_words):
338
+ return 'cause_effect'
339
+
340
+ # Example indicators
341
+ example_words = ['for example', 'for instance', 'such as', 'including']
342
+ if any(word in curr_lower for word in example_words):
343
+ return 'example'
344
+
345
+ # Check for negative/positive sentiment shift (basic heuristic)
346
+ negative_words = ['not', 'no', 'never', 'without', 'lacking', 'failed', 'limitation']
347
+ positive_words = ['successful', 'effective', 'improved', 'enhanced', 'benefit']
348
+
349
+ prev_negative = any(word in prev_lower for word in negative_words)
350
+ curr_negative = any(word in curr_lower for word in negative_words)
351
+
352
+ if prev_negative != curr_negative:
353
+ return 'contrast'
354
+
355
+ return None
356
 
357
+ def expand_contractions(self, text: str) -> str:
358
+ """Expand contractions to formal academic language"""
359
+ for contraction, expansion in self.contractions.items():
360
+ pattern = re.compile(r'\b' + re.escape(contraction) + r'\b', re.IGNORECASE)
361
+ text = pattern.sub(expansion, text)
362
+ return text
363
 
364
+ def apply_transformation(self, sentence: str, transform_type: str,
365
+ connector_type: Optional[str] = None) -> str:
366
+ """Apply a single transformation to a sentence"""
367
+ # Ensure sentence starts with capital letter
368
+ if not sentence[0].isupper():
369
+ sentence = sentence[0].upper() + sentence[1:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
+ if transform_type == 'hedge':
372
+ hedge = random.choice(self.hedges)
373
+ # Insert hedge after first word or phrase
374
+ return f"{hedge.capitalize()}, {sentence[0].lower() + sentence[1:]}"
375
+
376
+ elif transform_type == 'booster':
377
+ booster = random.choice(self.boosters)
378
+ return f"{booster.capitalize()}, {sentence}"
379
+
380
+ elif transform_type == 'starter':
381
+ starter = random.choice(self.sentence_starters)
382
+ return f"{starter} {sentence[0].lower() + sentence[1:]}"
383
+
384
+ elif transform_type == 'connector' and connector_type:
385
+ connector = random.choice(self.connectors[connector_type])
386
+ return f"{connector.capitalize()}, {sentence[0].lower() + sentence[1:]}"
387
+
388
+ return sentence
389
 
390
+ def add_academic_discourse(self, text: str,
391
+ transformation_prob: float = 0.3) -> str:
392
+ """
393
+ Add academic discourse markers with context awareness
394
+
395
+ Args:
396
+ text: Input text
397
+ transformation_prob: Overall probability of transforming a sentence
398
+ """
399
+ # Expand contractions first
400
+ text = self.expand_contractions(text)
401
+
402
+ # Split into sentences
403
+ sentences = nltk.sent_tokenize(text)
404
+ modified_sentences = []
405
+
406
+ for i, sent in enumerate(sentences):
407
+ # Classify sentence
408
+ sent_type = self.classify_sentence(sent)
409
+
410
+ # Determine if transformation should be applied
411
+ if random.random() > transformation_prob:
412
+ modified_sentences.append(sent)
413
+ continue
414
+
415
+ # Choose transformation based on sentence type and position
416
+ transform_type = None
417
+ connector_type = None
418
+
419
+ if i == 0:
420
+ # First sentence: avoid connectors
421
+ if sent_type == 'claim':
422
+ transform_type = random.choice(['booster', 'starter', None])
423
+ else:
424
+ transform_type = random.choice(['starter', None])
425
+
426
+ else:
427
+ # Get previous sentence for context
428
+ prev_sent = sentences[i-1]
429
+ relationship = self.detect_semantic_relationship(prev_sent, sent)
430
+
431
+ if relationship:
432
+ # Use appropriate connector
433
+ transform_type = 'connector'
434
+ connector_type = relationship
435
+
436
+ elif sent_type == 'claim':
437
+ # Claims: prefer boosters or starters
438
+ transform_type = random.choice(['booster', 'starter', None])
439
+
440
+ elif sent_type == 'evidence':
441
+ # Evidence: avoid hedges (data should be certain)
442
+ transform_type = random.choice(['booster', None])
443
+
444
+ elif sent_type == 'interpretation':
445
+ # Interpretations: can use hedges
446
+ transform_type = random.choice(['hedge', 'starter', None])
447
+
448
+ else:
449
+ # General sentences: balanced approach
450
+ transform_type = random.choice([
451
+ 'hedge', 'booster', 'starter', 'connector', None
452
+ ])
453
+ if transform_type == 'connector':
454
+ connector_type = random.choice(list(self.connectors.keys()))
455
+
456
+ # Apply transformation
457
+ if transform_type:
458
+ sent = self.apply_transformation(sent, transform_type, connector_type)
459
+
460
+ modified_sentences.append(sent)
461
+
462
+ return ' '.join(modified_sentences)
463
+
464
 
465
  # ============================================================================
466
  # STAGE 4: SENTENCE STRUCTURE VARIATION
467
  # ============================================================================
468
+ def vary_sentence_structure(
469
+ text: str,
470
+ split_prob: float = 0.4,
471
+ merge_prob: float = 0.3,
472
+ min_split_length: int = 20,
473
+ max_merge_length: int = 10
474
+ ) -> str:
475
+ """
476
+ Enhance sentence structure variation using NLI inference +
477
+ semantic similarity to preserve academic integrity.
478
+ """
479
+
480
  connectors = {
481
  "contrast": ["however", "nevertheless", "nonetheless", "in contrast"],
482
+ "addition": ["moreover", "furthermore", "in addition", "what is more", "also"],
483
  "cause_effect": ["therefore", "thus", "consequently", "as a result"],
484
  "example": ["for example", "for instance", "to illustrate"],
485
  "conclusion": ["in conclusion", "overall", "in summary"]
486
  }
487
+
488
  all_connectors = {c.lower() for group in connectors.values() for c in group}
489
+
490
+ def already_has_connector(s: str) -> bool:
491
+ s = s.strip().lower()
492
+ return any(s.startswith(c) for c in all_connectors)
493
+
494
+ def sentence_is_fragment(s: str) -> bool:
495
+ doc = nlp(s)
496
+ has_verb = any(t.pos_ in ("VERB", "AUX") for t in doc)
497
+ has_subj = any(t.dep_ in ("nsubj", "nsubjpass") for t in doc)
498
+ return not (has_verb and has_subj)
499
+
500
  def choose_connector_type(prev_sent: str, curr_sent: str) -> str:
501
  curr_lower = curr_sent.lower()
502
+
503
+ # Rule-based first
504
+ if any(x in curr_lower for x in ["such as", "for instance", "including"]):
505
  return "example"
506
+ if curr_lower.startswith(("however", "although", "but", "nevertheless")):
507
  return "contrast"
508
+ if any(x in curr_lower for x in ["therefore", "thus", "as a result", "because"]):
509
  return "cause_effect"
510
+
511
+ # === NLI inference ===
512
+ try:
513
+ logits = nli_model.predict([(prev_sent, curr_sent)])[0]
514
+ contradiction, neutral, entailment = logits
515
+
516
+ if contradiction > 0.40:
517
+ return "contrast"
518
+ if entailment > 0.40:
519
+ if "because" in curr_lower:
520
+ return "cause_effect"
521
+ return "addition"
522
+ except:
523
+ pass # fail safe
524
+
525
+ # === Similarity fallback ===
526
+ emb = similarity_model.encode([prev_sent, curr_sent], convert_to_tensor=True)
527
+ sim = util.cos_sim(emb[0], emb[1]).item()
528
+
529
+ return "addition" if sim >= 0.55 else "contrast"
530
+
531
+ def add_connector(prev, curr):
532
+ ctype = choose_connector_type(prev, curr)
533
+ connector = random.choice(connectors[ctype])
534
+ return f"{connector.capitalize()}, {curr[0].lower() + curr[1:]}"
535
+
536
  doc = nlp(text)
537
+ sents = [s.text.strip() for s in doc.sents]
538
  modified = []
539
+
540
+ for sent in sents:
541
+ words = sent.split()
542
+
543
+ # SPLIT
 
544
  if len(words) > min_split_length and random.random() < split_prob:
545
+ split_positions = [tok.i - doc[list(doc.sents).index(sent)].start
546
+ for tok in nlp(sent) if tok.dep_ in ("cc", "mark")]
547
+
548
+ if split_positions:
549
+ sp = random.choice(split_positions)
550
+ tokens = list(nlp(sent))
551
+ if 0 < sp < len(tokens):
552
+ first = " ".join(t.text for t in tokens[:sp]).strip()
553
+ second = " ".join(t.text for t in tokens[sp+1:]).strip()
554
+
555
+ if first and second and not sentence_is_fragment(second):
556
+ if not already_has_connector(second) and random.random() < 0.5:
557
+ second = add_connector(first, second)
558
+ modified.extend([first + ".", second])
559
  continue
560
+
561
+ # MERGE
562
+ if (modified
563
+ and len(words) < max_merge_length
564
+ and len(modified[-1].split()) < max_merge_length
565
+ and random.random() < merge_prob):
566
+
567
+ prev = modified[-1]
568
+ if not already_has_connector(sent):
569
+ merged_clause = add_connector(prev, sent)
570
+
571
+ if prev.endswith("."):
572
+ merged = prev[:-1] + f"; {merged_clause[0].lower() + merged_clause[1:]}"
573
+ else:
574
+ merged = prev + f", {merged_clause.lower()}"
575
+
576
+ if not sentence_is_fragment(sent):
577
+ modified[-1] = merged
578
+ continue
579
+
580
+ modified.append(sent)
581
+
582
+ # Clean + Capitalize sentences
583
+ out = " ".join(modified)
584
+ out = re.sub(r"\s+", " ", out).strip()
585
+ out = ". ".join(s.strip().capitalize() for s in out.split(".") if s.strip()) + "."
586
+
587
+ return out
588
+
589
+
590
+ # ============================================================================
591
+ # LLM Refinement with Gemini
592
+ # ============================================================================
593
+
594
+ GEMINI_VALIDATION_PROMPT = """
595
+ You will be given two texts: an 'Original' text and a 'Transformed' text. The 'Transformed' text is a poor modification of the 'Original', containing grammatical errors, misspellings, and inappropriate synonyms.
596
+
597
+ Your task is to:
598
+
599
+ 1. Compare the 'Transformed' text word-by-word against the 'Original' text.
600
+ 2. Identify every word in the 'Transformed' text that is incorrect or a poor substitute.
601
+ 3. Categorize these into:
602
+ - "irrelevant_incorrect"
603
+ - "inappropriate_synonyms"
604
+ 4. For each, return a JSON dictionary with
605
+ "transformed_word" : "correct_word_from_original"
606
+
607
+ ### Output Format ###
608
+ {
609
+ "irrelevant_incorrect": { "bad_word": "correct_word", ... },
610
+ "inappropriate_synonyms": { "bad_word": "correct_word", ... }
611
+ }
612
+
613
+ ### Text ###
614
+ Original:
615
+ <<<ORIGINAL_TEXT>>>
616
+
617
+ Transformed:
618
+ <<<TRANSFORMED_TEXT>>>
619
+ """
620
+
621
+ def validateText(original,transformed):
622
+ # ------------------- Build Prompt -------------------
623
+ prompt = GEMINI_VALIDATION_PROMPT \
624
+ .replace("<<<ORIGINAL_TEXT>>>", original) \
625
+ .replace("<<<TRANSFORMED_TEXT>>>", transformed)
626
+
627
+ # ------------------- Query Gemini -------------------
628
+ response = model.generate_content(prompt)
629
+ result = response.text
630
+
631
+ print("\n\n### Gemini Output ###\n", result)
632
+
633
+ try:
634
+ corrections = json.loads(result)
635
+ except:
636
+ # sometimes model adds markdown, brackets etc. optional cleaning
637
+ cleaned = re.sub(r"```json|```", "", result).strip()
638
+ corrections = json.loads(cleaned)
639
+
640
+ irrelevant = corrections.get("irrelevant_incorrect", {})
641
+ synonyms = corrections.get("inappropriate_synonyms", {})
642
+
643
+ # ------------------- Update Transformed Text -------------------
644
+ updated_text = transformed
645
 
646
+ for wrong, right in {**irrelevant, **synonyms}.items():
647
+ updated_text = re.sub(rf"\b{wrong}\b", right, updated_text)
648
+
649
+ print("\n\n### Updated Text After Gemini ###\n", updated_text)
650
+ return updated_text
651
 
652
  # ============================================================================
653
  # QUALITY CHECK
 
721
  ):
722
  """Main humanizer function that processes text through all enabled stages"""
723
 
724
+ original = input_text
725
+
726
  if not input_text.strip():
727
  return "", 0.0, "Please enter some text to humanize."
728
 
 
743
 
744
  # Stage 2: Synonym Replacement
745
  if enable_stage2:
746
+ replacer = ContextualSynonymReplacer()
747
+ random.seed(42) # For reproducibility
748
+ result = replacer.synonym_replace(
749
+ result,
750
+ prob=0.3,
751
+ min_word_length=3,
752
+ max_synonyms=5
753
+ )
754
  stages_applied.append("Synonym Replacement")
755
 
756
  # Stage 3: Academic Discourse
757
  if enable_stage3:
758
+ transformer = AcademicDiscourseTransformer()
759
+ random.seed(42)
760
+ result = transformer.add_academic_discourse(result, transformation_prob=0.4)
761
  stages_applied.append("Academic Discourse")
762
 
763
  # Stage 4: Sentence Structure
 
766
  min_split_length, max_merge_length)
767
  stages_applied.append("Sentence Structure")
768
 
769
+
770
+ # LLM Review
771
+ result = validateText(original,result)
772
+
773
  # Calculate similarity
774
  similarity = calculate_similarity(input_text, result)
775
  ai_content_label_generated, ai_content_score_generated = predict_ai_content(result)
 
784
  return result, similarity, status,ai_content_label_generated, ai_content_score_generated,ai_content_label_input, ai_content_score_input
785
 
786
  except Exception as e:
 
787
  import traceback
788
  traceback.print_exc()
789
  return "", 0.0, f"❌ Error: {str(e)}"
requirements.txt CHANGED
@@ -6,4 +6,6 @@ sentencepiece>=0.1.99
6
  torch>=2.2.0
7
  numpy>=1.26.4
8
  sentence-transformers>=2.6.0
 
 
9
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 
6
  torch>=2.2.0
7
  numpy>=1.26.4
8
  sentence-transformers>=2.6.0
9
+ google-generativeai
10
+ scikit-learn
11
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz