Mostafa174 commited on
Commit
450a421
·
1 Parent(s): c1f9b9f

Initial commit

Browse files
Files changed (2) hide show
  1. app.py +218 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import numpy as np
4
+ from scipy.special import expit
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
+ from PyPDF2 import PdfReader
7
+ from docx import Document
8
+
9
+ # Load Model and Tokenizer
10
+
11
+ MODEL = "cardiffnlp/tweet-topic-21-multi"
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
13
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
14
+ class_mapping = model.config.id2label
15
+
16
+
17
+ # Text Analyzer
18
+
19
+ def analyze_topics(text):
20
+ detected_topics = []
21
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
22
+ outputs = model(**inputs)
23
+
24
+ scores = outputs.logits[0].detach().numpy()
25
+ scores = expit(scores)
26
+ predictions = (scores >= 0.5).astype(int)
27
+
28
+ for i, pred in enumerate(predictions):
29
+ if pred:
30
+ topic_name = class_mapping[i]
31
+ confidence = scores[i]
32
+ detected_topics.append(f"• {topic_name} ({confidence:.2f})")
33
+
34
+ if detected_topics:
35
+ return "\n".join(detected_topics)
36
+ else:
37
+ return "No specific topics detected."
38
+
39
+
40
+ # Document Analyzer Helpers
41
+
42
+ def extract_text_from_file(file_path):
43
+ ext = os.path.splitext(file_path)[1].lower()
44
+
45
+ if ext == ".pdf":
46
+ reader = PdfReader(file_path)
47
+ text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
48
+ elif ext == ".docx":
49
+ doc = Document(file_path)
50
+ text = "\n".join([p.text for p in doc.paragraphs])
51
+ elif ext == ".txt":
52
+ with open(file_path, "r", encoding="utf-8") as f:
53
+ text = f.read()
54
+ else:
55
+ raise ValueError("Unsupported file format. Please upload a PDF, DOCX, or TXT file.")
56
+
57
+ return text.strip()
58
+
59
+
60
+ def analyze_document(file):
61
+ if file is None:
62
+ return "Please upload a document first."
63
+
64
+ text = extract_text_from_file(file.name)
65
+ if not text:
66
+ return "No readable text found in document."
67
+
68
+ # Split into chunks for large docs
69
+ words = text.split()
70
+ chunks = [" ".join(words[i:i + 400]) for i in range(0, len(words), 400)]
71
+
72
+ all_detected_topics = {}
73
+
74
+ for chunk in chunks:
75
+ inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
76
+ outputs = model(**inputs)
77
+ scores = outputs.logits[0].detach().numpy()
78
+ scores = expit(scores)
79
+ predictions = (scores >= 0.5).astype(int)
80
+
81
+ for i, pred in enumerate(predictions):
82
+ if pred:
83
+ topic_name = class_mapping[i]
84
+ confidence = scores[i]
85
+ all_detected_topics.setdefault(topic_name, []).append(confidence)
86
+
87
+ if all_detected_topics:
88
+ summary = [
89
+ f"• {topic} (avg confidence: {np.mean(confs):.2f})"
90
+ for topic, confs in all_detected_topics.items()
91
+ ]
92
+ summary.sort(key=lambda x: float(x.split(': ')[-1].rstrip(')')), reverse=True)
93
+ return "\n".join(summary)
94
+ else:
95
+ return "No specific topics detected in document."
96
+ css = """
97
+ /* --- Global Layout --- */
98
+ body {
99
+ background-color: #1a1a1a !important;
100
+ color: #f5f5f5 !important;
101
+ font-family: 'Inter', sans-serif !important;
102
+ margin: 0 !important;
103
+ padding: 0 !important;
104
+ }
105
+
106
+ /* Full width */
107
+ #root, .gradio-container, .main {
108
+ max-width: 100% !important;
109
+ width: 100% !important;
110
+ background-color: #1a1a1a !important;
111
+ margin: 0 !important;
112
+ padding: 0 !important;
113
+ border: none !important;
114
+ box-shadow: none !important;
115
+ }
116
+
117
+ /* Headings and Labels */
118
+ h1, h2, h3, label {
119
+ color: #ff9900 !important;
120
+ font-weight: 600 !important;
121
+ }
122
+
123
+ /* Text Inputs */
124
+ textarea, input {
125
+ background-color: #2a2a2a !important;
126
+ color: #f5f5f5 !important;
127
+ border: 1px solid #3a3a3a !important;
128
+ border-radius: 10px !important;
129
+ padding: 12px !important;
130
+ }
131
+
132
+
133
+
134
+
135
+ /* Buttons */
136
+ button {
137
+ background-color: #ff9900 !important;
138
+ color: #1a1a1a !important;
139
+ font-weight: 600 !important;
140
+ border-radius: 8px !important;
141
+ border: none !important;
142
+ padding: 8px 16px !important;
143
+ transition: 0.25s ease-in-out;
144
+ }
145
+ button:hover {
146
+ background-color: #ffb84d !important;
147
+ }
148
+
149
+ /* Output textbox */
150
+ .output-textbox {
151
+ background-color: #252525 !important;
152
+ color: #ffd480 !important;
153
+ border: 1px solid #3a3a3a !important;
154
+ border-radius: 10px !important;
155
+ box-shadow: inset 0 0 6px rgba(255,153,0,0.1);
156
+ }
157
+
158
+ /* Tabs */
159
+ .tabitem.svelte-1ipelgc {
160
+ background-color: #1a1a1a !important;
161
+ color: #ffb84d !important;
162
+ }
163
+ .tabitem.svelte-1ipelgc.selected {
164
+ background-color: #ff9900 !important;
165
+ color: #1a1a1a !important;
166
+ font-weight: 700 !important;
167
+ }
168
+
169
+ /* Footer */
170
+ .footer, .svelte-1xdkkgx, .wrap.svelte-1ipelgc {
171
+ background: none !important;
172
+ border: none !important;
173
+ box-shadow: none !important;
174
+ color: #888 !important;
175
+ text-align: center !important;
176
+ }
177
+ """
178
+
179
+ # -------------------------
180
+ # Gradio Interface
181
+ # -------------------------
182
+
183
+ tweet_tab = gr.Interface(
184
+ fn=analyze_topics,
185
+ inputs=gr.Textbox(
186
+ label="📝 Enter Text",
187
+ placeholder="Type or paste text here...",
188
+ lines=4
189
+ ),
190
+ outputs=gr.Textbox(label="🎯 Detected Topics"),
191
+ examples=[
192
+ ["Just watched the new Marvel movie, it was amazing!"],
193
+ ["Bitcoin prices are going up again!"],
194
+ ["Climate change is affecting polar bears."],
195
+ ],
196
+ title="💬 Text Topic Analyzer",
197
+ description="Analyze short texts or tweets to detect underlying topics using CardiffNLP’s Tweet Topic model.",
198
+ )
199
+
200
+ document_tab = gr.Interface(
201
+ fn=analyze_document,
202
+ inputs=gr.File(label="📄 Upload Document (PDF, DOCX, or TXT)"),
203
+ outputs=gr.Textbox(label="📘 Detected Topics"),
204
+ title="📄 Document Topic Analyzer",
205
+ description="Upload a document and let the AI detect key topics discussed inside.",
206
+ )
207
+
208
+ app = gr.TabbedInterface(
209
+ [tweet_tab, document_tab],
210
+ ["💬 Text Analyzer", "📄 Document Analyzer"],
211
+ title="🧠 AI Topic Analyzer",
212
+ css=css,
213
+ theme=gr.themes.Base(primary_hue="orange", secondary_hue="orange"),
214
+ )
215
+
216
+ if __name__ == "__main__":
217
+ app.launch()
218
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ transformers>=4.30.0
3
+ torch>=2.0.0
4
+ numpy>=1.21.0
5
+ scipy>=1.7.0
6
+ PyPDF2>=3.0.0
7
+ python-docx>=0.8.11