Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import string | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import PorterStemmer, WordNetLemmatizer | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.pipeline import Pipeline | |
| # Download necessary NLTK data files | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| # Custom transformer for text preprocessing | |
| class TextPreprocessor(BaseEstimator, TransformerMixin): | |
| def __init__(self): | |
| self.stop_words = set(stopwords.words('english')) | |
| self.lemmatizer = WordNetLemmatizer() | |
| def preprocess_text(self, text): | |
| # Lowercase the text | |
| text = text.lower() | |
| # Remove punctuation | |
| text = re.sub(f'[{re.escape(string.punctuation)}]', '', text) | |
| # Remove numbers | |
| text = re.sub(r'\d+', '', text) | |
| # Tokenize the text | |
| words = text.split() | |
| # Remove stopwords and apply lemmatization | |
| words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words] | |
| # Join words back into a single string | |
| cleaned_text = ' '.join(words) | |
| return cleaned_text | |
| def fit(self, X, y=None): | |
| return self | |
| def transform(self, X, y=None): | |
| return [self.preprocess_text(text) for text in X] | |
| # Model pipeline | |
| pipeline = Pipeline([ | |
| ('preprocessor', TextPreprocessor()), | |
| ('vectorizer', TfidfVectorizer()), | |
| ('classifier', RandomForestClassifier()) | |
| ]) | |