Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from typing import Dict, List, Tuple, Optional | |
| import warnings | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| warnings.filterwarnings('ignore') | |
| class RussianFeatureExtractor: | |
| """Исправленная версия экстрактора признаков с работающим composite_quality_score""" | |
| def __init__(self, use_heavy_models: bool = False): | |
| print("Инициализация исправленного экстрактора признаков...") | |
| self.use_heavy_models = use_heavy_models | |
| self.sbert_model = None | |
| # Инициализация моделей | |
| self._initialize_models() | |
| # Списки ключевых слов | |
| self.greeting_words = ['здравствуйте', 'привет', 'добрый', 'здравствуй', 'доброе', 'приветствую'] | |
| self.question_words = ['как', 'что', 'где', 'когда', 'почему', 'можно', 'сколько', 'какой', 'какая'] | |
| self.descriptive_words = ['вижу', 'изображен', 'находится', 'делает', 'одет', 'стоит', 'сидит'] | |
| self.connector_words = ['потому что', 'поэтому', 'так как', 'например', 'кроме того'] | |
| self.emotional_words = ['красиво', 'интересно', 'замечательно', 'прекрасно', 'нравится'] | |
| self.spatial_words = ['слева', 'справа', 'вверху', 'внизу', 'рядом', 'около'] | |
| print("✅ Инициализация завершена!") | |
| def _initialize_models(self): | |
| """Инициализация моделей""" | |
| if self.use_heavy_models: | |
| print("ℹ️ Тяжелые модели отключены для стабильности") | |
| print("ℹ️ Используем легкие методы (TF-IDF)") | |
| def clean_text(self, text: str) -> str: | |
| """Очистка текста""" | |
| if pd.isna(text): | |
| return "" | |
| text = str(text) | |
| text = re.sub(r'<[^>]+>', '', text) | |
| text = re.sub(r'[^\w\sа-яА-ЯёЁ.,!?;:()-]', '', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def extract_basic_features(self, text: str) -> Dict[str, float]: | |
| """Базовые текстовые признаки""" | |
| text_clean = self.clean_text(text) | |
| if not text_clean: | |
| return { | |
| 'text_length': 0, 'word_count': 0, 'sentence_count': 0, | |
| 'avg_word_length': 0, 'lexical_diversity': 0, | |
| 'has_questions': 0, 'has_exclamations': 0 | |
| } | |
| # Базовые метрики | |
| words = re.findall(r'\b[а-яёa-z]+\b', text_clean.lower()) | |
| sentences = [s.strip() for s in re.split(r'[.!?]+', text_clean) if s.strip()] | |
| word_count = len(words) | |
| text_length = len(text_clean) | |
| sentence_count = len(sentences) | |
| features = { | |
| 'text_length': text_length, | |
| 'word_count': word_count, | |
| 'sentence_count': sentence_count, | |
| 'avg_word_length': sum(len(w) for w in words) / max(word_count, 1), | |
| 'lexical_diversity': len(set(words)) / max(word_count, 1), | |
| 'has_questions': int('?' in text_clean), | |
| 'has_exclamations': int('!' in text_clean), | |
| } | |
| return features | |
| def extract_semantic_features(self, question: str, answer: str) -> Dict[str, float]: | |
| """Семантические признаки""" | |
| question_clean = self.clean_text(question) | |
| answer_clean = self.clean_text(answer) | |
| features = { | |
| 'keyword_overlap': 0.0, | |
| 'response_relevance': 0.0 | |
| } | |
| if not answer_clean or not question_clean: | |
| return features | |
| try: | |
| # Упрощенный анализ ключевых слов | |
| question_words = set(re.findall(r'\b[а-яё]+\b', question_clean.lower())) | |
| answer_words = set(re.findall(r'\b[а-яё]+\b', answer_clean.lower())) | |
| if question_words: | |
| common_words = question_words.intersection(answer_words) | |
| features['keyword_overlap'] = len(common_words) / max(len(question_words), 1) | |
| features['response_relevance'] = min(1.0, len(answer_words) / max(len(question_words), 1)) | |
| except Exception as e: | |
| print(f"Ошибка семантических признаков: {e}") | |
| return features | |
| def extract_grammar_features(self, text: str) -> Dict[str, float]: | |
| """Грамматические признаки""" | |
| text_clean = self.clean_text(text) | |
| features = { | |
| 'grammar_quality': 0.5, # Базовая оценка | |
| 'has_punctuation': 0.0, | |
| 'sentence_completeness': 0.0 | |
| } | |
| if not text_clean: | |
| return features | |
| sentences = [s.strip() for s in re.split(r'[.!?]+', text_clean) if s.strip()] | |
| words = text_clean.split() | |
| if sentences: | |
| # Проверка пунктуации | |
| features['has_punctuation'] = 1.0 if any(mark in text_clean for mark in '.!?') else 0.0 | |
| # Полнота предложений | |
| complete_sentences = sum(1 for s in sentences if len(s.split()) >= 3) | |
| features['sentence_completeness'] = complete_sentences / max(len(sentences), 1) | |
| # Улучшенная эвристика грамматического качества | |
| grammar_score = 0.0 | |
| grammar_score += features['has_punctuation'] * 0.3 | |
| grammar_score += features['sentence_completeness'] * 0.4 | |
| # Дополнительные эвристики | |
| if len(words) > 5: | |
| avg_sentence_len = len(words) / len(sentences) | |
| if 5 <= avg_sentence_len <= 20: | |
| grammar_score += 0.2 | |
| elif avg_sentence_len > 20: | |
| grammar_score += 0.1 | |
| features['grammar_quality'] = min(1.0, grammar_score) | |
| return features | |
| def extract_style_features(self, text: str) -> Dict[str, float]: | |
| """Стилистические признаки""" | |
| text_clean = self.clean_text(text).lower() | |
| features = { | |
| 'has_greeting': 0.0, | |
| 'has_description': 0.0, | |
| 'has_connectors': 0.0, | |
| 'has_emotional_words': 0.0, | |
| 'style_score': 0.0 | |
| } | |
| if not text_clean: | |
| return features | |
| # Стилистические маркеры | |
| features.update({ | |
| 'has_greeting': float(any(greet in text_clean for greet in self.greeting_words)), | |
| 'has_description': float(any(desc in text_clean for desc in self.descriptive_words)), | |
| 'has_connectors': float(any(conn in text_clean for conn in self.connector_words)), | |
| 'has_emotional_words': float(any(emot in text_clean for emot in self.emotional_words)), | |
| }) | |
| # Оценка стиля | |
| style_indicators = sum([ | |
| features['has_greeting'], | |
| features['has_connectors'], | |
| features['has_emotional_words'] | |
| ]) | |
| features['style_score'] = min(1.0, style_indicators / 3) | |
| return features | |
| def extract_quality_features(self, text: str, question_type: int) -> Dict[str, float]: | |
| """Признаки качества ответа""" | |
| text_clean = self.clean_text(text) | |
| words = text_clean.split() | |
| word_count = len(words) | |
| features = { | |
| 'answer_length_sufficiency': min(1.0, word_count / 30), # Нормализованная длина | |
| 'content_richness': 0.0, | |
| 'engagement_level': 0.0 | |
| } | |
| if not text_clean: | |
| return features | |
| # Богатство контента (лексическое разнообразие + длина) | |
| lexical_diversity = len(set(words)) / max(word_count, 1) | |
| features['content_richness'] = min(1.0, (lexical_diversity + features['answer_length_sufficiency']) / 2) | |
| # Уровень вовлеченности | |
| engagement = 0.0 | |
| engagement += features['answer_length_sufficiency'] * 0.4 | |
| engagement += lexical_diversity * 0.3 | |
| engagement += (1.0 if '?' in text_clean else 0.0) * 0.3 | |
| features['engagement_level'] = engagement | |
| return features | |
| def extract_all_features(self, row: pd.Series) -> Dict[str, float]: | |
| """Извлечение всех признаков - ИСПРАВЛЕННАЯ ВЕРСИЯ""" | |
| try: | |
| # Безопасное извлечение данных | |
| question = row.get('Текст вопроса', row.get('Вопрос', '')) | |
| answer = row.get('Транскрибация ответа', row.get('Транскрипт', row.get('Ответ', ''))) | |
| question_type = row.get('№ вопроса', row.get('Тип вопроса', 1)) | |
| try: | |
| question_type = int(question_type) | |
| except: | |
| question_type = 1 | |
| features = {} | |
| # 1. Базовые признаки (надежные) | |
| basic_features = self.extract_basic_features(answer) | |
| features.update(basic_features) | |
| # 2. Семантические признаки | |
| semantic_features = self.extract_semantic_features(question, answer) | |
| features.update(semantic_features) | |
| # 3. Грамматические признаки | |
| grammar_features = self.extract_grammar_features(answer) | |
| features.update(grammar_features) | |
| # 4. Стилистические признаки | |
| style_features = self.extract_style_features(answer) | |
| features.update(style_features) | |
| # 5. Признаки качества | |
| quality_features = self.extract_quality_features(answer, question_type) | |
| features.update(quality_features) | |
| # 6. Тип вопроса | |
| features['question_type'] = float(question_type) | |
| # 7. ИСПРАВЛЕННЫЙ композитный показатель | |
| features['composite_quality_score'] = self._calculate_quality_score(features) | |
| return features | |
| except Exception as e: | |
| print(f"❌ Ошибка при извлечении признаков: {e}") | |
| # Возвращаем базовые признаки | |
| return self._get_fallback_features() | |
| def _calculate_quality_score(self, features: Dict[str, float]) -> float: | |
| """ИСПРАВЛЕННЫЙ расчет качества ответа""" | |
| # Веса для разных категорий | |
| weights = { | |
| # Семантика и релевантность (35%) | |
| 'keyword_overlap': 0.20, | |
| 'response_relevance': 0.15, | |
| # Грамматика и структура (25%) | |
| 'grammar_quality': 0.15, | |
| 'sentence_completeness': 0.10, | |
| # Стиль и вовлеченность (25%) | |
| 'style_score': 0.10, | |
| 'engagement_level': 0.15, | |
| # Содержание (15%) | |
| 'content_richness': 0.15 | |
| } | |
| total_score = 0.0 | |
| total_weight = 0.0 | |
| for feature, weight in weights.items(): | |
| if feature in features: | |
| value = features[feature] | |
| total_score += value * weight | |
| total_weight += weight | |
| # Нормализация на случай отсутствующих признаков | |
| if total_weight > 0: | |
| final_score = total_score / total_weight | |
| else: | |
| final_score = 0.5 # нейтральная оценка | |
| return min(1.0, max(0.0, final_score)) | |
| def _get_fallback_features(self) -> Dict[str, float]: | |
| """Базовые признаки при ошибке""" | |
| return { | |
| 'text_length': 0, 'word_count': 0, 'sentence_count': 0, | |
| 'avg_word_length': 0, 'lexical_diversity': 0, | |
| 'has_questions': 0, 'has_exclamations': 0, | |
| 'keyword_overlap': 0, 'response_relevance': 0, | |
| 'grammar_quality': 0.5, 'has_punctuation': 0, 'sentence_completeness': 0, | |
| 'has_greeting': 0, 'has_description': 0, 'has_connectors': 0, | |
| 'has_emotional_words': 0, 'style_score': 0, | |
| 'answer_length_sufficiency': 0, 'content_richness': 0, 'engagement_level': 0, | |
| 'question_type': 1, 'composite_quality_score': 0.5 | |
| } | |
| def extract_features_for_dataframe(self, df: pd.DataFrame, sample_size: int = None) -> pd.DataFrame: | |
| """Извлечение признаков для датафрейма""" | |
| if sample_size and sample_size < len(df): | |
| df = df.sample(sample_size, random_state=42) | |
| print(f"Взята выборка: {len(df)} строк") | |
| print(f"Извлечение признаков для {len(df)} строк...") | |
| features_list = [] | |
| successful = 0 | |
| for idx, row in df.iterrows(): | |
| if idx % 50 == 0 and idx > 0: | |
| print(f"Обработано {idx}/{len(df)} строк...") | |
| try: | |
| features = self.extract_all_features(row) | |
| features['original_index'] = idx | |
| features_list.append(features) | |
| successful += 1 | |
| except Exception as e: | |
| print(f"❌ Ошибка в строке {idx}: {e}") | |
| continue | |
| if features_list: | |
| features_df = pd.DataFrame(features_list) | |
| features_df.set_index('original_index', inplace=True) | |
| success_rate = successful / len(df) | |
| print(f"✅ Извлечение завершено! Успешно: {successful}/{len(df)} ({success_rate:.1%})") | |
| return features_df | |
| else: | |
| print("❌ Не удалось извлечь признаки") | |
| return pd.DataFrame() | |
| # Быстрая функция для тестирования | |
| def extract_quick_features(text: str) -> Dict[str, float]: | |
| extractor = RussianFeatureExtractor() | |
| return extractor.extract_basic_features(text) | |
| if __name__ == "__main__": | |
| # Тест исправленной версии | |
| extractor = RussianFeatureExtractor() | |
| test_data = { | |
| 'Текст вопроса': ['Расскажите о вашем городе'], | |
| 'Транскрибация ответа': ['Привет! Я живу в Москве. Это большой и красивый город с множеством парков и музеев.'], | |
| '№ вопроса': [1] | |
| } | |
| test_df = pd.DataFrame(test_data) | |
| features = extractor.extract_all_features(test_df.iloc[0]) | |
| print("🎯 ТЕСТ ИСПРАВЛЕННОЙ ВЕРСИИ:") | |
| print(f"Композитный показатель: {features['composite_quality_score']:.3f}") | |
| print(f"Грамматическое качество: {features['grammar_quality']:.3f}") | |
| print(f"Стилевой показатель: {features['style_score']:.3f}") | |
| print(f"Количество слов: {features['word_count']}") |