Spaces:

lidiiakarmanova
/

exam-evaluator

Sleeping

KarmanovaLidiia

Initial clean commit for HF Space (models via Git LFS)

bcb314a about 1 month ago

3.65 kB

	# src/features.py
	from __future__ import annotations
	import re
	from pathlib import Path
	from typing import List

	import numpy as np
	import pandas as pd


	_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё0-9]+", re.UNICODE)
	_SENT_SPLIT_RE = re.compile(r"[.!?]+[\s\n]+")
	_PUNCT_RE = re.compile(r"[^\w\s]", re.UNICODE)


	def _tokenize_words(text: str) -> List[str]:
	if not isinstance(text, str) or not text:
	return []
	return _WORD_RE.findall(text)


	def _split_sentences(text: str) -> List[str]:
	if not isinstance(text, str) or not text.strip():
	return []
	parts = _SENT_SPLIT_RE.split(text.strip())
	return [p.strip() for p in parts if p.strip()]


	def build_baseline_features(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Принимает датафрейм со столбцами:
	- question_number
	- question_text
	- answer_text
	- score (может отсутствовать на инференсе, тогда создадим NaN)

	Возвращает df с базовыми признаками:
	ans_len_chars, ans_len_words, ans_n_sents, ans_avg_sent_len,
	ans_ttr, ans_short_sent_rt, ans_punct_rt, q_len_words, has_intro
	"""

	out = df.copy()

	if "score" not in out.columns:
	out["score"] = np.nan

	# гарантируем строки
	out["question_text"] = out["question_text"].fillna("").astype(str)
	out["answer_text"] = out["answer_text"].fillna("").astype(str)

	# длины/токены
	ans_words = out["answer_text"].apply(_tokenize_words)
	q_words = out["question_text"].apply(_tokenize_words)
	ans_sents = out["answer_text"].apply(_split_sentences)

	out["ans_len_chars"] = out["answer_text"].str.len()
	out["ans_len_words"] = ans_words.apply(len).astype(int)

	out["ans_n_sents"] = ans_sents.apply(len).astype(int)
	out["ans_avg_sent_len"] = (
	out["ans_len_words"] / out["ans_n_sents"].replace({0: np.nan})
	).fillna(0).astype(float)

	# Type-Token Ratio
	def _ttr(ws: List[str]) -> float:
	return 0.0 if not ws else len(set(map(str.lower, ws))) / float(len(ws))

	out["ans_ttr"] = ans_words.apply(_ttr).astype(float)

	# доля коротких предложений (<= 5 слов)
	def _short_rate(sents: List[str]) -> float:
	if not sents:
	return 0.0
	cnt = 0
	for s in sents:
	if len(_tokenize_words(s)) <= 5:
	cnt += 1
	return cnt / float(len(sents))

	out["ans_short_sent_rt"] = ans_sents.apply(_short_rate).astype(float)

	# доля пунктуации в ответе
	def _punct_ratio(text: str) -> float:
	if not text:
	return 0.0
	punct = len(_PUNCT_RE.findall(text))
	return punct / float(len(text))

	out["ans_punct_rt"] = out["answer_text"].apply(_punct_ratio).astype(float)

	# длина вопроса в словах
	out["q_len_words"] = q_words.apply(len).astype(int)

	# наличие вводной части — простая эвристика
	out["has_intro"] = out["answer_text"].str.contains(
	r"\b(во-первых\|например\|сначала\|итак\|сперва\|прежде всего)\b",
	case=False, na=False
	).astype(float)

	# порядок колонок
	cols = [
	"question_number", "question_text", "answer_text", "score",
	"ans_len_chars", "ans_len_words", "ans_n_sents", "ans_avg_sent_len",
	"ans_ttr", "ans_short_sent_rt", "ans_punct_rt", "q_len_words", "has_intro",
	]
	cols = [c for c in cols if c in out.columns]
	return out[cols]