File size: 5,957 Bytes
f1eedd1
acd7cf4
 
 
43d27f2
 
 
acd7cf4
 
43d27f2
 
 
 
acd7cf4
 
 
 
 
 
 
43d27f2
acd7cf4
43d27f2
acd7cf4
 
 
 
 
43d27f2
acd7cf4
 
 
 
 
 
 
 
 
43d27f2
acd7cf4
 
 
43d27f2
acd7cf4
 
 
 
 
43d27f2
 
 
 
 
 
 
acd7cf4
 
 
 
43d27f2
acd7cf4
43d27f2
acd7cf4
 
 
 
 
 
 
 
 
43d27f2
 
 
 
 
 
 
 
acd7cf4
 
 
 
43d27f2
acd7cf4
 
 
 
43d27f2
acd7cf4
43d27f2
acd7cf4
 
43d27f2
 
 
 
 
 
acd7cf4
43d27f2
 
 
 
 
 
 
 
 
 
 
 
acd7cf4
 
 
 
 
 
 
 
 
43d27f2
acd7cf4
 
 
 
43d27f2
acd7cf4
 
43d27f2
acd7cf4
43d27f2
acd7cf4
43d27f2
 
 
 
acd7cf4
 
 
 
43d27f2
 
 
 
 
 
 
 
acd7cf4
 
43d27f2
 
 
 
 
 
 
 
 
 
 
acd7cf4
 
43d27f2
 
 
 
acd7cf4
 
 
 
43d27f2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# TODO: this module needs refactoring to merge into GraphGen framework
"""Evaluate the quality of the generated text using various metrics"""

import argparse
import json
import os

import pandas as pd
from dotenv import load_dotenv

from graphgen.bases.datatypes import QAPair

from .models import LengthEvaluator, MTLDEvaluator, RewardEvaluator, UniEvaluator
from .utils import logger, set_logger

sys_path = os.path.abspath(os.path.dirname(__file__))
set_logger(os.path.join(sys_path, "cache", "logs", "evaluate.log"))

load_dotenv()


def evaluate_length(corpus, tokenizer_name):
    length_evaluator = LengthEvaluator(tokenizer_name=tokenizer_name)
    logger.info("Length evaluator loaded")
    scores = length_evaluator.get_average_score(corpus)
    logger.info("Length scores: %s", scores)
    return scores


def evaluate_mtld(corpus):
    mtld_evaluator = MTLDEvaluator()
    logger.info("MTLD evaluator loaded")
    scores = mtld_evaluator.get_average_score(corpus)
    logger.info("MTLD scores: %s", scores)
    min_max_scores = mtld_evaluator.get_min_max_score(corpus)
    logger.info("MTLD min max scores: %s", min_max_scores)
    return scores, min_max_scores


def evaluate_reward(corpus, reward_model_names):
    scores = []
    for reward_name in reward_model_names:
        reward_evaluator = RewardEvaluator(reward_name=reward_name)
        logger.info("Loaded reward model: %s", reward_name)
        average_score = reward_evaluator.get_average_score(corpus)
        logger.info("%s scores: %s", reward_name, average_score)
        min_max_scores = reward_evaluator.get_min_max_score(corpus)
        logger.info("%s min max scores: %s", reward_name, min_max_scores)
        scores.append(
            {
                "reward_name": reward_name.split("/")[-1],
                "score": average_score,
                "min_max_scores": min_max_scores,
            }
        )
        del reward_evaluator
        clean_gpu_cache()
    return scores


def evaluate_uni(corpus, uni_model_name):
    uni_evaluator = UniEvaluator(model_name=uni_model_name)
    logger.info("Uni evaluator loaded with model %s", uni_model_name)
    uni_scores = uni_evaluator.get_average_score(corpus)
    for key, value in uni_scores.items():
        logger.info("Uni %s scores: %s", key, value)
    min_max_scores = uni_evaluator.get_min_max_score(corpus)
    for key, value in min_max_scores.items():
        logger.info("Uni %s min max scores: %s", key, value)
    del uni_evaluator
    clean_gpu_cache()
    return (
        uni_scores["naturalness"],
        uni_scores["coherence"],
        uni_scores["understandability"],
        min_max_scores["naturalness"],
        min_max_scores["coherence"],
        min_max_scores["understandability"],
    )


def clean_gpu_cache():
    import torch

    if torch.cuda.is_available():
        torch.cuda.empty_cache()


if __name__ == "__main__":
    import torch.multiprocessing as mp

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--folder", type=str, default="cache/data", help="folder to load data"
    )
    parser.add_argument(
        "--output", type=str, default="cache/output", help="path to save output"
    )

    parser.add_argument(
        "--tokenizer", type=str, default="cl100k_base", help="tokenizer name"
    )
    parser.add_argument(
        "--reward",
        type=str,
        default="OpenAssistant/reward-model-deberta-v3-large-v2",
        help="Comma-separated list of reward models",
    )
    parser.add_argument(
        "--uni", type=str, default="MingZhong/unieval-sum", help="uni model name"
    )

    args = parser.parse_args()

    if not os.path.exists(args.folder):
        raise ValueError(f"Folder {args.folder} does not exist")

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    reward_models = args.reward.split(",")

    results = []

    logger.info("Data loaded from %s", args.folder)
    mp.set_start_method("spawn")

    for file in os.listdir(args.folder):
        if file.endswith(".json"):
            logger.info("Processing %s", file)
            with open(os.path.join(args.folder, file), "r", encoding="utf-8") as f:
                data = json.load(f)
            data = [
                QAPair(question=data[key]["question"], answer=data[key]["answer"])
                for key in data
            ]

            length_scores = evaluate_length(data, args.tokenizer)
            mtld_scores, min_max_mtld_scores = evaluate_mtld(data)
            reward_scores = evaluate_reward(data, reward_models)
            (
                uni_naturalness_scores,
                uni_coherence_scores,
                uni_understandability_scores,
                min_max_uni_naturalness_scores,
                min_max_uni_coherence_scores,
                min_max_uni_understandability_scores,
            ) = evaluate_uni(data, args.uni)

            result = {
                "file": file,
                "number": len(data),
                "length": length_scores,
                "mtld": mtld_scores,
                "mtld_min_max": min_max_mtld_scores,
                "uni_naturalness": uni_naturalness_scores,
                "uni_coherence": uni_coherence_scores,
                "uni_understandability": uni_understandability_scores,
                "uni_naturalness_min_max": min_max_uni_naturalness_scores,
                "uni_coherence_min_max": min_max_uni_coherence_scores,
                "uni_understandability_min_max": min_max_uni_understandability_scores,
            }
            for reward_score in reward_scores:
                result[reward_score["reward_name"]] = reward_score["score"]
                result[f"{reward_score['reward_name']}_min_max"] = reward_score[
                    "min_max_scores"
                ]

            results.append(result)

    results = pd.DataFrame(results)
    results.to_csv(os.path.join(args.output, "evaluation.csv"), index=False)