mirror of
https://github.com/karma-riuk/crab-webapp.git
synced 2025-07-05 14:18:12 +02:00
now using dataset and doign a better job with bleu
calculations and paraphrases
This commit is contained in:
@ -1,38 +1,39 @@
|
|||||||
# utils/process_data.py
|
|
||||||
import json
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
from utils.handlers import get_build_handler
|
||||||
from .paths import get_project_path
|
from .paths import get_project_path
|
||||||
from sacrebleu import sentence_bleu as bleu
|
from sacrebleu import sentence_bleu as bleu
|
||||||
|
from utils.dataset import ArchiveState, Dataset
|
||||||
|
|
||||||
|
REFERENCE_MAP = Dataset.from_json(
|
||||||
|
str(get_project_path('../data/dataset.json'))
|
||||||
|
).build_reference_map()
|
||||||
|
|
||||||
def build_reference_map(dataset_path: str) -> dict[str, list[str]]:
|
ARCHIVES_ROOT = str(get_project_path('../data/archives'))
|
||||||
ref_map = {}
|
|
||||||
data = json.loads(open(dataset_path).read())
|
|
||||||
for entry in data['entries']:
|
|
||||||
id_ = entry['metadata']['id']
|
|
||||||
comments = entry['comments']
|
|
||||||
ref_map[id_] = [c['body'] for c in comments]
|
|
||||||
return ref_map
|
|
||||||
|
|
||||||
|
|
||||||
REFERENCE_MAP = build_reference_map(str(get_project_path('../data/dataset.json')))
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate_comments(answers: dict[str, str], percent_cb):
|
def evaluate_comments(answers: dict[str, str], percent_cb):
|
||||||
total = len(answers)
|
total = len(answers)
|
||||||
results = {}
|
results = {}
|
||||||
|
print(REFERENCE_MAP.keys())
|
||||||
for i, (id_, gen) in enumerate(answers.items(), 1):
|
for i, (id_, gen) in enumerate(answers.items(), 1):
|
||||||
if id_ not in REFERENCE_MAP:
|
if id_ not in REFERENCE_MAP:
|
||||||
print(f'id: "{id_}" is not present in the dataset', file=sys.stderr)
|
print(f'id: "{id_}" is not present in the dataset', file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
paraphrases = REFERENCE_MAP[id_]
|
entry = REFERENCE_MAP[id_]
|
||||||
max_score = 0.0
|
max_score = 0
|
||||||
scores = []
|
scores = []
|
||||||
for p in paraphrases:
|
for p in [entry.comments[0].body] + entry.comments[0].paraphrases:
|
||||||
score = bleu(gen, [p]).score
|
score = round(bleu(gen, [p]).score, 2)
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
max_score = max(max_score, score)
|
max_score = max(max_score, score)
|
||||||
results[id_] = {'max_bleu_score': max_score, 'bleu_scores': scores, 'proposed_comment': gen}
|
|
||||||
|
print(scores)
|
||||||
|
results[id_] = {
|
||||||
|
'max_bleu_score': max_score,
|
||||||
|
'bleu_scores': scores,
|
||||||
|
'proposed_comment': gen,
|
||||||
|
}
|
||||||
percent_cb(int(i / total * 100))
|
percent_cb(int(i / total * 100))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user