From f5bdfd1a1bdb98090f27fd40d914dc89f5672691 Mon Sep 17 00:00:00 2001 From: Karma Riuk Date: Tue, 10 Jun 2025 20:45:51 +0200 Subject: [PATCH] the input to code refinement now ignores paraphrases --- dataset.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/dataset.py b/dataset.py index 69a07f1..d65a062 100644 --- a/dataset.py +++ b/dataset.py @@ -111,12 +111,28 @@ class CommentGenEntry: diffs=entry.diffs_before, ) +@dataclass +class CodeRefinementComment: + body: str + file: str + from_: int + to: int + + @classmethod + def from_comment(cls, comment: Comment) -> "CodeRefinementComment": + return cls( + body=comment.body, + file=comment.file, + from_=comment.from_, + to=comment.to, + ) + @dataclass class CodeRefinementEntry: id: str files: Dict[str, str] # filename -> file content diffs: Dict[str, str] # filename -> diff, diffs between the opening of the PR and the comment - comments: List[Comment] + comments: List[CodeRefinementComment] @staticmethod def from_entry(entry: DatasetEntry) -> "CodeRefinementEntry": @@ -124,7 +140,7 @@ class CodeRefinementEntry: id=entry.metadata.id, files={fname: fdata.content_before_pr for fname, fdata in entry.files.items()}, diffs=entry.diffs_before, - comments=entry.comments, + comments=[CodeRefinementComment.from_comment(c) for c in entry.comments], ) # fmt: on