from dataclasses import dataclass, field from enum import Enum from typing import Any, Dict, List, Optional import json, argparse, os from utils import prompt_yes_no # fmt: off @dataclass class FileData: is_code_related: bool coverage: Dict[str, float] # jacoco-report -> coverage content_before_pr: str = "" content_after_pr: str = "" @dataclass class Comment: body: str file: str from_: int to: int @dataclass class Selection: comment_suggests_change: bool diff_after_address_change: Optional[bool] good: Optional[bool] @dataclass class Metadata: repo: str # the name of the repo, with style XXX/YYY pr_number: int pr_title: str pr_body: str merge_commit_sha: str # to checkout for the tests successful: bool = True build_system: str = "" reason_for_failure: str = "" last_cmd_error_msg: str = "" selection: Optional[Selection] = None @dataclass class DatasetEntry: metadata: Metadata files: Dict[str, FileData] # filename -> file data, files before the PR (before the first PR commits) diffs_before: Dict[str, str] # filename -> diff, diffs between the opening of the PR and the comment comments: List[Comment] diffs_after: Dict[str, str] # filename -> diff, changes after the comment @dataclass class CommentGenEntry: id: int files: Dict[str, str] # filename -> file content diffs: Dict[str, str] # filename -> diff, diffs between the opening of the PR and the comment @staticmethod def from_entry(entry: DatasetEntry, id: int) -> "CommentGenEntry": return CommentGenEntry( id=id, files={fname: fdata.content_before_pr for fname, fdata in entry.files.items()}, diffs=entry.diffs_before, ) @dataclass class CodeRefinementEntry: id: int files: Dict[str, str] # filename -> file content diffs: Dict[str, str] # filename -> diff, diffs between the opening of the PR and the comment comments: List[Comment] @staticmethod def from_entry(entry: DatasetEntry, id: int) -> "CodeRefinementEntry": return CodeRefinementEntry( id=id, files={fname: fdata.content_before_pr for fname, fdata in entry.files.items()}, diffs=entry.diffs_before, comments=entry.comments, ) class OutputType(Enum): FULL = "full" CODE_REFINEMENT = "code_refinement" COMMENT_GEN = "comment_gen" # fmt: on @dataclass class Dataset: entries: List[DatasetEntry] = field(default_factory=list) def __len__(self) -> int: return sum(1 for entry in self.entries if entry.metadata.successful) def to_json(self, filename: str, type_: OutputType = OutputType.FULL) -> None: """Serialize the dataset to a JSON file""" entries_to_dump = self.entries if type_ == OutputType.COMMENT_GEN: entries_to_dump = [ entry for entry in self.entries if entry.metadata.selection and entry.metadata.selection.comment_suggests_change ] elif type_ == OutputType.CODE_REFINEMENT: entries_to_dump = [ entry for entry in self.entries if entry.metadata.selection and entry.metadata.selection.diff_after_address_change ] entry_counter = -1 to_dump = Dataset(entries=entries_to_dump) def transform_entry(entry: DatasetEntry | Dataset | Any) -> dict | list: if not isinstance(entry, (DatasetEntry, Dataset)): return entry.__dict__ if type_ == OutputType.FULL: return entry.__dict__ if isinstance(entry, Dataset): return entry.entries nonlocal entry_counter entry_counter += 1 if type_ == OutputType.COMMENT_GEN: return CommentGenEntry.from_entry(entry, entry_counter).__dict__ if type_ == OutputType.CODE_REFINEMENT: return CodeRefinementEntry.from_entry(entry, entry_counter).__dict__ with open(filename, "w", encoding="utf-8") as f: json.dump(to_dump, f, default=transform_entry, indent=4) @staticmethod def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset": with open(filename, "r", encoding="utf-8") as f: print(f"Loading dataset from {filename}...", end=" ", flush=True) data = json.load(f) print("Done") entries = [] for entry_data in data["entries"]: metadata_data = entry_data["metadata"] selection_data = metadata_data["selection"] if "selection" in metadata_data else None selection = Selection(**selection_data) if selection_data else None metadata_data["selection"] = selection metadata = Metadata(**metadata_data) if ( not keep_still_in_progress and metadata.reason_for_failure == "Was still being processed" ): continue files = {fname: FileData(**fdata) for fname, fdata in entry_data["files"].items()} comments = [Comment(**comment) for comment in entry_data["comments"]] entry = DatasetEntry( metadata=metadata, files=files, diffs_before=entry_data["diffs_before"], comments=comments, diffs_after=entry_data["diffs_after"], ) entries.append(entry) return Dataset(entries=entries) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Dataset class") parser.add_argument( "-f", "--filename", type=str, required=True, help="Path to the JSON file to load", ) parser.add_argument( "-o", "--output", type=str, default="output.json", help="Path to the output JSON file", ) parser.add_argument( "-t", "--output_type", choices=[mode.value for mode in OutputType], default=OutputType.FULL.value, help="Type of output to generate", ) args = parser.parse_args() dataset = Dataset.from_json(args.filename) print(f"Loaded {len(dataset)} entries from {args.filename}") if os.path.exists(args.output): overwrite = prompt_yes_no( f"Output file {args.output} already exists. Do you want to overwrite it?" ) if not overwrite: print("Exiting without saving.") exit(0) print(f"Saving dataset to {args.output}...", end=" ", flush=True) dataset.to_json(args.output, OutputType(args.output_type)) print("Done")