created the dataclasses for the "triplets"

2025-10-13 19:58:02 +02:00 · 2025-03-14 09:26:28 +01:00
parent 873f712640
commit 9f8884a1e9
1 changed files with 51 additions and 0 deletions
--- a/dataset.py
+++ b/dataset.py
@@ -0,0 +1,51 @@
+from dataclasses import dataclass, field
+from typing import List
+import json
+
+@dataclass
+class FileData:
+    path: str
+    content: str # Not sure about this, maybe we should just keep the path and extract the contents dynamically (boh)
+
+@dataclass
+class Metadata:
+    repo: str # the name of the repo, with style XXX/YYY 
+    pr_number: int
+    merge_commit_sha: str # to checkout for the tests
+    successful: bool
+    reason_for_failure: str
+    last_cmd_error_msg: str
+
+@dataclass
+class DatasetEntry:
+    metadata: Metadata
+    files: List[FileData] # files before the PR (before the first PR commits)
+    diffs_before: List[str] # diffs between the opening of the PR and the comment
+    comment: str
+    diffs_after: List[str] # changes after the comment
+
+@dataclass
+class Dataset:
+    entries: List = field(default_factory=list)
+
+    def to_json(self, filename: str):
+        """Serialize the dataset to a JSON file"""
+        with open(filename, "w", encoding="utf-8") as f:
+            json.dump(self, f, default=lambda o: o.__dict__, indent=4)
+
+    @staticmethod
+    def from_json(filename: str):
+        """Load the dataset from a JSON file"""
+        with open(filename, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            return Dataset(
+                entries=[
+                    DatasetEntry(
+                        metadata=Metadata(**entry["metadata"]),
+                        files=[FileData(**file) for file in entry["files"]],
+                        diffs_before=entry["diffs_before"],
+                        comment=entry["comment"],
+                        diffs_after=entry["diffs_after"]
+                    ) for entry in data["entries"]
+                ]
+            )