first draft of migration to augment the data

This commit is contained in:
Karma Riuk
2025-03-28 10:22:29 +01:00
parent 0ed221acb8
commit 649043d9f0

View File

@ -1,7 +1,7 @@
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Dict, List from typing import Dict, List
import json import json, os
from github import Github
from collections import defaultdict from collections import defaultdict
@ -11,6 +11,19 @@ class FileData:
path: str path: str
content: str = "" # Not sure about this, maybe we should just keep the path and extract the contents dynamically (boh) content: str = "" # Not sure about this, maybe we should just keep the path and extract the contents dynamically (boh)
@dataclass
class FileData_new:
is_code_related: bool
coverage: Dict[str, float] # jacoco-report -> coverage
content_before_pr: str = ""
content_after_pr: str = ""
@dataclass
class Comment:
body: str
file: str
from_: int
to: int
@dataclass @dataclass
class Metadata: class Metadata:
@ -24,6 +37,18 @@ class Metadata:
reason_for_failure: str = "" reason_for_failure: str = ""
last_cmd_error_msg: str = "" last_cmd_error_msg: str = ""
@dataclass
class Metadata_new:
repo: str # the name of the repo, with style XXX/YYY
pr_number: int
pr_title: str
pr_body: str
merge_commit_sha: str # to checkout for the tests
successful: bool = True
build_system: str = ""
reason_for_failure: str = ""
last_cmd_error_msg: str = ""
@dataclass @dataclass
class DatasetEntry: class DatasetEntry:
@ -33,6 +58,14 @@ class DatasetEntry:
comments: List[str] comments: List[str]
diffs_after: Dict[str, str] # filename -> diff, changes after the comment diffs_after: Dict[str, str] # filename -> diff, changes after the comment
@dataclass
class DatasetEntry_new:
metadata: Metadata
files: Dict[str, FileData_new] # filename -> file data, files before the PR (before the first PR commits)
diffs_before: Dict[str, str] # filename -> diff, diffs between the opening of the PR and the comment
comments: List[Comment]
diffs_after: Dict[str, str] # filename -> diff, changes after the comment
# fmt: on # fmt: on
@dataclass @dataclass
@ -50,7 +83,9 @@ class Dataset:
@staticmethod @staticmethod
def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset": def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset":
with open(filename) as f: with open(filename) as f:
print(f"Loading dataset from {filename}...", end="")
data = json.load(f) data = json.load(f)
print("Done")
entries = [] entries = []
for entry_data in data["entries"]: for entry_data in data["entries"]:
@ -74,3 +109,56 @@ class Dataset:
entries.append(entry) entries.append(entry)
return Dataset(entries=entries) return Dataset(entries=entries)
@dataclass
class Dataset_new:
entries: List[DatasetEntry_new] = field(default_factory=list)
def __len__(self) -> int:
return sum(1 for entry in self.entries if entry.metadata.successful)
def to_json(self, filename: str):
"""Serialize the dataset to a JSON file"""
with open(filename, "w", encoding="utf-8") as f:
json.dump(self, f, default=lambda o: o.__dict__, indent=4)
@staticmethod
def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset":
raise NotImplementedError("This method is not implemented yet")
def migrate(dataset: Dataset) -> Dataset_new:
ret = Dataset_new()
return ret
def fix_metadata(metadata_data: dict) -> None:
repo = g.get_repo(metadata_data["repo"])
pr = repo.get_pull(metadata_data["pr_number"])
if "pr_body" not in metadata_data:
metadata_data["pr_body"] = pr.title
if "pr_title" not in metadata_data:
metadata_data["pr_title"] = pr.body
if "commented_lines_from_to" not in metadata_data:
metadata_data["commented_lines_from_to"] = {}
for comment in pr.get_review_comments():
to = comment.line
from_ = comment.start_line
if from_ is None:
from_ = to
metadata_data["commented_lines_from_to"][comment.body] = {
"from": from_,
"to": to,
}
if __name__ == "__main__":
g = Github(os.environ["GITHUB_AUTH_TOKEN_CRAB"])
dataset = Dataset.from_json("dataset.json")
new_dataset = migrate(dataset)
print("done, saving...")
new_dataset.to_json("dataset.new.json")