diff --git a/dataset.py b/dataset.py index b02b452..3edf306 100644 --- a/dataset.py +++ b/dataset.py @@ -4,7 +4,7 @@ import json # fmt: off @dataclass -class FileData_new: +class FileData: is_code_related: bool coverage: Dict[str, float] # jacoco-report -> coverage content_before_pr: str = "" @@ -18,7 +18,7 @@ class Comment: to: int @dataclass -class Metadata_new: +class Metadata: repo: str # the name of the repo, with style XXX/YYY pr_number: int pr_title: str @@ -31,9 +31,9 @@ class Metadata_new: @dataclass -class DatasetEntry_new: - metadata: Metadata_new - files: Dict[str, FileData_new] # filename -> file data, files before the PR (before the first PR commits) +class DatasetEntry: + metadata: Metadata + files: Dict[str, FileData] # filename -> file data, files before the PR (before the first PR commits) diffs_before: Dict[str, str] # filename -> diff, diffs between the opening of the PR and the comment comments: List[Comment] diffs_after: Dict[str, str] # filename -> diff, changes after the comment @@ -41,8 +41,8 @@ class DatasetEntry_new: # fmt: on @dataclass -class Dataset_new: - entries: List[DatasetEntry_new] = field(default_factory=list) +class Dataset: + entries: List[DatasetEntry] = field(default_factory=list) def __len__(self) -> int: return sum(1 for entry in self.entries if entry.metadata.successful) @@ -53,7 +53,7 @@ class Dataset_new: json.dump(self, f, default=lambda o: o.__dict__, indent=4) @staticmethod - def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset_new": + def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset": with open(filename, "r", encoding="utf-8") as f: print(f"Loading dataset from {filename}...", end="") data = json.load(f) @@ -62,7 +62,7 @@ class Dataset_new: entries = [] for entry_data in data["entries"]: metadata_data = entry_data["metadata"] - metadata = Metadata_new(**metadata_data) + metadata = Metadata(**metadata_data) if ( not keep_still_in_progress @@ -70,11 +70,11 @@ class Dataset_new: ): continue - files = {fname: FileData_new(**fdata) for fname, fdata in entry_data["files"].items()} + files = {fname: FileData(**fdata) for fname, fdata in entry_data["files"].items()} comments = [Comment(**comment) for comment in entry_data["comments"]] - entry = DatasetEntry_new( + entry = DatasetEntry( metadata=metadata, files=files, diffs_before=entry_data["diffs_before"], @@ -83,4 +83,4 @@ class Dataset_new: ) entries.append(entry) - return Dataset_new(entries=entries) + return Dataset(entries=entries) diff --git a/pull_requests.py b/pull_requests.py index e91a71f..5bf4196 100644 --- a/pull_requests.py +++ b/pull_requests.py @@ -12,10 +12,10 @@ from datetime import datetime from dataset import ( Comment, - Dataset_new, - DatasetEntry_new, - FileData_new, - Metadata_new, + Dataset, + DatasetEntry, + FileData, + Metadata, ) from errors import ( CantCheckoutCommitError, @@ -137,7 +137,7 @@ def try_read_file(fname: str) -> str: return "File listed in PR is a directory (likely a submodule), to be ignored" -def get_files(pr: PullRequest, repo: Repository, repo_path: str) -> dict[str, FileData_new]: +def get_files(pr: PullRequest, repo: Repository, repo_path: str) -> dict[str, FileData]: ret = {} for file in pr.get_files(): try: @@ -167,7 +167,7 @@ def get_files(pr: PullRequest, repo: Repository, repo_path: str) -> dict[str, Fi checkout(repo_path, pr) contents_after = try_read_file(os.path.join(repo_path, file.filename)) - ret[file.filename] = FileData_new( + ret[file.filename] = FileData( is_code_related=file.filename.endswith('.java'), coverage={}, content_before_pr=contents_before, @@ -196,16 +196,16 @@ def get_comments(pr: PullRequest) -> list[Comment]: def process_pull( repo: Repository, pr: PullRequest, - dataset: Dataset_new, + dataset: Dataset, repos_dir: str, - cache: dict[str, dict[int, DatasetEntry_new]] = {}, + cache: dict[str, dict[int, DatasetEntry]] = {}, ): if pr.number in cache.get(repo.full_name, set()): dataset.entries.append(cache[repo.full_name][pr.number]) return - entry = DatasetEntry_new( - metadata=Metadata_new( + entry = DatasetEntry( + metadata=Metadata( repo.full_name, pr.number, pr.title, @@ -321,9 +321,9 @@ def process_pull( def process_repo( repo_name: str, - dataset: Dataset_new, + dataset: Dataset, repos_dir: str, - cache: dict[str, dict[int, DatasetEntry_new]] = {}, + cache: dict[str, dict[int, DatasetEntry]] = {}, ): repo = g.get_repo(repo_name) if repo.full_name in cache: @@ -348,9 +348,9 @@ def process_repo( def process_repos( df: pd.DataFrame, - dataset: Dataset_new, + dataset: Dataset, repos_dir: str, - cache: dict[str, dict[int, DatasetEntry_new]] = {}, + cache: dict[str, dict[int, DatasetEntry]] = {}, ): """ Processes the repos in the given csv file, extracting the good ones and @@ -378,9 +378,9 @@ def process_repos( def only_inject_jacoco( - dataset: Dataset_new, + dataset: Dataset, repos_dir: str, - cache: dict[str, dict[int, DatasetEntry_new]] = {}, + cache: dict[str, dict[int, DatasetEntry]] = {}, ): n_successfull_injections = 0 n_tried_injections = 0 @@ -468,13 +468,13 @@ if __name__ == "__main__": if args.only_repo is not None: df = df.loc[df["name"] == args.only_repo] - cache: dict[str, dict[int, DatasetEntry_new]] = defaultdict(dict) + cache: dict[str, dict[int, DatasetEntry]] = defaultdict(dict) if args.cache is not None: - cache_dataset = Dataset_new.from_json(args.cache) + cache_dataset = Dataset.from_json(args.cache) for cache_entry in cache_dataset.entries: cache[cache_entry.metadata.repo][cache_entry.metadata.pr_number] = cache_entry - dataset = Dataset_new() + dataset = Dataset() try: if args.only_inject_jacoco: only_inject_jacoco(dataset, args.repos, cache)