now using only the new datset version

2025-07-05 05:28:13 +02:00 · 2025-03-31 14:17:36 +02:00
parent 35bd296c7c
commit 669049b7a4
4 changed files with 309 additions and 131 deletions
--- a/dataset.py
+++ b/dataset.py
@ -131,7 +131,7 @@ class Dataset_new:
            json.dump(self, f, default=lambda o: o.__dict__, indent=4)
    @staticmethod
-    def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset":
+    def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset_new":
        raise NotImplementedError("This method is not implemented yet")
--- a/errors.py
+++ b/errors.py
@ -0,0 +1,45 @@
 from abc import ABC
 class SetupException(Exception, ABC):
    reason_for_failure: str
 class NoDiffsBeforeError(SetupException):
    reason_for_failure = "Couldn't get the diffs before the first commit"
 class NoDiffsAfterError(SetupException):
    reason_for_failure = "Couldn't get the diffs after the last commit"
 class CantCloneRepoError(SetupException):
    reason_for_failure = "Couldn't clone the repository"
 class CantEnsureFullHistoryError(SetupException):
    reason_for_failure = "Couldn't ensure the full history of the repo (fetch --unshallow)"
 class CantFetchPRError(SetupException):
    reason_for_failure = "Couldn't fetch the PR's merge commit"
 class CantCheckoutCommitError(SetupException):
    reason_for_failure = (
        "Coudln't checkout the PR's merge commit (even after fetching the pull/<pr_number>/merge)"
    )
 class MultipleFilesError(SetupException):
    reason_for_failure = (
        "When requesting the contents of a file, a list of ContentFile was returned"
    )
 class NotValidDirectory(SetupException):
    reason_for_failure = "The directory is not valid"
 class CantFindBuildFile(SetupException):
    reason_for_failure = "Couldn't find the build file in the directory"
--- a/handlers.py
+++ b/handlers.py
@ -1,10 +1,12 @@
 from abc import ABC, abstractmethod
-import os, re, docker, signal, sys, javalang
+import os, re, docker, signal, javalang
 from bs4 import BeautifulSoup
-from typing import Iterable, Optional, Tuple, Iterator
+from typing import Iterable, Tuple, Iterator
 import xml.etree.ElementTree as ET
 from javalang.tree import PackageDeclaration
 from errors import CantFindBuildFile, NotValidDirectory
 REPORT_SIZE_THRESHOLD = 400   # less than 400 bytes (charcaters), we don't care about it
@ -77,7 +79,9 @@ class BuildHandler(ABC):
                raise FailedToCompileError(output)
        except TimeoutError:
            self.updates["compiled_successfully"] = False
-            self.updates["error_msg"] = "Compile process killed due to exceeding the 1-hour time limit"
+            self.updates[
                "error_msg"
            ] = "Compile process killed due to exceeding the 1-hour time limit"
        finally:
            signal.alarm(0)  # Cancel the alarm
@ -135,7 +139,9 @@ class BuildHandler(ABC):
        candidates = []
        for coverage_report_path in self.get_jacoco_report_paths():
            if not os.path.exists(coverage_report_path):
-                raise NoCoverageReportFound(f"Coverage report file '{coverage_report_path}' does not exist")
+                raise NoCoverageReportFound(
                    f"Coverage report file '{coverage_report_path}' does not exist"
                )
            fully_qualified_class = self._extract_fully_qualified_class(filename)
            candidates.append({"report_file": coverage_report_path, "fqc": fully_qualified_class})
@ -220,7 +226,7 @@ class BuildHandler(ABC):
 class MavenHandler(BuildHandler):
-    def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
+    def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None:
        super().__init__(repo_path, build_file, updates)
        self.base_cmd = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
        # -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
@ -265,7 +271,9 @@ class MavenHandler(BuildHandler):
            self.updates["n_tests_failed"] += failures
            self.updates["n_tests_errors"] += errors
            self.updates["n_tests_skipped"] += skipped
-            self.updates["n_tests_passed"] += tests_run - (failures + errors)  # Calculate passed tests
+            self.updates["n_tests_passed"] += tests_run - (
                failures + errors
            )  # Calculate passed tests
    def get_jacoco_report_paths(self) -> Iterable[str]:
        found_at_least_one = False
@ -329,7 +337,7 @@ class MavenHandler(BuildHandler):
 class GradleHandler(BuildHandler):
-    def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
+    def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None:
        super().__init__(repo_path, build_file, updates)
        self.base_cmd = "gradle --no-daemon --console=plain"
@ -360,7 +368,9 @@ class GradleHandler(BuildHandler):
        test_results_path = os.path.join(self.path, "build/reports/tests/test/index.html")
        if not os.path.exists(test_results_path):
-            raise NoTestResultsToExtractError("No test results found (prolly a repo with sub-projects)")
+            raise NoTestResultsToExtractError(
                "No test results found (prolly a repo with sub-projects)"
            )
        # Load the HTML file
        with open(test_results_path, "r") as file:
@ -374,7 +384,9 @@ class GradleHandler(BuildHandler):
            # counter_div = test_div.find("div", class_="counter")
            counter_div = test_div.select_one("div.counter")
            if counter_div is None:
-                raise NoTestResultsToExtractError("No test results found (not div.counter for tests)")
+                raise NoTestResultsToExtractError(
                    "No test results found (not div.counter for tests)"
                )
            self.updates["n_tests"] = int(counter_div.text.strip())
@ -386,12 +398,16 @@ class GradleHandler(BuildHandler):
            # counter_div = failures_div.find("div", class_="counter")
            counter_div = failures_div.select_one("div.counter")
            if counter_div is None:
-                raise NoTestResultsToExtractError("No test results found (not div.counter for failures)")
+                raise NoTestResultsToExtractError(
                    "No test results found (not div.counter for failures)"
                )
            self.updates["n_tests_failed"] = int(counter_div.text.strip())
            # Calculate passed tests
-            self.updates["n_tests_passed"] = self.updates["n_tests"] - self.updates["n_tests_failed"]
+            self.updates["n_tests_passed"] = (
                self.updates["n_tests"] - self.updates["n_tests_failed"]
            )
    def get_jacoco_report_paths(self) -> Iterable[str]:
        found_at_least_one = False
@ -577,7 +593,7 @@ def get_coverage_for_file(xml_file: str, target_fully_qualified_class: str, base
    return -1
-def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]:
+def get_build_handler(root: str, repo: str, verbose: bool = False) -> BuildHandler:
    """
    Get the path to the build file of a repository. The build file is either a
    `pom.xml`, `build.gradle`, or `build.xml` file.
@ -592,38 +608,33 @@ def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False
    path = os.path.join(root, repo)
    # Check if the given path is a directory
    if not os.path.isdir(path):
-        error_msg = f"The path {path} is not a valid directory."
+        raise NotValidDirectory(f"The path {path} is not a valid directory.")
        print(error_msg, file=sys.stderr)
        updates["error_msg"] = error_msg
        return None
    to_keep = ["pom.xml", "build.gradle"]
    for entry in os.scandir(path):
        if entry.is_file() and entry.name in to_keep:
            if verbose:
                print(f"Found {entry.name} in {repo} root, so keeping it and returning")
            updates["depth_of_build_file"] = 0
            if entry.name == "build.gradle":
-                updates["build_system"] = "gradle"
+                return GradleHandler(path, entry.name)
                return GradleHandler(path, entry.name, updates)
            else:
-                updates["build_system"] = "maven"
+                return MavenHandler(path, entry.name)
                return MavenHandler(path, entry.name, updates)
-    # List files in the immediate subdirectories
+    raise CantFindBuildFile(f"Couldn't find any of {to_keep} build files in the directory '{path}'")
-    for entry in os.scandir(path):
+    # # List files in the immediate subdirectories
-        if entry.is_dir():
+    # for entry in os.scandir(path):
-            for sub_entry in os.scandir(entry.path):
+    #     if entry.is_dir():
-                if sub_entry.is_file() and sub_entry.name in to_keep:
+    #         for sub_entry in os.scandir(entry.path):
-                    if verbose:
+    #             if sub_entry.is_file() and sub_entry.name in to_keep:
-                        print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
+    #                 if verbose:
-                    updates["depth_of_build_file"] = 1
+    #                     print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
-                    if entry.name == "build.gradle":
+    #                 updates["depth_of_build_file"] = 1
-                        updates["build_system"] = "gradle"
+    #                 if entry.name == "build.gradle":
-                        return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates)
+    #                     updates["build_system"] = "gradle"
-                    else:
+    #                     return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates)
-                        updates["build_system"] = "maven"
+    #                 else:
-                        return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates)
+    #                     updates["build_system"] = "maven"
    #                     return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates)
-    updates["error_msg"] = "No build file found"
+    # updates["error_msg"] = "No build file found"
-    return None
+    # return None
--- a/pull_requests.py
+++ b/pull_requests.py
@ -1,6 +1,8 @@
 from collections import defaultdict
 import argparse, os, subprocess, docker
 from typing import Any, Callable
 from github.Commit import Commit
 from github.ContentFile import ContentFile
 from github.PullRequest import PullRequest
 from github.Repository import Repository
 import pandas as pd
@ -8,7 +10,26 @@ from github import Github, GithubException
 from tqdm import tqdm
 from datetime import datetime
-from dataset import Dataset, DatasetEntry, FileData, Metadata
+from dataset import (
    Comment,
    Dataset,
    Dataset_new,
    DatasetEntry,
    DatasetEntry_new,
    FileData,
    FileData_new,
    Metadata,
    Metadata_new,
 )
 from errors import (
    CantCheckoutCommitError,
    CantEnsureFullHistoryError,
    CantFetchPRError,
    MultipleFilesError,
    NoDiffsAfterError,
    NoDiffsBeforeError,
    SetupException,
 )
 from handlers import HandlerException, get_build_handler
 from utils import has_only_1_comment, move_github_logging_to_file, clone, run_git_cmd
@ -54,115 +75,216 @@ def reset_repo_to_latest_commit(repo_path: str) -> None:
    run_git_cmd(["reset", "--hard", current_branch], repo_path)
 def get_diffs_before(repo: Repository, pr: PullRequest) -> dict[str, str]:
    comments = list(pr.get_review_comments())
    comments.sort(key=lambda comment: comment.created_at)
    first_comment = comments[0]
    try:
        return {
            file.filename: file.patch
            for file in repo.compare(pr.base.sha, first_comment.commit_id).files
        }
    except GithubException as e:
        raise NoDiffsBeforeError(e)
 def get_diffs_after(repo: Repository, pr: PullRequest) -> dict[str, str]:
    comments = list(pr.get_review_comments())
    comments.sort(key=lambda comment: comment.created_at)
    first_commit_after_comment = None
    commits = list(pr.get_commits())
    commits.sort(key=lambda commit: commit.commit.author.date)
    for commit in commits:
        if commit.commit.author.date > comments[0].created_at:
            first_commit_after_comment = commit
            break
    assert first_commit_after_comment is not None, "No commit after the comment"
    try:
        return {
            file.filename: file.patch
            for file in repo.compare(first_commit_after_comment.sha, pr.base.sha).files
        }
    except GithubException as e:
        raise NoDiffsAfterError(e)
 def checkout(repo_path: str, pr: PullRequest) -> None:
    try:
        ensure_full_history(repo_path)
    except subprocess.CalledProcessError as e:
        raise CantEnsureFullHistoryError(e.stderr)
    try:
        run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
    except subprocess.CalledProcessError:
        try:
            run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path)
        except subprocess.CalledProcessError as e:
            raise CantFetchPRError(e.stderr)
        try:
            run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
        except subprocess.CalledProcessError as e:
            raise CantCheckoutCommitError(e.stderr)
 def try_read_file(fname: str) -> str:
    if not os.path.exists(fname):
        return ""   # file was removed after the PR
    try:
        with open(fname, "r", encoding="utf-8") as f:
            return f.read()
    except UnicodeDecodeError:
        return "Binary file (from filesystem), to be ignored"
    except IsADirectoryError:
        return "File listed in PR is a directory (likely a submodule), to be ignored"
 def get_files(pr: PullRequest, repo: Repository, repo_path: str) -> dict[str, FileData_new]:
    ret = {}
    for file in pr.get_files():
        try:
            contents = repo.get_contents(file.filename, ref=pr.base.sha)
            assert isinstance(
                contents, ContentFile
            ), f"Multiple files with the same name {file.filename} in base sha {pr.base.sha} ({contents})"
            contents_before = contents.decoded_content.decode()
        except AssertionError as e:
            raise MultipleFilesError(e)
        except UnicodeError as e:
            contents_before = "Binary content (from API), to be ignored"
        except Exception as e:
            contents_before = ""   # file didn't exist before the PR
        try:
            contents = repo.get_contents(file.filename, ref=pr.merge_commit_sha)
            assert isinstance(
                contents, ContentFile
            ), f"Multiple files with the same name {file.filename} in merge commit sha {pr.base.sha} ({contents})"
            contents_after = contents.decoded_content.decode()
        except AssertionError as e:
            raise MultipleFilesError(e)
        except UnicodeError as e:
            contents_after = "Binary content (from API), to be ignored"
        except Exception as e:
            checkout(repo_path, pr)
            contents_after = try_read_file(os.path.join(repo_path, file.filename))
        ret[file.filename] = FileData_new(
            is_code_related=file.filename.endswith('.java'),
            coverage={},
            content_before_pr=contents_before,
            content_after_pr=contents_after,
        )
    return ret
 def get_comments(pr: PullRequest) -> list[Comment]:
    ret = []
    for comment in pr.get_review_comments():
        comment = Comment(
            body=comment.body,
            file=comment.path,
            from_=comment.start_line if comment.start_line else comment.line,
            to=comment.line,
        )
        if comment.from_ is None or comment.to is None:
            comment.to = comment.original_line
            comment.from_ = comment.original_start_line
        ret.append(comment)
    return ret
 def process_pull(
    repo: Repository,
    pr: PullRequest,
-    dataset: Dataset,
+    dataset: Dataset_new,
    repos_dir: str,
-    cache: dict[str, dict[int, DatasetEntry]] = {},
+    cache: dict[str, dict[int, DatasetEntry_new]] = {},
 ):
    if pr.number in cache.get(repo.full_name, set()):
        dataset.entries.append(cache[repo.full_name][pr.number])
        return
-    commits = list(pr.get_commits())
+    entry = DatasetEntry_new(
-    if not commits:
+        metadata=Metadata_new(
-        return  # No commits, skip processing
+            repo.full_name,
-
+            pr.number,
-    first_commit = commits[0]
+            pr.title,
-    last_commit = commits[-1]
+            pr.body,
-
+            pr.merge_commit_sha,
-    try:
+            reason_for_failure="Was still being processed",
-        diffs_before = {
+        ),
-            file.filename: file.patch for file in repo.compare(pr.base.sha, first_commit.sha).files
+        files={},
-        }
+        diffs_before={},
-    except GithubException as e:
+        comments=[],
-        return
+        diffs_after={},
    )
    dataset.entries.append(entry)
    comments = list(pr.get_review_comments())
    assert len(comments) == 1
    comment = comments[0]
    comment_text = comment.body
    commented_file_path = comment.path
    try:
        diffs_after = {
            file.filename: file.patch
            for file in repo.compare(first_commit.sha, last_commit.sha).files
        }
    except GithubException as e:
        return
    entry = DatasetEntry(
        metadata=Metadata(
            repo.full_name,
            pr.number,
            pr.merge_commit_sha,
            {comment_text: commented_file_path},
            reason_for_failure="Was still being processed",
        ),
        files={file.filename: FileData(file.filename) for file in pr.get_files()},
        diffs_before=diffs_before,
        comments=[comment_text],
        diffs_after=diffs_after,
    )
    dataset.entries.append(entry)
    repo_path = os.path.join(repos_dir, repo.full_name)
-    updates = {}
+    build_handler = None
    if not clone(repo.full_name, repos_dir, updates):
        entry.metadata.last_cmd_error_msg = updates["error_msg"]
        entry.metadata.reason_for_failure = "Couldn't clone the repo successfully"
        entry.metadata.successful = False
-    def _try_cmd(action: Callable[[], Any], reason_for_failure: str) -> bool:
+    setup_steps = [
-        """
+        (
-        Tries a command, and if it fails, it sets the metadata of the entry.
+            "Getting diffs before the first commit...",
-        """
+            lambda: entry.diffs_before.update(get_diffs_before(repo, pr)),
        ),
        (
            "Getting diffs after the first commit...",
            lambda: entry.diffs_after.update(get_diffs_after(repo, pr)),
        ),
        ("Cloning the repo...", lambda: clone(repo.full_name, repos_dir)),
        (
            "Getting the files...",
            lambda: entry.files.update(get_files(pr, repo, repo_path)),
        ),
        (
            "Getting the comments...",
            lambda: entry.comments.extend(get_comments(pr)),
        ),
        ("Checkout out merge commit...", lambda: checkout(repo_path, pr)),
    ]
    with tqdm(total=len(setup_steps), desc="Setting up PR", leave=False) as pbar:
        for message, action in setup_steps:
            pbar.set_postfix(
                {
                    "doing": message,
                    "started at": datetime.now().strftime("%d/%m, %H:%M:%S"),
                }
            )
            try:
            # return action()
                action()
-        except subprocess.CalledProcessError as e:
+            except SetupException as e:
-            entry.metadata.last_cmd_error_msg = f"{e.stderr}"
+                entry.metadata.last_cmd_error_msg = str(e)
-            entry.metadata.reason_for_failure = reason_for_failure
+                entry.metadata.reason_for_failure = e.reason_for_failure
                entry.metadata.successful = False
            # raise e
        return entry.metadata.successful
    if not _try_cmd(
        lambda: ensure_full_history(repo_path),
        "Couldn't ensure the full history of the repo (fetch --unshallow)",
    ):
                return
            pbar.update(1)
    try:
-        run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
+        build_handler = get_build_handler(repos_dir, repo.full_name)
    except subprocess.CalledProcessError:
        if not _try_cmd(
            lambda: run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path),
            "Couldn't fetch the PR's merge commit",
        ):
            return
        if not _try_cmd(
            lambda: run_git_cmd(["checkout", pr.merge_commit_sha], repo_path),
            "Coudln't checkout the PR's merge commit (even after fetching the pull/<pr_number>/merge)",
        ):
            return
    build_handler = get_build_handler(repos_dir, repo.full_name, updates)
    if build_handler is None:
        entry.metadata.last_cmd_error_msg = updates["error_msg"]
        entry.metadata.reason_for_failure = "Couldn't get the build handler"
        entry.metadata.successful = False
        return
        entry.metadata.build_system = build_handler.get_type()
        build_handler.set_client(docker_client)
    except SetupException as e:
        entry.metadata.last_cmd_error_msg = str(e)
        entry.metadata.reason_for_failure = e.reason_for_failure
        entry.metadata.successful = False
        return
    def _check_coverages():
        for coverage_file, coverage in build_handler.check_coverage(commented_file_path):
-            entry.metadata.commented_files_coverages[commented_file_path][coverage_file] = coverage
+            entry.files[commented_file_path].coverage[coverage_file] = coverage
    steps = [
        ("Checking for tests...", build_handler.check_for_tests),
@ -197,9 +319,9 @@ def process_pull(
 def process_repo(
    repo_name: str,
-    dataset: Dataset,
+    dataset: Dataset_new,
    repos_dir: str,
-    cache: dict[str, dict[int, DatasetEntry]] = {},
+    cache: dict[str, dict[int, DatasetEntry_new]] = {},
 ):
    repo = g.get_repo(repo_name)
    if repo.full_name in cache:
@ -224,9 +346,9 @@ def process_repo(
 def process_repos(
    df: pd.DataFrame,
-    dataset: Dataset,
+    dataset: Dataset_new,
    repos_dir: str,
-    cache: dict[str, dict[int, DatasetEntry]] = {},
+    cache: dict[str, dict[int, DatasetEntry_new]] = {},
 ):
    """
    Processes the repos in the given csv file, extracting the good ones and
@ -254,9 +376,9 @@ def process_repos(
 def only_inject_jacoco(
-    dataset: Dataset,
+    dataset: Dataset_new,
    repos_dir: str,
-    cache: dict[str, dict[int, DatasetEntry]] = {},
+    cache: dict[str, dict[int, DatasetEntry_new]] = {},
 ):
    n_successfull_injections = 0
    n_tried_injections = 0
@ -344,13 +466,13 @@ if __name__ == "__main__":
    if args.only_repo is not None:
        df = df.loc[df["name"] == args.only_repo]
-    cache: dict[str, dict[int, DatasetEntry]] = defaultdict(dict)
+    cache: dict[str, dict[int, DatasetEntry_new]] = defaultdict(dict)
    if args.cache is not None:
-        cache_dataset = Dataset.from_json(args.cache)
+        cache_dataset = Dataset_new.from_json(args.cache)
        for cache_entry in cache_dataset.entries:
            cache[cache_entry.metadata.repo][cache_entry.metadata.pr_number] = cache_entry
-    dataset = Dataset()
+    dataset = Dataset_new()
    try:
        if args.only_inject_jacoco:
            only_inject_jacoco(dataset, args.repos, cache)