From 669049b7a4d6c87eca3f52b3aad3842029852ccf Mon Sep 17 00:00:00 2001
From: Karma Riuk <riukkarma@gmail.com>
Date: Mon, 31 Mar 2025 14:17:36 +0200
Subject: [PATCH] now using only the new datset version

---
 dataset.py       |   2 +-
 errors.py        |  45 +++++++
 handlers.py      |  85 +++++++------
 pull_requests.py | 308 +++++++++++++++++++++++++++++++++--------------
 4 files changed, 309 insertions(+), 131 deletions(-)
 create mode 100644 errors.py

diff --git a/dataset.py b/dataset.py
index 9c4ad3e..2c99d6b 100644
--- a/dataset.py
+++ b/dataset.py
@@ -131,7 +131,7 @@ class Dataset_new:
             json.dump(self, f, default=lambda o: o.__dict__, indent=4)
 
     @staticmethod
-    def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset":
+    def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset_new":
         raise NotImplementedError("This method is not implemented yet")
 
 
diff --git a/errors.py b/errors.py
new file mode 100644
index 0000000..ee2db1d
--- /dev/null
+++ b/errors.py
@@ -0,0 +1,45 @@
+from abc import ABC
+
+
+class SetupException(Exception, ABC):
+    reason_for_failure: str
+
+
+class NoDiffsBeforeError(SetupException):
+    reason_for_failure = "Couldn't get the diffs before the first commit"
+
+
+class NoDiffsAfterError(SetupException):
+    reason_for_failure = "Couldn't get the diffs after the last commit"
+
+
+class CantCloneRepoError(SetupException):
+    reason_for_failure = "Couldn't clone the repository"
+
+
+class CantEnsureFullHistoryError(SetupException):
+    reason_for_failure = "Couldn't ensure the full history of the repo (fetch --unshallow)"
+
+
+class CantFetchPRError(SetupException):
+    reason_for_failure = "Couldn't fetch the PR's merge commit"
+
+
+class CantCheckoutCommitError(SetupException):
+    reason_for_failure = (
+        "Coudln't checkout the PR's merge commit (even after fetching the pull/<pr_number>/merge)"
+    )
+
+
+class MultipleFilesError(SetupException):
+    reason_for_failure = (
+        "When requesting the contents of a file, a list of ContentFile was returned"
+    )
+
+
+class NotValidDirectory(SetupException):
+    reason_for_failure = "The directory is not valid"
+
+
+class CantFindBuildFile(SetupException):
+    reason_for_failure = "Couldn't find the build file in the directory"
diff --git a/handlers.py b/handlers.py
index 74a9086..dbe4d99 100644
--- a/handlers.py
+++ b/handlers.py
@@ -1,10 +1,12 @@
 from abc import ABC, abstractmethod
-import os, re, docker, signal, sys, javalang
+import os, re, docker, signal, javalang
 from bs4 import BeautifulSoup
-from typing import Iterable, Optional, Tuple, Iterator
+from typing import Iterable, Tuple, Iterator
 import xml.etree.ElementTree as ET
 from javalang.tree import PackageDeclaration
 
+from errors import CantFindBuildFile, NotValidDirectory
+
 REPORT_SIZE_THRESHOLD = 400   # less than 400 bytes (charcaters), we don't care about it
 
 
@@ -77,7 +79,9 @@ class BuildHandler(ABC):
                 raise FailedToCompileError(output)
         except TimeoutError:
             self.updates["compiled_successfully"] = False
-            self.updates["error_msg"] = "Compile process killed due to exceeding the 1-hour time limit"
+            self.updates[
+                "error_msg"
+            ] = "Compile process killed due to exceeding the 1-hour time limit"
         finally:
             signal.alarm(0)  # Cancel the alarm
 
@@ -135,7 +139,9 @@ class BuildHandler(ABC):
         candidates = []
         for coverage_report_path in self.get_jacoco_report_paths():
             if not os.path.exists(coverage_report_path):
-                raise NoCoverageReportFound(f"Coverage report file '{coverage_report_path}' does not exist")
+                raise NoCoverageReportFound(
+                    f"Coverage report file '{coverage_report_path}' does not exist"
+                )
 
             fully_qualified_class = self._extract_fully_qualified_class(filename)
             candidates.append({"report_file": coverage_report_path, "fqc": fully_qualified_class})
@@ -220,7 +226,7 @@ class BuildHandler(ABC):
 
 
 class MavenHandler(BuildHandler):
-    def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
+    def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None:
         super().__init__(repo_path, build_file, updates)
         self.base_cmd = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
         # -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
@@ -265,7 +271,9 @@ class MavenHandler(BuildHandler):
             self.updates["n_tests_failed"] += failures
             self.updates["n_tests_errors"] += errors
             self.updates["n_tests_skipped"] += skipped
-            self.updates["n_tests_passed"] += tests_run - (failures + errors)  # Calculate passed tests
+            self.updates["n_tests_passed"] += tests_run - (
+                failures + errors
+            )  # Calculate passed tests
 
     def get_jacoco_report_paths(self) -> Iterable[str]:
         found_at_least_one = False
@@ -329,7 +337,7 @@ class MavenHandler(BuildHandler):
 
 
 class GradleHandler(BuildHandler):
-    def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
+    def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None:
         super().__init__(repo_path, build_file, updates)
         self.base_cmd = "gradle --no-daemon --console=plain"
 
@@ -360,7 +368,9 @@ class GradleHandler(BuildHandler):
 
         test_results_path = os.path.join(self.path, "build/reports/tests/test/index.html")
         if not os.path.exists(test_results_path):
-            raise NoTestResultsToExtractError("No test results found (prolly a repo with sub-projects)")
+            raise NoTestResultsToExtractError(
+                "No test results found (prolly a repo with sub-projects)"
+            )
 
         # Load the HTML file
         with open(test_results_path, "r") as file:
@@ -374,7 +384,9 @@ class GradleHandler(BuildHandler):
             # counter_div = test_div.find("div", class_="counter")
             counter_div = test_div.select_one("div.counter")
             if counter_div is None:
-                raise NoTestResultsToExtractError("No test results found (not div.counter for tests)")
+                raise NoTestResultsToExtractError(
+                    "No test results found (not div.counter for tests)"
+                )
 
             self.updates["n_tests"] = int(counter_div.text.strip())
 
@@ -386,12 +398,16 @@ class GradleHandler(BuildHandler):
             # counter_div = failures_div.find("div", class_="counter")
             counter_div = failures_div.select_one("div.counter")
             if counter_div is None:
-                raise NoTestResultsToExtractError("No test results found (not div.counter for failures)")
+                raise NoTestResultsToExtractError(
+                    "No test results found (not div.counter for failures)"
+                )
 
             self.updates["n_tests_failed"] = int(counter_div.text.strip())
 
             # Calculate passed tests
-            self.updates["n_tests_passed"] = self.updates["n_tests"] - self.updates["n_tests_failed"]
+            self.updates["n_tests_passed"] = (
+                self.updates["n_tests"] - self.updates["n_tests_failed"]
+            )
 
     def get_jacoco_report_paths(self) -> Iterable[str]:
         found_at_least_one = False
@@ -577,7 +593,7 @@ def get_coverage_for_file(xml_file: str, target_fully_qualified_class: str, base
     return -1
 
 
-def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]:
+def get_build_handler(root: str, repo: str, verbose: bool = False) -> BuildHandler:
     """
     Get the path to the build file of a repository. The build file is either a
     `pom.xml`, `build.gradle`, or `build.xml` file.
@@ -592,38 +608,33 @@ def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False
     path = os.path.join(root, repo)
     # Check if the given path is a directory
     if not os.path.isdir(path):
-        error_msg = f"The path {path} is not a valid directory."
-        print(error_msg, file=sys.stderr)
-        updates["error_msg"] = error_msg
-        return None
+        raise NotValidDirectory(f"The path {path} is not a valid directory.")
 
     to_keep = ["pom.xml", "build.gradle"]
     for entry in os.scandir(path):
         if entry.is_file() and entry.name in to_keep:
             if verbose:
                 print(f"Found {entry.name} in {repo} root, so keeping it and returning")
-            updates["depth_of_build_file"] = 0
             if entry.name == "build.gradle":
-                updates["build_system"] = "gradle"
-                return GradleHandler(path, entry.name, updates)
+                return GradleHandler(path, entry.name)
             else:
-                updates["build_system"] = "maven"
-                return MavenHandler(path, entry.name, updates)
+                return MavenHandler(path, entry.name)
 
-    # List files in the immediate subdirectories
-    for entry in os.scandir(path):
-        if entry.is_dir():
-            for sub_entry in os.scandir(entry.path):
-                if sub_entry.is_file() and sub_entry.name in to_keep:
-                    if verbose:
-                        print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
-                    updates["depth_of_build_file"] = 1
-                    if entry.name == "build.gradle":
-                        updates["build_system"] = "gradle"
-                        return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates)
-                    else:
-                        updates["build_system"] = "maven"
-                        return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates)
+    raise CantFindBuildFile(f"Couldn't find any of {to_keep} build files in the directory '{path}'")
+    # # List files in the immediate subdirectories
+    # for entry in os.scandir(path):
+    #     if entry.is_dir():
+    #         for sub_entry in os.scandir(entry.path):
+    #             if sub_entry.is_file() and sub_entry.name in to_keep:
+    #                 if verbose:
+    #                     print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
+    #                 updates["depth_of_build_file"] = 1
+    #                 if entry.name == "build.gradle":
+    #                     updates["build_system"] = "gradle"
+    #                     return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates)
+    #                 else:
+    #                     updates["build_system"] = "maven"
+    #                     return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates)
 
-    updates["error_msg"] = "No build file found"
-    return None
+    # updates["error_msg"] = "No build file found"
+    # return None
diff --git a/pull_requests.py b/pull_requests.py
index bd6ef1c..a971901 100644
--- a/pull_requests.py
+++ b/pull_requests.py
@@ -1,6 +1,8 @@
 from collections import defaultdict
 import argparse, os, subprocess, docker
 from typing import Any, Callable
+from github.Commit import Commit
+from github.ContentFile import ContentFile
 from github.PullRequest import PullRequest
 from github.Repository import Repository
 import pandas as pd
@@ -8,7 +10,26 @@ from github import Github, GithubException
 from tqdm import tqdm
 from datetime import datetime
 
-from dataset import Dataset, DatasetEntry, FileData, Metadata
+from dataset import (
+    Comment,
+    Dataset,
+    Dataset_new,
+    DatasetEntry,
+    DatasetEntry_new,
+    FileData,
+    FileData_new,
+    Metadata,
+    Metadata_new,
+)
+from errors import (
+    CantCheckoutCommitError,
+    CantEnsureFullHistoryError,
+    CantFetchPRError,
+    MultipleFilesError,
+    NoDiffsAfterError,
+    NoDiffsBeforeError,
+    SetupException,
+)
 from handlers import HandlerException, get_build_handler
 from utils import has_only_1_comment, move_github_logging_to_file, clone, run_git_cmd
 
@@ -54,115 +75,216 @@ def reset_repo_to_latest_commit(repo_path: str) -> None:
     run_git_cmd(["reset", "--hard", current_branch], repo_path)
 
 
+def get_diffs_before(repo: Repository, pr: PullRequest) -> dict[str, str]:
+    comments = list(pr.get_review_comments())
+    comments.sort(key=lambda comment: comment.created_at)
+    first_comment = comments[0]
+    try:
+        return {
+            file.filename: file.patch
+            for file in repo.compare(pr.base.sha, first_comment.commit_id).files
+        }
+    except GithubException as e:
+        raise NoDiffsBeforeError(e)
+
+
+def get_diffs_after(repo: Repository, pr: PullRequest) -> dict[str, str]:
+    comments = list(pr.get_review_comments())
+    comments.sort(key=lambda comment: comment.created_at)
+    first_commit_after_comment = None
+    commits = list(pr.get_commits())
+    commits.sort(key=lambda commit: commit.commit.author.date)
+    for commit in commits:
+        if commit.commit.author.date > comments[0].created_at:
+            first_commit_after_comment = commit
+            break
+
+    assert first_commit_after_comment is not None, "No commit after the comment"
+
+    try:
+        return {
+            file.filename: file.patch
+            for file in repo.compare(first_commit_after_comment.sha, pr.base.sha).files
+        }
+    except GithubException as e:
+        raise NoDiffsAfterError(e)
+
+
+def checkout(repo_path: str, pr: PullRequest) -> None:
+    try:
+        ensure_full_history(repo_path)
+    except subprocess.CalledProcessError as e:
+        raise CantEnsureFullHistoryError(e.stderr)
+
+    try:
+        run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
+    except subprocess.CalledProcessError:
+        try:
+            run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path)
+        except subprocess.CalledProcessError as e:
+            raise CantFetchPRError(e.stderr)
+
+        try:
+            run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
+        except subprocess.CalledProcessError as e:
+            raise CantCheckoutCommitError(e.stderr)
+
+
+def try_read_file(fname: str) -> str:
+    if not os.path.exists(fname):
+        return ""   # file was removed after the PR
+    try:
+        with open(fname, "r", encoding="utf-8") as f:
+            return f.read()
+    except UnicodeDecodeError:
+        return "Binary file (from filesystem), to be ignored"
+    except IsADirectoryError:
+        return "File listed in PR is a directory (likely a submodule), to be ignored"
+
+
+def get_files(pr: PullRequest, repo: Repository, repo_path: str) -> dict[str, FileData_new]:
+    ret = {}
+    for file in pr.get_files():
+        try:
+            contents = repo.get_contents(file.filename, ref=pr.base.sha)
+            assert isinstance(
+                contents, ContentFile
+            ), f"Multiple files with the same name {file.filename} in base sha {pr.base.sha} ({contents})"
+            contents_before = contents.decoded_content.decode()
+        except AssertionError as e:
+            raise MultipleFilesError(e)
+        except UnicodeError as e:
+            contents_before = "Binary content (from API), to be ignored"
+        except Exception as e:
+            contents_before = ""   # file didn't exist before the PR
+
+        try:
+            contents = repo.get_contents(file.filename, ref=pr.merge_commit_sha)
+            assert isinstance(
+                contents, ContentFile
+            ), f"Multiple files with the same name {file.filename} in merge commit sha {pr.base.sha} ({contents})"
+            contents_after = contents.decoded_content.decode()
+        except AssertionError as e:
+            raise MultipleFilesError(e)
+        except UnicodeError as e:
+            contents_after = "Binary content (from API), to be ignored"
+        except Exception as e:
+            checkout(repo_path, pr)
+            contents_after = try_read_file(os.path.join(repo_path, file.filename))
+
+        ret[file.filename] = FileData_new(
+            is_code_related=file.filename.endswith('.java'),
+            coverage={},
+            content_before_pr=contents_before,
+            content_after_pr=contents_after,
+        )
+
+    return ret
+
+
+def get_comments(pr: PullRequest) -> list[Comment]:
+    ret = []
+    for comment in pr.get_review_comments():
+        comment = Comment(
+            body=comment.body,
+            file=comment.path,
+            from_=comment.start_line if comment.start_line else comment.line,
+            to=comment.line,
+        )
+        if comment.from_ is None or comment.to is None:
+            comment.to = comment.original_line
+            comment.from_ = comment.original_start_line
+        ret.append(comment)
+    return ret
+
+
 def process_pull(
     repo: Repository,
     pr: PullRequest,
-    dataset: Dataset,
+    dataset: Dataset_new,
     repos_dir: str,
-    cache: dict[str, dict[int, DatasetEntry]] = {},
+    cache: dict[str, dict[int, DatasetEntry_new]] = {},
 ):
     if pr.number in cache.get(repo.full_name, set()):
         dataset.entries.append(cache[repo.full_name][pr.number])
         return
 
-    commits = list(pr.get_commits())
-    if not commits:
-        return  # No commits, skip processing
-
-    first_commit = commits[0]
-    last_commit = commits[-1]
-
-    try:
-        diffs_before = {
-            file.filename: file.patch for file in repo.compare(pr.base.sha, first_commit.sha).files
-        }
-    except GithubException as e:
-        return
+    entry = DatasetEntry_new(
+        metadata=Metadata_new(
+            repo.full_name,
+            pr.number,
+            pr.title,
+            pr.body,
+            pr.merge_commit_sha,
+            reason_for_failure="Was still being processed",
+        ),
+        files={},
+        diffs_before={},
+        comments=[],
+        diffs_after={},
+    )
+    dataset.entries.append(entry)
 
     comments = list(pr.get_review_comments())
     assert len(comments) == 1
     comment = comments[0]
-    comment_text = comment.body
     commented_file_path = comment.path
 
-    try:
-        diffs_after = {
-            file.filename: file.patch
-            for file in repo.compare(first_commit.sha, last_commit.sha).files
-        }
-    except GithubException as e:
-        return
-
-    entry = DatasetEntry(
-        metadata=Metadata(
-            repo.full_name,
-            pr.number,
-            pr.merge_commit_sha,
-            {comment_text: commented_file_path},
-            reason_for_failure="Was still being processed",
-        ),
-        files={file.filename: FileData(file.filename) for file in pr.get_files()},
-        diffs_before=diffs_before,
-        comments=[comment_text],
-        diffs_after=diffs_after,
-    )
-    dataset.entries.append(entry)
-
     repo_path = os.path.join(repos_dir, repo.full_name)
 
-    updates = {}
-    if not clone(repo.full_name, repos_dir, updates):
-        entry.metadata.last_cmd_error_msg = updates["error_msg"]
-        entry.metadata.reason_for_failure = "Couldn't clone the repo successfully"
-        entry.metadata.successful = False
+    build_handler = None
 
-    def _try_cmd(action: Callable[[], Any], reason_for_failure: str) -> bool:
-        """
-        Tries a command, and if it fails, it sets the metadata of the entry.
-        """
-        try:
-            # return action()
-            action()
-        except subprocess.CalledProcessError as e:
-            entry.metadata.last_cmd_error_msg = f"{e.stderr}"
-            entry.metadata.reason_for_failure = reason_for_failure
-            entry.metadata.successful = False
-            # raise e
-        return entry.metadata.successful
+    setup_steps = [
+        (
+            "Getting diffs before the first commit...",
+            lambda: entry.diffs_before.update(get_diffs_before(repo, pr)),
+        ),
+        (
+            "Getting diffs after the first commit...",
+            lambda: entry.diffs_after.update(get_diffs_after(repo, pr)),
+        ),
+        ("Cloning the repo...", lambda: clone(repo.full_name, repos_dir)),
+        (
+            "Getting the files...",
+            lambda: entry.files.update(get_files(pr, repo, repo_path)),
+        ),
+        (
+            "Getting the comments...",
+            lambda: entry.comments.extend(get_comments(pr)),
+        ),
+        ("Checkout out merge commit...", lambda: checkout(repo_path, pr)),
+    ]
 
-    if not _try_cmd(
-        lambda: ensure_full_history(repo_path),
-        "Couldn't ensure the full history of the repo (fetch --unshallow)",
-    ):
-        return
+    with tqdm(total=len(setup_steps), desc="Setting up PR", leave=False) as pbar:
+        for message, action in setup_steps:
+            pbar.set_postfix(
+                {
+                    "doing": message,
+                    "started at": datetime.now().strftime("%d/%m, %H:%M:%S"),
+                }
+            )
+            try:
+                action()
+            except SetupException as e:
+                entry.metadata.last_cmd_error_msg = str(e)
+                entry.metadata.reason_for_failure = e.reason_for_failure
+                entry.metadata.successful = False
+                return
+            pbar.update(1)
 
     try:
-        run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
-    except subprocess.CalledProcessError:
-        if not _try_cmd(
-            lambda: run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path),
-            "Couldn't fetch the PR's merge commit",
-        ):
-            return
-
-        if not _try_cmd(
-            lambda: run_git_cmd(["checkout", pr.merge_commit_sha], repo_path),
-            "Coudln't checkout the PR's merge commit (even after fetching the pull/<pr_number>/merge)",
-        ):
-            return
-
-    build_handler = get_build_handler(repos_dir, repo.full_name, updates)
-    if build_handler is None:
-        entry.metadata.last_cmd_error_msg = updates["error_msg"]
-        entry.metadata.reason_for_failure = "Couldn't get the build handler"
+        build_handler = get_build_handler(repos_dir, repo.full_name)
+        entry.metadata.build_system = build_handler.get_type()
+        build_handler.set_client(docker_client)
+    except SetupException as e:
+        entry.metadata.last_cmd_error_msg = str(e)
+        entry.metadata.reason_for_failure = e.reason_for_failure
         entry.metadata.successful = False
         return
-    entry.metadata.build_system = build_handler.get_type()
-    build_handler.set_client(docker_client)
 
     def _check_coverages():
         for coverage_file, coverage in build_handler.check_coverage(commented_file_path):
-            entry.metadata.commented_files_coverages[commented_file_path][coverage_file] = coverage
+            entry.files[commented_file_path].coverage[coverage_file] = coverage
 
     steps = [
         ("Checking for tests...", build_handler.check_for_tests),
@@ -197,9 +319,9 @@ def process_pull(
 
 def process_repo(
     repo_name: str,
-    dataset: Dataset,
+    dataset: Dataset_new,
     repos_dir: str,
-    cache: dict[str, dict[int, DatasetEntry]] = {},
+    cache: dict[str, dict[int, DatasetEntry_new]] = {},
 ):
     repo = g.get_repo(repo_name)
     if repo.full_name in cache:
@@ -224,9 +346,9 @@ def process_repo(
 
 def process_repos(
     df: pd.DataFrame,
-    dataset: Dataset,
+    dataset: Dataset_new,
     repos_dir: str,
-    cache: dict[str, dict[int, DatasetEntry]] = {},
+    cache: dict[str, dict[int, DatasetEntry_new]] = {},
 ):
     """
     Processes the repos in the given csv file, extracting the good ones and
@@ -254,9 +376,9 @@ def process_repos(
 
 
 def only_inject_jacoco(
-    dataset: Dataset,
+    dataset: Dataset_new,
     repos_dir: str,
-    cache: dict[str, dict[int, DatasetEntry]] = {},
+    cache: dict[str, dict[int, DatasetEntry_new]] = {},
 ):
     n_successfull_injections = 0
     n_tried_injections = 0
@@ -344,13 +466,13 @@ if __name__ == "__main__":
     if args.only_repo is not None:
         df = df.loc[df["name"] == args.only_repo]
 
-    cache: dict[str, dict[int, DatasetEntry]] = defaultdict(dict)
+    cache: dict[str, dict[int, DatasetEntry_new]] = defaultdict(dict)
     if args.cache is not None:
-        cache_dataset = Dataset.from_json(args.cache)
+        cache_dataset = Dataset_new.from_json(args.cache)
         for cache_entry in cache_dataset.entries:
             cache[cache_entry.metadata.repo][cache_entry.metadata.pr_number] = cache_entry
 
-    dataset = Dataset()
+    dataset = Dataset_new()
     try:
         if args.only_inject_jacoco:
             only_inject_jacoco(dataset, args.repos, cache)