From 669049b7a4d6c87eca3f52b3aad3842029852ccf Mon Sep 17 00:00:00 2001 From: Karma Riuk Date: Mon, 31 Mar 2025 14:17:36 +0200 Subject: [PATCH] now using only the new datset version --- dataset.py | 2 +- errors.py | 45 +++++++ handlers.py | 85 +++++++------ pull_requests.py | 308 +++++++++++++++++++++++++++++++++-------------- 4 files changed, 309 insertions(+), 131 deletions(-) create mode 100644 errors.py diff --git a/dataset.py b/dataset.py index 9c4ad3e..2c99d6b 100644 --- a/dataset.py +++ b/dataset.py @@ -131,7 +131,7 @@ class Dataset_new: json.dump(self, f, default=lambda o: o.__dict__, indent=4) @staticmethod - def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset": + def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset_new": raise NotImplementedError("This method is not implemented yet") diff --git a/errors.py b/errors.py new file mode 100644 index 0000000..ee2db1d --- /dev/null +++ b/errors.py @@ -0,0 +1,45 @@ +from abc import ABC + + +class SetupException(Exception, ABC): + reason_for_failure: str + + +class NoDiffsBeforeError(SetupException): + reason_for_failure = "Couldn't get the diffs before the first commit" + + +class NoDiffsAfterError(SetupException): + reason_for_failure = "Couldn't get the diffs after the last commit" + + +class CantCloneRepoError(SetupException): + reason_for_failure = "Couldn't clone the repository" + + +class CantEnsureFullHistoryError(SetupException): + reason_for_failure = "Couldn't ensure the full history of the repo (fetch --unshallow)" + + +class CantFetchPRError(SetupException): + reason_for_failure = "Couldn't fetch the PR's merge commit" + + +class CantCheckoutCommitError(SetupException): + reason_for_failure = ( + "Coudln't checkout the PR's merge commit (even after fetching the pull//merge)" + ) + + +class MultipleFilesError(SetupException): + reason_for_failure = ( + "When requesting the contents of a file, a list of ContentFile was returned" + ) + + +class NotValidDirectory(SetupException): + reason_for_failure = "The directory is not valid" + + +class CantFindBuildFile(SetupException): + reason_for_failure = "Couldn't find the build file in the directory" diff --git a/handlers.py b/handlers.py index 74a9086..dbe4d99 100644 --- a/handlers.py +++ b/handlers.py @@ -1,10 +1,12 @@ from abc import ABC, abstractmethod -import os, re, docker, signal, sys, javalang +import os, re, docker, signal, javalang from bs4 import BeautifulSoup -from typing import Iterable, Optional, Tuple, Iterator +from typing import Iterable, Tuple, Iterator import xml.etree.ElementTree as ET from javalang.tree import PackageDeclaration +from errors import CantFindBuildFile, NotValidDirectory + REPORT_SIZE_THRESHOLD = 400 # less than 400 bytes (charcaters), we don't care about it @@ -77,7 +79,9 @@ class BuildHandler(ABC): raise FailedToCompileError(output) except TimeoutError: self.updates["compiled_successfully"] = False - self.updates["error_msg"] = "Compile process killed due to exceeding the 1-hour time limit" + self.updates[ + "error_msg" + ] = "Compile process killed due to exceeding the 1-hour time limit" finally: signal.alarm(0) # Cancel the alarm @@ -135,7 +139,9 @@ class BuildHandler(ABC): candidates = [] for coverage_report_path in self.get_jacoco_report_paths(): if not os.path.exists(coverage_report_path): - raise NoCoverageReportFound(f"Coverage report file '{coverage_report_path}' does not exist") + raise NoCoverageReportFound( + f"Coverage report file '{coverage_report_path}' does not exist" + ) fully_qualified_class = self._extract_fully_qualified_class(filename) candidates.append({"report_file": coverage_report_path, "fqc": fully_qualified_class}) @@ -220,7 +226,7 @@ class BuildHandler(ABC): class MavenHandler(BuildHandler): - def __init__(self, repo_path: str, build_file: str, updates: dict) -> None: + def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None: super().__init__(repo_path, build_file, updates) self.base_cmd = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true" # -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars. @@ -265,7 +271,9 @@ class MavenHandler(BuildHandler): self.updates["n_tests_failed"] += failures self.updates["n_tests_errors"] += errors self.updates["n_tests_skipped"] += skipped - self.updates["n_tests_passed"] += tests_run - (failures + errors) # Calculate passed tests + self.updates["n_tests_passed"] += tests_run - ( + failures + errors + ) # Calculate passed tests def get_jacoco_report_paths(self) -> Iterable[str]: found_at_least_one = False @@ -329,7 +337,7 @@ class MavenHandler(BuildHandler): class GradleHandler(BuildHandler): - def __init__(self, repo_path: str, build_file: str, updates: dict) -> None: + def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None: super().__init__(repo_path, build_file, updates) self.base_cmd = "gradle --no-daemon --console=plain" @@ -360,7 +368,9 @@ class GradleHandler(BuildHandler): test_results_path = os.path.join(self.path, "build/reports/tests/test/index.html") if not os.path.exists(test_results_path): - raise NoTestResultsToExtractError("No test results found (prolly a repo with sub-projects)") + raise NoTestResultsToExtractError( + "No test results found (prolly a repo with sub-projects)" + ) # Load the HTML file with open(test_results_path, "r") as file: @@ -374,7 +384,9 @@ class GradleHandler(BuildHandler): # counter_div = test_div.find("div", class_="counter") counter_div = test_div.select_one("div.counter") if counter_div is None: - raise NoTestResultsToExtractError("No test results found (not div.counter for tests)") + raise NoTestResultsToExtractError( + "No test results found (not div.counter for tests)" + ) self.updates["n_tests"] = int(counter_div.text.strip()) @@ -386,12 +398,16 @@ class GradleHandler(BuildHandler): # counter_div = failures_div.find("div", class_="counter") counter_div = failures_div.select_one("div.counter") if counter_div is None: - raise NoTestResultsToExtractError("No test results found (not div.counter for failures)") + raise NoTestResultsToExtractError( + "No test results found (not div.counter for failures)" + ) self.updates["n_tests_failed"] = int(counter_div.text.strip()) # Calculate passed tests - self.updates["n_tests_passed"] = self.updates["n_tests"] - self.updates["n_tests_failed"] + self.updates["n_tests_passed"] = ( + self.updates["n_tests"] - self.updates["n_tests_failed"] + ) def get_jacoco_report_paths(self) -> Iterable[str]: found_at_least_one = False @@ -577,7 +593,7 @@ def get_coverage_for_file(xml_file: str, target_fully_qualified_class: str, base return -1 -def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]: +def get_build_handler(root: str, repo: str, verbose: bool = False) -> BuildHandler: """ Get the path to the build file of a repository. The build file is either a `pom.xml`, `build.gradle`, or `build.xml` file. @@ -592,38 +608,33 @@ def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False path = os.path.join(root, repo) # Check if the given path is a directory if not os.path.isdir(path): - error_msg = f"The path {path} is not a valid directory." - print(error_msg, file=sys.stderr) - updates["error_msg"] = error_msg - return None + raise NotValidDirectory(f"The path {path} is not a valid directory.") to_keep = ["pom.xml", "build.gradle"] for entry in os.scandir(path): if entry.is_file() and entry.name in to_keep: if verbose: print(f"Found {entry.name} in {repo} root, so keeping it and returning") - updates["depth_of_build_file"] = 0 if entry.name == "build.gradle": - updates["build_system"] = "gradle" - return GradleHandler(path, entry.name, updates) + return GradleHandler(path, entry.name) else: - updates["build_system"] = "maven" - return MavenHandler(path, entry.name, updates) + return MavenHandler(path, entry.name) - # List files in the immediate subdirectories - for entry in os.scandir(path): - if entry.is_dir(): - for sub_entry in os.scandir(entry.path): - if sub_entry.is_file() and sub_entry.name in to_keep: - if verbose: - print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning") - updates["depth_of_build_file"] = 1 - if entry.name == "build.gradle": - updates["build_system"] = "gradle" - return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates) - else: - updates["build_system"] = "maven" - return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates) + raise CantFindBuildFile(f"Couldn't find any of {to_keep} build files in the directory '{path}'") + # # List files in the immediate subdirectories + # for entry in os.scandir(path): + # if entry.is_dir(): + # for sub_entry in os.scandir(entry.path): + # if sub_entry.is_file() and sub_entry.name in to_keep: + # if verbose: + # print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning") + # updates["depth_of_build_file"] = 1 + # if entry.name == "build.gradle": + # updates["build_system"] = "gradle" + # return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates) + # else: + # updates["build_system"] = "maven" + # return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates) - updates["error_msg"] = "No build file found" - return None + # updates["error_msg"] = "No build file found" + # return None diff --git a/pull_requests.py b/pull_requests.py index bd6ef1c..a971901 100644 --- a/pull_requests.py +++ b/pull_requests.py @@ -1,6 +1,8 @@ from collections import defaultdict import argparse, os, subprocess, docker from typing import Any, Callable +from github.Commit import Commit +from github.ContentFile import ContentFile from github.PullRequest import PullRequest from github.Repository import Repository import pandas as pd @@ -8,7 +10,26 @@ from github import Github, GithubException from tqdm import tqdm from datetime import datetime -from dataset import Dataset, DatasetEntry, FileData, Metadata +from dataset import ( + Comment, + Dataset, + Dataset_new, + DatasetEntry, + DatasetEntry_new, + FileData, + FileData_new, + Metadata, + Metadata_new, +) +from errors import ( + CantCheckoutCommitError, + CantEnsureFullHistoryError, + CantFetchPRError, + MultipleFilesError, + NoDiffsAfterError, + NoDiffsBeforeError, + SetupException, +) from handlers import HandlerException, get_build_handler from utils import has_only_1_comment, move_github_logging_to_file, clone, run_git_cmd @@ -54,115 +75,216 @@ def reset_repo_to_latest_commit(repo_path: str) -> None: run_git_cmd(["reset", "--hard", current_branch], repo_path) +def get_diffs_before(repo: Repository, pr: PullRequest) -> dict[str, str]: + comments = list(pr.get_review_comments()) + comments.sort(key=lambda comment: comment.created_at) + first_comment = comments[0] + try: + return { + file.filename: file.patch + for file in repo.compare(pr.base.sha, first_comment.commit_id).files + } + except GithubException as e: + raise NoDiffsBeforeError(e) + + +def get_diffs_after(repo: Repository, pr: PullRequest) -> dict[str, str]: + comments = list(pr.get_review_comments()) + comments.sort(key=lambda comment: comment.created_at) + first_commit_after_comment = None + commits = list(pr.get_commits()) + commits.sort(key=lambda commit: commit.commit.author.date) + for commit in commits: + if commit.commit.author.date > comments[0].created_at: + first_commit_after_comment = commit + break + + assert first_commit_after_comment is not None, "No commit after the comment" + + try: + return { + file.filename: file.patch + for file in repo.compare(first_commit_after_comment.sha, pr.base.sha).files + } + except GithubException as e: + raise NoDiffsAfterError(e) + + +def checkout(repo_path: str, pr: PullRequest) -> None: + try: + ensure_full_history(repo_path) + except subprocess.CalledProcessError as e: + raise CantEnsureFullHistoryError(e.stderr) + + try: + run_git_cmd(["checkout", pr.merge_commit_sha], repo_path) + except subprocess.CalledProcessError: + try: + run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path) + except subprocess.CalledProcessError as e: + raise CantFetchPRError(e.stderr) + + try: + run_git_cmd(["checkout", pr.merge_commit_sha], repo_path) + except subprocess.CalledProcessError as e: + raise CantCheckoutCommitError(e.stderr) + + +def try_read_file(fname: str) -> str: + if not os.path.exists(fname): + return "" # file was removed after the PR + try: + with open(fname, "r", encoding="utf-8") as f: + return f.read() + except UnicodeDecodeError: + return "Binary file (from filesystem), to be ignored" + except IsADirectoryError: + return "File listed in PR is a directory (likely a submodule), to be ignored" + + +def get_files(pr: PullRequest, repo: Repository, repo_path: str) -> dict[str, FileData_new]: + ret = {} + for file in pr.get_files(): + try: + contents = repo.get_contents(file.filename, ref=pr.base.sha) + assert isinstance( + contents, ContentFile + ), f"Multiple files with the same name {file.filename} in base sha {pr.base.sha} ({contents})" + contents_before = contents.decoded_content.decode() + except AssertionError as e: + raise MultipleFilesError(e) + except UnicodeError as e: + contents_before = "Binary content (from API), to be ignored" + except Exception as e: + contents_before = "" # file didn't exist before the PR + + try: + contents = repo.get_contents(file.filename, ref=pr.merge_commit_sha) + assert isinstance( + contents, ContentFile + ), f"Multiple files with the same name {file.filename} in merge commit sha {pr.base.sha} ({contents})" + contents_after = contents.decoded_content.decode() + except AssertionError as e: + raise MultipleFilesError(e) + except UnicodeError as e: + contents_after = "Binary content (from API), to be ignored" + except Exception as e: + checkout(repo_path, pr) + contents_after = try_read_file(os.path.join(repo_path, file.filename)) + + ret[file.filename] = FileData_new( + is_code_related=file.filename.endswith('.java'), + coverage={}, + content_before_pr=contents_before, + content_after_pr=contents_after, + ) + + return ret + + +def get_comments(pr: PullRequest) -> list[Comment]: + ret = [] + for comment in pr.get_review_comments(): + comment = Comment( + body=comment.body, + file=comment.path, + from_=comment.start_line if comment.start_line else comment.line, + to=comment.line, + ) + if comment.from_ is None or comment.to is None: + comment.to = comment.original_line + comment.from_ = comment.original_start_line + ret.append(comment) + return ret + + def process_pull( repo: Repository, pr: PullRequest, - dataset: Dataset, + dataset: Dataset_new, repos_dir: str, - cache: dict[str, dict[int, DatasetEntry]] = {}, + cache: dict[str, dict[int, DatasetEntry_new]] = {}, ): if pr.number in cache.get(repo.full_name, set()): dataset.entries.append(cache[repo.full_name][pr.number]) return - commits = list(pr.get_commits()) - if not commits: - return # No commits, skip processing - - first_commit = commits[0] - last_commit = commits[-1] - - try: - diffs_before = { - file.filename: file.patch for file in repo.compare(pr.base.sha, first_commit.sha).files - } - except GithubException as e: - return + entry = DatasetEntry_new( + metadata=Metadata_new( + repo.full_name, + pr.number, + pr.title, + pr.body, + pr.merge_commit_sha, + reason_for_failure="Was still being processed", + ), + files={}, + diffs_before={}, + comments=[], + diffs_after={}, + ) + dataset.entries.append(entry) comments = list(pr.get_review_comments()) assert len(comments) == 1 comment = comments[0] - comment_text = comment.body commented_file_path = comment.path - try: - diffs_after = { - file.filename: file.patch - for file in repo.compare(first_commit.sha, last_commit.sha).files - } - except GithubException as e: - return - - entry = DatasetEntry( - metadata=Metadata( - repo.full_name, - pr.number, - pr.merge_commit_sha, - {comment_text: commented_file_path}, - reason_for_failure="Was still being processed", - ), - files={file.filename: FileData(file.filename) for file in pr.get_files()}, - diffs_before=diffs_before, - comments=[comment_text], - diffs_after=diffs_after, - ) - dataset.entries.append(entry) - repo_path = os.path.join(repos_dir, repo.full_name) - updates = {} - if not clone(repo.full_name, repos_dir, updates): - entry.metadata.last_cmd_error_msg = updates["error_msg"] - entry.metadata.reason_for_failure = "Couldn't clone the repo successfully" - entry.metadata.successful = False + build_handler = None - def _try_cmd(action: Callable[[], Any], reason_for_failure: str) -> bool: - """ - Tries a command, and if it fails, it sets the metadata of the entry. - """ - try: - # return action() - action() - except subprocess.CalledProcessError as e: - entry.metadata.last_cmd_error_msg = f"{e.stderr}" - entry.metadata.reason_for_failure = reason_for_failure - entry.metadata.successful = False - # raise e - return entry.metadata.successful + setup_steps = [ + ( + "Getting diffs before the first commit...", + lambda: entry.diffs_before.update(get_diffs_before(repo, pr)), + ), + ( + "Getting diffs after the first commit...", + lambda: entry.diffs_after.update(get_diffs_after(repo, pr)), + ), + ("Cloning the repo...", lambda: clone(repo.full_name, repos_dir)), + ( + "Getting the files...", + lambda: entry.files.update(get_files(pr, repo, repo_path)), + ), + ( + "Getting the comments...", + lambda: entry.comments.extend(get_comments(pr)), + ), + ("Checkout out merge commit...", lambda: checkout(repo_path, pr)), + ] - if not _try_cmd( - lambda: ensure_full_history(repo_path), - "Couldn't ensure the full history of the repo (fetch --unshallow)", - ): - return + with tqdm(total=len(setup_steps), desc="Setting up PR", leave=False) as pbar: + for message, action in setup_steps: + pbar.set_postfix( + { + "doing": message, + "started at": datetime.now().strftime("%d/%m, %H:%M:%S"), + } + ) + try: + action() + except SetupException as e: + entry.metadata.last_cmd_error_msg = str(e) + entry.metadata.reason_for_failure = e.reason_for_failure + entry.metadata.successful = False + return + pbar.update(1) try: - run_git_cmd(["checkout", pr.merge_commit_sha], repo_path) - except subprocess.CalledProcessError: - if not _try_cmd( - lambda: run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path), - "Couldn't fetch the PR's merge commit", - ): - return - - if not _try_cmd( - lambda: run_git_cmd(["checkout", pr.merge_commit_sha], repo_path), - "Coudln't checkout the PR's merge commit (even after fetching the pull//merge)", - ): - return - - build_handler = get_build_handler(repos_dir, repo.full_name, updates) - if build_handler is None: - entry.metadata.last_cmd_error_msg = updates["error_msg"] - entry.metadata.reason_for_failure = "Couldn't get the build handler" + build_handler = get_build_handler(repos_dir, repo.full_name) + entry.metadata.build_system = build_handler.get_type() + build_handler.set_client(docker_client) + except SetupException as e: + entry.metadata.last_cmd_error_msg = str(e) + entry.metadata.reason_for_failure = e.reason_for_failure entry.metadata.successful = False return - entry.metadata.build_system = build_handler.get_type() - build_handler.set_client(docker_client) def _check_coverages(): for coverage_file, coverage in build_handler.check_coverage(commented_file_path): - entry.metadata.commented_files_coverages[commented_file_path][coverage_file] = coverage + entry.files[commented_file_path].coverage[coverage_file] = coverage steps = [ ("Checking for tests...", build_handler.check_for_tests), @@ -197,9 +319,9 @@ def process_pull( def process_repo( repo_name: str, - dataset: Dataset, + dataset: Dataset_new, repos_dir: str, - cache: dict[str, dict[int, DatasetEntry]] = {}, + cache: dict[str, dict[int, DatasetEntry_new]] = {}, ): repo = g.get_repo(repo_name) if repo.full_name in cache: @@ -224,9 +346,9 @@ def process_repo( def process_repos( df: pd.DataFrame, - dataset: Dataset, + dataset: Dataset_new, repos_dir: str, - cache: dict[str, dict[int, DatasetEntry]] = {}, + cache: dict[str, dict[int, DatasetEntry_new]] = {}, ): """ Processes the repos in the given csv file, extracting the good ones and @@ -254,9 +376,9 @@ def process_repos( def only_inject_jacoco( - dataset: Dataset, + dataset: Dataset_new, repos_dir: str, - cache: dict[str, dict[int, DatasetEntry]] = {}, + cache: dict[str, dict[int, DatasetEntry_new]] = {}, ): n_successfull_injections = 0 n_tried_injections = 0 @@ -344,13 +466,13 @@ if __name__ == "__main__": if args.only_repo is not None: df = df.loc[df["name"] == args.only_repo] - cache: dict[str, dict[int, DatasetEntry]] = defaultdict(dict) + cache: dict[str, dict[int, DatasetEntry_new]] = defaultdict(dict) if args.cache is not None: - cache_dataset = Dataset.from_json(args.cache) + cache_dataset = Dataset_new.from_json(args.cache) for cache_entry in cache_dataset.entries: cache[cache_entry.metadata.repo][cache_entry.metadata.pr_number] = cache_entry - dataset = Dataset() + dataset = Dataset_new() try: if args.only_inject_jacoco: only_inject_jacoco(dataset, args.repos, cache)