now using only the new datset version

This commit is contained in:
Karma Riuk
2025-03-31 14:17:36 +02:00
parent 35bd296c7c
commit 669049b7a4
4 changed files with 309 additions and 131 deletions

View File

@ -131,7 +131,7 @@ class Dataset_new:
json.dump(self, f, default=lambda o: o.__dict__, indent=4) json.dump(self, f, default=lambda o: o.__dict__, indent=4)
@staticmethod @staticmethod
def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset": def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset_new":
raise NotImplementedError("This method is not implemented yet") raise NotImplementedError("This method is not implemented yet")

45
errors.py Normal file
View File

@ -0,0 +1,45 @@
from abc import ABC
class SetupException(Exception, ABC):
reason_for_failure: str
class NoDiffsBeforeError(SetupException):
reason_for_failure = "Couldn't get the diffs before the first commit"
class NoDiffsAfterError(SetupException):
reason_for_failure = "Couldn't get the diffs after the last commit"
class CantCloneRepoError(SetupException):
reason_for_failure = "Couldn't clone the repository"
class CantEnsureFullHistoryError(SetupException):
reason_for_failure = "Couldn't ensure the full history of the repo (fetch --unshallow)"
class CantFetchPRError(SetupException):
reason_for_failure = "Couldn't fetch the PR's merge commit"
class CantCheckoutCommitError(SetupException):
reason_for_failure = (
"Coudln't checkout the PR's merge commit (even after fetching the pull/<pr_number>/merge)"
)
class MultipleFilesError(SetupException):
reason_for_failure = (
"When requesting the contents of a file, a list of ContentFile was returned"
)
class NotValidDirectory(SetupException):
reason_for_failure = "The directory is not valid"
class CantFindBuildFile(SetupException):
reason_for_failure = "Couldn't find the build file in the directory"

View File

@ -1,10 +1,12 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import os, re, docker, signal, sys, javalang import os, re, docker, signal, javalang
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from typing import Iterable, Optional, Tuple, Iterator from typing import Iterable, Tuple, Iterator
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from javalang.tree import PackageDeclaration from javalang.tree import PackageDeclaration
from errors import CantFindBuildFile, NotValidDirectory
REPORT_SIZE_THRESHOLD = 400 # less than 400 bytes (charcaters), we don't care about it REPORT_SIZE_THRESHOLD = 400 # less than 400 bytes (charcaters), we don't care about it
@ -77,7 +79,9 @@ class BuildHandler(ABC):
raise FailedToCompileError(output) raise FailedToCompileError(output)
except TimeoutError: except TimeoutError:
self.updates["compiled_successfully"] = False self.updates["compiled_successfully"] = False
self.updates["error_msg"] = "Compile process killed due to exceeding the 1-hour time limit" self.updates[
"error_msg"
] = "Compile process killed due to exceeding the 1-hour time limit"
finally: finally:
signal.alarm(0) # Cancel the alarm signal.alarm(0) # Cancel the alarm
@ -135,7 +139,9 @@ class BuildHandler(ABC):
candidates = [] candidates = []
for coverage_report_path in self.get_jacoco_report_paths(): for coverage_report_path in self.get_jacoco_report_paths():
if not os.path.exists(coverage_report_path): if not os.path.exists(coverage_report_path):
raise NoCoverageReportFound(f"Coverage report file '{coverage_report_path}' does not exist") raise NoCoverageReportFound(
f"Coverage report file '{coverage_report_path}' does not exist"
)
fully_qualified_class = self._extract_fully_qualified_class(filename) fully_qualified_class = self._extract_fully_qualified_class(filename)
candidates.append({"report_file": coverage_report_path, "fqc": fully_qualified_class}) candidates.append({"report_file": coverage_report_path, "fqc": fully_qualified_class})
@ -220,7 +226,7 @@ class BuildHandler(ABC):
class MavenHandler(BuildHandler): class MavenHandler(BuildHandler):
def __init__(self, repo_path: str, build_file: str, updates: dict) -> None: def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None:
super().__init__(repo_path, build_file, updates) super().__init__(repo_path, build_file, updates)
self.base_cmd = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true" self.base_cmd = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
# -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars. # -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
@ -265,7 +271,9 @@ class MavenHandler(BuildHandler):
self.updates["n_tests_failed"] += failures self.updates["n_tests_failed"] += failures
self.updates["n_tests_errors"] += errors self.updates["n_tests_errors"] += errors
self.updates["n_tests_skipped"] += skipped self.updates["n_tests_skipped"] += skipped
self.updates["n_tests_passed"] += tests_run - (failures + errors) # Calculate passed tests self.updates["n_tests_passed"] += tests_run - (
failures + errors
) # Calculate passed tests
def get_jacoco_report_paths(self) -> Iterable[str]: def get_jacoco_report_paths(self) -> Iterable[str]:
found_at_least_one = False found_at_least_one = False
@ -329,7 +337,7 @@ class MavenHandler(BuildHandler):
class GradleHandler(BuildHandler): class GradleHandler(BuildHandler):
def __init__(self, repo_path: str, build_file: str, updates: dict) -> None: def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None:
super().__init__(repo_path, build_file, updates) super().__init__(repo_path, build_file, updates)
self.base_cmd = "gradle --no-daemon --console=plain" self.base_cmd = "gradle --no-daemon --console=plain"
@ -360,7 +368,9 @@ class GradleHandler(BuildHandler):
test_results_path = os.path.join(self.path, "build/reports/tests/test/index.html") test_results_path = os.path.join(self.path, "build/reports/tests/test/index.html")
if not os.path.exists(test_results_path): if not os.path.exists(test_results_path):
raise NoTestResultsToExtractError("No test results found (prolly a repo with sub-projects)") raise NoTestResultsToExtractError(
"No test results found (prolly a repo with sub-projects)"
)
# Load the HTML file # Load the HTML file
with open(test_results_path, "r") as file: with open(test_results_path, "r") as file:
@ -374,7 +384,9 @@ class GradleHandler(BuildHandler):
# counter_div = test_div.find("div", class_="counter") # counter_div = test_div.find("div", class_="counter")
counter_div = test_div.select_one("div.counter") counter_div = test_div.select_one("div.counter")
if counter_div is None: if counter_div is None:
raise NoTestResultsToExtractError("No test results found (not div.counter for tests)") raise NoTestResultsToExtractError(
"No test results found (not div.counter for tests)"
)
self.updates["n_tests"] = int(counter_div.text.strip()) self.updates["n_tests"] = int(counter_div.text.strip())
@ -386,12 +398,16 @@ class GradleHandler(BuildHandler):
# counter_div = failures_div.find("div", class_="counter") # counter_div = failures_div.find("div", class_="counter")
counter_div = failures_div.select_one("div.counter") counter_div = failures_div.select_one("div.counter")
if counter_div is None: if counter_div is None:
raise NoTestResultsToExtractError("No test results found (not div.counter for failures)") raise NoTestResultsToExtractError(
"No test results found (not div.counter for failures)"
)
self.updates["n_tests_failed"] = int(counter_div.text.strip()) self.updates["n_tests_failed"] = int(counter_div.text.strip())
# Calculate passed tests # Calculate passed tests
self.updates["n_tests_passed"] = self.updates["n_tests"] - self.updates["n_tests_failed"] self.updates["n_tests_passed"] = (
self.updates["n_tests"] - self.updates["n_tests_failed"]
)
def get_jacoco_report_paths(self) -> Iterable[str]: def get_jacoco_report_paths(self) -> Iterable[str]:
found_at_least_one = False found_at_least_one = False
@ -577,7 +593,7 @@ def get_coverage_for_file(xml_file: str, target_fully_qualified_class: str, base
return -1 return -1
def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]: def get_build_handler(root: str, repo: str, verbose: bool = False) -> BuildHandler:
""" """
Get the path to the build file of a repository. The build file is either a Get the path to the build file of a repository. The build file is either a
`pom.xml`, `build.gradle`, or `build.xml` file. `pom.xml`, `build.gradle`, or `build.xml` file.
@ -592,38 +608,33 @@ def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False
path = os.path.join(root, repo) path = os.path.join(root, repo)
# Check if the given path is a directory # Check if the given path is a directory
if not os.path.isdir(path): if not os.path.isdir(path):
error_msg = f"The path {path} is not a valid directory." raise NotValidDirectory(f"The path {path} is not a valid directory.")
print(error_msg, file=sys.stderr)
updates["error_msg"] = error_msg
return None
to_keep = ["pom.xml", "build.gradle"] to_keep = ["pom.xml", "build.gradle"]
for entry in os.scandir(path): for entry in os.scandir(path):
if entry.is_file() and entry.name in to_keep: if entry.is_file() and entry.name in to_keep:
if verbose: if verbose:
print(f"Found {entry.name} in {repo} root, so keeping it and returning") print(f"Found {entry.name} in {repo} root, so keeping it and returning")
updates["depth_of_build_file"] = 0
if entry.name == "build.gradle": if entry.name == "build.gradle":
updates["build_system"] = "gradle" return GradleHandler(path, entry.name)
return GradleHandler(path, entry.name, updates)
else: else:
updates["build_system"] = "maven" return MavenHandler(path, entry.name)
return MavenHandler(path, entry.name, updates)
# List files in the immediate subdirectories raise CantFindBuildFile(f"Couldn't find any of {to_keep} build files in the directory '{path}'")
for entry in os.scandir(path): # # List files in the immediate subdirectories
if entry.is_dir(): # for entry in os.scandir(path):
for sub_entry in os.scandir(entry.path): # if entry.is_dir():
if sub_entry.is_file() and sub_entry.name in to_keep: # for sub_entry in os.scandir(entry.path):
if verbose: # if sub_entry.is_file() and sub_entry.name in to_keep:
print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning") # if verbose:
updates["depth_of_build_file"] = 1 # print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
if entry.name == "build.gradle": # updates["depth_of_build_file"] = 1
updates["build_system"] = "gradle" # if entry.name == "build.gradle":
return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates) # updates["build_system"] = "gradle"
else: # return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates)
updates["build_system"] = "maven" # else:
return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates) # updates["build_system"] = "maven"
# return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates)
updates["error_msg"] = "No build file found" # updates["error_msg"] = "No build file found"
return None # return None

View File

@ -1,6 +1,8 @@
from collections import defaultdict from collections import defaultdict
import argparse, os, subprocess, docker import argparse, os, subprocess, docker
from typing import Any, Callable from typing import Any, Callable
from github.Commit import Commit
from github.ContentFile import ContentFile
from github.PullRequest import PullRequest from github.PullRequest import PullRequest
from github.Repository import Repository from github.Repository import Repository
import pandas as pd import pandas as pd
@ -8,7 +10,26 @@ from github import Github, GithubException
from tqdm import tqdm from tqdm import tqdm
from datetime import datetime from datetime import datetime
from dataset import Dataset, DatasetEntry, FileData, Metadata from dataset import (
Comment,
Dataset,
Dataset_new,
DatasetEntry,
DatasetEntry_new,
FileData,
FileData_new,
Metadata,
Metadata_new,
)
from errors import (
CantCheckoutCommitError,
CantEnsureFullHistoryError,
CantFetchPRError,
MultipleFilesError,
NoDiffsAfterError,
NoDiffsBeforeError,
SetupException,
)
from handlers import HandlerException, get_build_handler from handlers import HandlerException, get_build_handler
from utils import has_only_1_comment, move_github_logging_to_file, clone, run_git_cmd from utils import has_only_1_comment, move_github_logging_to_file, clone, run_git_cmd
@ -54,115 +75,216 @@ def reset_repo_to_latest_commit(repo_path: str) -> None:
run_git_cmd(["reset", "--hard", current_branch], repo_path) run_git_cmd(["reset", "--hard", current_branch], repo_path)
def get_diffs_before(repo: Repository, pr: PullRequest) -> dict[str, str]:
comments = list(pr.get_review_comments())
comments.sort(key=lambda comment: comment.created_at)
first_comment = comments[0]
try:
return {
file.filename: file.patch
for file in repo.compare(pr.base.sha, first_comment.commit_id).files
}
except GithubException as e:
raise NoDiffsBeforeError(e)
def get_diffs_after(repo: Repository, pr: PullRequest) -> dict[str, str]:
comments = list(pr.get_review_comments())
comments.sort(key=lambda comment: comment.created_at)
first_commit_after_comment = None
commits = list(pr.get_commits())
commits.sort(key=lambda commit: commit.commit.author.date)
for commit in commits:
if commit.commit.author.date > comments[0].created_at:
first_commit_after_comment = commit
break
assert first_commit_after_comment is not None, "No commit after the comment"
try:
return {
file.filename: file.patch
for file in repo.compare(first_commit_after_comment.sha, pr.base.sha).files
}
except GithubException as e:
raise NoDiffsAfterError(e)
def checkout(repo_path: str, pr: PullRequest) -> None:
try:
ensure_full_history(repo_path)
except subprocess.CalledProcessError as e:
raise CantEnsureFullHistoryError(e.stderr)
try:
run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
except subprocess.CalledProcessError:
try:
run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path)
except subprocess.CalledProcessError as e:
raise CantFetchPRError(e.stderr)
try:
run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
except subprocess.CalledProcessError as e:
raise CantCheckoutCommitError(e.stderr)
def try_read_file(fname: str) -> str:
if not os.path.exists(fname):
return "" # file was removed after the PR
try:
with open(fname, "r", encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError:
return "Binary file (from filesystem), to be ignored"
except IsADirectoryError:
return "File listed in PR is a directory (likely a submodule), to be ignored"
def get_files(pr: PullRequest, repo: Repository, repo_path: str) -> dict[str, FileData_new]:
ret = {}
for file in pr.get_files():
try:
contents = repo.get_contents(file.filename, ref=pr.base.sha)
assert isinstance(
contents, ContentFile
), f"Multiple files with the same name {file.filename} in base sha {pr.base.sha} ({contents})"
contents_before = contents.decoded_content.decode()
except AssertionError as e:
raise MultipleFilesError(e)
except UnicodeError as e:
contents_before = "Binary content (from API), to be ignored"
except Exception as e:
contents_before = "" # file didn't exist before the PR
try:
contents = repo.get_contents(file.filename, ref=pr.merge_commit_sha)
assert isinstance(
contents, ContentFile
), f"Multiple files with the same name {file.filename} in merge commit sha {pr.base.sha} ({contents})"
contents_after = contents.decoded_content.decode()
except AssertionError as e:
raise MultipleFilesError(e)
except UnicodeError as e:
contents_after = "Binary content (from API), to be ignored"
except Exception as e:
checkout(repo_path, pr)
contents_after = try_read_file(os.path.join(repo_path, file.filename))
ret[file.filename] = FileData_new(
is_code_related=file.filename.endswith('.java'),
coverage={},
content_before_pr=contents_before,
content_after_pr=contents_after,
)
return ret
def get_comments(pr: PullRequest) -> list[Comment]:
ret = []
for comment in pr.get_review_comments():
comment = Comment(
body=comment.body,
file=comment.path,
from_=comment.start_line if comment.start_line else comment.line,
to=comment.line,
)
if comment.from_ is None or comment.to is None:
comment.to = comment.original_line
comment.from_ = comment.original_start_line
ret.append(comment)
return ret
def process_pull( def process_pull(
repo: Repository, repo: Repository,
pr: PullRequest, pr: PullRequest,
dataset: Dataset, dataset: Dataset_new,
repos_dir: str, repos_dir: str,
cache: dict[str, dict[int, DatasetEntry]] = {}, cache: dict[str, dict[int, DatasetEntry_new]] = {},
): ):
if pr.number in cache.get(repo.full_name, set()): if pr.number in cache.get(repo.full_name, set()):
dataset.entries.append(cache[repo.full_name][pr.number]) dataset.entries.append(cache[repo.full_name][pr.number])
return return
commits = list(pr.get_commits()) entry = DatasetEntry_new(
if not commits: metadata=Metadata_new(
return # No commits, skip processing repo.full_name,
pr.number,
first_commit = commits[0] pr.title,
last_commit = commits[-1] pr.body,
pr.merge_commit_sha,
try: reason_for_failure="Was still being processed",
diffs_before = { ),
file.filename: file.patch for file in repo.compare(pr.base.sha, first_commit.sha).files files={},
} diffs_before={},
except GithubException as e: comments=[],
return diffs_after={},
)
dataset.entries.append(entry)
comments = list(pr.get_review_comments()) comments = list(pr.get_review_comments())
assert len(comments) == 1 assert len(comments) == 1
comment = comments[0] comment = comments[0]
comment_text = comment.body
commented_file_path = comment.path commented_file_path = comment.path
try:
diffs_after = {
file.filename: file.patch
for file in repo.compare(first_commit.sha, last_commit.sha).files
}
except GithubException as e:
return
entry = DatasetEntry(
metadata=Metadata(
repo.full_name,
pr.number,
pr.merge_commit_sha,
{comment_text: commented_file_path},
reason_for_failure="Was still being processed",
),
files={file.filename: FileData(file.filename) for file in pr.get_files()},
diffs_before=diffs_before,
comments=[comment_text],
diffs_after=diffs_after,
)
dataset.entries.append(entry)
repo_path = os.path.join(repos_dir, repo.full_name) repo_path = os.path.join(repos_dir, repo.full_name)
updates = {} build_handler = None
if not clone(repo.full_name, repos_dir, updates):
entry.metadata.last_cmd_error_msg = updates["error_msg"]
entry.metadata.reason_for_failure = "Couldn't clone the repo successfully"
entry.metadata.successful = False
def _try_cmd(action: Callable[[], Any], reason_for_failure: str) -> bool: setup_steps = [
""" (
Tries a command, and if it fails, it sets the metadata of the entry. "Getting diffs before the first commit...",
""" lambda: entry.diffs_before.update(get_diffs_before(repo, pr)),
),
(
"Getting diffs after the first commit...",
lambda: entry.diffs_after.update(get_diffs_after(repo, pr)),
),
("Cloning the repo...", lambda: clone(repo.full_name, repos_dir)),
(
"Getting the files...",
lambda: entry.files.update(get_files(pr, repo, repo_path)),
),
(
"Getting the comments...",
lambda: entry.comments.extend(get_comments(pr)),
),
("Checkout out merge commit...", lambda: checkout(repo_path, pr)),
]
with tqdm(total=len(setup_steps), desc="Setting up PR", leave=False) as pbar:
for message, action in setup_steps:
pbar.set_postfix(
{
"doing": message,
"started at": datetime.now().strftime("%d/%m, %H:%M:%S"),
}
)
try: try:
# return action()
action() action()
except subprocess.CalledProcessError as e: except SetupException as e:
entry.metadata.last_cmd_error_msg = f"{e.stderr}" entry.metadata.last_cmd_error_msg = str(e)
entry.metadata.reason_for_failure = reason_for_failure entry.metadata.reason_for_failure = e.reason_for_failure
entry.metadata.successful = False entry.metadata.successful = False
# raise e
return entry.metadata.successful
if not _try_cmd(
lambda: ensure_full_history(repo_path),
"Couldn't ensure the full history of the repo (fetch --unshallow)",
):
return return
pbar.update(1)
try: try:
run_git_cmd(["checkout", pr.merge_commit_sha], repo_path) build_handler = get_build_handler(repos_dir, repo.full_name)
except subprocess.CalledProcessError:
if not _try_cmd(
lambda: run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path),
"Couldn't fetch the PR's merge commit",
):
return
if not _try_cmd(
lambda: run_git_cmd(["checkout", pr.merge_commit_sha], repo_path),
"Coudln't checkout the PR's merge commit (even after fetching the pull/<pr_number>/merge)",
):
return
build_handler = get_build_handler(repos_dir, repo.full_name, updates)
if build_handler is None:
entry.metadata.last_cmd_error_msg = updates["error_msg"]
entry.metadata.reason_for_failure = "Couldn't get the build handler"
entry.metadata.successful = False
return
entry.metadata.build_system = build_handler.get_type() entry.metadata.build_system = build_handler.get_type()
build_handler.set_client(docker_client) build_handler.set_client(docker_client)
except SetupException as e:
entry.metadata.last_cmd_error_msg = str(e)
entry.metadata.reason_for_failure = e.reason_for_failure
entry.metadata.successful = False
return
def _check_coverages(): def _check_coverages():
for coverage_file, coverage in build_handler.check_coverage(commented_file_path): for coverage_file, coverage in build_handler.check_coverage(commented_file_path):
entry.metadata.commented_files_coverages[commented_file_path][coverage_file] = coverage entry.files[commented_file_path].coverage[coverage_file] = coverage
steps = [ steps = [
("Checking for tests...", build_handler.check_for_tests), ("Checking for tests...", build_handler.check_for_tests),
@ -197,9 +319,9 @@ def process_pull(
def process_repo( def process_repo(
repo_name: str, repo_name: str,
dataset: Dataset, dataset: Dataset_new,
repos_dir: str, repos_dir: str,
cache: dict[str, dict[int, DatasetEntry]] = {}, cache: dict[str, dict[int, DatasetEntry_new]] = {},
): ):
repo = g.get_repo(repo_name) repo = g.get_repo(repo_name)
if repo.full_name in cache: if repo.full_name in cache:
@ -224,9 +346,9 @@ def process_repo(
def process_repos( def process_repos(
df: pd.DataFrame, df: pd.DataFrame,
dataset: Dataset, dataset: Dataset_new,
repos_dir: str, repos_dir: str,
cache: dict[str, dict[int, DatasetEntry]] = {}, cache: dict[str, dict[int, DatasetEntry_new]] = {},
): ):
""" """
Processes the repos in the given csv file, extracting the good ones and Processes the repos in the given csv file, extracting the good ones and
@ -254,9 +376,9 @@ def process_repos(
def only_inject_jacoco( def only_inject_jacoco(
dataset: Dataset, dataset: Dataset_new,
repos_dir: str, repos_dir: str,
cache: dict[str, dict[int, DatasetEntry]] = {}, cache: dict[str, dict[int, DatasetEntry_new]] = {},
): ):
n_successfull_injections = 0 n_successfull_injections = 0
n_tried_injections = 0 n_tried_injections = 0
@ -344,13 +466,13 @@ if __name__ == "__main__":
if args.only_repo is not None: if args.only_repo is not None:
df = df.loc[df["name"] == args.only_repo] df = df.loc[df["name"] == args.only_repo]
cache: dict[str, dict[int, DatasetEntry]] = defaultdict(dict) cache: dict[str, dict[int, DatasetEntry_new]] = defaultdict(dict)
if args.cache is not None: if args.cache is not None:
cache_dataset = Dataset.from_json(args.cache) cache_dataset = Dataset_new.from_json(args.cache)
for cache_entry in cache_dataset.entries: for cache_entry in cache_dataset.entries:
cache[cache_entry.metadata.repo][cache_entry.metadata.pr_number] = cache_entry cache[cache_entry.metadata.repo][cache_entry.metadata.pr_number] = cache_entry
dataset = Dataset() dataset = Dataset_new()
try: try:
if args.only_inject_jacoco: if args.only_inject_jacoco:
only_inject_jacoco(dataset, args.repos, cache) only_inject_jacoco(dataset, args.repos, cache)