mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-05 05:28:13 +02:00
now using only the new datset version
This commit is contained in:
@ -131,7 +131,7 @@ class Dataset_new:
|
|||||||
json.dump(self, f, default=lambda o: o.__dict__, indent=4)
|
json.dump(self, f, default=lambda o: o.__dict__, indent=4)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset":
|
def from_json(filename: str, keep_still_in_progress: bool = False) -> "Dataset_new":
|
||||||
raise NotImplementedError("This method is not implemented yet")
|
raise NotImplementedError("This method is not implemented yet")
|
||||||
|
|
||||||
|
|
||||||
|
45
errors.py
Normal file
45
errors.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
from abc import ABC
|
||||||
|
|
||||||
|
|
||||||
|
class SetupException(Exception, ABC):
|
||||||
|
reason_for_failure: str
|
||||||
|
|
||||||
|
|
||||||
|
class NoDiffsBeforeError(SetupException):
|
||||||
|
reason_for_failure = "Couldn't get the diffs before the first commit"
|
||||||
|
|
||||||
|
|
||||||
|
class NoDiffsAfterError(SetupException):
|
||||||
|
reason_for_failure = "Couldn't get the diffs after the last commit"
|
||||||
|
|
||||||
|
|
||||||
|
class CantCloneRepoError(SetupException):
|
||||||
|
reason_for_failure = "Couldn't clone the repository"
|
||||||
|
|
||||||
|
|
||||||
|
class CantEnsureFullHistoryError(SetupException):
|
||||||
|
reason_for_failure = "Couldn't ensure the full history of the repo (fetch --unshallow)"
|
||||||
|
|
||||||
|
|
||||||
|
class CantFetchPRError(SetupException):
|
||||||
|
reason_for_failure = "Couldn't fetch the PR's merge commit"
|
||||||
|
|
||||||
|
|
||||||
|
class CantCheckoutCommitError(SetupException):
|
||||||
|
reason_for_failure = (
|
||||||
|
"Coudln't checkout the PR's merge commit (even after fetching the pull/<pr_number>/merge)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class MultipleFilesError(SetupException):
|
||||||
|
reason_for_failure = (
|
||||||
|
"When requesting the contents of a file, a list of ContentFile was returned"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class NotValidDirectory(SetupException):
|
||||||
|
reason_for_failure = "The directory is not valid"
|
||||||
|
|
||||||
|
|
||||||
|
class CantFindBuildFile(SetupException):
|
||||||
|
reason_for_failure = "Couldn't find the build file in the directory"
|
85
handlers.py
85
handlers.py
@ -1,10 +1,12 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
import os, re, docker, signal, sys, javalang
|
import os, re, docker, signal, javalang
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from typing import Iterable, Optional, Tuple, Iterator
|
from typing import Iterable, Tuple, Iterator
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from javalang.tree import PackageDeclaration
|
from javalang.tree import PackageDeclaration
|
||||||
|
|
||||||
|
from errors import CantFindBuildFile, NotValidDirectory
|
||||||
|
|
||||||
REPORT_SIZE_THRESHOLD = 400 # less than 400 bytes (charcaters), we don't care about it
|
REPORT_SIZE_THRESHOLD = 400 # less than 400 bytes (charcaters), we don't care about it
|
||||||
|
|
||||||
|
|
||||||
@ -77,7 +79,9 @@ class BuildHandler(ABC):
|
|||||||
raise FailedToCompileError(output)
|
raise FailedToCompileError(output)
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
self.updates["compiled_successfully"] = False
|
self.updates["compiled_successfully"] = False
|
||||||
self.updates["error_msg"] = "Compile process killed due to exceeding the 1-hour time limit"
|
self.updates[
|
||||||
|
"error_msg"
|
||||||
|
] = "Compile process killed due to exceeding the 1-hour time limit"
|
||||||
finally:
|
finally:
|
||||||
signal.alarm(0) # Cancel the alarm
|
signal.alarm(0) # Cancel the alarm
|
||||||
|
|
||||||
@ -135,7 +139,9 @@ class BuildHandler(ABC):
|
|||||||
candidates = []
|
candidates = []
|
||||||
for coverage_report_path in self.get_jacoco_report_paths():
|
for coverage_report_path in self.get_jacoco_report_paths():
|
||||||
if not os.path.exists(coverage_report_path):
|
if not os.path.exists(coverage_report_path):
|
||||||
raise NoCoverageReportFound(f"Coverage report file '{coverage_report_path}' does not exist")
|
raise NoCoverageReportFound(
|
||||||
|
f"Coverage report file '{coverage_report_path}' does not exist"
|
||||||
|
)
|
||||||
|
|
||||||
fully_qualified_class = self._extract_fully_qualified_class(filename)
|
fully_qualified_class = self._extract_fully_qualified_class(filename)
|
||||||
candidates.append({"report_file": coverage_report_path, "fqc": fully_qualified_class})
|
candidates.append({"report_file": coverage_report_path, "fqc": fully_qualified_class})
|
||||||
@ -220,7 +226,7 @@ class BuildHandler(ABC):
|
|||||||
|
|
||||||
|
|
||||||
class MavenHandler(BuildHandler):
|
class MavenHandler(BuildHandler):
|
||||||
def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
|
def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None:
|
||||||
super().__init__(repo_path, build_file, updates)
|
super().__init__(repo_path, build_file, updates)
|
||||||
self.base_cmd = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
|
self.base_cmd = "mvn -B -Dstyle.color=never -Dartifact.download.skip=true"
|
||||||
# -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
|
# -B (Batch Mode): Runs Maven in non-interactive mode, reducing output and removing download progress bars.
|
||||||
@ -265,7 +271,9 @@ class MavenHandler(BuildHandler):
|
|||||||
self.updates["n_tests_failed"] += failures
|
self.updates["n_tests_failed"] += failures
|
||||||
self.updates["n_tests_errors"] += errors
|
self.updates["n_tests_errors"] += errors
|
||||||
self.updates["n_tests_skipped"] += skipped
|
self.updates["n_tests_skipped"] += skipped
|
||||||
self.updates["n_tests_passed"] += tests_run - (failures + errors) # Calculate passed tests
|
self.updates["n_tests_passed"] += tests_run - (
|
||||||
|
failures + errors
|
||||||
|
) # Calculate passed tests
|
||||||
|
|
||||||
def get_jacoco_report_paths(self) -> Iterable[str]:
|
def get_jacoco_report_paths(self) -> Iterable[str]:
|
||||||
found_at_least_one = False
|
found_at_least_one = False
|
||||||
@ -329,7 +337,7 @@ class MavenHandler(BuildHandler):
|
|||||||
|
|
||||||
|
|
||||||
class GradleHandler(BuildHandler):
|
class GradleHandler(BuildHandler):
|
||||||
def __init__(self, repo_path: str, build_file: str, updates: dict) -> None:
|
def __init__(self, repo_path: str, build_file: str, updates: dict = {}) -> None:
|
||||||
super().__init__(repo_path, build_file, updates)
|
super().__init__(repo_path, build_file, updates)
|
||||||
self.base_cmd = "gradle --no-daemon --console=plain"
|
self.base_cmd = "gradle --no-daemon --console=plain"
|
||||||
|
|
||||||
@ -360,7 +368,9 @@ class GradleHandler(BuildHandler):
|
|||||||
|
|
||||||
test_results_path = os.path.join(self.path, "build/reports/tests/test/index.html")
|
test_results_path = os.path.join(self.path, "build/reports/tests/test/index.html")
|
||||||
if not os.path.exists(test_results_path):
|
if not os.path.exists(test_results_path):
|
||||||
raise NoTestResultsToExtractError("No test results found (prolly a repo with sub-projects)")
|
raise NoTestResultsToExtractError(
|
||||||
|
"No test results found (prolly a repo with sub-projects)"
|
||||||
|
)
|
||||||
|
|
||||||
# Load the HTML file
|
# Load the HTML file
|
||||||
with open(test_results_path, "r") as file:
|
with open(test_results_path, "r") as file:
|
||||||
@ -374,7 +384,9 @@ class GradleHandler(BuildHandler):
|
|||||||
# counter_div = test_div.find("div", class_="counter")
|
# counter_div = test_div.find("div", class_="counter")
|
||||||
counter_div = test_div.select_one("div.counter")
|
counter_div = test_div.select_one("div.counter")
|
||||||
if counter_div is None:
|
if counter_div is None:
|
||||||
raise NoTestResultsToExtractError("No test results found (not div.counter for tests)")
|
raise NoTestResultsToExtractError(
|
||||||
|
"No test results found (not div.counter for tests)"
|
||||||
|
)
|
||||||
|
|
||||||
self.updates["n_tests"] = int(counter_div.text.strip())
|
self.updates["n_tests"] = int(counter_div.text.strip())
|
||||||
|
|
||||||
@ -386,12 +398,16 @@ class GradleHandler(BuildHandler):
|
|||||||
# counter_div = failures_div.find("div", class_="counter")
|
# counter_div = failures_div.find("div", class_="counter")
|
||||||
counter_div = failures_div.select_one("div.counter")
|
counter_div = failures_div.select_one("div.counter")
|
||||||
if counter_div is None:
|
if counter_div is None:
|
||||||
raise NoTestResultsToExtractError("No test results found (not div.counter for failures)")
|
raise NoTestResultsToExtractError(
|
||||||
|
"No test results found (not div.counter for failures)"
|
||||||
|
)
|
||||||
|
|
||||||
self.updates["n_tests_failed"] = int(counter_div.text.strip())
|
self.updates["n_tests_failed"] = int(counter_div.text.strip())
|
||||||
|
|
||||||
# Calculate passed tests
|
# Calculate passed tests
|
||||||
self.updates["n_tests_passed"] = self.updates["n_tests"] - self.updates["n_tests_failed"]
|
self.updates["n_tests_passed"] = (
|
||||||
|
self.updates["n_tests"] - self.updates["n_tests_failed"]
|
||||||
|
)
|
||||||
|
|
||||||
def get_jacoco_report_paths(self) -> Iterable[str]:
|
def get_jacoco_report_paths(self) -> Iterable[str]:
|
||||||
found_at_least_one = False
|
found_at_least_one = False
|
||||||
@ -577,7 +593,7 @@ def get_coverage_for_file(xml_file: str, target_fully_qualified_class: str, base
|
|||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False) -> Optional[BuildHandler]:
|
def get_build_handler(root: str, repo: str, verbose: bool = False) -> BuildHandler:
|
||||||
"""
|
"""
|
||||||
Get the path to the build file of a repository. The build file is either a
|
Get the path to the build file of a repository. The build file is either a
|
||||||
`pom.xml`, `build.gradle`, or `build.xml` file.
|
`pom.xml`, `build.gradle`, or `build.xml` file.
|
||||||
@ -592,38 +608,33 @@ def get_build_handler(root: str, repo: str, updates: dict, verbose: bool = False
|
|||||||
path = os.path.join(root, repo)
|
path = os.path.join(root, repo)
|
||||||
# Check if the given path is a directory
|
# Check if the given path is a directory
|
||||||
if not os.path.isdir(path):
|
if not os.path.isdir(path):
|
||||||
error_msg = f"The path {path} is not a valid directory."
|
raise NotValidDirectory(f"The path {path} is not a valid directory.")
|
||||||
print(error_msg, file=sys.stderr)
|
|
||||||
updates["error_msg"] = error_msg
|
|
||||||
return None
|
|
||||||
|
|
||||||
to_keep = ["pom.xml", "build.gradle"]
|
to_keep = ["pom.xml", "build.gradle"]
|
||||||
for entry in os.scandir(path):
|
for entry in os.scandir(path):
|
||||||
if entry.is_file() and entry.name in to_keep:
|
if entry.is_file() and entry.name in to_keep:
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"Found {entry.name} in {repo} root, so keeping it and returning")
|
print(f"Found {entry.name} in {repo} root, so keeping it and returning")
|
||||||
updates["depth_of_build_file"] = 0
|
|
||||||
if entry.name == "build.gradle":
|
if entry.name == "build.gradle":
|
||||||
updates["build_system"] = "gradle"
|
return GradleHandler(path, entry.name)
|
||||||
return GradleHandler(path, entry.name, updates)
|
|
||||||
else:
|
else:
|
||||||
updates["build_system"] = "maven"
|
return MavenHandler(path, entry.name)
|
||||||
return MavenHandler(path, entry.name, updates)
|
|
||||||
|
|
||||||
# List files in the immediate subdirectories
|
raise CantFindBuildFile(f"Couldn't find any of {to_keep} build files in the directory '{path}'")
|
||||||
for entry in os.scandir(path):
|
# # List files in the immediate subdirectories
|
||||||
if entry.is_dir():
|
# for entry in os.scandir(path):
|
||||||
for sub_entry in os.scandir(entry.path):
|
# if entry.is_dir():
|
||||||
if sub_entry.is_file() and sub_entry.name in to_keep:
|
# for sub_entry in os.scandir(entry.path):
|
||||||
if verbose:
|
# if sub_entry.is_file() and sub_entry.name in to_keep:
|
||||||
print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
|
# if verbose:
|
||||||
updates["depth_of_build_file"] = 1
|
# print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
|
||||||
if entry.name == "build.gradle":
|
# updates["depth_of_build_file"] = 1
|
||||||
updates["build_system"] = "gradle"
|
# if entry.name == "build.gradle":
|
||||||
return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates)
|
# updates["build_system"] = "gradle"
|
||||||
else:
|
# return GradleHandler(path, os.path.join(entry.name, sub_entry.name), updates)
|
||||||
updates["build_system"] = "maven"
|
# else:
|
||||||
return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates)
|
# updates["build_system"] = "maven"
|
||||||
|
# return MavenHandler(path, os.path.join(entry.name, sub_entry.name), updates)
|
||||||
|
|
||||||
updates["error_msg"] = "No build file found"
|
# updates["error_msg"] = "No build file found"
|
||||||
return None
|
# return None
|
||||||
|
302
pull_requests.py
302
pull_requests.py
@ -1,6 +1,8 @@
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import argparse, os, subprocess, docker
|
import argparse, os, subprocess, docker
|
||||||
from typing import Any, Callable
|
from typing import Any, Callable
|
||||||
|
from github.Commit import Commit
|
||||||
|
from github.ContentFile import ContentFile
|
||||||
from github.PullRequest import PullRequest
|
from github.PullRequest import PullRequest
|
||||||
from github.Repository import Repository
|
from github.Repository import Repository
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -8,7 +10,26 @@ from github import Github, GithubException
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from dataset import Dataset, DatasetEntry, FileData, Metadata
|
from dataset import (
|
||||||
|
Comment,
|
||||||
|
Dataset,
|
||||||
|
Dataset_new,
|
||||||
|
DatasetEntry,
|
||||||
|
DatasetEntry_new,
|
||||||
|
FileData,
|
||||||
|
FileData_new,
|
||||||
|
Metadata,
|
||||||
|
Metadata_new,
|
||||||
|
)
|
||||||
|
from errors import (
|
||||||
|
CantCheckoutCommitError,
|
||||||
|
CantEnsureFullHistoryError,
|
||||||
|
CantFetchPRError,
|
||||||
|
MultipleFilesError,
|
||||||
|
NoDiffsAfterError,
|
||||||
|
NoDiffsBeforeError,
|
||||||
|
SetupException,
|
||||||
|
)
|
||||||
from handlers import HandlerException, get_build_handler
|
from handlers import HandlerException, get_build_handler
|
||||||
from utils import has_only_1_comment, move_github_logging_to_file, clone, run_git_cmd
|
from utils import has_only_1_comment, move_github_logging_to_file, clone, run_git_cmd
|
||||||
|
|
||||||
@ -54,115 +75,216 @@ def reset_repo_to_latest_commit(repo_path: str) -> None:
|
|||||||
run_git_cmd(["reset", "--hard", current_branch], repo_path)
|
run_git_cmd(["reset", "--hard", current_branch], repo_path)
|
||||||
|
|
||||||
|
|
||||||
|
def get_diffs_before(repo: Repository, pr: PullRequest) -> dict[str, str]:
|
||||||
|
comments = list(pr.get_review_comments())
|
||||||
|
comments.sort(key=lambda comment: comment.created_at)
|
||||||
|
first_comment = comments[0]
|
||||||
|
try:
|
||||||
|
return {
|
||||||
|
file.filename: file.patch
|
||||||
|
for file in repo.compare(pr.base.sha, first_comment.commit_id).files
|
||||||
|
}
|
||||||
|
except GithubException as e:
|
||||||
|
raise NoDiffsBeforeError(e)
|
||||||
|
|
||||||
|
|
||||||
|
def get_diffs_after(repo: Repository, pr: PullRequest) -> dict[str, str]:
|
||||||
|
comments = list(pr.get_review_comments())
|
||||||
|
comments.sort(key=lambda comment: comment.created_at)
|
||||||
|
first_commit_after_comment = None
|
||||||
|
commits = list(pr.get_commits())
|
||||||
|
commits.sort(key=lambda commit: commit.commit.author.date)
|
||||||
|
for commit in commits:
|
||||||
|
if commit.commit.author.date > comments[0].created_at:
|
||||||
|
first_commit_after_comment = commit
|
||||||
|
break
|
||||||
|
|
||||||
|
assert first_commit_after_comment is not None, "No commit after the comment"
|
||||||
|
|
||||||
|
try:
|
||||||
|
return {
|
||||||
|
file.filename: file.patch
|
||||||
|
for file in repo.compare(first_commit_after_comment.sha, pr.base.sha).files
|
||||||
|
}
|
||||||
|
except GithubException as e:
|
||||||
|
raise NoDiffsAfterError(e)
|
||||||
|
|
||||||
|
|
||||||
|
def checkout(repo_path: str, pr: PullRequest) -> None:
|
||||||
|
try:
|
||||||
|
ensure_full_history(repo_path)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise CantEnsureFullHistoryError(e.stderr)
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
try:
|
||||||
|
run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise CantFetchPRError(e.stderr)
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise CantCheckoutCommitError(e.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
def try_read_file(fname: str) -> str:
|
||||||
|
if not os.path.exists(fname):
|
||||||
|
return "" # file was removed after the PR
|
||||||
|
try:
|
||||||
|
with open(fname, "r", encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return "Binary file (from filesystem), to be ignored"
|
||||||
|
except IsADirectoryError:
|
||||||
|
return "File listed in PR is a directory (likely a submodule), to be ignored"
|
||||||
|
|
||||||
|
|
||||||
|
def get_files(pr: PullRequest, repo: Repository, repo_path: str) -> dict[str, FileData_new]:
|
||||||
|
ret = {}
|
||||||
|
for file in pr.get_files():
|
||||||
|
try:
|
||||||
|
contents = repo.get_contents(file.filename, ref=pr.base.sha)
|
||||||
|
assert isinstance(
|
||||||
|
contents, ContentFile
|
||||||
|
), f"Multiple files with the same name {file.filename} in base sha {pr.base.sha} ({contents})"
|
||||||
|
contents_before = contents.decoded_content.decode()
|
||||||
|
except AssertionError as e:
|
||||||
|
raise MultipleFilesError(e)
|
||||||
|
except UnicodeError as e:
|
||||||
|
contents_before = "Binary content (from API), to be ignored"
|
||||||
|
except Exception as e:
|
||||||
|
contents_before = "" # file didn't exist before the PR
|
||||||
|
|
||||||
|
try:
|
||||||
|
contents = repo.get_contents(file.filename, ref=pr.merge_commit_sha)
|
||||||
|
assert isinstance(
|
||||||
|
contents, ContentFile
|
||||||
|
), f"Multiple files with the same name {file.filename} in merge commit sha {pr.base.sha} ({contents})"
|
||||||
|
contents_after = contents.decoded_content.decode()
|
||||||
|
except AssertionError as e:
|
||||||
|
raise MultipleFilesError(e)
|
||||||
|
except UnicodeError as e:
|
||||||
|
contents_after = "Binary content (from API), to be ignored"
|
||||||
|
except Exception as e:
|
||||||
|
checkout(repo_path, pr)
|
||||||
|
contents_after = try_read_file(os.path.join(repo_path, file.filename))
|
||||||
|
|
||||||
|
ret[file.filename] = FileData_new(
|
||||||
|
is_code_related=file.filename.endswith('.java'),
|
||||||
|
coverage={},
|
||||||
|
content_before_pr=contents_before,
|
||||||
|
content_after_pr=contents_after,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def get_comments(pr: PullRequest) -> list[Comment]:
|
||||||
|
ret = []
|
||||||
|
for comment in pr.get_review_comments():
|
||||||
|
comment = Comment(
|
||||||
|
body=comment.body,
|
||||||
|
file=comment.path,
|
||||||
|
from_=comment.start_line if comment.start_line else comment.line,
|
||||||
|
to=comment.line,
|
||||||
|
)
|
||||||
|
if comment.from_ is None or comment.to is None:
|
||||||
|
comment.to = comment.original_line
|
||||||
|
comment.from_ = comment.original_start_line
|
||||||
|
ret.append(comment)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def process_pull(
|
def process_pull(
|
||||||
repo: Repository,
|
repo: Repository,
|
||||||
pr: PullRequest,
|
pr: PullRequest,
|
||||||
dataset: Dataset,
|
dataset: Dataset_new,
|
||||||
repos_dir: str,
|
repos_dir: str,
|
||||||
cache: dict[str, dict[int, DatasetEntry]] = {},
|
cache: dict[str, dict[int, DatasetEntry_new]] = {},
|
||||||
):
|
):
|
||||||
if pr.number in cache.get(repo.full_name, set()):
|
if pr.number in cache.get(repo.full_name, set()):
|
||||||
dataset.entries.append(cache[repo.full_name][pr.number])
|
dataset.entries.append(cache[repo.full_name][pr.number])
|
||||||
return
|
return
|
||||||
|
|
||||||
commits = list(pr.get_commits())
|
entry = DatasetEntry_new(
|
||||||
if not commits:
|
metadata=Metadata_new(
|
||||||
return # No commits, skip processing
|
repo.full_name,
|
||||||
|
pr.number,
|
||||||
first_commit = commits[0]
|
pr.title,
|
||||||
last_commit = commits[-1]
|
pr.body,
|
||||||
|
pr.merge_commit_sha,
|
||||||
try:
|
reason_for_failure="Was still being processed",
|
||||||
diffs_before = {
|
),
|
||||||
file.filename: file.patch for file in repo.compare(pr.base.sha, first_commit.sha).files
|
files={},
|
||||||
}
|
diffs_before={},
|
||||||
except GithubException as e:
|
comments=[],
|
||||||
return
|
diffs_after={},
|
||||||
|
)
|
||||||
|
dataset.entries.append(entry)
|
||||||
|
|
||||||
comments = list(pr.get_review_comments())
|
comments = list(pr.get_review_comments())
|
||||||
assert len(comments) == 1
|
assert len(comments) == 1
|
||||||
comment = comments[0]
|
comment = comments[0]
|
||||||
comment_text = comment.body
|
|
||||||
commented_file_path = comment.path
|
commented_file_path = comment.path
|
||||||
|
|
||||||
try:
|
|
||||||
diffs_after = {
|
|
||||||
file.filename: file.patch
|
|
||||||
for file in repo.compare(first_commit.sha, last_commit.sha).files
|
|
||||||
}
|
|
||||||
except GithubException as e:
|
|
||||||
return
|
|
||||||
|
|
||||||
entry = DatasetEntry(
|
|
||||||
metadata=Metadata(
|
|
||||||
repo.full_name,
|
|
||||||
pr.number,
|
|
||||||
pr.merge_commit_sha,
|
|
||||||
{comment_text: commented_file_path},
|
|
||||||
reason_for_failure="Was still being processed",
|
|
||||||
),
|
|
||||||
files={file.filename: FileData(file.filename) for file in pr.get_files()},
|
|
||||||
diffs_before=diffs_before,
|
|
||||||
comments=[comment_text],
|
|
||||||
diffs_after=diffs_after,
|
|
||||||
)
|
|
||||||
dataset.entries.append(entry)
|
|
||||||
|
|
||||||
repo_path = os.path.join(repos_dir, repo.full_name)
|
repo_path = os.path.join(repos_dir, repo.full_name)
|
||||||
|
|
||||||
updates = {}
|
build_handler = None
|
||||||
if not clone(repo.full_name, repos_dir, updates):
|
|
||||||
entry.metadata.last_cmd_error_msg = updates["error_msg"]
|
|
||||||
entry.metadata.reason_for_failure = "Couldn't clone the repo successfully"
|
|
||||||
entry.metadata.successful = False
|
|
||||||
|
|
||||||
def _try_cmd(action: Callable[[], Any], reason_for_failure: str) -> bool:
|
setup_steps = [
|
||||||
"""
|
(
|
||||||
Tries a command, and if it fails, it sets the metadata of the entry.
|
"Getting diffs before the first commit...",
|
||||||
"""
|
lambda: entry.diffs_before.update(get_diffs_before(repo, pr)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Getting diffs after the first commit...",
|
||||||
|
lambda: entry.diffs_after.update(get_diffs_after(repo, pr)),
|
||||||
|
),
|
||||||
|
("Cloning the repo...", lambda: clone(repo.full_name, repos_dir)),
|
||||||
|
(
|
||||||
|
"Getting the files...",
|
||||||
|
lambda: entry.files.update(get_files(pr, repo, repo_path)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Getting the comments...",
|
||||||
|
lambda: entry.comments.extend(get_comments(pr)),
|
||||||
|
),
|
||||||
|
("Checkout out merge commit...", lambda: checkout(repo_path, pr)),
|
||||||
|
]
|
||||||
|
|
||||||
|
with tqdm(total=len(setup_steps), desc="Setting up PR", leave=False) as pbar:
|
||||||
|
for message, action in setup_steps:
|
||||||
|
pbar.set_postfix(
|
||||||
|
{
|
||||||
|
"doing": message,
|
||||||
|
"started at": datetime.now().strftime("%d/%m, %H:%M:%S"),
|
||||||
|
}
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
# return action()
|
|
||||||
action()
|
action()
|
||||||
except subprocess.CalledProcessError as e:
|
except SetupException as e:
|
||||||
entry.metadata.last_cmd_error_msg = f"{e.stderr}"
|
entry.metadata.last_cmd_error_msg = str(e)
|
||||||
entry.metadata.reason_for_failure = reason_for_failure
|
entry.metadata.reason_for_failure = e.reason_for_failure
|
||||||
entry.metadata.successful = False
|
entry.metadata.successful = False
|
||||||
# raise e
|
|
||||||
return entry.metadata.successful
|
|
||||||
|
|
||||||
if not _try_cmd(
|
|
||||||
lambda: ensure_full_history(repo_path),
|
|
||||||
"Couldn't ensure the full history of the repo (fetch --unshallow)",
|
|
||||||
):
|
|
||||||
return
|
return
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
|
build_handler = get_build_handler(repos_dir, repo.full_name)
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
if not _try_cmd(
|
|
||||||
lambda: run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path),
|
|
||||||
"Couldn't fetch the PR's merge commit",
|
|
||||||
):
|
|
||||||
return
|
|
||||||
|
|
||||||
if not _try_cmd(
|
|
||||||
lambda: run_git_cmd(["checkout", pr.merge_commit_sha], repo_path),
|
|
||||||
"Coudln't checkout the PR's merge commit (even after fetching the pull/<pr_number>/merge)",
|
|
||||||
):
|
|
||||||
return
|
|
||||||
|
|
||||||
build_handler = get_build_handler(repos_dir, repo.full_name, updates)
|
|
||||||
if build_handler is None:
|
|
||||||
entry.metadata.last_cmd_error_msg = updates["error_msg"]
|
|
||||||
entry.metadata.reason_for_failure = "Couldn't get the build handler"
|
|
||||||
entry.metadata.successful = False
|
|
||||||
return
|
|
||||||
entry.metadata.build_system = build_handler.get_type()
|
entry.metadata.build_system = build_handler.get_type()
|
||||||
build_handler.set_client(docker_client)
|
build_handler.set_client(docker_client)
|
||||||
|
except SetupException as e:
|
||||||
|
entry.metadata.last_cmd_error_msg = str(e)
|
||||||
|
entry.metadata.reason_for_failure = e.reason_for_failure
|
||||||
|
entry.metadata.successful = False
|
||||||
|
return
|
||||||
|
|
||||||
def _check_coverages():
|
def _check_coverages():
|
||||||
for coverage_file, coverage in build_handler.check_coverage(commented_file_path):
|
for coverage_file, coverage in build_handler.check_coverage(commented_file_path):
|
||||||
entry.metadata.commented_files_coverages[commented_file_path][coverage_file] = coverage
|
entry.files[commented_file_path].coverage[coverage_file] = coverage
|
||||||
|
|
||||||
steps = [
|
steps = [
|
||||||
("Checking for tests...", build_handler.check_for_tests),
|
("Checking for tests...", build_handler.check_for_tests),
|
||||||
@ -197,9 +319,9 @@ def process_pull(
|
|||||||
|
|
||||||
def process_repo(
|
def process_repo(
|
||||||
repo_name: str,
|
repo_name: str,
|
||||||
dataset: Dataset,
|
dataset: Dataset_new,
|
||||||
repos_dir: str,
|
repos_dir: str,
|
||||||
cache: dict[str, dict[int, DatasetEntry]] = {},
|
cache: dict[str, dict[int, DatasetEntry_new]] = {},
|
||||||
):
|
):
|
||||||
repo = g.get_repo(repo_name)
|
repo = g.get_repo(repo_name)
|
||||||
if repo.full_name in cache:
|
if repo.full_name in cache:
|
||||||
@ -224,9 +346,9 @@ def process_repo(
|
|||||||
|
|
||||||
def process_repos(
|
def process_repos(
|
||||||
df: pd.DataFrame,
|
df: pd.DataFrame,
|
||||||
dataset: Dataset,
|
dataset: Dataset_new,
|
||||||
repos_dir: str,
|
repos_dir: str,
|
||||||
cache: dict[str, dict[int, DatasetEntry]] = {},
|
cache: dict[str, dict[int, DatasetEntry_new]] = {},
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Processes the repos in the given csv file, extracting the good ones and
|
Processes the repos in the given csv file, extracting the good ones and
|
||||||
@ -254,9 +376,9 @@ def process_repos(
|
|||||||
|
|
||||||
|
|
||||||
def only_inject_jacoco(
|
def only_inject_jacoco(
|
||||||
dataset: Dataset,
|
dataset: Dataset_new,
|
||||||
repos_dir: str,
|
repos_dir: str,
|
||||||
cache: dict[str, dict[int, DatasetEntry]] = {},
|
cache: dict[str, dict[int, DatasetEntry_new]] = {},
|
||||||
):
|
):
|
||||||
n_successfull_injections = 0
|
n_successfull_injections = 0
|
||||||
n_tried_injections = 0
|
n_tried_injections = 0
|
||||||
@ -344,13 +466,13 @@ if __name__ == "__main__":
|
|||||||
if args.only_repo is not None:
|
if args.only_repo is not None:
|
||||||
df = df.loc[df["name"] == args.only_repo]
|
df = df.loc[df["name"] == args.only_repo]
|
||||||
|
|
||||||
cache: dict[str, dict[int, DatasetEntry]] = defaultdict(dict)
|
cache: dict[str, dict[int, DatasetEntry_new]] = defaultdict(dict)
|
||||||
if args.cache is not None:
|
if args.cache is not None:
|
||||||
cache_dataset = Dataset.from_json(args.cache)
|
cache_dataset = Dataset_new.from_json(args.cache)
|
||||||
for cache_entry in cache_dataset.entries:
|
for cache_entry in cache_dataset.entries:
|
||||||
cache[cache_entry.metadata.repo][cache_entry.metadata.pr_number] = cache_entry
|
cache[cache_entry.metadata.repo][cache_entry.metadata.pr_number] = cache_entry
|
||||||
|
|
||||||
dataset = Dataset()
|
dataset = Dataset_new()
|
||||||
try:
|
try:
|
||||||
if args.only_inject_jacoco:
|
if args.only_inject_jacoco:
|
||||||
only_inject_jacoco(dataset, args.repos, cache)
|
only_inject_jacoco(dataset, args.repos, cache)
|
||||||
|
Reference in New Issue
Block a user