mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-05 05:28:13 +02:00
formatted code
This commit is contained in:
19
dataset.py
19
dataset.py
@ -2,23 +2,27 @@ from dataclasses import dataclass, field
|
||||
from typing import Dict, List
|
||||
import json
|
||||
|
||||
|
||||
# fmt: off
|
||||
@dataclass
|
||||
class FileData:
|
||||
path: str
|
||||
content: str = "" # Not sure about this, maybe we should just keep the path and extract the contents dynamically (boh)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
repo: str # the name of the repo, with style XXX/YYY
|
||||
pr_number: int
|
||||
merge_commit_sha: str # to checkout for the tests
|
||||
commented_files : Dict[str, str] # comment -> filename
|
||||
commented_files: Dict[str, str] # comment -> filename
|
||||
commented_files_coverages: Dict[str, Dict[str, float]] = field(default_factory=dict) # filename -> jacoco-report -> coverage
|
||||
successful: bool = True
|
||||
build_system: str = ""
|
||||
reason_for_failure: str = ""
|
||||
last_cmd_error_msg: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatasetEntry:
|
||||
metadata: Metadata
|
||||
@ -27,6 +31,8 @@ class DatasetEntry:
|
||||
comments: List[str]
|
||||
diffs_after: Dict[str, str] # filename -> diff, changes after the comment
|
||||
|
||||
|
||||
# fmt: on
|
||||
@dataclass
|
||||
class Dataset:
|
||||
entries: List[DatasetEntry] = field(default_factory=list)
|
||||
@ -48,19 +54,20 @@ class Dataset:
|
||||
for entry_data in data["entries"]:
|
||||
metadata_data = entry_data["metadata"]
|
||||
metadata = Metadata(**metadata_data)
|
||||
if not keep_still_in_progress and metadata.reason_for_failure == "Was still being processed":
|
||||
if (
|
||||
not keep_still_in_progress
|
||||
and metadata.reason_for_failure == "Was still being processed"
|
||||
):
|
||||
continue
|
||||
|
||||
files = {
|
||||
fname: FileData(**fdata) for fname, fdata in entry_data["files"].items()
|
||||
}
|
||||
files = {fname: FileData(**fdata) for fname, fdata in entry_data["files"].items()}
|
||||
|
||||
entry = DatasetEntry(
|
||||
metadata=metadata,
|
||||
files=files,
|
||||
diffs_before=entry_data["diffs_before"],
|
||||
comments=entry_data["comments"],
|
||||
diffs_after=entry_data["diffs_after"]
|
||||
diffs_after=entry_data["diffs_after"],
|
||||
)
|
||||
entries.append(entry)
|
||||
|
||||
|
129
pull_requests.py
129
pull_requests.py
@ -27,13 +27,17 @@ def get_good_projects(csv_file: str) -> pd.DataFrame:
|
||||
df = pd.read_csv(csv_file)
|
||||
return df.loc[(df['good_repo_for_crab'] == True) & (df['n_tests'] > 0)]
|
||||
|
||||
|
||||
def is_pull_good(pull: PullRequest, verbose: bool = False):
|
||||
return (
|
||||
has_only_1_comment(pull.get_commits(), pull.get_review_comments(), verbose=verbose)
|
||||
and pull.user.type != "Bot"
|
||||
)
|
||||
|
||||
def get_good_prs(repo: Repository, cache: dict[str, dict[int, DatasetEntry]] = {}) -> list[PullRequest]:
|
||||
|
||||
def get_good_prs(
|
||||
repo: Repository, cache: dict[str, dict[int, DatasetEntry]] = {}
|
||||
) -> list[PullRequest]:
|
||||
good_prs = []
|
||||
|
||||
potenially_good_prs = repo.get_pulls(state="closed")
|
||||
@ -42,7 +46,11 @@ def get_good_prs(repo: Repository, cache: dict[str, dict[int, DatasetEntry]] = {
|
||||
if number_of_prs == 0:
|
||||
return []
|
||||
|
||||
with tqdm(total=number_of_prs, desc=f"Extracting good PRs from {repo.full_name}", leave=False) as pbar:
|
||||
with tqdm(
|
||||
total=number_of_prs,
|
||||
desc=f"Extracting good PRs from {repo.full_name}",
|
||||
leave=False,
|
||||
) as pbar:
|
||||
for pr in potenially_good_prs:
|
||||
pbar.set_postfix({"new good found": len(good_prs), "pr_number": pr.number})
|
||||
if pr.merged_at is None or pr.number in cache.get(repo.full_name, set()):
|
||||
@ -54,8 +62,15 @@ def get_good_prs(repo: Repository, cache: dict[str, dict[int, DatasetEntry]] = {
|
||||
|
||||
return good_prs
|
||||
|
||||
|
||||
def run_git_cmd(cmd: list[str], repo_path: str) -> subprocess.CompletedProcess:
|
||||
return subprocess.run(["git", "-C", repo_path] + cmd, check=True, capture_output=True, text=True)
|
||||
return subprocess.run(
|
||||
["git", "-C", repo_path] + cmd,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
|
||||
def ensure_full_history(repo_path: str) -> None:
|
||||
result = run_git_cmd(["rev-parse", "--is-shallow-repository"], repo_path)
|
||||
@ -63,11 +78,19 @@ def ensure_full_history(repo_path: str) -> None:
|
||||
if result.stdout.strip() == "true":
|
||||
run_git_cmd(["fetch", "--unshallow"], repo_path)
|
||||
|
||||
|
||||
def reset_repo_to_latest_commit(repo_path: str) -> None:
|
||||
current_branch = run_git_cmd(["rev-parse", "--abbrev-ref", "HEAD"], repo_path).stdout.strip()
|
||||
run_git_cmd(["reset", "--hard", current_branch], repo_path)
|
||||
|
||||
def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir: str, cache: dict[str, dict[int, DatasetEntry]] = {}):
|
||||
|
||||
def process_pull(
|
||||
repo: Repository,
|
||||
pr: PullRequest,
|
||||
dataset: Dataset,
|
||||
repos_dir: str,
|
||||
cache: dict[str, dict[int, DatasetEntry]] = {},
|
||||
):
|
||||
if pr.number in cache.get(repo.full_name, set()):
|
||||
dataset.entries.append(cache[repo.full_name][pr.number])
|
||||
return
|
||||
@ -80,7 +103,9 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir:
|
||||
last_commit = commits[-1]
|
||||
|
||||
try:
|
||||
diffs_before = {file.filename: file.patch for file in repo.compare(pr.base.sha, first_commit.sha).files}
|
||||
diffs_before = {
|
||||
file.filename: file.patch for file in repo.compare(pr.base.sha, first_commit.sha).files
|
||||
}
|
||||
except GithubException as e:
|
||||
return
|
||||
|
||||
@ -91,12 +116,21 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir:
|
||||
commented_file_path = comment.path
|
||||
|
||||
try:
|
||||
diffs_after = {file.filename: file.patch for file in repo.compare(first_commit.sha, last_commit.sha).files}
|
||||
diffs_after = {
|
||||
file.filename: file.patch
|
||||
for file in repo.compare(first_commit.sha, last_commit.sha).files
|
||||
}
|
||||
except GithubException as e:
|
||||
return
|
||||
|
||||
entry = DatasetEntry(
|
||||
metadata=Metadata(repo.full_name, pr.number, pr.merge_commit_sha, {comment_text: commented_file_path}, reason_for_failure="Was still being processed"),
|
||||
metadata=Metadata(
|
||||
repo.full_name,
|
||||
pr.number,
|
||||
pr.merge_commit_sha,
|
||||
{comment_text: commented_file_path},
|
||||
reason_for_failure="Was still being processed",
|
||||
),
|
||||
files={file.filename: FileData(file.filename) for file in pr.get_files()},
|
||||
diffs_before=diffs_before,
|
||||
comments=[comment_text],
|
||||
@ -126,16 +160,25 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir:
|
||||
# raise e
|
||||
return entry.metadata.successful
|
||||
|
||||
if not _try_cmd(lambda: ensure_full_history(repo_path), "Couldn't ensure the full history of the repo (fetch --unshallow)"):
|
||||
if not _try_cmd(
|
||||
lambda: ensure_full_history(repo_path),
|
||||
"Couldn't ensure the full history of the repo (fetch --unshallow)",
|
||||
):
|
||||
return
|
||||
|
||||
try:
|
||||
run_git_cmd(["checkout", pr.merge_commit_sha], repo_path)
|
||||
except subprocess.CalledProcessError:
|
||||
if not _try_cmd(lambda: run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path), "Couldn't fetch the PR's merge commit"):
|
||||
if not _try_cmd(
|
||||
lambda: run_git_cmd(["fetch", "origin", f"pull/{pr.number}/merge"], repo_path),
|
||||
"Couldn't fetch the PR's merge commit",
|
||||
):
|
||||
return
|
||||
|
||||
if not _try_cmd(lambda: run_git_cmd(["checkout", pr.merge_commit_sha], repo_path), "Coudln't checkout the PR's merge commit (even after fetching the pull/<pr_number>/merge)"):
|
||||
if not _try_cmd(
|
||||
lambda: run_git_cmd(["checkout", pr.merge_commit_sha], repo_path),
|
||||
"Coudln't checkout the PR's merge commit (even after fetching the pull/<pr_number>/merge)",
|
||||
):
|
||||
return
|
||||
|
||||
build_handler = get_build_handler(repos_dir, repo.full_name, updates)
|
||||
@ -162,7 +205,12 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir:
|
||||
with build_handler, tqdm(total=len(steps), desc="Processing PR", leave=False) as pbar:
|
||||
try:
|
||||
for message, action in steps:
|
||||
pbar.set_postfix({"doing": message, "started at": datetime.now().strftime("%d/%m, %H:%M:%S")})
|
||||
pbar.set_postfix(
|
||||
{
|
||||
"doing": message,
|
||||
"started at": datetime.now().strftime("%d/%m, %H:%M:%S"),
|
||||
}
|
||||
)
|
||||
action()
|
||||
pbar.update(1)
|
||||
except HandlerException as e:
|
||||
@ -177,7 +225,13 @@ def process_pull(repo: Repository, pr: PullRequest, dataset: Dataset, repos_dir:
|
||||
entry.metadata.reason_for_failure = "" # was set to 'still processing', since it's done being processed and was successful, there are no reasons for failure
|
||||
dataset.to_json(args.output)
|
||||
|
||||
def process_repo(repo_name: str, dataset: Dataset, repos_dir: str, cache: dict[str, dict[int, DatasetEntry]] = {}):
|
||||
|
||||
def process_repo(
|
||||
repo_name: str,
|
||||
dataset: Dataset,
|
||||
repos_dir: str,
|
||||
cache: dict[str, dict[int, DatasetEntry]] = {},
|
||||
):
|
||||
repo = g.get_repo(repo_name)
|
||||
if repo.full_name in cache:
|
||||
dataset.entries.extend(cache[repo.full_name].values())
|
||||
@ -191,7 +245,13 @@ def process_repo(repo_name: str, dataset: Dataset, repos_dir: str, cache: dict[s
|
||||
pbar.set_postfix({"pr": pr.number})
|
||||
process_pull(repo, pr, dataset, repos_dir, cache)
|
||||
|
||||
def process_repos(df: pd.DataFrame, dataset: Dataset, repos_dir: str, cache: dict[str, dict[int, DatasetEntry]] = {}):
|
||||
|
||||
def process_repos(
|
||||
df: pd.DataFrame,
|
||||
dataset: Dataset,
|
||||
repos_dir: str,
|
||||
cache: dict[str, dict[int, DatasetEntry]] = {},
|
||||
):
|
||||
"""
|
||||
Processes the repos in the given csv file, extracting the good ones and
|
||||
creating the "triplets" for the dataset.
|
||||
@ -206,23 +266,50 @@ def process_repos(df: pd.DataFrame, dataset: Dataset, repos_dir: str, cache: dic
|
||||
for _, row in df.iterrows():
|
||||
repo_name = row["name"]
|
||||
assert isinstance(repo_name, str)
|
||||
pbar.set_postfix({
|
||||
pbar.set_postfix(
|
||||
{
|
||||
"repo": repo_name,
|
||||
"started at": datetime.now().strftime("%d/%m, %H:%M:%S"),
|
||||
"# triplets": f"{len(dataset)}/{len(dataset.entries)} ({len(dataset)/len(dataset.entries) if len(dataset.entries) > 0 else 0:.2%})"
|
||||
})
|
||||
"# triplets": f"{len(dataset)}/{len(dataset.entries)} ({len(dataset)/len(dataset.entries) if len(dataset.entries) > 0 else 0:.2%})",
|
||||
}
|
||||
)
|
||||
process_repo(repo_name, dataset, repos_dir, cache)
|
||||
pbar.update(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Creates the triplets for the CRAB dataset.')
|
||||
parser.add_argument('csv_file', type=str, help='The csv file containing the projects (the results from clone_repos.py).')
|
||||
parser.add_argument('-o', '--output', type=str, default="./dataset.json", help='The file in which the dataset will be contained. Default is "./dataset.json"')
|
||||
parser.add_argument('-r', '--repos', type=str, default="./results/", help='The directory in which the repos were cloned (will be cloned if they aren\'t there already). Default: "./results/"')
|
||||
parser.add_argument('-c', '--cache', type=str, help="The name of the output file from another run of this script. This is for when the script unexpectedly got interrupted and you want to resume from where you left off.")
|
||||
parser.add_argument(
|
||||
'csv_file',
|
||||
type=str,
|
||||
help='The csv file containing the projects (the results from clone_repos.py).',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o',
|
||||
'--output',
|
||||
type=str,
|
||||
default="./dataset.json",
|
||||
help='The file in which the dataset will be contained. Default is "./dataset.json"',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-r',
|
||||
'--repos',
|
||||
type=str,
|
||||
default="./results/",
|
||||
help='The directory in which the repos were cloned (will be cloned if they aren\'t there already). Default: "./results/"',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-c',
|
||||
'--cache',
|
||||
type=str,
|
||||
help="The name of the output file from another run of this script. This is for when the script unexpectedly got interrupted and you want to resume from where you left off.",
|
||||
)
|
||||
# parser.add_argument('-v', '--verbose', action='store_true', help='Prints the number of good projects.')
|
||||
parser.add_argument("--only-repo", type=str, help="If this argument is not provided, all the repos in the '--repos' csv will be processed. If instead you want to run the script on a single repo (for testing purposes mainly) provide a string of form 'XXX/YYY' to this argument, where XXX is the owner of the repo and YYY is the name of the repo")
|
||||
parser.add_argument(
|
||||
"--only-repo",
|
||||
type=str,
|
||||
help="If this argument is not provided, all the repos in the '--repos' csv will be processed. If instead you want to run the script on a single repo (for testing purposes mainly) provide a string of form 'XXX/YYY' to this argument, where XXX is the owner of the repo and YYY is the name of the repo",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
g = Github(os.environ["GITHUB_AUTH_TOKEN_CRAB"])
|
||||
|
Reference in New Issue
Block a user