mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-05 05:28:13 +02:00
now the rows actually get updated
This commit is contained in:
138
clone_repos.py
138
clone_repos.py
@ -9,7 +9,7 @@ EXCLUSION_LIST = [
|
||||
"edmcouncil/idmp",
|
||||
]
|
||||
|
||||
def clone(repo: str, dest: str, row: pd.Series, force: bool = False, verbose: bool = False) -> None:
|
||||
def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
|
||||
"""
|
||||
Clones a GitHub repository into a local directory.
|
||||
|
||||
@ -28,16 +28,16 @@ def clone(repo: str, dest: str, row: pd.Series, force: bool = False, verbose: bo
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
row["successfully_cloned"] = False
|
||||
updates["successfully_cloned"] = False
|
||||
print(f"Failed to clone {repo}", file=sys.stderr)
|
||||
print(f"Error message was:", file=sys.stderr)
|
||||
error_msg = proc.stderr.decode()
|
||||
print(error_msg, file=sys.stderr)
|
||||
row["error_msg"] = error_msg
|
||||
updates["error_msg"] = error_msg
|
||||
else:
|
||||
row["successfully_cloned"] = True
|
||||
updates["successfully_cloned"] = True
|
||||
|
||||
def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
|
||||
def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False):
|
||||
"""
|
||||
Get the path to the build file of a repository. The build file is either a
|
||||
`pom.xml`, `build.gradle`, or `build.xml` file.
|
||||
@ -54,14 +54,14 @@ def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
|
||||
if not os.path.isdir(path):
|
||||
error_msg = f"The path {path} is not a valid directory."
|
||||
print(error_msg, file=sys.stderr)
|
||||
row["error_msg"] = error_msg
|
||||
updates["error_msg"] = error_msg
|
||||
return None
|
||||
|
||||
to_keep = ["pom.xml", "build.gradle", "build.xml"]
|
||||
for entry in os.scandir(path):
|
||||
if entry.is_file() and entry.name in to_keep:
|
||||
if verbose: print(f"Found {entry.name} in {repo} root, so keeping it and returning")
|
||||
row["depth_of_build_file"] = 0
|
||||
updates["depth_of_build_file"] = 0
|
||||
return os.path.join(path, entry.name)
|
||||
|
||||
# List files in the immediate subdirectories
|
||||
@ -70,24 +70,24 @@ def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
|
||||
for sub_entry in os.scandir(entry.path):
|
||||
if sub_entry.is_file() and sub_entry.name in to_keep:
|
||||
if verbose: print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
|
||||
row["depth_of_build_file"] = 1
|
||||
updates["depth_of_build_file"] = 1
|
||||
return os.path.join(path, entry.name, sub_entry.name)
|
||||
|
||||
row["error_msg"] = "No build file found"
|
||||
updates["error_msg"] = "No build file found"
|
||||
return None
|
||||
|
||||
def has_tests(path: str, build_file: str, row: pd.Series) -> bool:
|
||||
def has_tests(path: str, build_file: str, updates: dict) -> bool:
|
||||
with open(build_file, "r") as f:
|
||||
content = f.read()
|
||||
|
||||
for library in ["junit", "testng", "mockito"]:
|
||||
if library in content:
|
||||
row["detected_source_of_tests"] = library + " library in build file"
|
||||
updates["detected_source_of_tests"] = library + " library in build file"
|
||||
return True
|
||||
|
||||
for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]:
|
||||
if keyword in content:
|
||||
row["detected_source_of_tests"] = keyword + " keyword in build file"
|
||||
updates["detected_source_of_tests"] = keyword + " keyword in build file"
|
||||
return False
|
||||
|
||||
test_dirs = [
|
||||
@ -98,10 +98,10 @@ def has_tests(path: str, build_file: str, row: pd.Series) -> bool:
|
||||
]
|
||||
for td in test_dirs:
|
||||
if os.path.exists(os.path.join(path, td)):
|
||||
row["detected_source_of_tests"] = td + " dir exists in repo"
|
||||
updates["detected_source_of_tests"] = td + " dir exists in repo"
|
||||
return True
|
||||
|
||||
row["error_msg"] = "No tests found"
|
||||
updates["error_msg"] = "No tests found"
|
||||
return False
|
||||
|
||||
def remove_dir(dir: str) -> None:
|
||||
@ -117,37 +117,47 @@ def remove_dir(dir: str) -> None:
|
||||
shutil.rmtree(parent)
|
||||
|
||||
|
||||
def process_row(row, dest: str, force: bool = False, verbose: bool = False):
|
||||
repo = row["name"]
|
||||
if repo in EXCLUSION_LIST:
|
||||
row["error_msg"] = "Repo in exclusion list"
|
||||
if verbose: print(f"Skipping {repo}, in exclusion list")
|
||||
return
|
||||
def process_row(row, dest: str, force: bool = False, verbose: bool = False) -> dict:
|
||||
updates = {} # Dictionary to store updates
|
||||
with tqdm(total=3, leave=False) as pbar:
|
||||
repo = row["name"]
|
||||
if repo in EXCLUSION_LIST:
|
||||
updates["error_msg"] = "Repo in exclusion list"
|
||||
if verbose: print(f"Skipping {repo}, in exclusion list")
|
||||
return updates
|
||||
|
||||
if force:
|
||||
clone(repo, dest, row, verbose=verbose)
|
||||
pbar.set_postfix_str("Cloning...")
|
||||
if force:
|
||||
clone(repo, dest, updates, verbose=verbose)
|
||||
pbar.update(1)
|
||||
|
||||
repo_path = os.path.join(dest, repo)
|
||||
if not os.path.exists(repo_path):
|
||||
row["error_msg"] = "Repo not cloned"
|
||||
return
|
||||
repo_path = os.path.join(dest, repo)
|
||||
if not os.path.exists(repo_path):
|
||||
updates["error_msg"] = "Repo not cloned"
|
||||
return updates
|
||||
|
||||
build_file = get_build_file(dest, repo, row)
|
||||
if build_file is None:
|
||||
if verbose: print(f"Removing {repo}, no build file")
|
||||
remove_dir(repo_path)
|
||||
return
|
||||
|
||||
if not has_tests(repo_path, build_file, row):
|
||||
if verbose: print(f"Removing {repo}, no test suites")
|
||||
remove_dir(repo_path)
|
||||
return
|
||||
# if verbose: print(f"Keeping {repo}")
|
||||
pbar.set_postfix_str("Getting build file...")
|
||||
build_file = get_build_file(dest, repo, updates)
|
||||
if build_file is None:
|
||||
if verbose: print(f"Removing {repo}, no build file")
|
||||
remove_dir(repo_path)
|
||||
return updates
|
||||
pbar.update(1)
|
||||
|
||||
|
||||
# Check for compilation and tests
|
||||
pbar.set_postfix_str("Checking for tests...")
|
||||
# if not has_tests(repo_path, build_file, updates):
|
||||
# if verbose: print(f"Removing {repo}, no test suites")
|
||||
# remove_dir(repo_path)
|
||||
# return
|
||||
# if verbose: print(f"Keeping {repo}")
|
||||
pbar.update(1)
|
||||
|
||||
# If repo was not removed, then it is a good repo
|
||||
row["good_repo_for_crab"] = True
|
||||
# Check for compilation and tests
|
||||
|
||||
# If repo was not removed, then it is a good repo
|
||||
updates["good_repo_for_crab"] = True
|
||||
return updates
|
||||
|
||||
def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
|
||||
"""
|
||||
@ -162,26 +172,48 @@ def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False)
|
||||
if verbose: print(f"Reading CSV file {file}")
|
||||
df = pd.read_csv(file)
|
||||
|
||||
df["successfully_cloned"] = None
|
||||
df["build_system"] = None
|
||||
df["depth_of_build_file"] = None
|
||||
df["detected_source_of_tests"] = None
|
||||
df["error_msg"] = None
|
||||
df["good_repo_for_crab"] = False
|
||||
df["n_tests"] = None
|
||||
df["n_tests_with_grep"] = None
|
||||
df["n_tests_passed"] = None
|
||||
df["n_tests_failed"] = None
|
||||
df["n_tests_skipped"] = None
|
||||
# drop all columns besides the name
|
||||
df = df[["name"]]
|
||||
|
||||
updates_list = [] # Collect updates in a list
|
||||
|
||||
good_repos = 0
|
||||
try:
|
||||
if verbose: print("Processing repositories")
|
||||
df.progress_apply(lambda row: process_row(row, dest, force=force, verbose=verbose), axis=1)
|
||||
with tqdm(total=len(df)) as pbar:
|
||||
for i, row in df.iterrows():
|
||||
updates = process_row(row, dest, force=force, verbose=verbose)
|
||||
if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]:
|
||||
good_repos += 1
|
||||
pbar.update(1)
|
||||
pbar.set_postfix({"repo": row["name"], "good_repos": good_repos}, refresh=True)
|
||||
updates_list.append((i, updates)) # Collect updates
|
||||
except KeyboardInterrupt:
|
||||
print("Keyboard interrupt detected. Stopping the processing of the repos...")
|
||||
|
||||
|
||||
# Create columns for the new data
|
||||
df = df.assign(
|
||||
successfully_cloned=None,
|
||||
build_system=None,
|
||||
depth_of_build_file=None,
|
||||
detected_source_of_tests=None,
|
||||
error_msg=None,
|
||||
good_repo_for_crab=False,
|
||||
n_tests=None,
|
||||
n_tests_with_grep=None,
|
||||
n_tests_passed=None,
|
||||
n_tests_failed=None,
|
||||
n_tests_skipped=None
|
||||
)
|
||||
|
||||
# Set the new data
|
||||
for index, updates in updates_list:
|
||||
for col, value in updates.items():
|
||||
df.at[index, col] = value # Batch updates to avoid fragmentation
|
||||
|
||||
if verbose: print("Writing results...")
|
||||
df.to_csv("results.csv.gz", index=False)
|
||||
df.to_csv("results.csv", index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Reference in New Issue
Block a user