mirror of
https://github.com/karma-riuk/crab.git
synced 2025-07-05 13:38:12 +02:00
now the rows actually get updated
This commit is contained in:
136
clone_repos.py
136
clone_repos.py
@ -9,7 +9,7 @@ EXCLUSION_LIST = [
|
|||||||
"edmcouncil/idmp",
|
"edmcouncil/idmp",
|
||||||
]
|
]
|
||||||
|
|
||||||
def clone(repo: str, dest: str, row: pd.Series, force: bool = False, verbose: bool = False) -> None:
|
def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
|
||||||
"""
|
"""
|
||||||
Clones a GitHub repository into a local directory.
|
Clones a GitHub repository into a local directory.
|
||||||
|
|
||||||
@ -28,16 +28,16 @@ def clone(repo: str, dest: str, row: pd.Series, force: bool = False, verbose: bo
|
|||||||
stderr=subprocess.PIPE
|
stderr=subprocess.PIPE
|
||||||
)
|
)
|
||||||
if proc.returncode != 0:
|
if proc.returncode != 0:
|
||||||
row["successfully_cloned"] = False
|
updates["successfully_cloned"] = False
|
||||||
print(f"Failed to clone {repo}", file=sys.stderr)
|
print(f"Failed to clone {repo}", file=sys.stderr)
|
||||||
print(f"Error message was:", file=sys.stderr)
|
print(f"Error message was:", file=sys.stderr)
|
||||||
error_msg = proc.stderr.decode()
|
error_msg = proc.stderr.decode()
|
||||||
print(error_msg, file=sys.stderr)
|
print(error_msg, file=sys.stderr)
|
||||||
row["error_msg"] = error_msg
|
updates["error_msg"] = error_msg
|
||||||
else:
|
else:
|
||||||
row["successfully_cloned"] = True
|
updates["successfully_cloned"] = True
|
||||||
|
|
||||||
def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
|
def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False):
|
||||||
"""
|
"""
|
||||||
Get the path to the build file of a repository. The build file is either a
|
Get the path to the build file of a repository. The build file is either a
|
||||||
`pom.xml`, `build.gradle`, or `build.xml` file.
|
`pom.xml`, `build.gradle`, or `build.xml` file.
|
||||||
@ -54,14 +54,14 @@ def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
|
|||||||
if not os.path.isdir(path):
|
if not os.path.isdir(path):
|
||||||
error_msg = f"The path {path} is not a valid directory."
|
error_msg = f"The path {path} is not a valid directory."
|
||||||
print(error_msg, file=sys.stderr)
|
print(error_msg, file=sys.stderr)
|
||||||
row["error_msg"] = error_msg
|
updates["error_msg"] = error_msg
|
||||||
return None
|
return None
|
||||||
|
|
||||||
to_keep = ["pom.xml", "build.gradle", "build.xml"]
|
to_keep = ["pom.xml", "build.gradle", "build.xml"]
|
||||||
for entry in os.scandir(path):
|
for entry in os.scandir(path):
|
||||||
if entry.is_file() and entry.name in to_keep:
|
if entry.is_file() and entry.name in to_keep:
|
||||||
if verbose: print(f"Found {entry.name} in {repo} root, so keeping it and returning")
|
if verbose: print(f"Found {entry.name} in {repo} root, so keeping it and returning")
|
||||||
row["depth_of_build_file"] = 0
|
updates["depth_of_build_file"] = 0
|
||||||
return os.path.join(path, entry.name)
|
return os.path.join(path, entry.name)
|
||||||
|
|
||||||
# List files in the immediate subdirectories
|
# List files in the immediate subdirectories
|
||||||
@ -70,24 +70,24 @@ def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
|
|||||||
for sub_entry in os.scandir(entry.path):
|
for sub_entry in os.scandir(entry.path):
|
||||||
if sub_entry.is_file() and sub_entry.name in to_keep:
|
if sub_entry.is_file() and sub_entry.name in to_keep:
|
||||||
if verbose: print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
|
if verbose: print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
|
||||||
row["depth_of_build_file"] = 1
|
updates["depth_of_build_file"] = 1
|
||||||
return os.path.join(path, entry.name, sub_entry.name)
|
return os.path.join(path, entry.name, sub_entry.name)
|
||||||
|
|
||||||
row["error_msg"] = "No build file found"
|
updates["error_msg"] = "No build file found"
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def has_tests(path: str, build_file: str, row: pd.Series) -> bool:
|
def has_tests(path: str, build_file: str, updates: dict) -> bool:
|
||||||
with open(build_file, "r") as f:
|
with open(build_file, "r") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
|
|
||||||
for library in ["junit", "testng", "mockito"]:
|
for library in ["junit", "testng", "mockito"]:
|
||||||
if library in content:
|
if library in content:
|
||||||
row["detected_source_of_tests"] = library + " library in build file"
|
updates["detected_source_of_tests"] = library + " library in build file"
|
||||||
return True
|
return True
|
||||||
|
|
||||||
for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]:
|
for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]:
|
||||||
if keyword in content:
|
if keyword in content:
|
||||||
row["detected_source_of_tests"] = keyword + " keyword in build file"
|
updates["detected_source_of_tests"] = keyword + " keyword in build file"
|
||||||
return False
|
return False
|
||||||
|
|
||||||
test_dirs = [
|
test_dirs = [
|
||||||
@ -98,10 +98,10 @@ def has_tests(path: str, build_file: str, row: pd.Series) -> bool:
|
|||||||
]
|
]
|
||||||
for td in test_dirs:
|
for td in test_dirs:
|
||||||
if os.path.exists(os.path.join(path, td)):
|
if os.path.exists(os.path.join(path, td)):
|
||||||
row["detected_source_of_tests"] = td + " dir exists in repo"
|
updates["detected_source_of_tests"] = td + " dir exists in repo"
|
||||||
return True
|
return True
|
||||||
|
|
||||||
row["error_msg"] = "No tests found"
|
updates["error_msg"] = "No tests found"
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def remove_dir(dir: str) -> None:
|
def remove_dir(dir: str) -> None:
|
||||||
@ -117,37 +117,47 @@ def remove_dir(dir: str) -> None:
|
|||||||
shutil.rmtree(parent)
|
shutil.rmtree(parent)
|
||||||
|
|
||||||
|
|
||||||
def process_row(row, dest: str, force: bool = False, verbose: bool = False):
|
def process_row(row, dest: str, force: bool = False, verbose: bool = False) -> dict:
|
||||||
repo = row["name"]
|
updates = {} # Dictionary to store updates
|
||||||
if repo in EXCLUSION_LIST:
|
with tqdm(total=3, leave=False) as pbar:
|
||||||
row["error_msg"] = "Repo in exclusion list"
|
repo = row["name"]
|
||||||
if verbose: print(f"Skipping {repo}, in exclusion list")
|
if repo in EXCLUSION_LIST:
|
||||||
return
|
updates["error_msg"] = "Repo in exclusion list"
|
||||||
|
if verbose: print(f"Skipping {repo}, in exclusion list")
|
||||||
|
return updates
|
||||||
|
|
||||||
if force:
|
pbar.set_postfix_str("Cloning...")
|
||||||
clone(repo, dest, row, verbose=verbose)
|
if force:
|
||||||
|
clone(repo, dest, updates, verbose=verbose)
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
repo_path = os.path.join(dest, repo)
|
repo_path = os.path.join(dest, repo)
|
||||||
if not os.path.exists(repo_path):
|
if not os.path.exists(repo_path):
|
||||||
row["error_msg"] = "Repo not cloned"
|
updates["error_msg"] = "Repo not cloned"
|
||||||
return
|
return updates
|
||||||
|
|
||||||
build_file = get_build_file(dest, repo, row)
|
pbar.set_postfix_str("Getting build file...")
|
||||||
if build_file is None:
|
build_file = get_build_file(dest, repo, updates)
|
||||||
if verbose: print(f"Removing {repo}, no build file")
|
if build_file is None:
|
||||||
remove_dir(repo_path)
|
if verbose: print(f"Removing {repo}, no build file")
|
||||||
return
|
remove_dir(repo_path)
|
||||||
|
return updates
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
if not has_tests(repo_path, build_file, row):
|
|
||||||
if verbose: print(f"Removing {repo}, no test suites")
|
|
||||||
remove_dir(repo_path)
|
|
||||||
return
|
|
||||||
# if verbose: print(f"Keeping {repo}")
|
|
||||||
|
|
||||||
# Check for compilation and tests
|
pbar.set_postfix_str("Checking for tests...")
|
||||||
|
# if not has_tests(repo_path, build_file, updates):
|
||||||
|
# if verbose: print(f"Removing {repo}, no test suites")
|
||||||
|
# remove_dir(repo_path)
|
||||||
|
# return
|
||||||
|
# if verbose: print(f"Keeping {repo}")
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
# If repo was not removed, then it is a good repo
|
# Check for compilation and tests
|
||||||
row["good_repo_for_crab"] = True
|
|
||||||
|
# If repo was not removed, then it is a good repo
|
||||||
|
updates["good_repo_for_crab"] = True
|
||||||
|
return updates
|
||||||
|
|
||||||
def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
|
def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
|
||||||
"""
|
"""
|
||||||
@ -162,26 +172,48 @@ def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False)
|
|||||||
if verbose: print(f"Reading CSV file {file}")
|
if verbose: print(f"Reading CSV file {file}")
|
||||||
df = pd.read_csv(file)
|
df = pd.read_csv(file)
|
||||||
|
|
||||||
df["successfully_cloned"] = None
|
# drop all columns besides the name
|
||||||
df["build_system"] = None
|
df = df[["name"]]
|
||||||
df["depth_of_build_file"] = None
|
|
||||||
df["detected_source_of_tests"] = None
|
|
||||||
df["error_msg"] = None
|
|
||||||
df["good_repo_for_crab"] = False
|
|
||||||
df["n_tests"] = None
|
|
||||||
df["n_tests_with_grep"] = None
|
|
||||||
df["n_tests_passed"] = None
|
|
||||||
df["n_tests_failed"] = None
|
|
||||||
df["n_tests_skipped"] = None
|
|
||||||
|
|
||||||
|
updates_list = [] # Collect updates in a list
|
||||||
|
|
||||||
|
good_repos = 0
|
||||||
try:
|
try:
|
||||||
if verbose: print("Processing repositories")
|
if verbose: print("Processing repositories")
|
||||||
df.progress_apply(lambda row: process_row(row, dest, force=force, verbose=verbose), axis=1)
|
with tqdm(total=len(df)) as pbar:
|
||||||
|
for i, row in df.iterrows():
|
||||||
|
updates = process_row(row, dest, force=force, verbose=verbose)
|
||||||
|
if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]:
|
||||||
|
good_repos += 1
|
||||||
|
pbar.update(1)
|
||||||
|
pbar.set_postfix({"repo": row["name"], "good_repos": good_repos}, refresh=True)
|
||||||
|
updates_list.append((i, updates)) # Collect updates
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print("Keyboard interrupt detected. Stopping the processing of the repos...")
|
print("Keyboard interrupt detected. Stopping the processing of the repos...")
|
||||||
|
|
||||||
|
|
||||||
|
# Create columns for the new data
|
||||||
|
df = df.assign(
|
||||||
|
successfully_cloned=None,
|
||||||
|
build_system=None,
|
||||||
|
depth_of_build_file=None,
|
||||||
|
detected_source_of_tests=None,
|
||||||
|
error_msg=None,
|
||||||
|
good_repo_for_crab=False,
|
||||||
|
n_tests=None,
|
||||||
|
n_tests_with_grep=None,
|
||||||
|
n_tests_passed=None,
|
||||||
|
n_tests_failed=None,
|
||||||
|
n_tests_skipped=None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Set the new data
|
||||||
|
for index, updates in updates_list:
|
||||||
|
for col, value in updates.items():
|
||||||
|
df.at[index, col] = value # Batch updates to avoid fragmentation
|
||||||
|
|
||||||
if verbose: print("Writing results...")
|
if verbose: print("Writing results...")
|
||||||
df.to_csv("results.csv.gz", index=False)
|
df.to_csv("results.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Reference in New Issue
Block a user