now the rows actually get updated

This commit is contained in:
Karma Riuk
2025-02-28 15:25:56 +01:00
parent a4be07c04e
commit b918c0044c

View File

@ -9,7 +9,7 @@ EXCLUSION_LIST = [
"edmcouncil/idmp", "edmcouncil/idmp",
] ]
def clone(repo: str, dest: str, row: pd.Series, force: bool = False, verbose: bool = False) -> None: def clone(repo: str, dest: str, updates: dict, force: bool = False, verbose: bool = False) -> None:
""" """
Clones a GitHub repository into a local directory. Clones a GitHub repository into a local directory.
@ -28,16 +28,16 @@ def clone(repo: str, dest: str, row: pd.Series, force: bool = False, verbose: bo
stderr=subprocess.PIPE stderr=subprocess.PIPE
) )
if proc.returncode != 0: if proc.returncode != 0:
row["successfully_cloned"] = False updates["successfully_cloned"] = False
print(f"Failed to clone {repo}", file=sys.stderr) print(f"Failed to clone {repo}", file=sys.stderr)
print(f"Error message was:", file=sys.stderr) print(f"Error message was:", file=sys.stderr)
error_msg = proc.stderr.decode() error_msg = proc.stderr.decode()
print(error_msg, file=sys.stderr) print(error_msg, file=sys.stderr)
row["error_msg"] = error_msg updates["error_msg"] = error_msg
else: else:
row["successfully_cloned"] = True updates["successfully_cloned"] = True
def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False): def get_build_file(root: str, repo: str, updates: dict, verbose: bool = False):
""" """
Get the path to the build file of a repository. The build file is either a Get the path to the build file of a repository. The build file is either a
`pom.xml`, `build.gradle`, or `build.xml` file. `pom.xml`, `build.gradle`, or `build.xml` file.
@ -54,14 +54,14 @@ def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
if not os.path.isdir(path): if not os.path.isdir(path):
error_msg = f"The path {path} is not a valid directory." error_msg = f"The path {path} is not a valid directory."
print(error_msg, file=sys.stderr) print(error_msg, file=sys.stderr)
row["error_msg"] = error_msg updates["error_msg"] = error_msg
return None return None
to_keep = ["pom.xml", "build.gradle", "build.xml"] to_keep = ["pom.xml", "build.gradle", "build.xml"]
for entry in os.scandir(path): for entry in os.scandir(path):
if entry.is_file() and entry.name in to_keep: if entry.is_file() and entry.name in to_keep:
if verbose: print(f"Found {entry.name} in {repo} root, so keeping it and returning") if verbose: print(f"Found {entry.name} in {repo} root, so keeping it and returning")
row["depth_of_build_file"] = 0 updates["depth_of_build_file"] = 0
return os.path.join(path, entry.name) return os.path.join(path, entry.name)
# List files in the immediate subdirectories # List files in the immediate subdirectories
@ -70,24 +70,24 @@ def get_build_file(root: str, repo: str, row: pd.Series, verbose: bool = False):
for sub_entry in os.scandir(entry.path): for sub_entry in os.scandir(entry.path):
if sub_entry.is_file() and sub_entry.name in to_keep: if sub_entry.is_file() and sub_entry.name in to_keep:
if verbose: print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning") if verbose: print(f"Found {sub_entry.name} in {repo} first level, so keeping it and returning")
row["depth_of_build_file"] = 1 updates["depth_of_build_file"] = 1
return os.path.join(path, entry.name, sub_entry.name) return os.path.join(path, entry.name, sub_entry.name)
row["error_msg"] = "No build file found" updates["error_msg"] = "No build file found"
return None return None
def has_tests(path: str, build_file: str, row: pd.Series) -> bool: def has_tests(path: str, build_file: str, updates: dict) -> bool:
with open(build_file, "r") as f: with open(build_file, "r") as f:
content = f.read() content = f.read()
for library in ["junit", "testng", "mockito"]: for library in ["junit", "testng", "mockito"]:
if library in content: if library in content:
row["detected_source_of_tests"] = library + " library in build file" updates["detected_source_of_tests"] = library + " library in build file"
return True return True
for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]: for keyword in ["testImplementation", "functionalTests", "bwc_tests_enabled"]:
if keyword in content: if keyword in content:
row["detected_source_of_tests"] = keyword + " keyword in build file" updates["detected_source_of_tests"] = keyword + " keyword in build file"
return False return False
test_dirs = [ test_dirs = [
@ -98,10 +98,10 @@ def has_tests(path: str, build_file: str, row: pd.Series) -> bool:
] ]
for td in test_dirs: for td in test_dirs:
if os.path.exists(os.path.join(path, td)): if os.path.exists(os.path.join(path, td)):
row["detected_source_of_tests"] = td + " dir exists in repo" updates["detected_source_of_tests"] = td + " dir exists in repo"
return True return True
row["error_msg"] = "No tests found" updates["error_msg"] = "No tests found"
return False return False
def remove_dir(dir: str) -> None: def remove_dir(dir: str) -> None:
@ -117,37 +117,47 @@ def remove_dir(dir: str) -> None:
shutil.rmtree(parent) shutil.rmtree(parent)
def process_row(row, dest: str, force: bool = False, verbose: bool = False): def process_row(row, dest: str, force: bool = False, verbose: bool = False) -> dict:
updates = {} # Dictionary to store updates
with tqdm(total=3, leave=False) as pbar:
repo = row["name"] repo = row["name"]
if repo in EXCLUSION_LIST: if repo in EXCLUSION_LIST:
row["error_msg"] = "Repo in exclusion list" updates["error_msg"] = "Repo in exclusion list"
if verbose: print(f"Skipping {repo}, in exclusion list") if verbose: print(f"Skipping {repo}, in exclusion list")
return return updates
pbar.set_postfix_str("Cloning...")
if force: if force:
clone(repo, dest, row, verbose=verbose) clone(repo, dest, updates, verbose=verbose)
pbar.update(1)
repo_path = os.path.join(dest, repo) repo_path = os.path.join(dest, repo)
if not os.path.exists(repo_path): if not os.path.exists(repo_path):
row["error_msg"] = "Repo not cloned" updates["error_msg"] = "Repo not cloned"
return return updates
build_file = get_build_file(dest, repo, row) pbar.set_postfix_str("Getting build file...")
build_file = get_build_file(dest, repo, updates)
if build_file is None: if build_file is None:
if verbose: print(f"Removing {repo}, no build file") if verbose: print(f"Removing {repo}, no build file")
remove_dir(repo_path) remove_dir(repo_path)
return return updates
pbar.update(1)
if not has_tests(repo_path, build_file, row):
if verbose: print(f"Removing {repo}, no test suites") pbar.set_postfix_str("Checking for tests...")
remove_dir(repo_path) # if not has_tests(repo_path, build_file, updates):
return # if verbose: print(f"Removing {repo}, no test suites")
# remove_dir(repo_path)
# return
# if verbose: print(f"Keeping {repo}") # if verbose: print(f"Keeping {repo}")
pbar.update(1)
# Check for compilation and tests # Check for compilation and tests
# If repo was not removed, then it is a good repo # If repo was not removed, then it is a good repo
row["good_repo_for_crab"] = True updates["good_repo_for_crab"] = True
return updates
def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None: def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False) -> None:
""" """
@ -162,26 +172,48 @@ def clone_repos(file: str, dest: str, force: bool =False, verbose: bool = False)
if verbose: print(f"Reading CSV file {file}") if verbose: print(f"Reading CSV file {file}")
df = pd.read_csv(file) df = pd.read_csv(file)
df["successfully_cloned"] = None # drop all columns besides the name
df["build_system"] = None df = df[["name"]]
df["depth_of_build_file"] = None
df["detected_source_of_tests"] = None
df["error_msg"] = None
df["good_repo_for_crab"] = False
df["n_tests"] = None
df["n_tests_with_grep"] = None
df["n_tests_passed"] = None
df["n_tests_failed"] = None
df["n_tests_skipped"] = None
updates_list = [] # Collect updates in a list
good_repos = 0
try: try:
if verbose: print("Processing repositories") if verbose: print("Processing repositories")
df.progress_apply(lambda row: process_row(row, dest, force=force, verbose=verbose), axis=1) with tqdm(total=len(df)) as pbar:
for i, row in df.iterrows():
updates = process_row(row, dest, force=force, verbose=verbose)
if "good_repo_for_crab" in updates and updates["good_repo_for_crab"]:
good_repos += 1
pbar.update(1)
pbar.set_postfix({"repo": row["name"], "good_repos": good_repos}, refresh=True)
updates_list.append((i, updates)) # Collect updates
except KeyboardInterrupt: except KeyboardInterrupt:
print("Keyboard interrupt detected. Stopping the processing of the repos...") print("Keyboard interrupt detected. Stopping the processing of the repos...")
# Create columns for the new data
df = df.assign(
successfully_cloned=None,
build_system=None,
depth_of_build_file=None,
detected_source_of_tests=None,
error_msg=None,
good_repo_for_crab=False,
n_tests=None,
n_tests_with_grep=None,
n_tests_passed=None,
n_tests_failed=None,
n_tests_skipped=None
)
# Set the new data
for index, updates in updates_list:
for col, value in updates.items():
df.at[index, col] = value # Batch updates to avoid fragmentation
if verbose: print("Writing results...") if verbose: print("Writing results...")
df.to_csv("results.csv.gz", index=False) df.to_csv("results.csv", index=False)
if __name__ == "__main__": if __name__ == "__main__":